diff --git a/.gitignore b/.gitignore index 7afd412dadd2..265959544978 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,7 @@ *.symversions *.tab.[ch] *.tar +*.usyms *.xz *.zst Module.symvers diff --git a/BUILD.bazel b/BUILD.bazel new file mode 100644 index 000000000000..484c2d6c7f79 --- /dev/null +++ b/BUILD.bazel @@ -0,0 +1,566 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2021 The Android Open Source Project +load("//build/bazel_common_rules/dist:dist.bzl", "copy_to_dist_dir") +load("//build/kernel/kleaf:common_kernels.bzl", "define_common_kernels", "define_db845c") +load( + "//build/kernel/kleaf:kernel.bzl", + "ddk_headers", + "kernel_abi", + "kernel_build", + "kernel_images", + "kernel_modules_install", + "kernel_unstripped_modules_archive", +) +load(":modules.bzl", "COMMON_GKI_MODULES_LIST") + +package( + default_visibility = [ + "//visibility:public", + ], +) + +_aarch64_additional_kmi_symbol_lists = [ + # keep sorted + "android/abi_gki_aarch64_db845c", + "android/abi_gki_aarch64_exynos", + "android/abi_gki_aarch64_pixel", + "android/abi_gki_aarch64_virtual_device", +] + +define_common_kernels(target_configs = { + "kernel_aarch64": { + # TODO(b/188620248): re-enable trimming + "trim_nonlisted_kmi": False, + "kmi_symbol_list_strict_mode": False, + "module_implicit_outs": COMMON_GKI_MODULES_LIST, + "kmi_symbol_list": "android/abi_gki_aarch64", + "additional_kmi_symbol_lists": _aarch64_additional_kmi_symbol_lists, + }, + "kernel_aarch64_16k": { + "module_implicit_outs": COMMON_GKI_MODULES_LIST, + }, + "kernel_aarch64_debug": { + # TODO(b/188620248): re-enable trimming + "trim_nonlisted_kmi": False, + "kmi_symbol_list_strict_mode": False, + "module_implicit_outs": COMMON_GKI_MODULES_LIST, + "kmi_symbol_list": "android/abi_gki_aarch64", + "additional_kmi_symbol_lists": _aarch64_additional_kmi_symbol_lists, + }, + "kernel_x86_64": { + "kmi_symbol_list_strict_mode": False, + "module_implicit_outs": COMMON_GKI_MODULES_LIST, + }, + "kernel_x86_64_debug": { + "kmi_symbol_list_strict_mode": False, + "module_implicit_outs": COMMON_GKI_MODULES_LIST, + }, +}) + +define_db845c( + name = "db845c", + outs = [ + "arch/arm64/boot/dts/qcom/qrb5165-rb5.dtb", + "arch/arm64/boot/dts/qcom/sdm845-db845c.dtb", + ], + define_abi_targets = True, + kmi_symbol_list = "//common:android/abi_gki_aarch64_db845c", + kmi_symbol_list_add_only = True, + module_outs = [ + # keep sorted + "crypto/michael_mic.ko", + "drivers/base/regmap/regmap-sdw.ko", + "drivers/base/regmap/regmap-slimbus.ko", + "drivers/bus/mhi/core/mhi.ko", + "drivers/clk/qcom/clk-qcom.ko", + "drivers/clk/qcom/clk-rpmh.ko", + "drivers/clk/qcom/clk-spmi-pmic-div.ko", + "drivers/clk/qcom/dispcc-sdm845.ko", + "drivers/clk/qcom/dispcc-sm8250.ko", + "drivers/clk/qcom/gcc-sdm845.ko", + "drivers/clk/qcom/gcc-sm8250.ko", + "drivers/clk/qcom/gpucc-sdm845.ko", + "drivers/clk/qcom/gpucc-sm8250.ko", + "drivers/clk/qcom/lpass-gfm-sm8250.ko", + "drivers/clk/qcom/videocc-sdm845.ko", + "drivers/clk/qcom/videocc-sm8250.ko", + "drivers/cpufreq/qcom-cpufreq-hw.ko", + "drivers/dma-buf/heaps/system_heap.ko", + "drivers/dma/qcom/bam_dma.ko", + "drivers/extcon/extcon-usb-gpio.ko", + "drivers/firmware/qcom-scm.ko", + "drivers/gpio/gpio-wcd934x.ko", + "drivers/gpu/drm/bridge/display-connector.ko", + "drivers/gpu/drm/bridge/lontium-lt9611.ko", + "drivers/gpu/drm/bridge/lontium-lt9611uxc.ko", + "drivers/gpu/drm/msm/msm.ko", + "drivers/gpu/drm/scheduler/gpu-sched.ko", + "drivers/hwspinlock/qcom_hwspinlock.ko", + "drivers/i2c/busses/i2c-designware-core.ko", + "drivers/i2c/busses/i2c-designware-platform.ko", + "drivers/i2c/busses/i2c-qcom-geni.ko", + "drivers/i2c/busses/i2c-qup.ko", + "drivers/i2c/busses/i2c-rk3x.ko", + "drivers/i2c/i2c-dev.ko", + "drivers/i2c/i2c-mux.ko", + "drivers/i2c/muxes/i2c-mux-pca954x.ko", + "drivers/iio/adc/qcom-spmi-adc5.ko", + "drivers/iio/adc/qcom-vadc-common.ko", + "drivers/input/misc/pm8941-pwrkey.ko", + "drivers/interconnect/qcom/icc-bcm-voter.ko", + "drivers/interconnect/qcom/icc-osm-l3.ko", + "drivers/interconnect/qcom/icc-rpmh.ko", + "drivers/interconnect/qcom/qnoc-sdm845.ko", + "drivers/interconnect/qcom/qnoc-sm8250.ko", + "drivers/iommu/arm/arm-smmu/arm_smmu.ko", + "drivers/irqchip/qcom-pdc.ko", + "drivers/leds/led-class-multicolor.ko", + "drivers/mailbox/qcom-apcs-ipc-mailbox.ko", + "drivers/mailbox/qcom-ipcc.ko", + "drivers/mfd/qcom-spmi-pmic.ko", + "drivers/mfd/wcd934x.ko", + "drivers/misc/fastrpc.ko", + "drivers/mmc/host/cqhci.ko", + "drivers/mmc/host/sdhci-msm.ko", + "drivers/net/can/spi/mcp251xfd/mcp251xfd.ko", + "drivers/net/wireless/ath/ath.ko", + "drivers/net/wireless/ath/ath10k/ath10k_core.ko", + "drivers/net/wireless/ath/ath10k/ath10k_pci.ko", + "drivers/net/wireless/ath/ath10k/ath10k_snoc.ko", + "drivers/net/wireless/ath/ath11k/ath11k.ko", + "drivers/net/wireless/ath/ath11k/ath11k_ahb.ko", + "drivers/net/wireless/ath/ath11k/ath11k_pci.ko", + "drivers/nvmem/nvmem_qfprom.ko", + "drivers/phy/qualcomm/phy-qcom-qmp.ko", + "drivers/phy/qualcomm/phy-qcom-qusb2.ko", + "drivers/phy/qualcomm/phy-qcom-snps-femto-v2.ko", + "drivers/phy/qualcomm/phy-qcom-usb-hs.ko", + "drivers/pinctrl/qcom/pinctrl-lpass-lpi.ko", + "drivers/pinctrl/qcom/pinctrl-msm.ko", + "drivers/pinctrl/qcom/pinctrl-sdm845.ko", + "drivers/pinctrl/qcom/pinctrl-sm8250.ko", + "drivers/pinctrl/qcom/pinctrl-spmi-gpio.ko", + "drivers/pinctrl/qcom/pinctrl-spmi-mpp.ko", + "drivers/power/reset/qcom-pon.ko", + "drivers/power/reset/reboot-mode.ko", + "drivers/power/reset/syscon-reboot-mode.ko", + "drivers/regulator/gpio-regulator.ko", + "drivers/regulator/qcom-rpmh-regulator.ko", + "drivers/regulator/qcom_spmi-regulator.ko", + "drivers/regulator/qcom_usb_vbus-regulator.ko", + "drivers/remoteproc/qcom_common.ko", + "drivers/remoteproc/qcom_pil_info.ko", + "drivers/remoteproc/qcom_q6v5.ko", + "drivers/remoteproc/qcom_q6v5_adsp.ko", + "drivers/remoteproc/qcom_q6v5_mss.ko", + "drivers/remoteproc/qcom_q6v5_pas.ko", + "drivers/remoteproc/qcom_q6v5_wcss.ko", + "drivers/remoteproc/qcom_sysmon.ko", + "drivers/reset/reset-qcom-aoss.ko", + "drivers/reset/reset-qcom-pdc.ko", + "drivers/rpmsg/qcom_glink.ko", + "drivers/rpmsg/qcom_glink_rpm.ko", + "drivers/rpmsg/qcom_glink_smem.ko", + "drivers/rpmsg/qcom_smd.ko", + "drivers/rpmsg/rpmsg_ns.ko", + "drivers/rtc/rtc-pm8xxx.ko", + "drivers/slimbus/slim-qcom-ngd-ctrl.ko", + "drivers/slimbus/slimbus.ko", + "drivers/soc/qcom/apr.ko", + "drivers/soc/qcom/cmd-db.ko", + "drivers/soc/qcom/llcc-qcom.ko", + "drivers/soc/qcom/mdt_loader.ko", + "drivers/soc/qcom/pdr_interface.ko", + "drivers/soc/qcom/qcom_aoss.ko", + "drivers/soc/qcom/qcom_rpmh.ko", + "drivers/soc/qcom/qmi_helpers.ko", + "drivers/soc/qcom/rmtfs_mem.ko", + "drivers/soc/qcom/rpmhpd.ko", + "drivers/soc/qcom/smem.ko", + "drivers/soc/qcom/smp2p.ko", + "drivers/soc/qcom/smsm.ko", + "drivers/soc/qcom/socinfo.ko", + "drivers/soundwire/soundwire-bus.ko", + "drivers/soundwire/soundwire-qcom.ko", + "drivers/spi/spi-geni-qcom.ko", + "drivers/spi/spi-pl022.ko", + "drivers/spi/spi-qcom-qspi.ko", + "drivers/spi/spi-qup.ko", + "drivers/spmi/spmi-pmic-arb.ko", + "drivers/thermal/qcom/lmh.ko", + "drivers/thermal/qcom/qcom-spmi-adc-tm5.ko", + "drivers/thermal/qcom/qcom-spmi-temp-alarm.ko", + "drivers/thermal/qcom/qcom_tsens.ko", + "drivers/tty/serial/msm_serial.ko", + "drivers/ufs/host/ufs_qcom.ko", + "drivers/usb/common/ulpi.ko", + "drivers/usb/host/ohci-hcd.ko", + "drivers/usb/host/ohci-pci.ko", + "drivers/usb/host/ohci-platform.ko", + "drivers/usb/typec/qcom-pmic-typec.ko", + "drivers/watchdog/pm8916_wdt.ko", + "drivers/watchdog/qcom-wdt.ko", + "net/qrtr/ns.ko", + "net/qrtr/qrtr.ko", + "net/qrtr/qrtr-mhi.ko", + "net/qrtr/qrtr-smd.ko", + "net/qrtr/qrtr-tun.ko", + "sound/soc/codecs/snd-soc-dmic.ko", + "sound/soc/codecs/snd-soc-hdmi-codec.ko", + "sound/soc/codecs/snd-soc-lpass-va-macro.ko", + "sound/soc/codecs/snd-soc-lpass-wsa-macro.ko", + "sound/soc/codecs/snd-soc-max98927.ko", + "sound/soc/codecs/snd-soc-rl6231.ko", + "sound/soc/codecs/snd-soc-rt5663.ko", + "sound/soc/codecs/snd-soc-wcd-mbhc.ko", + "sound/soc/codecs/snd-soc-wcd9335.ko", + "sound/soc/codecs/snd-soc-wcd934x.ko", + "sound/soc/codecs/snd-soc-wsa881x.ko", + "sound/soc/qcom/qdsp6/q6adm.ko", + "sound/soc/qcom/qdsp6/q6afe.ko", + "sound/soc/qcom/qdsp6/q6afe-clocks.ko", + "sound/soc/qcom/qdsp6/q6afe-dai.ko", + "sound/soc/qcom/qdsp6/q6asm.ko", + "sound/soc/qcom/qdsp6/q6asm-dai.ko", + "sound/soc/qcom/qdsp6/q6core.ko", + "sound/soc/qcom/qdsp6/q6dsp-common.ko", + "sound/soc/qcom/qdsp6/q6routing.ko", + "sound/soc/qcom/snd-soc-qcom-common.ko", + "sound/soc/qcom/snd-soc-sdm845.ko", + "sound/soc/qcom/snd-soc-sm8250.ko", + ], +) + +# TODO(b/258259749): Convert rockpi4 to mixed build +kernel_build( + name = "rockpi4", + outs = [ + "Image", + "System.map", + "modules.builtin", + "modules.builtin.modinfo", + "rk3399-rock-pi-4b.dtb", + "vmlinux", + "vmlinux.symvers", + ], + build_config = "build.config.rockpi4", + collect_unstripped_modules = True, + kmi_symbol_list = "//common:android/abi_gki_rockpi4", + module_outs = COMMON_GKI_MODULES_LIST + [ + # keep sorted + "drivers/block/virtio_blk.ko", + "drivers/char/hw_random/virtio-rng.ko", + "drivers/clk/clk-rk808.ko", + "drivers/cpufreq/cpufreq-dt.ko", + "drivers/dma/pl330.ko", + "drivers/gpu/drm/bridge/analogix/analogix_dp.ko", + "drivers/gpu/drm/bridge/synopsys/dw-hdmi.ko", + "drivers/gpu/drm/bridge/synopsys/dw-mipi-dsi.ko", + "drivers/gpu/drm/rockchip/rockchipdrm.ko", + "drivers/i2c/busses/i2c-rk3x.ko", + "drivers/iio/adc/rockchip_saradc.ko", + "drivers/iio/buffer/industrialio-triggered-buffer.ko", + "drivers/iio/buffer/kfifo_buf.ko", + "drivers/mfd/rk808.ko", + "drivers/mmc/core/pwrseq_simple.ko", + "drivers/mmc/host/cqhci.ko", + "drivers/mmc/host/dw_mmc.ko", + "drivers/mmc/host/dw_mmc-pltfm.ko", + "drivers/mmc/host/dw_mmc-rockchip.ko", + "drivers/mmc/host/sdhci-of-arasan.ko", + "drivers/net/ethernet/stmicro/stmmac/dwmac-rk.ko", + "drivers/net/ethernet/stmicro/stmmac/stmmac.ko", + "drivers/net/ethernet/stmicro/stmmac/stmmac-platform.ko", + "drivers/net/net_failover.ko", + "drivers/net/pcs/pcs_xpcs.ko", + "drivers/net/virtio_net.ko", + "drivers/nvmem/nvmem_rockchip_efuse.ko", + "drivers/pci/controller/pcie-rockchip-host.ko", + "drivers/phy/rockchip/phy-rockchip-emmc.ko", + "drivers/phy/rockchip/phy-rockchip-inno-usb2.ko", + "drivers/phy/rockchip/phy-rockchip-pcie.ko", + "drivers/phy/rockchip/phy-rockchip-typec.ko", + "drivers/pwm/pwm-rockchip.ko", + "drivers/regulator/fan53555.ko", + "drivers/regulator/pwm-regulator.ko", + "drivers/regulator/rk808-regulator.ko", + "drivers/rtc/rtc-rk808.ko", + "drivers/soc/rockchip/io-domain.ko", + "drivers/thermal/rockchip_thermal.ko", + "drivers/usb/host/ohci-hcd.ko", + "drivers/usb/host/ohci-platform.ko", + "drivers/virtio/virtio_pci.ko", + "drivers/virtio/virtio_pci_modern_dev.ko", + "drivers/watchdog/dw_wdt.ko", + "net/core/failover.ko", + ], +) + +kernel_abi( + name = "rockpi4_abi", + kernel_build = "//common:rockpi4", + kmi_symbol_list_add_only = True, +) + +kernel_modules_install( + name = "rockpi4_modules_install", + kernel_build = "//common:rockpi4", +) + +kernel_unstripped_modules_archive( + name = "rockpi4_unstripped_modules_archive", + kernel_build = ":rockpi4", +) + +kernel_images( + name = "rockpi4_images", + build_initramfs = True, + kernel_build = "//common:rockpi4", + kernel_modules_install = "//common:rockpi4_modules_install", +) + +copy_to_dist_dir( + name = "rockpi4_dist", + data = [ + ":rockpi4", + ":rockpi4_images", + ":rockpi4_modules_install", + ":rockpi4_unstripped_modules_archive", + ], + dist_dir = "out/rockpi4/dist", + flat = True, +) + +kernel_build( + name = "fips140", + outs = [], + base_kernel = ":kernel_aarch64", + build_config = "build.config.gki.aarch64.fips140", + module_outs = ["crypto/fips140.ko"], +) + +copy_to_dist_dir( + name = "fips140_dist", + data = [ + ":fips140", + ], + dist_dir = "out/fips140/dist", + flat = True, +) + +# allmodconfig build tests. +# These are build tests only, so: +# - outs are intentionally set to empty to not copy anything to DIST_DIR +# - --allow-undeclared-modules must be used so modules are not declared or copied. +# - No dist target because these are build tests. We don't care about the artifacts. + +# tools/bazel build --allow_undeclared_modules //common:kernel_aarch64_allmodconfig +kernel_build( + name = "kernel_aarch64_allmodconfig", + # Hack to actually check the build. + # Otherwise, Bazel thinks that there are no output files, and skip building. + outs = [".config"], + build_config = "build.config.allmodconfig.aarch64", + visibility = ["//visibility:private"], +) + +# tools/bazel build --allow_undeclared_modules //common:kernel_x86_64_allmodconfig +kernel_build( + name = "kernel_x86_64_allmodconfig", + # Hack to actually check the build. + # Otherwise, Bazel thinks that there are no output files, and skip building. + outs = [".config"], + build_config = "build.config.allmodconfig.x86_64", + visibility = ["//visibility:private"], +) + +# tools/bazel build --allow_undeclared_modules //common:kernel_arm_allmodconfig +kernel_build( + name = "kernel_arm_allmodconfig", + # Hack to actually check the build. + # Otherwise, Bazel thinks that there are no output files, and skip building. + outs = [".config"], + build_config = "build.config.allmodconfig.arm", + visibility = ["//visibility:private"], +) + +# DDK Headers +# All headers. These are the public targets for DDK modules to use. +alias( + name = "all_headers", + actual = "all_headers_aarch64", + visibility = ["//visibility:public"], +) + +ddk_headers( + name = "all_headers_aarch64", + hdrs = [":all_headers_allowlist_aarch64"] + select({ + "//build/kernel/kleaf:allow_ddk_unsafe_headers_set": [":all_headers_unsafe"], + "//conditions:default": [], + }), + visibility = ["//visibility:public"], +) + +ddk_headers( + name = "all_headers_arm", + hdrs = [":all_headers_allowlist_arm"] + select({ + "//build/kernel/kleaf:allow_ddk_unsafe_headers_set": [":all_headers_unsafe"], + "//conditions:default": [], + }), + visibility = ["//visibility:public"], +) + +ddk_headers( + name = "all_headers_x86_64", + hdrs = [":all_headers_allowlist_x86_64"] + select({ + "//build/kernel/kleaf:allow_ddk_unsafe_headers_set": [":all_headers_unsafe"], + "//conditions:default": [], + }), + visibility = ["//visibility:public"], +) + +# Implementation details for DDK headers. The targets below cannot be directly +# depended on by DDK modules. + +# DDK headers allowlist. This is the list of all headers and include +# directories that are safe to use in DDK modules. +ddk_headers( + name = "all_headers_allowlist_aarch64", + hdrs = [ + ":all_headers_allowlist_aarch64_globs", + ":all_headers_allowlist_common_globs", + ], + # The list of include directories where source files can #include headers + # from. In other words, these are the `-I` option to the C compiler. + # These are prepended to LINUXINCLUDE. + linux_includes = [ + "arch/arm64/include", + "arch/arm64/include/uapi", + "include", + "include/uapi", + ], + visibility = ["//visibility:private"], +) + +ddk_headers( + name = "all_headers_allowlist_arm", + hdrs = [ + ":all_headers_allowlist_arm_globs", + ":all_headers_allowlist_common_globs", + ], + # The list of include directories where source files can #include headers + # from. In other words, these are the `-I` option to the C compiler. + # These are prepended to LINUXINCLUDE. + linux_includes = [ + "arch/arm/include", + "arch/arm/include/uapi", + "include", + "include/uapi", + ], + visibility = ["//visibility:private"], +) + +ddk_headers( + name = "all_headers_allowlist_x86_64", + hdrs = [ + ":all_headers_allowlist_common_globs", + ":all_headers_allowlist_x86_64_globs", + ], + # The list of include directories where source files can #include headers + # from. In other words, these are the `-I` option to the C compiler. + # These are prepended to LINUXINCLUDE. + linux_includes = [ + "arch/x86/include", + "arch/x86/include/uapi", + "include", + "include/uapi", + ], + visibility = ["//visibility:private"], +) + +# List of DDK headers allowlist that are glob()-ed to avoid changes of BUILD +# file when the list of files changes. All headers in these directories +# are safe to use. +# These are separate filegroup targets so the all_headers_allowlist_* are +# more friendly to batch BUILD file update tools like buildozer. + +# globs() for arm only +filegroup( + name = "all_headers_allowlist_arm_globs", + srcs = glob(["arch/arm/include/**/*.h"]), + visibility = ["//visibility:private"], +) + +# globs() for arm64 only +filegroup( + name = "all_headers_allowlist_aarch64_globs", + srcs = glob(["arch/arm64/include/**/*.h"]), + visibility = ["//visibility:private"], +) + +# globs() for x86 only +filegroup( + name = "all_headers_allowlist_x86_64_globs", + srcs = glob(["arch/x86/include/**/*.h"]), + visibility = ["//visibility:private"], +) + +# globs() for all architectures +filegroup( + name = "all_headers_allowlist_common_globs", + srcs = glob(["include/**/*.h"]), + visibility = ["//visibility:private"], +) + +# DDK headers unsafe list. This is the list of all headers and include +# directories that may be used during migration from kernel_module's, but +# should be avoided in general. +# Use with caution; items may: +# - be removed without notice +# - be moved into all_headers +ddk_headers( + name = "all_headers_unsafe", + hdrs = [ + "drivers/devfreq/governor.h", + "drivers/dma-buf/heaps/deferred-free-helper.h", + "drivers/dma-buf/heaps/page_pool.h", + "drivers/dma/dmaengine.h", + "drivers/pci/controller/dwc/pcie-designware.h", + "drivers/pinctrl/core.h", + "drivers/pinctrl/samsung/pinctrl-samsung.h", + "drivers/staging/android/debug_kinfo.h", + "drivers/thermal/thermal_core.h", + "drivers/thermal/thermal_netlink.h", + "drivers/usb/core/phy.h", + "drivers/usb/dwc3/core.h", + "drivers/usb/dwc3/debug.h", + "drivers/usb/dwc3/gadget.h", + "drivers/usb/dwc3/io.h", + "drivers/usb/dwc3/trace.h", + "drivers/usb/gadget/configfs.h", + "drivers/usb/gadget/function/u_serial.h", + "drivers/usb/host/pci-quirks.h", + "drivers/usb/host/xhci.h", + "drivers/usb/host/xhci-ext-caps.h", + "drivers/usb/host/xhci-mvebu.h", + "drivers/usb/host/xhci-plat.h", + "drivers/usb/host/xhci-rcar.h", + "drivers/usb/typec/tcpm/tcpci.h", + ], + # The list of include directories where source files can #include headers + # from. In other words, these are the `-I` option to the C compiler. + # Unsafe include directories are appended to ccflags-y. + includes = [ + "drivers/devfreq", + "drivers/dma", + "drivers/dma-buf", + "drivers/pci/controller/dwc", + "drivers/pinctrl", + "drivers/scsi/ufs", + "drivers/thermal", + "drivers/usb", + "drivers/usb/gadget/function", + "drivers/usb/typec", + ], + visibility = ["//visibility:private"], +) diff --git a/Documentation/ABI/stable/sysfs-module b/Documentation/ABI/stable/sysfs-module index 6272ae5fb366..46bcd2d00c96 100644 --- a/Documentation/ABI/stable/sysfs-module +++ b/Documentation/ABI/stable/sysfs-module @@ -32,3 +32,21 @@ Description: Note: If the module is built into the kernel, or if the CONFIG_MODULE_UNLOAD kernel configuration value is not enabled, this file will not be present. + +What: /sys/module/MODULENAME/scmversion +Date: November 2020 +KernelVersion: Android Common Kernel -- android12-5.10+ +Contact: Will McVicker +Description: This read-only file will appear if modpost was supplied with an + SCM version for the module. It can be enabled with the config + MODULE_SCMVERSION. The SCM version is retrieved by + scripts/setlocalversion, which means that the presence of this + file depends on CONFIG_LOCALVERSION_AUTO=y. When read, the SCM + version that the module was compiled with is returned. The SCM + version is returned in the following format:: + + === + Git: g[a-f0-9]\+(-dirty)\? + Mercurial: hg[a-f0-9]\+(-dirty)\? + Subversion: svn[0-9]\+ + === diff --git a/Documentation/ABI/testing/OWNERS b/Documentation/ABI/testing/OWNERS new file mode 100644 index 000000000000..3ab5dca46dbe --- /dev/null +++ b/Documentation/ABI/testing/OWNERS @@ -0,0 +1 @@ +per-file sysfs-fs-f2fs=file:/fs/f2fs/OWNERS diff --git a/Documentation/ABI/testing/configfs-usb-gadget-uvc b/Documentation/ABI/testing/configfs-usb-gadget-uvc index 889ed45be4ca..f00cff6d8c5c 100644 --- a/Documentation/ABI/testing/configfs-usb-gadget-uvc +++ b/Documentation/ABI/testing/configfs-usb-gadget-uvc @@ -7,6 +7,7 @@ Description: UVC function directory streaming_maxburst 0..15 (ss only) streaming_maxpacket 1..1023 (fs), 1..3072 (hs/ss) streaming_interval 1..16 + function_name string [32] =================== ============================= What: /config/usb-gadget/gadget/functions/uvc.name/control @@ -196,7 +197,7 @@ Description: Specific MJPEG format descriptors read-only bmaControls this format's data for bmaControls in the streaming header - bmInterfaceFlags specifies interlace information, + bmInterlaceFlags specifies interlace information, read-only bAspectRatioY the X dimension of the picture aspect ratio, read-only @@ -252,7 +253,7 @@ Description: Specific uncompressed format descriptors read-only bmaControls this format's data for bmaControls in the streaming header - bmInterfaceFlags specifies interlace information, + bmInterlaceFlags specifies interlace information, read-only bAspectRatioY the X dimension of the picture aspect ratio, read-only diff --git a/Documentation/ABI/testing/dm-bow b/Documentation/ABI/testing/dm-bow new file mode 100644 index 000000000000..d663e81ca360 --- /dev/null +++ b/Documentation/ABI/testing/dm-bow @@ -0,0 +1,19 @@ +What: /sys/block/dm-/bow/free +Date: January 2023 +KernelVersion: 5.15 +Contact: paullawrence@google.com +Description: free space + Free space on device in bytes. Only valid in state 0 +Users: Android vold to determine if there is sufficient space for expected size + of checksum + +What: /sys/block/dm-/bow/state +Date: January 2023 +KernelVersion: 5.15 +Contact: paullawrence@google.com +Description: dm-bow state + Read-write string containing 0, 1 or 2 + 0: Trim mode + 1: Checkpoint mode + 2: Committed mode + See Documentation/device-mapper/dm-bow for details diff --git a/Documentation/ABI/testing/sysfs-driver-typec-displayport b/Documentation/ABI/testing/sysfs-driver-typec-displayport index 231471ad0d4b..256c87c5219a 100644 --- a/Documentation/ABI/testing/sysfs-driver-typec-displayport +++ b/Documentation/ABI/testing/sysfs-driver-typec-displayport @@ -47,3 +47,18 @@ Description: USB SuperSpeed protocol. From user perspective pin assignments C and E are equal, where all channels on the connector are used for carrying DisplayPort protocol (allowing higher resolutions). + +What: /sys/bus/typec/devices/.../displayport/hpd +Date: Dec 2022 +Contact: Badhri Jagan Sridharan +Description: + VESA DisplayPort Alt Mode on USB Type-C Standard defines how + HotPlugDetect(HPD) shall be supported on the USB-C connector when + operating in DisplayPort Alt Mode. This is a read only node which + reflects the current state of HPD. + + Valid values: + - 1: when HPD’s logical state is high (HPD_High) as defined + by VESA DisplayPort Alt Mode on USB Type-C Standard. + - 0 when HPD’s logical state is low (HPD_Low) as defined by + VESA DisplayPort Alt Mode on USB Type-C Standard. diff --git a/Documentation/ABI/testing/sysfs-driver-ufs b/Documentation/ABI/testing/sysfs-driver-ufs index ec3a7149ced5..a5b48ef716c6 100644 --- a/Documentation/ABI/testing/sysfs-driver-ufs +++ b/Documentation/ABI/testing/sysfs-driver-ufs @@ -1299,6 +1299,15 @@ Description: This node is used to set or display whether UFS WriteBooster is platform that doesn't support UFSHCD_CAP_CLK_SCALING, we can disable/enable WriteBooster through this sysfs node. +What: /sys/bus/platform/drivers/ufshcd/*/enable_wb_buf_flush +What: /sys/bus/platform/devices/*.ufs/enable_wb_buf_flush +Date: July 2022 +Contact: Jinyoung Choi +Description: This entry shows the status of WriteBooster buffer flushing + and it can be used to enable or disable the flushing. + If flushing is enabled, the device executes the flush + operation when the command queue is empty. + What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/hpb_version Date: June 2021 Contact: Daejun Park @@ -1394,7 +1403,7 @@ Description: This entry shows the number of reads that cannot be changed to The file is read only. -What: /sys/class/scsi_device/*/device/hpb_stats/rb_noti_cnt +What: /sys/class/scsi_device/*/device/hpb_stats/rcmd_noti_cnt Date: June 2021 Contact: Daejun Park Description: This entry shows the number of response UPIUs that has @@ -1402,19 +1411,23 @@ Description: This entry shows the number of response UPIUs that has The file is read only. -What: /sys/class/scsi_device/*/device/hpb_stats/rb_active_cnt +What: /sys/class/scsi_device/*/device/hpb_stats/rcmd_active_cnt Date: June 2021 Contact: Daejun Park -Description: This entry shows the number of active sub-regions recommended by - response UPIUs. +Description: For the HPB device control mode, this entry shows the number of + active sub-regions recommended by response UPIUs. For the HPB host control + mode, this entry shows the number of active sub-regions recommended by the + HPB host control mode heuristic algorithm. The file is read only. -What: /sys/class/scsi_device/*/device/hpb_stats/rb_inactive_cnt +What: /sys/class/scsi_device/*/device/hpb_stats/rcmd_inactive_cnt Date: June 2021 Contact: Daejun Park -Description: This entry shows the number of inactive regions recommended by - response UPIUs. +Description: For the HPB device control mode, this entry shows the number of + inactive regions recommended by response UPIUs. For the HPB host control + mode, this entry shows the number of inactive regions recommended by the + HPB host control mode heuristic algorithm. The file is read only. @@ -1461,6 +1474,43 @@ Description: This entry shows the status of HPB. The file is read only. +Contact: Daniil Lunev +What: /sys/bus/platform/drivers/ufshcd/*/capabilities/ +What: /sys/bus/platform/devices/*.ufs/capabilities/ +Date: August 2022 +Description: The group represents the effective capabilities of the + host-device pair. i.e. the capabilities which are enabled in the + driver for the specific host controller, supported by the host + controller and are supported and/or have compatible + configuration on the device side. + +Contact: Daniil Lunev +What: /sys/bus/platform/drivers/ufshcd/*/capabilities/clock_scaling +What: /sys/bus/platform/devices/*.ufs/capabilities/clock_scaling +Date: August 2022 +Contact: Daniil Lunev +Description: Indicates status of clock scaling. + + == ============================ + 0 Clock scaling is not supported. + 1 Clock scaling is supported. + == ============================ + + The file is read only. + +What: /sys/bus/platform/drivers/ufshcd/*/capabilities/write_booster +What: /sys/bus/platform/devices/*.ufs/capabilities/write_booster +Date: August 2022 +Contact: Daniil Lunev +Description: Indicates status of Write Booster. + + == ============================ + 0 Write Booster can not be enabled. + 1 Write Booster can be enabled. + == ============================ + + The file is read only. + What: /sys/class/scsi_device/*/device/hpb_param_sysfs/activation_thld Date: February 2021 Contact: Avri Altman diff --git a/Documentation/ABI/testing/sysfs-fs-erofs b/Documentation/ABI/testing/sysfs-fs-erofs new file mode 100644 index 000000000000..a9512594dc4c --- /dev/null +++ b/Documentation/ABI/testing/sysfs-fs-erofs @@ -0,0 +1,7 @@ +What: /sys/fs/erofs/features/ +Date: November 2021 +Contact: "Huang Jianan" +Description: Shows all enabled kernel features. + Supported features: + zero_padding, compr_cfgs, big_pcluster, chunked_file, + device_table, compr_head2, sb_chksum. diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 48d41b669627..9e3756625a81 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -55,8 +55,9 @@ Description: Controls the in-place-update policy. 0x04 F2FS_IPU_UTIL 0x08 F2FS_IPU_SSR_UTIL 0x10 F2FS_IPU_FSYNC - 0x20 F2FS_IPU_ASYNC, + 0x20 F2FS_IPU_ASYNC 0x40 F2FS_IPU_NOCACHE + 0x80 F2FS_IPU_HONOR_OPU_WRITE ==== ================= Refer segment.h for details. @@ -98,13 +99,47 @@ Description: Controls the issue rate of discard commands that consist of small checkpoint is triggered, and issued during the checkpoint. By default, it is disabled with 0. +What: /sys/fs/f2fs//max_ordered_discard +Date: October 2022 +Contact: "Yangtao Li" +Description: Controls the maximum ordered discard, the unit size is one block(4KB). + Set it to 16 by default. + +What: /sys/fs/f2fs//max_discard_request +Date: December 2021 +Contact: "Konstantin Vyshetsky" +Description: Controls the number of discards a thread will issue at a time. + Higher number will allow the discard thread to finish its work + faster, at the cost of higher latency for incomming I/O. + +What: /sys/fs/f2fs//min_discard_issue_time +Date: December 2021 +Contact: "Konstantin Vyshetsky" +Description: Controls the interval the discard thread will wait between + issuing discard requests when there are discards to be issued and + no I/O aware interruptions occur. + +What: /sys/fs/f2fs//mid_discard_issue_time +Date: December 2021 +Contact: "Konstantin Vyshetsky" +Description: Controls the interval the discard thread will wait between + issuing discard requests when there are discards to be issued and + an I/O aware interruption occurs. + +What: /sys/fs/f2fs//max_discard_issue_time +Date: December 2021 +Contact: "Konstantin Vyshetsky" +Description: Controls the interval the discard thread will wait when there are + no discard operations to be issued. + What: /sys/fs/f2fs//discard_granularity Date: July 2017 Contact: "Chao Yu" Description: Controls discard granularity of inner discard thread. Inner thread will not issue discards with size that is smaller than granularity. The unit size is one block(4KB), now only support configuring - in range of [1, 512]. Default value is 4(=16KB). + in range of [1, 512]. Default value is 16. + For small devices, default value is 1. What: /sys/fs/f2fs//umount_discard_timeout Date: January 2019 @@ -112,6 +147,11 @@ Contact: "Jaegeuk Kim" Description: Set timeout to issue discard commands during umount. Default: 5 secs +What: /sys/fs/f2fs//pending_discard +Date: November 2021 +Contact: "Jaegeuk Kim" +Description: Shows the number of pending discard commands in the queue. + What: /sys/fs/f2fs//max_victim_search Date: January 2014 Contact: "Jaegeuk Kim" @@ -202,7 +242,7 @@ Description: Shows total written kbytes issued to disk. What: /sys/fs/f2fs//features Date: July 2017 Contact: "Jaegeuk Kim" -Description: /feature_list/ +Description: /feature_list/> Shows all enabled features in current device. Supported features: encryption, blkzoned, extra_attr, projquota, inode_checksum, @@ -264,11 +304,16 @@ Description: Shows current reserved blocks in system, it may be temporarily What: /sys/fs/f2fs//gc_urgent Date: August 2017 Contact: "Jaegeuk Kim" -Description: Do background GC aggressively when set. When gc_urgent = 1, - background thread starts to do GC by given gc_urgent_sleep_time - interval. When gc_urgent = 2, F2FS will lower the bar of - checking idle in order to process outstanding discard commands - and GC a little bit aggressively. It is set to 0 by default. +Description: Do background GC aggressively when set. Set to 0 by default. + gc urgent high(1): does GC forcibly in a period of given + gc_urgent_sleep_time and ignores I/O idling check. uses greedy + GC approach and turns SSR mode on. + gc urgent low(2): lowers the bar of checking I/O idling in + order to process outstanding discard commands and GC a + little bit aggressively. uses cost benefit GC approach. + gc urgent mid(3): does GC forcibly in a period of given + gc_urgent_sleep_time and executes a mid level of I/O idling check. + uses cost benefit GC approach. What: /sys/fs/f2fs//gc_urgent_sleep_time Date: August 2017 @@ -428,6 +473,30 @@ Description: Show status of f2fs superblock in real time. 0x4000 SBI_IS_FREEZING freefs is in process ====== ===================== ================================= +What: /sys/fs/f2fs//stat/cp_status +Date: September 2022 +Contact: "Chao Yu" +Description: Show status of f2fs checkpoint in real time. + + =============================== ============================== + cp flag value + CP_UMOUNT_FLAG 0x00000001 + CP_ORPHAN_PRESENT_FLAG 0x00000002 + CP_COMPACT_SUM_FLAG 0x00000004 + CP_ERROR_FLAG 0x00000008 + CP_FSCK_FLAG 0x00000010 + CP_FASTBOOT_FLAG 0x00000020 + CP_CRC_RECOVERY_FLAG 0x00000040 + CP_NAT_BITS_FLAG 0x00000080 + CP_TRIMMED_FLAG 0x00000100 + CP_NOCRC_RECOVERY_FLAG 0x00000200 + CP_LARGE_NAT_BITMAP_FLAG 0x00000400 + CP_QUOTA_NEED_FSCK_FLAG 0x00000800 + CP_DISABLED_FLAG 0x00001000 + CP_DISABLED_QUICK_FLAG 0x00002000 + CP_RESIZEFS_FLAG 0x00004000 + =============================== ============================== + What: /sys/fs/f2fs//ckpt_thread_ioprio Date: January 2021 Contact: "Daeho Jeong" @@ -499,7 +568,7 @@ Date: July 2021 Contact: "Daeho Jeong" Description: Show how many segments have been reclaimed by GC during a specific GC mode (0: GC normal, 1: GC idle CB, 2: GC idle greedy, - 3: GC idle AT, 4: GC urgent high, 5: GC urgent low) + 3: GC idle AT, 4: GC urgent high, 5: GC urgent low 6: GC urgent mid) You can re-initialize this value to "0". What: /sys/fs/f2fs//gc_segment_mode @@ -513,3 +582,90 @@ Date: July 2021 Contact: "Daeho Jeong" Description: You can control the multiplier value of bdi device readahead window size between 2 (default) and 256 for POSIX_FADV_SEQUENTIAL advise option. + +What: /sys/fs/f2fs//max_fragment_chunk +Date: August 2021 +Contact: "Daeho Jeong" +Description: With "mode=fragment:block" mount options, we can scatter block allocation. + f2fs will allocate 1.. blocks in a chunk and make a hole + in the length of 1.. by turns. This value can be set + between 1..512 and the default value is 4. + +What: /sys/fs/f2fs//max_fragment_hole +Date: August 2021 +Contact: "Daeho Jeong" +Description: With "mode=fragment:block" mount options, we can scatter block allocation. + f2fs will allocate 1.. blocks in a chunk and make a hole + in the length of 1.. by turns. This value can be set + between 1..512 and the default value is 4. + +What: /sys/fs/f2fs//gc_remaining_trials +Date: October 2022 +Contact: "Yangtao Li" +Description: You can set the trial count limit for GC urgent and idle mode with this value. + If GC thread gets to the limit, the mode will turn back to GC normal mode. + By default, the value is zero, which means there is no limit like before. + +What: /sys/fs/f2fs//max_roll_forward_node_blocks +Date: January 2022 +Contact: "Jaegeuk Kim" +Description: Controls max # of node block writes to be used for roll forward + recovery. This can limit the roll forward recovery time. + +What: /sys/fs/f2fs//unusable_blocks_per_sec +Date: June 2022 +Contact: "Jaegeuk Kim" +Description: Shows the number of unusable blocks in a section which was defined by + the zone capacity reported by underlying zoned device. + +What: /sys/fs/f2fs//current_atomic_write +Date: July 2022 +Contact: "Daeho Jeong" +Description: Show the total current atomic write block count, which is not committed yet. + This is a read-only entry. + +What: /sys/fs/f2fs//peak_atomic_write +Date: July 2022 +Contact: "Daeho Jeong" +Description: Show the peak value of total current atomic write block count after boot. + If you write "0" here, you can initialize to "0". + +What: /sys/fs/f2fs//committed_atomic_block +Date: July 2022 +Contact: "Daeho Jeong" +Description: Show the accumulated total committed atomic write block count after boot. + If you write "0" here, you can initialize to "0". + +What: /sys/fs/f2fs//revoked_atomic_block +Date: July 2022 +Contact: "Daeho Jeong" +Description: Show the accumulated total revoked atomic write block count after boot. + If you write "0" here, you can initialize to "0". + +What: /sys/fs/f2fs//gc_mode +Date: October 2022 +Contact: "Yangtao Li" +Description: Show the current gc_mode as a string. + This is a read-only entry. + +What: /sys/fs/f2fs//discard_urgent_util +Date: November 2022 +Contact: "Yangtao Li" +Description: When space utilization exceeds this, do background DISCARD aggressively. + Does DISCARD forcibly in a period of given min_discard_issue_time when the number + of discards is not 0 and set discard granularity to 1. + Default: 80 + +What: /sys/fs/f2fs//hot_data_age_threshold +Date: November 2022 +Contact: "Ping Xiong" +Description: When DATA SEPARATION is on, it controls the age threshold to indicate + the data blocks as hot. By default it was initialized as 262144 blocks + (equals to 1GB). + +What: /sys/fs/f2fs//warm_data_age_threshold +Date: November 2022 +Contact: "Ping Xiong" +Description: When DATA SEPARATION is on, it controls the age threshold to indicate + the data blocks as warm. By default it was initialized as 2621440 blocks + (equals to 10GB). diff --git a/Documentation/ABI/testing/sysfs-fs-fuse b/Documentation/ABI/testing/sysfs-fs-fuse new file mode 100644 index 000000000000..b9956842b36f --- /dev/null +++ b/Documentation/ABI/testing/sysfs-fs-fuse @@ -0,0 +1,19 @@ +What: /sys/fs/fuse/features/fuse_bpf +Date: December 2022 +Contact: Paul Lawrence +Description: + Read-only file that contains the word 'supported' if fuse-bpf is + supported, does not exist otherwise + +What: /sys/fs/fuse/bpf_prog_type_fuse +Date: December 2022 +Contact: Paul Lawrence +Description: + bpf_prog_type_fuse defines the program type of bpf programs that + may be passed to fuse-bpf. For upstream bpf program types, this + is a constant defined in a contiguous array of constants. + bpf_prog_type_fuse is appended to the end of the list, so it may + change and therefore its value must be read from this file. + + Contents is ASCII decimal representation of bpf_prog_type_fuse + diff --git a/Documentation/ABI/testing/sysfs-fs-incfs b/Documentation/ABI/testing/sysfs-fs-incfs new file mode 100644 index 000000000000..e4e05f950f9e --- /dev/null +++ b/Documentation/ABI/testing/sysfs-fs-incfs @@ -0,0 +1,70 @@ +What: /sys/fs/incremental-fs/features/corefs +Date: 2019 +Contact: Paul Lawrence +Description: Reads 'supported'. Always present. + +What: /sys/fs/incremental-fs/features/v2 +Date: April 2021 +Contact: Paul Lawrence +Description: Reads 'supported'. Present if all v2 features of incfs are + supported. + +What: /sys/fs/incremental-fs/features/zstd +Date: April 2021 +Contact: Paul Lawrence +Description: Reads 'supported'. Present if zstd compression is supported + for data blocks. + +What: /sys/fs/incremental-fs/features/bugfix_throttling +Date: January 2023 +Contact: Paul Lawrence +Description: Reads 'supported'. Present if the throttling lock bug is fixed + https://android-review.git.corp.google.com/c/kernel/common/+/2381827 + +What: /sys/fs/incremental-fs/instances/[name] +Date: April 2021 +Contact: Paul Lawrence +Description: Folder created when incfs is mounted with the sysfs_name=[name] + option. If this option is used, the following values are created + in this folder. + +What: /sys/fs/incremental-fs/instances/[name]/reads_delayed_min +Date: April 2021 +Contact: Paul Lawrence +Description: Returns a count of the number of reads that were delayed as a + result of the per UID read timeouts min time setting. + +What: /sys/fs/incremental-fs/instances/[name]/reads_delayed_min_us +Date: April 2021 +Contact: Paul Lawrence +Description: Returns total delay time for all files since first mount as a + result of the per UID read timeouts min time setting. + +What: /sys/fs/incremental-fs/instances/[name]/reads_delayed_pending +Date: April 2021 +Contact: Paul Lawrence +Description: Returns a count of the number of reads that were delayed as a + result of waiting for a pending read. + +What: /sys/fs/incremental-fs/instances/[name]/reads_delayed_pending_us +Date: April 2021 +Contact: Paul Lawrence +Description: Returns total delay time for all files since first mount as a + result of waiting for a pending read. + +What: /sys/fs/incremental-fs/instances/[name]/reads_failed_hash_verification +Date: April 2021 +Contact: Paul Lawrence +Description: Returns number of reads that failed because of hash verification + failures. + +What: /sys/fs/incremental-fs/instances/[name]/reads_failed_other +Date: April 2021 +Contact: Paul Lawrence +Description: Returns number of reads that failed for reasons other than + timing out or hash failures. + +What: /sys/fs/incremental-fs/instances/[name]/reads_failed_timed_out +Date: April 2021 +Contact: Paul Lawrence +Description: Returns number of reads that timed out. diff --git a/Documentation/ABI/testing/sysfs-kernel-dmaheap b/Documentation/ABI/testing/sysfs-kernel-dmaheap new file mode 100644 index 000000000000..f496181e9bf8 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-dmaheap @@ -0,0 +1,7 @@ +What: /sys/kernel/dma_heap/total_pools_kb +Date: Feb 2021 +KernelVersion: 5.10 +Contact: Hridya Valsaraju , +Description: + The total_pools_kb file is read-only and specifies how much + memory in Kb is allocated to DMA-BUF heap pools. diff --git a/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons b/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons new file mode 100644 index 000000000000..acb19b91c192 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons @@ -0,0 +1,16 @@ +What: /sys/kernel/wakeup_reasons/last_resume_reason +Date: February 2014 +Contact: Ruchi Kandoi +Description: + The /sys/kernel/wakeup_reasons/last_resume_reason is + used to report wakeup reasons after system exited suspend. + +What: /sys/kernel/wakeup_reasons/last_suspend_time +Date: March 2015 +Contact: jinqian +Description: + The /sys/kernel/wakeup_reasons/last_suspend_time is + used to report time spent in last suspend cycle. It contains + two numbers (in seconds) separated by space. First number is + the time spent in suspend and resume processes. Second number + is the time spent in sleep state. \ No newline at end of file diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 4d8c27eca96b..b1847f6414d9 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1296,6 +1296,11 @@ PAGE_SIZE multiple when read back. pagetables Amount of memory allocated for page tables. + sec_pagetables + Amount of memory allocated for secondary page tables, + this currently includes KVM mmu allocations on x86 + and arm64. + percpu (npn) Amount of memory used for storing per-cpu kernel data structures. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bcb102c91b19..aa5c2b18552b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -961,6 +961,10 @@ can be useful when debugging issues that require an SLB miss to occur. + disable_dma32= [KNL] + Dynamically disable ZONE_DMA32 on kernels compiled with + CONFIG_ZONE_DMA32=y. + stress_slb [PPC] Limits the number of kernel SLB entries, and flushes them frequently to increase the rate of SLB faults @@ -1396,6 +1400,10 @@ Format: { "fix" } Permit 'security.evm' to be updated regardless of current integrity status. + export_pmu_events + [KNL,ARM64] Sets the PMU export bit (PMCR_EL0.X), which enables + the exporting of events over an IMPLEMENTATION DEFINED PMU event + export bus to another device. failslab= fail_usercopy= @@ -1646,6 +1654,10 @@ If specified, z/VM IUCV HVC accepts connections from listed z/VM user IDs only. + hvc_dcc.enable= [ARM,ARM64] Enable DCC driver at runtime. For GKI, + disabled at runtime by default to prevent + crashes in devices which do not support DCC. + hv_nopvspin [X86,HYPER_V] Disables the paravirt spinlock optimizations which allow the hypervisor to 'idle' the guest on lock contention. @@ -2056,6 +2068,14 @@ forcing Dual Address Cycle for PCI cards supporting greater than 32-bit addressing. + iommu.max_align_shift= + [ARM64, X86] Limit the alignment of IOVAs to a maximum + PAGE_SIZE order. Larger IOVAs will be aligned to this + specified order. The order is expressed as a power of + two multiplied by the PAGE_SIZE. + Format: { "4" | "5" | "6" | "7" | "8" | "9" } + Default: 9 + iommu.strict= [ARM64, X86] Configure TLB invalidation behaviour Format: { "0" | "1" } 0 - Lazy mode. @@ -2078,6 +2098,9 @@ 1 - Bypass the IOMMU for DMA. unset - Use value of CONFIG_IOMMU_DEFAULT_PASSTHROUGH. + ioremap_guard [ARM64] enable the KVM MMIO guard functionality + if available. + io7= [HW] IO7 for Marvel-based Alpha systems See comment before marvel_specify_io7 in arch/alpha/kernel/core_marvel.c. @@ -2367,14 +2390,19 @@ kvm-arm.mode= [KVM,ARM] Select one of KVM/arm64's modes of operation. + none: Forcefully disable KVM. + nvhe: Standard nVHE-based mode, without support for protected guests. protected: nVHE-based mode with support for guests whose - state is kept private from the host. - Not valid if the kernel is running in EL2. + state is kept private from the host. See + Documentation/virt/kvm/arm/pkvm.rst for more + information about this mode of operation. - Defaults to VHE/nVHE based on hardware support. + Defaults to VHE/nVHE based on hardware support. Setting + mode to "protected" will disable kexec and hibernation + for the host. kvm-arm.vgic_v3_group0_trap= [KVM,ARM] Trap guest accesses to GICv3 group-0 diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index cbd19d5e625f..d42ee3d39cac 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -32,6 +32,7 @@ the Linux memory management. idle_page_tracking ksm memory-hotplug + multigen_lru nommu-mmap numa_memory_policy numaperf diff --git a/Documentation/admin-guide/mm/multigen_lru.rst b/Documentation/admin-guide/mm/multigen_lru.rst new file mode 100644 index 000000000000..33e068830497 --- /dev/null +++ b/Documentation/admin-guide/mm/multigen_lru.rst @@ -0,0 +1,162 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +Multi-Gen LRU +============= +The multi-gen LRU is an alternative LRU implementation that optimizes +page reclaim and improves performance under memory pressure. Page +reclaim decides the kernel's caching policy and ability to overcommit +memory. It directly impacts the kswapd CPU usage and RAM efficiency. + +Quick start +=========== +Build the kernel with the following configurations. + +* ``CONFIG_LRU_GEN=y`` +* ``CONFIG_LRU_GEN_ENABLED=y`` + +All set! + +Runtime options +=============== +``/sys/kernel/mm/lru_gen/`` contains stable ABIs described in the +following subsections. + +Kill switch +----------- +``enabled`` accepts different values to enable or disable the +following components. Its default value depends on +``CONFIG_LRU_GEN_ENABLED``. All the components should be enabled +unless some of them have unforeseen side effects. Writing to +``enabled`` has no effect when a component is not supported by the +hardware, and valid values will be accepted even when the main switch +is off. + +====== =============================================================== +Values Components +====== =============================================================== +0x0001 The main switch for the multi-gen LRU. +0x0002 Clearing the accessed bit in leaf page table entries in large + batches, when MMU sets it (e.g., on x86). This behavior can + theoretically worsen lock contention (mmap_lock). If it is + disabled, the multi-gen LRU will suffer a minor performance + degradation for workloads that contiguously map hot pages, + whose accessed bits can be otherwise cleared by fewer larger + batches. +0x0004 Clearing the accessed bit in non-leaf page table entries as + well, when MMU sets it (e.g., on x86). This behavior was not + verified on x86 varieties other than Intel and AMD. If it is + disabled, the multi-gen LRU will suffer a negligible + performance degradation. +[yYnN] Apply to all the components above. +====== =============================================================== + +E.g., +:: + + echo y >/sys/kernel/mm/lru_gen/enabled + cat /sys/kernel/mm/lru_gen/enabled + 0x0007 + echo 5 >/sys/kernel/mm/lru_gen/enabled + cat /sys/kernel/mm/lru_gen/enabled + 0x0005 + +Thrashing prevention +-------------------- +Personal computers are more sensitive to thrashing because it can +cause janks (lags when rendering UI) and negatively impact user +experience. The multi-gen LRU offers thrashing prevention to the +majority of laptop and desktop users who do not have ``oomd``. + +Users can write ``N`` to ``min_ttl_ms`` to prevent the working set of +``N`` milliseconds from getting evicted. The OOM killer is triggered +if this working set cannot be kept in memory. In other words, this +option works as an adjustable pressure relief valve, and when open, it +terminates applications that are hopefully not being used. + +Based on the average human detectable lag (~100ms), ``N=1000`` usually +eliminates intolerable janks due to thrashing. Larger values like +``N=3000`` make janks less noticeable at the risk of premature OOM +kills. + +The default value ``0`` means disabled. + +Experimental features +===================== +``/sys/kernel/debug/lru_gen`` accepts commands described in the +following subsections. Multiple command lines are supported, so does +concatenation with delimiters ``,`` and ``;``. + +``/sys/kernel/debug/lru_gen_full`` provides additional stats for +debugging. ``CONFIG_LRU_GEN_STATS=y`` keeps historical stats from +evicted generations in this file. + +Working set estimation +---------------------- +Working set estimation measures how much memory an application needs +in a given time interval, and it is usually done with little impact on +the performance of the application. E.g., data centers want to +optimize job scheduling (bin packing) to improve memory utilizations. +When a new job comes in, the job scheduler needs to find out whether +each server it manages can allocate a certain amount of memory for +this new job before it can pick a candidate. To do so, the job +scheduler needs to estimate the working sets of the existing jobs. + +When it is read, ``lru_gen`` returns a histogram of numbers of pages +accessed over different time intervals for each memcg and node. +``MAX_NR_GENS`` decides the number of bins for each histogram. The +histograms are noncumulative. +:: + + memcg memcg_id memcg_path + node node_id + min_gen_nr age_in_ms nr_anon_pages nr_file_pages + ... + max_gen_nr age_in_ms nr_anon_pages nr_file_pages + +Each bin contains an estimated number of pages that have been accessed +within ``age_in_ms``. E.g., ``min_gen_nr`` contains the coldest pages +and ``max_gen_nr`` contains the hottest pages, since ``age_in_ms`` of +the former is the largest and that of the latter is the smallest. + +Users can write the following command to ``lru_gen`` to create a new +generation ``max_gen_nr+1``: + + ``+ memcg_id node_id max_gen_nr [can_swap [force_scan]]`` + +``can_swap`` defaults to the swap setting and, if it is set to ``1``, +it forces the scan of anon pages when swap is off, and vice versa. +``force_scan`` defaults to ``1`` and, if it is set to ``0``, it +employs heuristics to reduce the overhead, which is likely to reduce +the coverage as well. + +A typical use case is that a job scheduler runs this command at a +certain time interval to create new generations, and it ranks the +servers it manages based on the sizes of their cold pages defined by +this time interval. + +Proactive reclaim +----------------- +Proactive reclaim induces page reclaim when there is no memory +pressure. It usually targets cold pages only. E.g., when a new job +comes in, the job scheduler wants to proactively reclaim cold pages on +the server it selected, to improve the chance of successfully landing +this new job. + +Users can write the following command to ``lru_gen`` to evict +generations less than or equal to ``min_gen_nr``. + + ``- memcg_id node_id min_gen_nr [swappiness [nr_to_reclaim]]`` + +``min_gen_nr`` should be less than ``max_gen_nr-1``, since +``max_gen_nr`` and ``max_gen_nr-1`` are not fully aged (equivalent to +the active list) and therefore cannot be evicted. ``swappiness`` +overrides the default value in ``/proc/sys/vm/swappiness``. +``nr_to_reclaim`` limits the number of pages to evict. + +A typical use case is that a job scheduler runs this command before it +tries to land a new job on a server. If it fails to materialize enough +cold pages because of the overestimation, it retries on the next +server according to the ranking result obtained from the working set +estimation step. This less forceful approach limits the impacts on the +existing jobs. diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 48b91c485c99..a1994a9ad2e1 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -267,6 +267,17 @@ domain names are in general different. For a detailed discussion see the ``hostname(1)`` man page. +export_pmu_events (arm64 only) +============================== + +Controls the PMU export bit (PMCR_EL0.X), which enables the exporting of +events over an IMPLEMENTATION DEFINED PMU event export bus to another device. + +0: disables exporting of events (default). + +1: enables exporting of events. + + firmware_config =============== @@ -915,6 +926,17 @@ enabled, otherwise writing to this file will return ``-EBUSY``. The default value is 8. +perf_user_access (arm64 only) +================================= + +Controls user space access for reading perf event counters. When set to 1, +user space can read performance monitor counter registers directly. + +The default value is 0 (access disabled). + +See Documentation/arm64/perf.rst for more information. + + pid_max ======= diff --git a/Documentation/arm64/cpu-feature-registers.rst b/Documentation/arm64/cpu-feature-registers.rst index 749ae970c319..c7adc7897df6 100644 --- a/Documentation/arm64/cpu-feature-registers.rst +++ b/Documentation/arm64/cpu-feature-registers.rst @@ -92,7 +92,7 @@ operation if the source belongs to the supported system register space. The infrastructure emulates only the following system register space:: - Op0=3, Op1=0, CRn=0, CRm=0,4,5,6,7 + Op0=3, Op1=0, CRn=0, CRm=0,2,3,4,5,6,7 (See Table C5-6 'System instruction encodings for non-Debug System register accesses' in ARMv8 ARM DDI 0487A.h, for the list of @@ -290,6 +290,44 @@ infrastructure: +------------------------------+---------+---------+ | RPRES | [7-4] | y | +------------------------------+---------+---------+ + | WFXT | [3-0] | y | + +------------------------------+---------+---------+ + + 10) MVFR0_EL1 - AArch32 Media and VFP Feature Register 0 + + +------------------------------+---------+---------+ + | Name | bits | visible | + +------------------------------+---------+---------+ + | FPDP | [11-8] | y | + +------------------------------+---------+---------+ + + 11) MVFR1_EL1 - AArch32 Media and VFP Feature Register 1 + + +------------------------------+---------+---------+ + | Name | bits | visible | + +------------------------------+---------+---------+ + | SIMDFMAC | [31-28] | y | + +------------------------------+---------+---------+ + | SIMDSP | [19-16] | y | + +------------------------------+---------+---------+ + | SIMDInt | [15-12] | y | + +------------------------------+---------+---------+ + | SIMDLS | [11-8] | y | + +------------------------------+---------+---------+ + + 12) ID_ISAR5_EL1 - AArch32 Instruction Set Attribute Register 5 + + +------------------------------+---------+---------+ + | Name | bits | visible | + +------------------------------+---------+---------+ + | CRC32 | [19-16] | y | + +------------------------------+---------+---------+ + | SHA2 | [15-12] | y | + +------------------------------+---------+---------+ + | SHA1 | [11-8] | y | + +------------------------------+---------+---------+ + | AES | [7-4] | y | + +------------------------------+---------+---------+ Appendix I: Example diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst index b72ff17d600a..3d116fb536c5 100644 --- a/Documentation/arm64/elf_hwcaps.rst +++ b/Documentation/arm64/elf_hwcaps.rst @@ -259,6 +259,48 @@ HWCAP2_RPRES Functionality implied by ID_AA64ISAR2_EL1.RPRES == 0b0001. +HWCAP2_MTE3 + + Functionality implied by ID_AA64PFR1_EL1.MTE == 0b0011, as described + by Documentation/arm64/memory-tagging-extension.rst. + +HWCAP2_SME + + Functionality implied by ID_AA64PFR1_EL1.SME == 0b0001, as described + by Documentation/arm64/sme.rst. + +HWCAP2_SME_I16I64 + + Functionality implied by ID_AA64SMFR0_EL1.I16I64 == 0b1111. + +HWCAP2_SME_F64F64 + + Functionality implied by ID_AA64SMFR0_EL1.F64F64 == 0b1. + +HWCAP2_SME_I8I32 + + Functionality implied by ID_AA64SMFR0_EL1.I8I32 == 0b1111. + +HWCAP2_SME_F16F32 + + Functionality implied by ID_AA64SMFR0_EL1.F16F32 == 0b1. + +HWCAP2_SME_B16F32 + + Functionality implied by ID_AA64SMFR0_EL1.B16F32 == 0b1. + +HWCAP2_SME_F32F32 + + Functionality implied by ID_AA64SMFR0_EL1.F32F32 == 0b1. + +HWCAP2_SME_FA64 + + Functionality implied by ID_AA64SMFR0_EL1.FA64 == 0b1. + +HWCAP2_WFXT + + Functionality implied by ID_AA64ISAR2_EL1.WFXT == 0b0010. + 4. Unused AT_HWCAP bits ----------------------- diff --git a/Documentation/arm64/index.rst b/Documentation/arm64/index.rst index 4f840bac083e..ae21f8118830 100644 --- a/Documentation/arm64/index.rst +++ b/Documentation/arm64/index.rst @@ -21,6 +21,7 @@ ARM64 Architecture perf pointer-authentication silicon-errata + sme sve tagged-address-abi tagged-pointers diff --git a/Documentation/arm64/memory-tagging-extension.rst b/Documentation/arm64/memory-tagging-extension.rst index 7b99c8f428eb..7e812a51e506 100644 --- a/Documentation/arm64/memory-tagging-extension.rst +++ b/Documentation/arm64/memory-tagging-extension.rst @@ -76,6 +76,9 @@ configurable behaviours: with ``.si_code = SEGV_MTEAERR`` and ``.si_addr = 0`` (the faulting address is unknown). +- *Asymmetric* - Reads are handled as for synchronous mode while writes + are handled as for asynchronous mode. + The user can select the above modes, per thread, using the ``prctl(PR_SET_TAGGED_ADDR_CTRL, flags, 0, 0, 0)`` system call where ``flags`` contains any number of the following values in the ``PR_MTE_TCF_MASK`` @@ -139,18 +142,25 @@ tag checking mode as the CPU's preferred tag checking mode. The preferred tag checking mode for each CPU is controlled by ``/sys/devices/system/cpu/cpu/mte_tcf_preferred``, to which a -privileged user may write the value ``async`` or ``sync``. The default -preferred mode for each CPU is ``async``. +privileged user may write the value ``async``, ``sync`` or ``asymm``. The +default preferred mode for each CPU is ``async``. To allow a program to potentially run in the CPU's preferred tag checking mode, the user program may set multiple tag check fault mode bits in the ``flags`` argument to the ``prctl(PR_SET_TAGGED_ADDR_CTRL, -flags, 0, 0, 0)`` system call. If the CPU's preferred tag checking -mode is in the task's set of provided tag checking modes (this will -always be the case at present because the kernel only supports two -tag checking modes, but future kernels may support more modes), that -mode will be selected. Otherwise, one of the modes in the task's mode -set will be selected in a currently unspecified manner. +flags, 0, 0, 0)`` system call. If both synchronous and asynchronous +modes are requested then asymmetric mode may also be selected by the +kernel. If the CPU's preferred tag checking mode is in the task's set +of provided tag checking modes, that mode will be selected. Otherwise, +one of the modes in the task's mode will be selected by the kernel +from the task's mode set using the preference order: + + 1. Asynchronous + 2. Asymmetric + 3. Synchronous + +Note that there is no way for userspace to request multiple modes and +also disable asymmetric mode. Initial process state --------------------- diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst index 1cee230338a2..861e357abc6e 100644 --- a/Documentation/arm64/silicon-errata.rst +++ b/Documentation/arm64/silicon-errata.rst @@ -102,12 +102,26 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A510 | #2457168 | ARM64_ERRATUM_2457168 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A510 | #2658417 | ARM64_ERRATUM_2658417 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A710 | #2119858 | ARM64_ERRATUM_2119858 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A710 | #2054223 | ARM64_ERRATUM_2054223 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A710 | #2224489 | ARM64_ERRATUM_2224489 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-N1 | #1188873,1418040| ARM64_ERRATUM_1418040 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-N1 | #1349291 | N/A | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-N1 | #1542419 | ARM64_ERRATUM_1542419 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Neoverse-N2 | #2139208 | ARM64_ERRATUM_2139208 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Neoverse-N2 | #2067961 | ARM64_ERRATUM_2067961 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Neoverse-N2 | #2253138 | ARM64_ERRATUM_2253138 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | MMU-500 | #841119,826419 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ diff --git a/Documentation/arm64/sme.rst b/Documentation/arm64/sme.rst new file mode 100644 index 000000000000..8ba677b87e90 --- /dev/null +++ b/Documentation/arm64/sme.rst @@ -0,0 +1,428 @@ +=================================================== +Scalable Matrix Extension support for AArch64 Linux +=================================================== + +This document outlines briefly the interface provided to userspace by Linux in +order to support use of the ARM Scalable Matrix Extension (SME). + +This is an outline of the most important features and issues only and not +intended to be exhaustive. It should be read in conjunction with the SVE +documentation in sve.rst which provides details on the Streaming SVE mode +included in SME. + +This document does not aim to describe the SME architecture or programmer's +model. To aid understanding, a minimal description of relevant programmer's +model features for SME is included in Appendix A. + + +1. General +----------- + +* PSTATE.SM, PSTATE.ZA, the streaming mode vector length, the ZA + register state and TPIDR2_EL0 are tracked per thread. + +* The presence of SME is reported to userspace via HWCAP2_SME in the aux vector + AT_HWCAP2 entry. Presence of this flag implies the presence of the SME + instructions and registers, and the Linux-specific system interfaces + described in this document. SME is reported in /proc/cpuinfo as "sme". + +* Support for the execution of SME instructions in userspace can also be + detected by reading the CPU ID register ID_AA64PFR1_EL1 using an MRS + instruction, and checking that the value of the SME field is nonzero. [3] + + It does not guarantee the presence of the system interfaces described in the + following sections: software that needs to verify that those interfaces are + present must check for HWCAP2_SME instead. + +* There are a number of optional SME features, presence of these is reported + through AT_HWCAP2 through: + + HWCAP2_SME_I16I64 + HWCAP2_SME_F64F64 + HWCAP2_SME_I8I32 + HWCAP2_SME_F16F32 + HWCAP2_SME_B16F32 + HWCAP2_SME_F32F32 + HWCAP2_SME_FA64 + + This list may be extended over time as the SME architecture evolves. + + These extensions are also reported via the CPU ID register ID_AA64SMFR0_EL1, + which userspace can read using an MRS instruction. See elf_hwcaps.txt and + cpu-feature-registers.txt for details. + +* Debuggers should restrict themselves to interacting with the target via the + NT_ARM_SVE, NT_ARM_SSVE and NT_ARM_ZA regsets. The recommended way + of detecting support for these regsets is to connect to a target process + first and then attempt a + + ptrace(PTRACE_GETREGSET, pid, NT_ARM_, &iov). + +* Whenever ZA register values are exchanged in memory between userspace and + the kernel, the register value is encoded in memory as a series of horizontal + vectors from 0 to VL/8-1 stored in the same endianness invariant format as is + used for SVE vectors. + +* On thread creation TPIDR2_EL0 is preserved unless CLONE_SETTLS is specified, + in which case it is set to 0. + +2. Vector lengths +------------------ + +SME defines a second vector length similar to the SVE vector length which is +controls the size of the streaming mode SVE vectors and the ZA matrix array. +The ZA matrix is square with each side having as many bytes as a streaming +mode SVE vector. + + +3. Sharing of streaming and non-streaming mode SVE state +--------------------------------------------------------- + +It is implementation defined which if any parts of the SVE state are shared +between streaming and non-streaming modes. When switching between modes +via software interfaces such as ptrace if no register content is provided as +part of switching no state will be assumed to be shared and everything will +be zeroed. + + +4. System call behaviour +------------------------- + +* On syscall PSTATE.ZA is preserved, if PSTATE.ZA==1 then the contents of the + ZA matrix are preserved. + +* On syscall PSTATE.SM will be cleared and the SVE registers will be handled + as per the standard SVE ABI. + +* Neither the SVE registers nor ZA are used to pass arguments to or receive + results from any syscall. + +* On process creation (eg, clone()) the newly created process will have + PSTATE.SM cleared. + +* All other SME state of a thread, including the currently configured vector + length, the state of the PR_SME_VL_INHERIT flag, and the deferred vector + length (if any), is preserved across all syscalls, subject to the specific + exceptions for execve() described in section 6. + + +5. Signal handling +------------------- + +* Signal handlers are invoked with streaming mode and ZA disabled. + +* A new signal frame record za_context encodes the ZA register contents on + signal delivery. [1] + +* The signal frame record for ZA always contains basic metadata, in particular + the thread's vector length (in za_context.vl). + +* The ZA matrix may or may not be included in the record, depending on + the value of PSTATE.ZA. The registers are present if and only if: + za_context.head.size >= ZA_SIG_CONTEXT_SIZE(sve_vq_from_vl(za_context.vl)) + in which case PSTATE.ZA == 1. + +* If matrix data is present, the remainder of the record has a vl-dependent + size and layout. Macros ZA_SIG_* are defined [1] to facilitate access to + them. + +* The matrix is stored as a series of horizontal vectors in the same format as + is used for SVE vectors. + +* If the ZA context is too big to fit in sigcontext.__reserved[], then extra + space is allocated on the stack, an extra_context record is written in + __reserved[] referencing this space. za_context is then written in the + extra space. Refer to [1] for further details about this mechanism. + + +5. Signal return +----------------- + +When returning from a signal handler: + +* If there is no za_context record in the signal frame, or if the record is + present but contains no register data as described in the previous section, + then ZA is disabled. + +* If za_context is present in the signal frame and contains matrix data then + PSTATE.ZA is set to 1 and ZA is populated with the specified data. + +* The vector length cannot be changed via signal return. If za_context.vl in + the signal frame does not match the current vector length, the signal return + attempt is treated as illegal, resulting in a forced SIGSEGV. + + +6. prctl extensions +-------------------- + +Some new prctl() calls are added to allow programs to manage the SME vector +length: + +prctl(PR_SME_SET_VL, unsigned long arg) + + Sets the vector length of the calling thread and related flags, where + arg == vl | flags. Other threads of the calling process are unaffected. + + vl is the desired vector length, where sve_vl_valid(vl) must be true. + + flags: + + PR_SME_VL_INHERIT + + Inherit the current vector length across execve(). Otherwise, the + vector length is reset to the system default at execve(). (See + Section 9.) + + PR_SME_SET_VL_ONEXEC + + Defer the requested vector length change until the next execve() + performed by this thread. + + The effect is equivalent to implicit execution of the following + call immediately after the next execve() (if any) by the thread: + + prctl(PR_SME_SET_VL, arg & ~PR_SME_SET_VL_ONEXEC) + + This allows launching of a new program with a different vector + length, while avoiding runtime side effects in the caller. + + Without PR_SME_SET_VL_ONEXEC, the requested change takes effect + immediately. + + + Return value: a nonnegative on success, or a negative value on error: + EINVAL: SME not supported, invalid vector length requested, or + invalid flags. + + + On success: + + * Either the calling thread's vector length or the deferred vector length + to be applied at the next execve() by the thread (dependent on whether + PR_SME_SET_VL_ONEXEC is present in arg), is set to the largest value + supported by the system that is less than or equal to vl. If vl == + SVE_VL_MAX, the value set will be the largest value supported by the + system. + + * Any previously outstanding deferred vector length change in the calling + thread is cancelled. + + * The returned value describes the resulting configuration, encoded as for + PR_SME_GET_VL. The vector length reported in this value is the new + current vector length for this thread if PR_SME_SET_VL_ONEXEC was not + present in arg; otherwise, the reported vector length is the deferred + vector length that will be applied at the next execve() by the calling + thread. + + * Changing the vector length causes all of ZA, P0..P15, FFR and all bits of + Z0..Z31 except for Z0 bits [127:0] .. Z31 bits [127:0] to become + unspecified, including both streaming and non-streaming SVE state. + Calling PR_SME_SET_VL with vl equal to the thread's current vector + length, or calling PR_SME_SET_VL with the PR_SVE_SET_VL_ONEXEC flag, + does not constitute a change to the vector length for this purpose. + + * Changing the vector length causes PSTATE.ZA and PSTATE.SM to be cleared. + Calling PR_SME_SET_VL with vl equal to the thread's current vector + length, or calling PR_SME_SET_VL with the PR_SVE_SET_VL_ONEXEC flag, + does not constitute a change to the vector length for this purpose. + + +prctl(PR_SME_GET_VL) + + Gets the vector length of the calling thread. + + The following flag may be OR-ed into the result: + + PR_SME_VL_INHERIT + + Vector length will be inherited across execve(). + + There is no way to determine whether there is an outstanding deferred + vector length change (which would only normally be the case between a + fork() or vfork() and the corresponding execve() in typical use). + + To extract the vector length from the result, bitwise and it with + PR_SME_VL_LEN_MASK. + + Return value: a nonnegative value on success, or a negative value on error: + EINVAL: SME not supported. + + +7. ptrace extensions +--------------------- + +* A new regset NT_ARM_SSVE is defined for access to streaming mode SVE + state via PTRACE_GETREGSET and PTRACE_SETREGSET, this is documented in + sve.rst. + +* A new regset NT_ARM_ZA is defined for ZA state for access to ZA state via + PTRACE_GETREGSET and PTRACE_SETREGSET. + + Refer to [2] for definitions. + +The regset data starts with struct user_za_header, containing: + + size + + Size of the complete regset, in bytes. + This depends on vl and possibly on other things in the future. + + If a call to PTRACE_GETREGSET requests less data than the value of + size, the caller can allocate a larger buffer and retry in order to + read the complete regset. + + max_size + + Maximum size in bytes that the regset can grow to for the target + thread. The regset won't grow bigger than this even if the target + thread changes its vector length etc. + + vl + + Target thread's current streaming vector length, in bytes. + + max_vl + + Maximum possible streaming vector length for the target thread. + + flags + + Zero or more of the following flags, which have the same + meaning and behaviour as the corresponding PR_SET_VL_* flags: + + SME_PT_VL_INHERIT + + SME_PT_VL_ONEXEC (SETREGSET only). + +* The effects of changing the vector length and/or flags are equivalent to + those documented for PR_SME_SET_VL. + + The caller must make a further GETREGSET call if it needs to know what VL is + actually set by SETREGSET, unless is it known in advance that the requested + VL is supported. + +* The size and layout of the payload depends on the header fields. The + SME_PT_ZA_*() macros are provided to facilitate access to the data. + +* In either case, for SETREGSET it is permissible to omit the payload, in which + case the vector length and flags are changed and PSTATE.ZA is set to 0 + (along with any consequences of those changes). If a payload is provided + then PSTATE.ZA will be set to 1. + +* For SETREGSET, if the requested VL is not supported, the effect will be the + same as if the payload were omitted, except that an EIO error is reported. + No attempt is made to translate the payload data to the correct layout + for the vector length actually set. It is up to the caller to translate the + payload layout for the actual VL and retry. + +* The effect of writing a partial, incomplete payload is unspecified. + + +8. ELF coredump extensions +--------------------------- + +* NT_ARM_SSVE notes will be added to each coredump for + each thread of the dumped process. The contents will be equivalent to the + data that would have been read if a PTRACE_GETREGSET of the corresponding + type were executed for each thread when the coredump was generated. + +* A NT_ARM_ZA note will be added to each coredump for each thread of the + dumped process. The contents will be equivalent to the data that would have + been read if a PTRACE_GETREGSET of NT_ARM_ZA were executed for each thread + when the coredump was generated. + + +9. System runtime configuration +-------------------------------- + +* To mitigate the ABI impact of expansion of the signal frame, a policy + mechanism is provided for administrators, distro maintainers and developers + to set the default vector length for userspace processes: + +/proc/sys/abi/sme_default_vector_length + + Writing the text representation of an integer to this file sets the system + default vector length to the specified value, unless the value is greater + than the maximum vector length supported by the system in which case the + default vector length is set to that maximum. + + The result can be determined by reopening the file and reading its + contents. + + At boot, the default vector length is initially set to 32 or the maximum + supported vector length, whichever is smaller and supported. This + determines the initial vector length of the init process (PID 1). + + Reading this file returns the current system default vector length. + +* At every execve() call, the new vector length of the new process is set to + the system default vector length, unless + + * PR_SME_VL_INHERIT (or equivalently SME_PT_VL_INHERIT) is set for the + calling thread, or + + * a deferred vector length change is pending, established via the + PR_SME_SET_VL_ONEXEC flag (or SME_PT_VL_ONEXEC). + +* Modifying the system default vector length does not affect the vector length + of any existing process or thread that does not make an execve() call. + + +Appendix A. SME programmer's model (informative) +================================================= + +This section provides a minimal description of the additions made by SVE to the +ARMv8-A programmer's model that are relevant to this document. + +Note: This section is for information only and not intended to be complete or +to replace any architectural specification. + +A.1. Registers +--------------- + +In A64 state, SME adds the following: + +* A new mode, streaming mode, in which a subset of the normal FPSIMD and SVE + features are available. When supported EL0 software may enter and leave + streaming mode at any time. + + For best system performance it is strongly encouraged for software to enable + streaming mode only when it is actively being used. + +* A new vector length controlling the size of ZA and the Z registers when in + streaming mode, separately to the vector length used for SVE when not in + streaming mode. There is no requirement that either the currently selected + vector length or the set of vector lengths supported for the two modes in + a given system have any relationship. The streaming mode vector length + is referred to as SVL. + +* A new ZA matrix register. This is a square matrix of SVLxSVL bits. Most + operations on ZA require that streaming mode be enabled but ZA can be + enabled without streaming mode in order to load, save and retain data. + + For best system performance it is strongly encouraged for software to enable + ZA only when it is actively being used. + +* Two new 1 bit fields in PSTATE which may be controlled via the SMSTART and + SMSTOP instructions or by access to the SVCR system register: + + * PSTATE.ZA, if this is 1 then the ZA matrix is accessible and has valid + data while if it is 0 then ZA can not be accessed. When PSTATE.ZA is + changed from 0 to 1 all bits in ZA are cleared. + + * PSTATE.SM, if this is 1 then the PE is in streaming mode. When the value + of PSTATE.SM is changed then it is implementation defined if the subset + of the floating point register bits valid in both modes may be retained. + Any other bits will be cleared. + + +References +========== + +[1] arch/arm64/include/uapi/asm/sigcontext.h + AArch64 Linux signal ABI definitions + +[2] arch/arm64/include/uapi/asm/ptrace.h + AArch64 Linux ptrace ABI definitions + +[3] Documentation/arm64/cpu-feature-registers.rst diff --git a/Documentation/arm64/sve.rst b/Documentation/arm64/sve.rst index 03137154299e..93c2c2990584 100644 --- a/Documentation/arm64/sve.rst +++ b/Documentation/arm64/sve.rst @@ -7,7 +7,9 @@ Author: Dave Martin Date: 4 August 2017 This document outlines briefly the interface provided to userspace by Linux in -order to support use of the ARM Scalable Vector Extension (SVE). +order to support use of the ARM Scalable Vector Extension (SVE), including +interactions with Streaming SVE mode added by the Scalable Matrix Extension +(SME). This is an outline of the most important features and issues only and not intended to be exhaustive. @@ -23,6 +25,10 @@ model features for SVE is included in Appendix A. * SVE registers Z0..Z31, P0..P15 and FFR and the current vector length VL, are tracked per-thread. +* In streaming mode FFR is not accessible unless HWCAP2_SME_FA64 is present + in the system, when it is not supported and these interfaces are used to + access streaming mode FFR is read and written as zero. + * The presence of SVE is reported to userspace via HWCAP_SVE in the aux vector AT_HWCAP entry. Presence of this flag implies the presence of the SVE instructions and registers, and the Linux-specific system interfaces @@ -53,10 +59,19 @@ model features for SVE is included in Appendix A. which userspace can read using an MRS instruction. See elf_hwcaps.txt and cpu-feature-registers.txt for details. +* On hardware that supports the SME extensions, HWCAP2_SME will also be + reported in the AT_HWCAP2 aux vector entry. Among other things SME adds + streaming mode which provides a subset of the SVE feature set using a + separate SME vector length and the same Z/V registers. See sme.rst + for more details. + * Debuggers should restrict themselves to interacting with the target via the NT_ARM_SVE regset. The recommended way of detecting support for this regset is to connect to a target process first and then attempt a - ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov). + ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov). Note that when SME is + present and streaming SVE mode is in use the FPSIMD subset of registers + will be read via NT_ARM_SVE and NT_ARM_SVE writes will exit streaming mode + in the target. * Whenever SVE scalable register values (Zn, Pn, FFR) are exchanged in memory between userspace and the kernel, the register value is encoded in memory in @@ -126,6 +141,11 @@ the SVE instruction set architecture. are only present in fpsimd_context. For convenience, the content of V0..V31 is duplicated between sve_context and fpsimd_context. +* The record contains a flag field which includes a flag SVE_SIG_FLAG_SM which + if set indicates that the thread is in streaming mode and the vector length + and register data (if present) describe the streaming SVE data and vector + length. + * The signal frame record for SVE always contains basic metadata, in particular the thread's vector length (in sve_context.vl). @@ -170,6 +190,11 @@ When returning from a signal handler: the signal frame does not match the current vector length, the signal return attempt is treated as illegal, resulting in a forced SIGSEGV. +* It is permitted to enter or leave streaming mode by setting or clearing + the SVE_SIG_FLAG_SM flag but applications should take care to ensure that + when doing so sve_context.vl and any register data are appropriate for the + vector length in the new mode. + 6. prctl extensions -------------------- @@ -255,7 +280,7 @@ prctl(PR_SVE_GET_VL) vector length change (which would only normally be the case between a fork() or vfork() and the corresponding execve() in typical use). - To extract the vector length from the result, and it with + To extract the vector length from the result, bitwise and it with PR_SVE_VL_LEN_MASK. Return value: a nonnegative value on success, or a negative value on error: @@ -265,8 +290,14 @@ prctl(PR_SVE_GET_VL) 7. ptrace extensions --------------------- -* A new regset NT_ARM_SVE is defined for use with PTRACE_GETREGSET and - PTRACE_SETREGSET. +* New regsets NT_ARM_SVE and NT_ARM_SSVE are defined for use with + PTRACE_GETREGSET and PTRACE_SETREGSET. NT_ARM_SSVE describes the + streaming mode SVE registers and NT_ARM_SVE describes the + non-streaming mode SVE registers. + + In this description a register set is referred to as being "live" when + the target is in the appropriate streaming or non-streaming mode and is + using data beyond the subset shared with the FPSIMD Vn registers. Refer to [2] for definitions. @@ -297,7 +328,7 @@ The regset data starts with struct user_sve_header, containing: flags - either + at most one of SVE_PT_REGS_FPSIMD @@ -331,6 +362,10 @@ The regset data starts with struct user_sve_header, containing: SVE_PT_VL_ONEXEC (SETREGSET only). + If neither FPSIMD nor SVE flags are provided then no register + payload is available, this is only possible when SME is implemented. + + * The effects of changing the vector length and/or flags are equivalent to those documented for PR_SVE_SET_VL. @@ -346,6 +381,13 @@ The regset data starts with struct user_sve_header, containing: case only the vector length and flags are changed (along with any consequences of those changes). +* In systems supporting SME when in streaming mode a GETREGSET for + NT_REG_SVE will return only the user_sve_header with no register data, + similarly a GETREGSET for NT_REG_SSVE will not return any register data + when not in streaming mode. + +* A GETREGSET for NT_ARM_SSVE will never return SVE_PT_REGS_FPSIMD. + * For SETREGSET, if an SVE_PT_REGS_SVE payload is present and the requested VL is not supported, the effect will be the same as if the payload were omitted, except that an EIO error is reported. No @@ -355,17 +397,25 @@ The regset data starts with struct user_sve_header, containing: unspecified. It is up to the caller to translate the payload layout for the actual VL and retry. +* Where SME is implemented it is not possible to GETREGSET the register + state for normal SVE when in streaming mode, nor the streaming mode + register state when in normal mode, regardless of the implementation defined + behaviour of the hardware for sharing data between the two modes. + +* Any SETREGSET of NT_ARM_SVE will exit streaming mode if the target was in + streaming mode and any SETREGSET of NT_ARM_SSVE will enter streaming mode + if the target was not in streaming mode. + * The effect of writing a partial, incomplete payload is unspecified. 8. ELF coredump extensions --------------------------- -* A NT_ARM_SVE note will be added to each coredump for each thread of the - dumped process. The contents will be equivalent to the data that would have - been read if a PTRACE_GETREGSET of NT_ARM_SVE were executed for each thread - when the coredump was generated. - +* NT_ARM_SVE and NT_ARM_SSVE notes will be added to each coredump for + each thread of the dumped process. The contents will be equivalent to the + data that would have been read if a PTRACE_GETREGSET of the corresponding + type were executed for each thread when the coredump was generated. 9. System runtime configuration -------------------------------- diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst index 7f9b40d6b416..4fd2b1171301 100644 --- a/Documentation/block/inline-encryption.rst +++ b/Documentation/block/inline-encryption.rst @@ -1,5 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0 +.. _inline_encryption: + ================= Inline Encryption ================= @@ -7,230 +9,269 @@ Inline Encryption Background ========== -Inline encryption hardware sits logically between memory and the disk, and can -en/decrypt data as it goes in/out of the disk. Inline encryption hardware has a -fixed number of "keyslots" - slots into which encryption contexts (i.e. the -encryption key, encryption algorithm, data unit size) can be programmed by the -kernel at any time. Each request sent to the disk can be tagged with the index -of a keyslot (and also a data unit number to act as an encryption tweak), and -the inline encryption hardware will en/decrypt the data in the request with the -encryption context programmed into that keyslot. This is very different from -full disk encryption solutions like self encrypting drives/TCG OPAL/ATA -Security standards, since with inline encryption, any block on disk could be -encrypted with any encryption context the kernel chooses. +Inline encryption hardware sits logically between memory and disk, and can +en/decrypt data as it goes in/out of the disk. For each I/O request, software +can control exactly how the inline encryption hardware will en/decrypt the data +in terms of key, algorithm, data unit size (the granularity of en/decryption), +and data unit number (a value that determines the initialization vector(s)). +Some inline encryption hardware accepts all encryption parameters including raw +keys directly in low-level I/O requests. However, most inline encryption +hardware instead has a fixed number of "keyslots" and requires that the key, +algorithm, and data unit size first be programmed into a keyslot. Each +low-level I/O request then just contains a keyslot index and data unit number. + +Note that inline encryption hardware is very different from traditional crypto +accelerators, which are supported through the kernel crypto API. Traditional +crypto accelerators operate on memory regions, whereas inline encryption +hardware operates on I/O requests. Thus, inline encryption hardware needs to be +managed by the block layer, not the kernel crypto API. + +Inline encryption hardware is also very different from "self-encrypting drives", +such as those based on the TCG Opal or ATA Security standards. Self-encrypting +drives don't provide fine-grained control of encryption and provide no way to +verify the correctness of the resulting ciphertext. Inline encryption hardware +provides fine-grained control of encryption, including the choice of key and +initialization vector for each sector, and can be tested for correctness. Objective ========= -We want to support inline encryption (IE) in the kernel. -To allow for testing, we also want a crypto API fallback when actual -IE hardware is absent. We also want IE to work with layered devices -like dm and loopback (i.e. we want to be able to use the IE hardware -of the underlying devices if present, or else fall back to crypto API -en/decryption). - +We want to support inline encryption in the kernel. To make testing easier, we +also want support for falling back to the kernel crypto API when actual inline +encryption hardware is absent. We also want inline encryption to work with +layered devices like device-mapper and loopback (i.e. we want to be able to use +the inline encryption hardware of the underlying devices if present, or else +fall back to crypto API en/decryption). Constraints and notes ===================== -- IE hardware has a limited number of "keyslots" that can be programmed - with an encryption context (key, algorithm, data unit size, etc.) at any time. - One can specify a keyslot in a data request made to the device, and the - device will en/decrypt the data using the encryption context programmed into - that specified keyslot. When possible, we want to make multiple requests with - the same encryption context share the same keyslot. +- We need a way for upper layers (e.g. filesystems) to specify an encryption + context to use for en/decrypting a bio, and device drivers (e.g. UFSHCD) need + to be able to use that encryption context when they process the request. + Encryption contexts also introduce constraints on bio merging; the block layer + needs to be aware of these constraints. -- We need a way for upper layers like filesystems to specify an encryption - context to use for en/decrypting a struct bio, and a device driver (like UFS) - needs to be able to use that encryption context when it processes the bio. +- Different inline encryption hardware has different supported algorithms, + supported data unit sizes, maximum data unit numbers, etc. We call these + properties the "crypto capabilities". We need a way for device drivers to + advertise crypto capabilities to upper layers in a generic way. -- We need a way for device drivers to expose their inline encryption - capabilities in a unified way to the upper layers. +- Inline encryption hardware usually (but not always) requires that keys be + programmed into keyslots before being used. Since programming keyslots may be + slow and there may not be very many keyslots, we shouldn't just program the + key for every I/O request, but rather keep track of which keys are in the + keyslots and reuse an already-programmed keyslot when possible. +- Upper layers typically define a specific end-of-life for crypto keys, e.g. + when an encrypted directory is locked or when a crypto mapping is torn down. + At these times, keys are wiped from memory. We must provide a way for upper + layers to also evict keys from any keyslots they are present in. -Design -====== +- When possible, device-mapper devices must be able to pass through the inline + encryption support of their underlying devices. However, it doesn't make + sense for device-mapper devices to have keyslots themselves. -We add a struct bio_crypt_ctx to struct bio that can -represent an encryption context, because we need to be able to pass this -encryption context from the upper layers (like the fs layer) to the -device driver to act upon. +Basic design +============ -While IE hardware works on the notion of keyslots, the FS layer has no -knowledge of keyslots - it simply wants to specify an encryption context to -use while en/decrypting a bio. +We introduce ``struct blk_crypto_key`` to represent an inline encryption key and +how it will be used. This includes the type of the key (standard or +hardware-wrapped); the actual bytes of the key; the size of the key; the +algorithm and data unit size the key will be used with; and the number of bytes +needed to represent the maximum data unit number the key will be used with. -We introduce a keyslot manager (KSM) that handles the translation from -encryption contexts specified by the FS to keyslots on the IE hardware. -This KSM also serves as the way IE hardware can expose its capabilities to -upper layers. The generic mode of operation is: each device driver that wants -to support IE will construct a KSM and set it up in its struct request_queue. -Upper layers that want to use IE on this device can then use this KSM in -the device's struct request_queue to translate an encryption context into -a keyslot. The presence of the KSM in the request queue shall be used to mean -that the device supports IE. +We introduce ``struct bio_crypt_ctx`` to represent an encryption context. It +contains a data unit number and a pointer to a blk_crypto_key. We add pointers +to a bio_crypt_ctx to ``struct bio`` and ``struct request``; this allows users +of the block layer (e.g. filesystems) to provide an encryption context when +creating a bio and have it be passed down the stack for processing by the block +layer and device drivers. Note that the encryption context doesn't explicitly +say whether to encrypt or decrypt, as that is implicit from the direction of the +bio; WRITE means encrypt, and READ means decrypt. -The KSM uses refcounts to track which keyslots are idle (either they have no -encryption context programmed, or there are no in-flight struct bios -referencing that keyslot). When a new encryption context needs a keyslot, it -tries to find a keyslot that has already been programmed with the same -encryption context, and if there is no such keyslot, it evicts the least -recently used idle keyslot and programs the new encryption context into that -one. If no idle keyslots are available, then the caller will sleep until there -is at least one. +We also introduce ``struct blk_crypto_profile`` to contain all generic inline +encryption-related state for a particular inline encryption device. The +blk_crypto_profile serves as the way that drivers for inline encryption hardware +advertise their crypto capabilities and provide certain functions (e.g., +functions to program and evict keys) to upper layers. Each device driver that +wants to support inline encryption will construct a blk_crypto_profile, then +associate it with the disk's request_queue. +The blk_crypto_profile also manages the hardware's keyslots, when applicable. +This happens in the block layer, so that users of the block layer can just +specify encryption contexts and don't need to know about keyslots at all, nor do +device drivers need to care about most details of keyslot management. -blk-mq changes, other block layer changes and blk-crypto-fallback -================================================================= +Specifically, for each keyslot, the block layer (via the blk_crypto_profile) +keeps track of which blk_crypto_key that keyslot contains (if any), and how many +in-flight I/O requests are using it. When the block layer creates a +``struct request`` for a bio that has an encryption context, it grabs a keyslot +that already contains the key if possible. Otherwise it waits for an idle +keyslot (a keyslot that isn't in-use by any I/O), then programs the key into the +least-recently-used idle keyslot using the function the device driver provided. +In both cases, the resulting keyslot is stored in the ``crypt_keyslot`` field of +the request, where it is then accessible to device drivers and is released after +the request completes. -We add a pointer to a ``bi_crypt_context`` and ``keyslot`` to -struct request. These will be referred to as the ``crypto fields`` -for the request. This ``keyslot`` is the keyslot into which the -``bi_crypt_context`` has been programmed in the KSM of the ``request_queue`` -that this request is being sent to. +``struct request`` also contains a pointer to the original bio_crypt_ctx. +Requests can be built from multiple bios, and the block layer must take the +encryption context into account when trying to merge bios and requests. For two +bios/requests to be merged, they must have compatible encryption contexts: both +unencrypted, or both encrypted with the same key and contiguous data unit +numbers. Only the encryption context for the first bio in a request is +retained, since the remaining bios have been verified to be merge-compatible +with the first bio. -We introduce ``block/blk-crypto-fallback.c``, which allows upper layers to remain -blissfully unaware of whether or not real inline encryption hardware is present -underneath. When a bio is submitted with a target ``request_queue`` that doesn't -support the encryption context specified with the bio, the block layer will -en/decrypt the bio with the blk-crypto-fallback. +To make it possible for inline encryption to work with request_queue based +layered devices, when a request is cloned, its encryption context is cloned as +well. When the cloned request is submitted, it is then processed as usual; this +includes getting a keyslot from the clone's target device if needed. -If the bio is a ``WRITE`` bio, a bounce bio is allocated, and the data in the bio -is encrypted stored in the bounce bio - blk-mq will then proceed to process the -bounce bio as if it were not encrypted at all (except when blk-integrity is -concerned). ``blk-crypto-fallback`` sets the bounce bio's ``bi_end_io`` to an -internal function that cleans up the bounce bio and ends the original bio. +blk-crypto-fallback +=================== -If the bio is a ``READ`` bio, the bio's ``bi_end_io`` (and also ``bi_private``) -is saved and overwritten by ``blk-crypto-fallback`` to -``bio_crypto_fallback_decrypt_bio``. The bio's ``bi_crypt_context`` is also -overwritten with ``NULL``, so that to the rest of the stack, the bio looks -as if it was a regular bio that never had an encryption context specified. -``bio_crypto_fallback_decrypt_bio`` will decrypt the bio, restore the original -``bi_end_io`` (and also ``bi_private``) and end the bio again. +It is desirable for the inline encryption support of upper layers (e.g. +filesystems) to be testable without real inline encryption hardware, and +likewise for the block layer's keyslot management logic. It is also desirable +to allow upper layers to just always use inline encryption rather than have to +implement encryption in multiple ways. -Regardless of whether real inline encryption hardware is used or the +Therefore, we also introduce *blk-crypto-fallback*, which is an implementation +of inline encryption using the kernel crypto API. blk-crypto-fallback is built +into the block layer, so it works on any block device without any special setup. +Essentially, when a bio with an encryption context is submitted to a +block_device that doesn't support that encryption context, the block layer will +handle en/decryption of the bio using blk-crypto-fallback. + +For encryption, the data cannot be encrypted in-place, as callers usually rely +on it being unmodified. Instead, blk-crypto-fallback allocates bounce pages, +fills a new bio with those bounce pages, encrypts the data into those bounce +pages, and submits that "bounce" bio. When the bounce bio completes, +blk-crypto-fallback completes the original bio. If the original bio is too +large, multiple bounce bios may be required; see the code for details. + +For decryption, blk-crypto-fallback "wraps" the bio's completion callback +(``bi_complete``) and private data (``bi_private``) with its own, unsets the +bio's encryption context, then submits the bio. If the read completes +successfully, blk-crypto-fallback restores the bio's original completion +callback and private data, then decrypts the bio's data in-place using the +kernel crypto API. Decryption happens from a workqueue, as it may sleep. +Afterwards, blk-crypto-fallback completes the bio. + +In both cases, the bios that blk-crypto-fallback submits no longer have an +encryption context. Therefore, lower layers only see standard unencrypted I/O. + +blk-crypto-fallback also defines its own blk_crypto_profile and has its own +"keyslots"; its keyslots contain ``struct crypto_skcipher`` objects. The reason +for this is twofold. First, it allows the keyslot management logic to be tested +without actual inline encryption hardware. Second, similar to actual inline +encryption hardware, the crypto API doesn't accept keys directly in requests but +rather requires that keys be set ahead of time, and setting keys can be +expensive; moreover, allocating a crypto_skcipher can't happen on the I/O path +at all due to the locks it takes. Therefore, the concept of keyslots still +makes sense for blk-crypto-fallback. + +Note that regardless of whether real inline encryption hardware or blk-crypto-fallback is used, the ciphertext written to disk (and hence the -on-disk format of data) will be the same (assuming the hardware's implementation -of the algorithm being used adheres to spec and functions correctly). - -If a ``request queue``'s inline encryption hardware claimed to support the -encryption context specified with a bio, then it will not be handled by the -``blk-crypto-fallback``. We will eventually reach a point in blk-mq when a -struct request needs to be allocated for that bio. At that point, -blk-mq tries to program the encryption context into the ``request_queue``'s -keyslot_manager, and obtain a keyslot, which it stores in its newly added -``keyslot`` field. This keyslot is released when the request is completed. - -When the first bio is added to a request, ``blk_crypto_rq_bio_prep`` is called, -which sets the request's ``crypt_ctx`` to a copy of the bio's -``bi_crypt_context``. bio_crypt_do_front_merge is called whenever a subsequent -bio is merged to the front of the request, which updates the ``crypt_ctx`` of -the request so that it matches the newly merged bio's ``bi_crypt_context``. In particular, the request keeps a copy of the ``bi_crypt_context`` of the first -bio in its bio-list (blk-mq needs to be careful to maintain this invariant -during bio and request merges). - -To make it possible for inline encryption to work with request queue based -layered devices, when a request is cloned, its ``crypto fields`` are cloned as -well. When the cloned request is submitted, blk-mq programs the -``bi_crypt_context`` of the request into the clone's request_queue's keyslot -manager, and stores the returned keyslot in the clone's ``keyslot``. +on-disk format of data) will be the same (assuming that both the inline +encryption hardware's implementation and the kernel crypto API's implementation +of the algorithm being used adhere to spec and function correctly). +blk-crypto-fallback is optional and is controlled by the +``CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK`` kernel configuration option. API presented to users of the block layer ========================================= -``struct blk_crypto_key`` represents a crypto key (the raw key, size of the -key, the crypto algorithm to use, the data unit size to use, and the number of -bytes required to represent data unit numbers that will be specified with the -``bi_crypt_context``). +``blk_crypto_config_supported()`` allows users to check ahead of time whether +inline encryption with particular crypto settings will work on a particular +block_device -- either via hardware or via blk-crypto-fallback. This function +takes in a ``struct blk_crypto_config`` which is like blk_crypto_key, but omits +the actual bytes of the key and instead just contains the algorithm, data unit +size, etc. This function can be useful if blk-crypto-fallback is disabled. -``blk_crypto_init_key`` allows upper layers to initialize such a -``blk_crypto_key``. +``blk_crypto_init_key()`` allows users to initialize a blk_crypto_key. -``bio_crypt_set_ctx`` should be called on any bio that a user of -the block layer wants en/decrypted via inline encryption (or the -blk-crypto-fallback, if hardware support isn't available for the desired -crypto configuration). This function takes the ``blk_crypto_key`` and the -data unit number (DUN) to use when en/decrypting the bio. +Users must call ``blk_crypto_start_using_key()`` before actually starting to use +a blk_crypto_key on a block_device (even if ``blk_crypto_config_supported()`` +was called earlier). This is needed to initialize blk-crypto-fallback if it +will be needed. This must not be called from the data path, as this may have to +allocate resources, which may deadlock in that case. -``blk_crypto_config_supported`` allows upper layers to query whether or not the -an encryption context passed to request queue can be handled by blk-crypto -(either by real inline encryption hardware, or by the blk-crypto-fallback). -This is useful e.g. when blk-crypto-fallback is disabled, and the upper layer -wants to use an algorithm that may not supported by hardware - this function -lets the upper layer know ahead of time that the algorithm isn't supported, -and the upper layer can fallback to something else if appropriate. +Next, to attach an encryption context to a bio, users should call +``bio_crypt_set_ctx()``. This function allocates a bio_crypt_ctx and attaches +it to a bio, given the blk_crypto_key and the data unit number that will be used +for en/decryption. Users don't need to worry about freeing the bio_crypt_ctx +later, as that happens automatically when the bio is freed or reset. -``blk_crypto_start_using_key`` - Upper layers must call this function on -``blk_crypto_key`` and a ``request_queue`` before using the key with any bio -headed for that ``request_queue``. This function ensures that either the -hardware supports the key's crypto settings, or the crypto API fallback has -transforms for the needed mode allocated and ready to go. Note that this -function may allocate an ``skcipher``, and must not be called from the data -path, since allocating ``skciphers`` from the data path can deadlock. +Finally, when done using inline encryption with a blk_crypto_key on a +block_device, users must call ``blk_crypto_evict_key()``. This ensures that +the key is evicted from all keyslots it may be programmed into and unlinked from +any kernel data structures it may be linked into. -``blk_crypto_evict_key`` *must* be called by upper layers before a -``blk_crypto_key`` is freed. Further, it *must* only be called only once -there are no more in-flight requests that use that ``blk_crypto_key``. -``blk_crypto_evict_key`` will ensure that a key is removed from any keyslots in -inline encryption hardware that the key might have been programmed into (or the blk-crypto-fallback). +In summary, for users of the block layer, the lifecycle of a blk_crypto_key is +as follows: + +1. ``blk_crypto_config_supported()`` (optional) +2. ``blk_crypto_init_key()`` +3. ``blk_crypto_start_using_key()`` +4. ``bio_crypt_set_ctx()`` (potentially many times) +5. ``blk_crypto_evict_key()`` (after all I/O has completed) +6. Zeroize the blk_crypto_key (this has no dedicated function) + +If a blk_crypto_key is being used on multiple block_devices, then +``blk_crypto_config_supported()`` (if used), ``blk_crypto_start_using_key()``, +and ``blk_crypto_evict_key()`` must be called on each block_device. API presented to device drivers =============================== -A :c:type:``struct blk_keyslot_manager`` should be set up by device drivers in -the ``request_queue`` of the device. The device driver needs to call -``blk_ksm_init`` (or its resource-managed variant ``devm_blk_ksm_init``) on the -``blk_keyslot_manager``, while specifying the number of keyslots supported by -the hardware. +A device driver that wants to support inline encryption must set up a +blk_crypto_profile in the request_queue of its device. To do this, it first +must call ``blk_crypto_profile_init()`` (or its resource-managed variant +``devm_blk_crypto_profile_init()``), providing the number of keyslots. -The device driver also needs to tell the KSM how to actually manipulate the -IE hardware in the device to do things like programming the crypto key into -the IE hardware into a particular keyslot. All this is achieved through the -struct blk_ksm_ll_ops field in the KSM that the device driver -must fill up after initing the ``blk_keyslot_manager``. +Next, it must advertise its crypto capabilities by setting fields in the +blk_crypto_profile, e.g. ``modes_supported`` and ``max_dun_bytes_supported``. -The KSM also handles runtime power management for the device when applicable -(e.g. when it wants to program a crypto key into the IE hardware, the device -must be runtime powered on) - so the device driver must also set the ``dev`` -field in the ksm to point to the `struct device` for the KSM to use for runtime -power management. +It then must set function pointers in the ``ll_ops`` field of the +blk_crypto_profile to tell upper layers how to control the inline encryption +hardware, e.g. how to program and evict keyslots. Most drivers will need to +implement ``keyslot_program`` and ``keyslot_evict``. For details, see the +comments for ``struct blk_crypto_ll_ops``. -``blk_ksm_reprogram_all_keys`` can be called by device drivers if the device -needs each and every of its keyslots to be reprogrammed with the key it -"should have" at the point in time when the function is called. This is useful -e.g. if a device loses all its keys on runtime power down/up. +Once the driver registers a blk_crypto_profile with a request_queue, I/O +requests the driver receives via that queue may have an encryption context. All +encryption contexts will be compatible with the crypto capabilities declared in +the blk_crypto_profile, so drivers don't need to worry about handling +unsupported requests. Also, if a nonzero number of keyslots was declared in the +blk_crypto_profile, then all I/O requests that have an encryption context will +also have a keyslot which was already programmed with the appropriate key. -If the driver used ``blk_ksm_init`` instead of ``devm_blk_ksm_init``, then -``blk_ksm_destroy`` should be called to free up all resources used by a -``blk_keyslot_manager`` once it is no longer needed. +If the driver implements runtime suspend and its blk_crypto_ll_ops don't work +while the device is runtime-suspended, then the driver must also set the ``dev`` +field of the blk_crypto_profile to point to the ``struct device`` that will be +resumed before any of the low-level operations are called. + +If there are situations where the inline encryption hardware loses the contents +of its keyslots, e.g. device resets, the driver must handle reprogramming the +keyslots. To do this, the driver may call ``blk_crypto_reprogram_all_keys()``. + +Finally, if the driver used ``blk_crypto_profile_init()`` instead of +``devm_blk_crypto_profile_init()``, then it is responsible for calling +``blk_crypto_profile_destroy()`` when the crypto profile is no longer needed. Layered Devices =============== -Request queue based layered devices like dm-rq that wish to support IE need to -create their own keyslot manager for their request queue, and expose whatever -functionality they choose. When a layered device wants to pass a clone of that -request to another ``request_queue``, blk-crypto will initialize and prepare the -clone as necessary - see ``blk_crypto_insert_cloned_request`` in -``blk-crypto.c``. - - -Future Optimizations for layered devices -======================================== - -Creating a keyslot manager for a layered device uses up memory for each -keyslot, and in general, a layered device merely passes the request on to a -"child" device, so the keyslots in the layered device itself are completely -unused, and don't need any refcounting or keyslot programming. We can instead -define a new type of KSM; the "passthrough KSM", that layered devices can use -to advertise an unlimited number of keyslots, and support for any encryption -algorithms they choose, while not actually using any memory for each keyslot. -Another use case for the "passthrough KSM" is for IE devices that do not have a -limited number of keyslots. - +Request queue based layered devices like dm-rq that wish to support inline +encryption need to create their own blk_crypto_profile for their request_queue, +and expose whatever functionality they choose. When a layered device wants to +pass a clone of that request to another request_queue, blk-crypto will +initialize and prepare the clone as necessary; see +``blk_crypto_insert_cloned_request()``. Interaction between inline encryption and blk integrity ======================================================= @@ -257,7 +298,220 @@ Because there isn't any real hardware yet, it seems prudent to assume that hardware implementations might not implement both features together correctly, and disallow the combination for now. Whenever a device supports integrity, the kernel will pretend that the device does not support hardware inline encryption -(by essentially setting the keyslot manager in the request_queue of the device -to NULL). When the crypto API fallback is enabled, this means that all bios with -and encryption context will use the fallback, and IO will complete as usual. -When the fallback is disabled, a bio with an encryption context will be failed. +(by setting the blk_crypto_profile in the request_queue of the device to NULL). +When the crypto API fallback is enabled, this means that all bios with and +encryption context will use the fallback, and IO will complete as usual. When +the fallback is disabled, a bio with an encryption context will be failed. + +.. _hardware_wrapped_keys: + +Hardware-wrapped keys +===================== + +Motivation and threat model +--------------------------- + +Linux storage encryption (dm-crypt, fscrypt, eCryptfs, etc.) traditionally +relies on the raw encryption key(s) being present in kernel memory so that the +encryption can be performed. This traditionally isn't seen as a problem because +the key(s) won't be present during an offline attack, which is the main type of +attack that storage encryption is intended to protect from. + +However, there is an increasing desire to also protect users' data from other +types of attacks (to the extent possible), including: + +- Cold boot attacks, where an attacker with physical access to a system suddenly + powers it off, then immediately dumps the system memory to extract recently + in-use encryption keys, then uses these keys to decrypt user data on-disk. + +- Online attacks where the attacker is able to read kernel memory without fully + compromising the system, followed by an offline attack where any extracted + keys can be used to decrypt user data on-disk. An example of such an online + attack would be if the attacker is able to run some code on the system that + exploits a Meltdown-like vulnerability but is unable to escalate privileges. + +- Online attacks where the attacker fully compromises the system, but their data + exfiltration is significantly time-limited and/or bandwidth-limited, so in + order to completely exfiltrate the data they need to extract the encryption + keys to use in a later offline attack. + +Hardware-wrapped keys are a feature of inline encryption hardware that is +designed to protect users' data from the above attacks (to the extent possible), +without introducing limitations such as a maximum number of keys. + +Note that it is impossible to **fully** protect users' data from these attacks. +Even in the attacks where the attacker "just" gets read access to kernel memory, +they can still extract any user data that is present in memory, including +plaintext pagecache pages of encrypted files. The focus here is just on +protecting the encryption keys, as those instantly give access to **all** user +data in any following offline attack, rather than just some of it (where which +data is included in that "some" might not be controlled by the attacker). + +Solution overview +----------------- + +Inline encryption hardware typically has "keyslots" into which software can +program keys for the hardware to use; the contents of keyslots typically can't +be read back by software. As such, the above security goals could be achieved +if the kernel simply erased its copy of the key(s) after programming them into +keyslot(s) and thereafter only referred to them via keyslot number. + +However, that naive approach runs into the problem that it limits the number of +unlocked keys to the number of keyslots, which typically is a small number. In +cases where there is only one encryption key system-wide (e.g., a full-disk +encryption key), that can be tolerable. However, in general there can be many +logged-in users with many different keys, and/or many running applications with +application-specific encrypted storage areas. This is especially true if +file-based encryption (e.g. fscrypt) is being used. + +Thus, it is important for the kernel to still have a way to "remind" the +hardware about a key, without actually having the raw key itself. This would +ensure that the number of hardware keyslots only limits the number of active I/O +requests, not other things such as the number of logged-in users, the number of +running apps, or the number of encrypted storage areas that apps can create. + +Somewhat less importantly, it is also desirable that the raw keys are never +visible to software at all, even while being initially unlocked. This would +ensure that a read-only compromise of system memory will never allow a key to be +extracted to be used off-system, even if it occurs when a key is being unlocked. + +To solve all these problems, some vendors of inline encryption hardware have +made their hardware support *hardware-wrapped keys*. Hardware-wrapped keys +are encrypted keys that can only be unwrapped (decrypted) and used by hardware +-- either by the inline encryption hardware itself, or by a dedicated hardware +block that can directly provision keys to the inline encryption hardware. + +(We refer to them as "hardware-wrapped keys" rather than simply "wrapped keys" +to add some clarity in cases where there could be other types of wrapped keys, +such as in file-based encryption. Key wrapping is a commonly used technique.) + +The key which wraps (encrypts) hardware-wrapped keys is a hardware-internal key +that is never exposed to software; it is either a persistent key (a "long-term +wrapping key") or a per-boot key (an "ephemeral wrapping key"). The long-term +wrapped form of the key is what is initially unlocked, but it is erased from +memory as soon as it is converted into an ephemerally-wrapped key. In-use +hardware-wrapped keys are always ephemerally-wrapped, not long-term wrapped. + +As inline encryption hardware can only be used to encrypt/decrypt data on-disk, +the hardware also includes a level of indirection; it doesn't use the unwrapped +key directly for inline encryption, but rather derives both an inline encryption +key and a "software secret" from it. Software can use the "software secret" for +tasks that can't use the inline encryption hardware, such as filenames +encryption. The software secret is not protected from memory compromise. + +Key hierarchy +------------- + +Here is the key hierarchy for a hardware-wrapped key:: + + Hardware-wrapped key + | + | + + | + ----------------------------- + | | + Inline encryption key Software secret + +The components are: + +- *Hardware-wrapped key*: a key for the hardware's KDF (Key Derivation + Function), in ephemerally-wrapped form. The key wrapping algorithm is a + hardware implementation detail that doesn't impact kernel operation, but a + strong authenticated encryption algorithm such as AES-256-GCM is recommended. + +- *Hardware KDF*: a KDF (Key Derivation Function) which the hardware uses to + derive subkeys after unwrapping the wrapped key. The hardware's choice of KDF + doesn't impact kernel operation, but it does need to be known for testing + purposes, and it's also assumed to have at least a 256-bit security strength. + All known hardware uses the SP800-108 KDF in Counter Mode with AES-256-CMAC, + with a particular choice of labels and contexts; new hardware should use this + already-vetted KDF. + +- *Inline encryption key*: a derived key which the hardware directly provisions + to a keyslot of the inline encryption hardware, without exposing it to + software. In all known hardware, this will always be an AES-256-XTS key. + However, in principle other encryption algorithms could be supported too. + Hardware must derive distinct subkeys for each supported encryption algorithm. + +- *Software secret*: a derived key which the hardware returns to software so + that software can use it for cryptographic tasks that can't use inline + encryption. This value is cryptographically isolated from the inline + encryption key, i.e. knowing one doesn't reveal the other. (The KDF ensures + this.) Currently, the software secret is always 32 bytes and thus is suitable + for cryptographic applications that require up to a 256-bit security strength. + Some use cases (e.g. full-disk encryption) won't require the software secret. + +Example: in the case of fscrypt, the fscrypt master key (the key that protects a +particular set of encrypted directories) is made hardware-wrapped. The inline +encryption key is used as the file contents encryption key, while the software +secret (rather than the master key directly) is used to key fscrypt's KDF +(HKDF-SHA512) to derive other subkeys such as filenames encryption keys. + +Note that currently this design assumes a single inline encryption key per +hardware-wrapped key, without any further key derivation. Thus, in the case of +fscrypt, currently hardware-wrapped keys are only compatible with the "inline +encryption optimized" settings, which use one file contents encryption key per +encryption policy rather than one per file. This design could be extended to +make the hardware derive per-file keys using per-file nonces passed down the +storage stack, and in fact some hardware already supports this; future work is +planned to remove this limitation by adding the corresponding kernel support. + +Kernel support +-------------- + +The inline encryption support of the kernel's block layer ("blk-crypto") has +been extended to support hardware-wrapped keys as an alternative to standard +keys, when hardware support is available. This works in the following way: + +- A ``key_types_supported`` field is added to the crypto capabilities in + ``struct blk_crypto_profile``. This allows device drivers to declare that + they support standard keys, hardware-wrapped keys, or both. + +- ``struct blk_crypto_key`` can now contain a hardware-wrapped key as an + alternative to a standard key; a ``key_type`` field is added to + ``struct blk_crypto_config`` to distinguish between the different key types. + This allows users of blk-crypto to en/decrypt data using a hardware-wrapped + key in a way very similar to using a standard key. + +- A new method ``blk_crypto_ll_ops::derive_sw_secret`` is added. Device drivers + that support hardware-wrapped keys must implement this method. Users of + blk-crypto can call ``blk_crypto_derive_sw_secret()`` to access this method. + +- The programming and eviction of hardware-wrapped keys happens via + ``blk_crypto_ll_ops::keyslot_program`` and + ``blk_crypto_ll_ops::keyslot_evict``, just like it does for standard keys. If + a driver supports hardware-wrapped keys, then it must handle hardware-wrapped + keys being passed to these methods. + +blk-crypto-fallback doesn't support hardware-wrapped keys. Therefore, +hardware-wrapped keys can only be used with actual inline encryption hardware. + +Currently, the kernel only works with hardware-wrapped keys in +ephemerally-wrapped form. No generic kernel interfaces are provided for +generating or importing hardware-wrapped keys in the first place, or converting +them to ephemerally-wrapped form. In Android, SoC vendors are required to +support these operations in their KeyMint implementation (a hardware abstraction +layer in userspace); for details, see the `Android documentation +`_. + +Testability +----------- + +Both the hardware KDF and the inline encryption itself are well-defined +algorithms that don't depend on any secrets other than the unwrapped key. +Therefore, if the unwrapped key is known to software, these algorithms can be +reproduced in software in order to verify the ciphertext that is written to disk +by the inline encryption hardware. + +However, the unwrapped key will only be known to software for testing if the +"import" functionality is used. Proper testing is not possible in the +"generate" case where the hardware generates the key itself. The correct +operation of the "generate" mode thus relies on the security and correctness of +the hardware RNG and its use to generate the key, as well as the testing of the +"import" mode as that should cover all parts other than the key generation. + +For an example of a test that verifies the ciphertext written to disk in the +"import" mode, see the fscrypt hardware-wrapped key tests in xfstests, or +`Android's vts_kernel_encryption_test +`_. diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 21dc03bc10a4..e66916a483cd 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -4,39 +4,76 @@ The Kernel Address Sanitizer (KASAN) Overview -------- -KernelAddressSANitizer (KASAN) is a dynamic memory safety error detector -designed to find out-of-bound and use-after-free bugs. KASAN has three modes: +Kernel Address Sanitizer (KASAN) is a dynamic memory safety error detector +designed to find out-of-bounds and use-after-free bugs. -1. generic KASAN (similar to userspace ASan), -2. software tag-based KASAN (similar to userspace HWASan), -3. hardware tag-based KASAN (based on hardware memory tagging). +KASAN has three modes: -Generic KASAN is mainly used for debugging due to a large memory overhead. -Software tag-based KASAN can be used for dogfood testing as it has a lower -memory overhead that allows using it with real workloads. Hardware tag-based -KASAN comes with low memory and performance overheads and, therefore, can be -used in production. Either as an in-field memory bug detector or as a security -mitigation. +1. Generic KASAN +2. Software Tag-Based KASAN +3. Hardware Tag-Based KASAN -Software KASAN modes (#1 and #2) use compile-time instrumentation to insert -validity checks before every memory access and, therefore, require a compiler -version that supports that. +Generic KASAN, enabled with CONFIG_KASAN_GENERIC, is the mode intended for +debugging, similar to userspace ASan. This mode is supported on many CPU +architectures, but it has significant performance and memory overheads. -Generic KASAN is supported in GCC and Clang. With GCC, it requires version -8.3.0 or later. Any supported Clang version is compatible, but detection of -out-of-bounds accesses for global variables is only supported since Clang 11. +Software Tag-Based KASAN or SW_TAGS KASAN, enabled with CONFIG_KASAN_SW_TAGS, +can be used for both debugging and dogfood testing, similar to userspace HWASan. +This mode is only supported for arm64, but its moderate memory overhead allows +using it for testing on memory-restricted devices with real workloads. -Software tag-based KASAN mode is only supported in Clang. +Hardware Tag-Based KASAN or HW_TAGS KASAN, enabled with CONFIG_KASAN_HW_TAGS, +is the mode intended to be used as an in-field memory bug detector or as a +security mitigation. This mode only works on arm64 CPUs that support MTE +(Memory Tagging Extension), but it has low memory and performance overheads and +thus can be used in production. -The hardware KASAN mode (#3) relies on hardware to perform the checks but -still requires a compiler version that supports memory tagging instructions. -This mode is supported in GCC 10+ and Clang 11+. +For details about the memory and performance impact of each KASAN mode, see the +descriptions of the corresponding Kconfig options. -Both software KASAN modes work with SLUB and SLAB memory allocators, -while the hardware tag-based KASAN currently only supports SLUB. +The Generic and the Software Tag-Based modes are commonly referred to as the +software modes. The Software Tag-Based and the Hardware Tag-Based modes are +referred to as the tag-based modes. -Currently, generic KASAN is supported for the x86_64, arm, arm64, xtensa, s390, -and riscv architectures, and tag-based KASAN modes are supported only for arm64. +Support +------- + +Architectures +~~~~~~~~~~~~~ + +Generic KASAN is supported on x86_64, arm, arm64, powerpc, riscv, s390, and +xtensa, and the tag-based KASAN modes are supported only on arm64. + +Compilers +~~~~~~~~~ + +Software KASAN modes use compile-time instrumentation to insert validity checks +before every memory access and thus require a compiler version that provides +support for that. The Hardware Tag-Based mode relies on hardware to perform +these checks but still requires a compiler version that supports the memory +tagging instructions. + +Generic KASAN requires GCC version 8.3.0 or later +or any Clang version supported by the kernel. + +Software Tag-Based KASAN requires GCC 11+ +or any Clang version supported by the kernel. + +Hardware Tag-Based KASAN requires GCC 10+ or Clang 12+. + +Memory types +~~~~~~~~~~~~ + +Generic KASAN supports finding bugs in all of slab, page_alloc, vmap, vmalloc, +stack, and global memory. + +Software Tag-Based KASAN supports slab, page_alloc, vmalloc, and stack memory. + +Hardware Tag-Based KASAN supports slab, page_alloc, and non-executable vmalloc +memory. + +For slab, both software KASAN modes support SLUB and SLAB allocators, while +Hardware Tag-Based KASAN only supports SLUB. Usage ----- @@ -45,18 +82,81 @@ To enable KASAN, configure the kernel with:: CONFIG_KASAN=y -and choose between ``CONFIG_KASAN_GENERIC`` (to enable generic KASAN), -``CONFIG_KASAN_SW_TAGS`` (to enable software tag-based KASAN), and -``CONFIG_KASAN_HW_TAGS`` (to enable hardware tag-based KASAN). +and choose between ``CONFIG_KASAN_GENERIC`` (to enable Generic KASAN), +``CONFIG_KASAN_SW_TAGS`` (to enable Software Tag-Based KASAN), and +``CONFIG_KASAN_HW_TAGS`` (to enable Hardware Tag-Based KASAN). -For software modes, also choose between ``CONFIG_KASAN_OUTLINE`` and +For the software modes, also choose between ``CONFIG_KASAN_OUTLINE`` and ``CONFIG_KASAN_INLINE``. Outline and inline are compiler instrumentation types. -The former produces a smaller binary while the latter is 1.1-2 times faster. +The former produces a smaller binary while the latter is up to 2 times faster. To include alloc and free stack traces of affected slab objects into reports, enable ``CONFIG_STACKTRACE``. To include alloc and free stack traces of affected physical pages, enable ``CONFIG_PAGE_OWNER`` and boot with ``page_owner=on``. +Boot parameters +~~~~~~~~~~~~~~~ + +KASAN is affected by the generic ``panic_on_warn`` command line parameter. +When it is enabled, KASAN panics the kernel after printing a bug report. + +By default, KASAN prints a bug report only for the first invalid memory access. +With ``kasan_multi_shot``, KASAN prints a report on every invalid access. This +effectively disables ``panic_on_warn`` for KASAN reports. + +Alternatively, independent of ``panic_on_warn``, the ``kasan.fault=`` boot +parameter can be used to control panic and reporting behaviour: + +- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN + report or also panic the kernel (default: ``report``). The panic happens even + if ``kasan_multi_shot`` is enabled. + +Software and Hardware Tag-Based KASAN modes (see the section about various +modes below) support altering stack trace collection behavior: + +- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack + traces collection (default: ``on``). +- ``kasan.stack_ring_size=`` specifies the number of entries + in the stack ring (default: ``32768``). + +Hardware Tag-Based KASAN mode is intended for use in production as a security +mitigation. Therefore, it supports additional boot parameters that allow +disabling KASAN altogether or controlling its features: + +- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``). + +- ``kasan.mode=sync``, ``=async`` or ``=asymm`` controls whether KASAN + is configured in synchronous, asynchronous or asymmetric mode of + execution (default: ``sync``). + Synchronous mode: a bad access is detected immediately when a tag + check fault occurs. + Asynchronous mode: a bad access detection is delayed. When a tag check + fault occurs, the information is stored in hardware (in the TFSR_EL1 + register for arm64). The kernel periodically checks the hardware and + only reports tag faults during these checks. + Asymmetric mode: a bad access is detected synchronously on reads and + asynchronously on writes. + +- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc + allocations (default: ``on``). + +- ``kasan.page_alloc.sample=`` makes KASAN tag only every + Nth page_alloc allocation with the order equal or greater than + ``kasan.page_alloc.sample.order``, where N is the value of the ``sample`` + parameter (default: ``1``, or tag every such allocation). + This parameter is intended to mitigate the performance overhead introduced + by KASAN. + Note that enabling this parameter makes Hardware Tag-Based KASAN skip checks + of allocations chosen by sampling and thus miss bad accesses to these + allocations. Use the default value for accurate bug detection. + +- ``kasan.page_alloc.sample.order=`` specifies the minimum + order of allocations that are affected by sampling (default: ``3``). + Only applies when ``kasan.page_alloc.sample`` is set to a value greater + than ``1``. + This parameter is intended to allow sampling only large page_alloc + allocations, which is the biggest source of the performance overhead. + Error reports ~~~~~~~~~~~~~ @@ -146,7 +246,7 @@ is either 8 or 16 aligned bytes depending on KASAN mode. Each number in the memory state section of the report shows the state of one of the memory granules that surround the accessed address. -For generic KASAN, the size of each memory granule is 8. The state of each +For Generic KASAN, the size of each memory granule is 8. The state of each granule is encoded in one shadow byte. Those 8 bytes can be accessible, partially accessible, freed, or be a part of a redzone. KASAN uses the following encoding for each shadow byte: 00 means that all 8 bytes of the corresponding @@ -171,41 +271,6 @@ traces point to places in code that interacted with the object but that are not directly present in the bad access stack trace. Currently, this includes call_rcu() and workqueue queuing. -Boot parameters -~~~~~~~~~~~~~~~ - -KASAN is affected by the generic ``panic_on_warn`` command line parameter. -When it is enabled, KASAN panics the kernel after printing a bug report. - -By default, KASAN prints a bug report only for the first invalid memory access. -With ``kasan_multi_shot``, KASAN prints a report on every invalid access. This -effectively disables ``panic_on_warn`` for KASAN reports. - -Alternatively, independent of ``panic_on_warn`` the ``kasan.fault=`` boot -parameter can be used to control panic and reporting behaviour: - -- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN - report or also panic the kernel (default: ``report``). The panic happens even - if ``kasan_multi_shot`` is enabled. - -Hardware tag-based KASAN mode (see the section about various modes below) is -intended for use in production as a security mitigation. Therefore, it supports -additional boot parameters that allow disabling KASAN or controlling features: - -- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``). - -- ``kasan.mode=sync`` or ``=async`` controls whether KASAN is configured in - synchronous or asynchronous mode of execution (default: ``sync``). - Synchronous mode: a bad access is detected immediately when a tag - check fault occurs. - Asynchronous mode: a bad access detection is delayed. When a tag check - fault occurs, the information is stored in hardware (in the TFSR_EL1 - register for arm64). The kernel periodically checks the hardware and - only reports tag faults during these checks. - -- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack - traces collection (default: ``on``). - Implementation details ---------------------- @@ -244,49 +309,46 @@ outline-instrumented kernel. Generic KASAN is the only mode that delays the reuse of freed objects via quarantine (see mm/kasan/quarantine.c for implementation). -Software tag-based KASAN +Software Tag-Based KASAN ~~~~~~~~~~~~~~~~~~~~~~~~ -Software tag-based KASAN uses a software memory tagging approach to checking +Software Tag-Based KASAN uses a software memory tagging approach to checking access validity. It is currently only implemented for the arm64 architecture. -Software tag-based KASAN uses the Top Byte Ignore (TBI) feature of arm64 CPUs +Software Tag-Based KASAN uses the Top Byte Ignore (TBI) feature of arm64 CPUs to store a pointer tag in the top byte of kernel pointers. It uses shadow memory to store memory tags associated with each 16-byte memory cell (therefore, it dedicates 1/16th of the kernel memory for shadow memory). -On each memory allocation, software tag-based KASAN generates a random tag, tags +On each memory allocation, Software Tag-Based KASAN generates a random tag, tags the allocated memory with this tag, and embeds the same tag into the returned pointer. -Software tag-based KASAN uses compile-time instrumentation to insert checks +Software Tag-Based KASAN uses compile-time instrumentation to insert checks before each memory access. These checks make sure that the tag of the memory that is being accessed is equal to the tag of the pointer that is used to access -this memory. In case of a tag mismatch, software tag-based KASAN prints a bug +this memory. In case of a tag mismatch, Software Tag-Based KASAN prints a bug report. -Software tag-based KASAN also has two instrumentation modes (outline, which +Software Tag-Based KASAN also has two instrumentation modes (outline, which emits callbacks to check memory accesses; and inline, which performs the shadow memory checks inline). With outline instrumentation mode, a bug report is printed from the function that performs the access check. With inline instrumentation, a ``brk`` instruction is emitted by the compiler, and a dedicated ``brk`` handler is used to print bug reports. -Software tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through +Software Tag-Based KASAN uses 0xFF as a match-all pointer tag (accesses through pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently reserved to tag freed memory regions. -Software tag-based KASAN currently only supports tagging of slab and page_alloc -memory. - -Hardware tag-based KASAN +Hardware Tag-Based KASAN ~~~~~~~~~~~~~~~~~~~~~~~~ -Hardware tag-based KASAN is similar to the software mode in concept but uses +Hardware Tag-Based KASAN is similar to the software mode in concept but uses hardware memory tagging support instead of compiler instrumentation and shadow memory. -Hardware tag-based KASAN is currently only implemented for arm64 architecture +Hardware Tag-Based KASAN is currently only implemented for arm64 architecture and based on both arm64 Memory Tagging Extension (MTE) introduced in ARMv8.5 Instruction Set Architecture and Top Byte Ignore (TBI). @@ -296,26 +358,25 @@ access, hardware makes sure that the tag of the memory that is being accessed is equal to the tag of the pointer that is used to access this memory. In case of a tag mismatch, a fault is generated, and a report is printed. -Hardware tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through +Hardware Tag-Based KASAN uses 0xFF as a match-all pointer tag (accesses through pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently reserved to tag freed memory regions. -Hardware tag-based KASAN currently only supports tagging of slab and page_alloc -memory. - -If the hardware does not support MTE (pre ARMv8.5), hardware tag-based KASAN +If the hardware does not support MTE (pre ARMv8.5), Hardware Tag-Based KASAN will not be enabled. In this case, all KASAN boot parameters are ignored. Note that enabling CONFIG_KASAN_HW_TAGS always results in in-kernel TBI being enabled. Even when ``kasan.mode=off`` is provided or when the hardware does not support MTE (but supports TBI). -Hardware tag-based KASAN only reports the first found bug. After that, MTE tag +Hardware Tag-Based KASAN only reports the first found bug. After that, MTE tag checking gets disabled. Shadow memory ------------- +The contents of this section are only applicable to software KASAN modes. + The kernel maps memory in several different parts of the address space. The range of kernel virtual addresses is large: there is not enough real memory to support a real shadow region for every address that could be @@ -346,7 +407,7 @@ CONFIG_KASAN_VMALLOC With ``CONFIG_KASAN_VMALLOC``, KASAN can cover vmalloc space at the cost of greater memory usage. Currently, this is supported on x86, -riscv, s390, and powerpc. +arm64, riscv, s390, and powerpc. This works by hooking into vmalloc and vmap and dynamically allocating real shadow memory to back the mappings. @@ -406,19 +467,18 @@ generic ``noinstr`` one. Note that disabling compiler instrumentation (either on a per-file or a per-function basis) makes KASAN ignore the accesses that happen directly in that code for software KASAN modes. It does not help when the accesses happen -indirectly (through calls to instrumented functions) or with the hardware -tag-based mode that does not use compiler instrumentation. +indirectly (through calls to instrumented functions) or with Hardware +Tag-Based KASAN, which does not use compiler instrumentation. For software KASAN modes, to disable KASAN reports in a part of the kernel code for the current task, annotate this part of the code with a ``kasan_disable_current()``/``kasan_enable_current()`` section. This also disables the reports for indirect accesses that happen through function calls. -For tag-based KASAN modes (include the hardware one), to disable access -checking, use ``kasan_reset_tag()`` or ``page_kasan_tag_reset()``. Note that -temporarily disabling access checking via ``page_kasan_tag_reset()`` requires -saving and restoring the per-page KASAN tag via -``page_kasan_tag``/``page_kasan_tag_set``. +For tag-based KASAN modes, to disable access checking, use +``kasan_reset_tag()`` or ``page_kasan_tag_reset()``. Note that temporarily +disabling access checking via ``page_kasan_tag_reset()`` requires saving and +restoring the per-page KASAN tag via ``page_kasan_tag``/``page_kasan_tag_set``. Tests ~~~~~ diff --git a/Documentation/device-mapper/dm-bow.txt b/Documentation/device-mapper/dm-bow.txt new file mode 100644 index 000000000000..e3fc4d22e0f4 --- /dev/null +++ b/Documentation/device-mapper/dm-bow.txt @@ -0,0 +1,99 @@ +dm_bow (backup on write) +======================== + +dm_bow is a device mapper driver that uses the free space on a device to back up +data that is overwritten. The changes can then be committed by a simple state +change, or rolled back by removing the dm_bow device and running a command line +utility over the underlying device. + +dm_bow has three states, set by writing ‘1’ or ‘2’ to /sys/block/dm-?/bow/state. +It is only possible to go from state 0 (initial state) to state 1, and then from +state 1 to state 2. + +State 0: dm_bow collects all trims to the device and assumes that these mark +free space on the overlying file system that can be safely used. Typically the +mount code would create the dm_bow device, mount the file system, call the +FITRIM ioctl on the file system then switch to state 1. These trims are not +propagated to the underlying device. + +State 1: All writes to the device cause the underlying data to be backed up to +the free (trimmed) area as needed in such a way as they can be restored. +However, the writes, with one exception, then happen exactly as they would +without dm_bow, so the device is always in a good final state. The exception is +that sector 0 is used to keep a log of the latest changes, both to indicate that +we are in this state and to allow rollback. See below for all details. If there +isn't enough free space, writes are failed with -ENOSPC. + +State 2: The transition to state 2 triggers replacing the special sector 0 with +the normal sector 0, and the freeing of all state information. dm_bow then +becomes a pass-through driver, allowing the device to continue to be used with +minimal performance impact. + +Usage +===== +dm-bow takes one command line parameter, the name of the underlying device. + +dm-bow will typically be used in the following way. dm-bow will be loaded with a +suitable underlying device and the resultant device will be mounted. A file +system trim will be issued via the FITRIM ioctl, then the device will be +switched to state 1. The file system will now be used as normal. At some point, +the changes can either be committed by switching to state 2, or rolled back by +unmounting the file system, removing the dm-bow device and running the command +line utility. Note that rebooting the device will be equivalent to unmounting +and removing, but the command line utility must still be run + +Details of operation in state 1 +=============================== + +dm_bow maintains a type for all sectors. A sector can be any of: + +SECTOR0 +SECTOR0_CURRENT +UNCHANGED +FREE +CHANGED +BACKUP + +SECTOR0 is the first sector on the device, and is used to hold the log of +changes. This is the one exception. + +SECTOR0_CURRENT is a sector picked from the FREE sectors, and is where reads and +writes from the true sector zero are redirected to. Note that like any backup +sector, if the sector is written to directly, it must be moved again. + +UNCHANGED means that the sector has not been changed since we entered state 1. +Thus if it is written to or trimmed, the contents must first be backed up. + +FREE means that the sector was trimmed in state 0 and has not yet been written +to or used for backup. On being written to, a FREE sector is changed to CHANGED. + +CHANGED means that the sector has been modified, and can be further modified +without further backup. + +BACKUP means that this is a free sector being used as a backup. On being written +to, the contents must first be backed up again. + +All backup operations are logged to the first sector. The log sector has the +format: +-------------------------------------------------------- +| Magic | Count | Sequence | Log entry | Log entry | … +-------------------------------------------------------- + +Magic is a magic number. Count is the number of log entries. Sequence is 0 +initially. A log entry is + +----------------------------------- +| Source | Dest | Size | Checksum | +----------------------------------- + +When SECTOR0 is full, the log sector is backed up and another empty log sector +created with sequence number one higher. The first entry in any log entry with +sequence > 0 therefore must be the log of the backing up of the previous log +sector. Note that sequence is not strictly needed, but is a useful sanity check +and potentially limits the time spent trying to restore a corrupted snapshot. + +On entering state 1, dm_bow has a list of free sectors. All other sectors are +unchanged. Sector0_current is selected from the free sectors and the contents of +sector 0 are copied there. The sector 0 is backed up, which triggers the first +log entry to be written. + diff --git a/Documentation/devicetree/bindings/misc/qemu,vcpu-stall-detector.yaml b/Documentation/devicetree/bindings/misc/qemu,vcpu-stall-detector.yaml new file mode 100644 index 000000000000..1aebeb696ee0 --- /dev/null +++ b/Documentation/devicetree/bindings/misc/qemu,vcpu-stall-detector.yaml @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/misc/qemu,vcpu-stall-detector.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: VCPU stall detector + +description: + This binding describes a CPU stall detector mechanism for virtual CPUs + which is accessed through MMIO. + +maintainers: + - Sebastian Ene + +properties: + compatible: + enum: + - qemu,vcpu-stall-detector + + reg: + maxItems: 1 + + clock-frequency: + $ref: /schemas/types.yaml#/definitions/uint32 + description: | + The internal clock of the stall detector peripheral measure in Hz used + to decrement its internal counter register on each tick. + Defaults to 10 if unset. + default: 10 + + timeout-sec: + description: | + The stall detector expiration timeout measured in seconds. + Defaults to 8 if unset. Please note that it also takes into account the + time spent while the VCPU is not running. + default: 8 + +required: + - compatible + +additionalProperties: false + +examples: + - | + vmwdt@9030000 { + compatible = "qemu,vcpu-stall-detector"; + reg = <0x9030000 0x10000>; + clock-frequency = <10>; + timeout-sec = <8>; + }; diff --git a/Documentation/devicetree/bindings/mmc/mtk-sd.yaml b/Documentation/devicetree/bindings/mmc/mtk-sd.yaml index e866e985549e..82768a807294 100644 --- a/Documentation/devicetree/bindings/mmc/mtk-sd.yaml +++ b/Documentation/devicetree/bindings/mmc/mtk-sd.yaml @@ -119,6 +119,18 @@ properties: If present, HS400 command responses are sampled on rising edges. If not present, HS400 command responses are sampled on falling edges. + mediatek,hs400-ds-dly3: + $ref: /schemas/types.yaml#/definitions/uint32 + description: + Gear of the third delay line for DS for input data latch in data + pad macro, there are 32 stages from 0 to 31. + For different corner IC, the time is different about one step, it is + about 100ps. + The value is confirmed by doing scan and calibration to find a best + value with corner IC and it is valid only for HS400 mode. + minimum: 0 + maximum: 31 + mediatek,latch-ck: $ref: /schemas/types.yaml#/definitions/uint32 description: diff --git a/Documentation/devicetree/bindings/perf/arm,cmn.yaml b/Documentation/devicetree/bindings/perf/arm,cmn.yaml index 42424ccbdd0c..2d4219ec7eda 100644 --- a/Documentation/devicetree/bindings/perf/arm,cmn.yaml +++ b/Documentation/devicetree/bindings/perf/arm,cmn.yaml @@ -12,12 +12,14 @@ maintainers: properties: compatible: - const: arm,cmn-600 + enum: + - arm,cmn-600 + - arm,ci-700 reg: items: - description: Physical address of the base (PERIPHBASE) and - size (up to 64MB) of the configuration address space. + size of the configuration address space. interrupts: minItems: 1 @@ -31,14 +33,23 @@ properties: arm,root-node: $ref: /schemas/types.yaml#/definitions/uint32 - description: Offset from PERIPHBASE of the configuration - discovery node (see TRM definition of ROOTNODEBASE). + description: Offset from PERIPHBASE of CMN-600's configuration + discovery node (see TRM definition of ROOTNODEBASE). Not + relevant for newer CMN/CI products. required: - compatible - reg - interrupts - - arm,root-node + +if: + properties: + compatible: + contains: + const: arm,cmn-600 +then: + required: + - arm,root-node additionalProperties: false diff --git a/Documentation/devicetree/bindings/reserved-memory/google,open-dice.yaml b/Documentation/devicetree/bindings/reserved-memory/google,open-dice.yaml new file mode 100644 index 000000000000..257a0b51994a --- /dev/null +++ b/Documentation/devicetree/bindings/reserved-memory/google,open-dice.yaml @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/reserved-memory/google,open-dice.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Open Profile for DICE Device Tree Bindings + +description: | + This binding represents a reserved memory region containing data + generated by the Open Profile for DICE protocol. + + See https://pigweed.googlesource.com/open-dice/ + +maintainers: + - David Brazdil + +allOf: + - $ref: "reserved-memory.yaml" + +properties: + compatible: + const: google,open-dice + + reg: + description: page-aligned region of memory containing DICE data + +required: + - compatible + - reg + - no-map + +unevaluatedProperties: false + +examples: + - | + reserved-memory { + #address-cells = <2>; + #size-cells = <1>; + + dice: dice@12340000 { + compatible = "google,open-dice"; + reg = <0x00 0x12340000 0x2000>; + no-map; + }; + }; diff --git a/Documentation/filesystems/OWNERS b/Documentation/filesystems/OWNERS new file mode 100644 index 000000000000..a63dbf4659f2 --- /dev/null +++ b/Documentation/filesystems/OWNERS @@ -0,0 +1 @@ +per-file f2fs**=file:/fs/f2fs/OWNERS diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst index b97579b7d8fb..7119aa213be7 100644 --- a/Documentation/filesystems/erofs.rst +++ b/Documentation/filesystems/erofs.rst @@ -19,9 +19,10 @@ It is designed as a better filesystem solution for the following scenarios: immutable and bit-for-bit identical to the official golden image for their releases due to security and other considerations and - - hope to save some extra storage space with guaranteed end-to-end performance - by using reduced metadata and transparent file compression, especially - for those embedded devices with limited memory (ex, smartphone); + - hope to minimize extra storage space with guaranteed end-to-end performance + by using compact layout, transparent file compression and direct access, + especially for those embedded devices with limited memory and high-density + hosts with numerous containers; Here is the main features of EROFS: @@ -51,7 +52,9 @@ Here is the main features of EROFS: - Support POSIX.1e ACLs by using xattrs; - Support transparent data compression as an option: - LZ4 algorithm with the fixed-sized output compression for high performance. + LZ4 algorithm with the fixed-sized output compression for high performance; + + - Multiple device support for multi-layer container images. The following git tree provides the file system user-space tools under development (ex, formatting tool mkfs.erofs): @@ -87,8 +90,17 @@ cache_strategy=%s Select a strategy for cached decompression from now on: dax={always,never} Use direct access (no page cache). See Documentation/filesystems/dax.rst. dax A legacy option which is an alias for ``dax=always``. +device=%s Specify a path to an extra device to be used together. =================== ========================================================= +Sysfs Entries +============= + +Information about mounted erofs file systems can be found in /sys/fs/erofs. +Each mounted filesystem will have a directory in /sys/fs/erofs based on its +device name (i.e., /sys/fs/erofs/sda). +(see also Documentation/ABI/testing/sysfs-fs-erofs) + On-disk details =============== diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 7fe50b0bccde..b725d8190bef 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -25,10 +25,14 @@ a consistency checking tool (fsck.f2fs), and a debugging tool (dump.f2fs). - git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs-tools.git -For reporting bugs and sending patches, please use the following mailing list: +For sending patches, please use the following mailing list: - linux-f2fs-devel@lists.sourceforge.net +For reporting bugs, please use the following f2fs bug tracker link: + +- https://bugzilla.kernel.org/enter_bug.cgi?product=File%20System&component=f2fs + Background and Design issues ============================ @@ -154,6 +158,8 @@ nobarrier This option can be used if underlying storage guarantees If this option is set, no cache_flush commands are issued but f2fs still guarantees the write ordering of all the data writes. +barrier If this option is set, cache_flush commands are allowed to be + issued. fastboot This option is used when a system wants to reduce mount time as much as possible, even though normal performance can be sacrificed. @@ -198,10 +204,30 @@ fault_type=%d Support configuring fault injection type, should be FAULT_WRITE_IO 0x000004000 FAULT_SLAB_ALLOC 0x000008000 FAULT_DQUOT_INIT 0x000010000 + FAULT_LOCK_OP 0x000020000 + FAULT_BLKADDR 0x000040000 =================== =========== mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. + "fragment:segment" and "fragment:block" are newly added here. + These are developer options for experiments to simulate filesystem + fragmentation/after-GC situation itself. The developers use these + modes to understand filesystem fragmentation/after-GC condition well, + and eventually get some insights to handle them better. + In "fragment:segment", f2fs allocates a new segment in ramdom + position. With this, we can simulate the after-GC condition. + In "fragment:block", we can scatter block allocation with + "max_fragment_chunk" and "max_fragment_hole" sysfs nodes. + We added some randomness to both chunk and hole size to make + it close to realistic IO pattern. So, in this mode, f2fs will allocate + 1.. blocks in a chunk and make a hole in the + length of 1.. by turns. With this, the newly + allocated blocks will be scattered throughout the whole partition. + Note that "fragment:block" implicitly enables "fragment:segment" + option for more randomness. + Please, use these options for your experiments and we strongly + recommend to re-format the filesystem after using these options. io_bits=%u Set the bit size of write IO requests. It should be set with "mode=lfs". usrquota Enable plain user disk quota accounting. @@ -216,12 +242,6 @@ offgrpjquota Turn off group journalled quota. offprjjquota Turn off project journalled quota. quota Enable plain user disk quota accounting. noquota Disable all plain disk quota option. -whint_mode=%s Control which write hints are passed down to block - layer. This supports "off", "user-based", and - "fs-based". In "off" mode (default), f2fs does not pass - down hints. In "user-based" mode, f2fs tries to pass - down hints given by users. And in "fs-based" mode, f2fs - passes down hints with its policy. alloc_mode=%s Adjust block allocation policy, which supports "reuse" and "default". fsync_mode=%s Control the policy of fsync. Currently supports "posix", @@ -323,6 +343,15 @@ discard_unit=%s Control discard unit, the argument can be "block", "segment" default, it is helpful for large sized SMR or ZNS devices to reduce memory cost by getting rid of fs metadata supports small discard. +memory=%s Control memory mode. This supports "normal" and "low" modes. + "low" mode is introduced to support low memory devices. + Because of the nature of low memory devices, in this mode, f2fs + will try to save memory sometimes by sacrificing performance. + "normal" mode is the default mode and same as before. +age_extent_cache Enable an age extent cache based on rb-tree. It records + data block update frequency of the extent per inode, in + order to provide better temperature hints for data block + allocation. ======================== ============================================================ Debugfs Entries @@ -732,70 +761,6 @@ In order to identify whether the data in the victim segment are valid or not, F2FS manages a bitmap. Each bit represents the validity of a block, and the bitmap is composed of a bit stream covering whole blocks in main area. -Write-hint Policy ------------------ - -1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. - -2) whint_mode=user-based. F2FS tries to pass down hints given by -users. - -===================== ======================== =================== -User F2FS Block -===================== ======================== =================== -N/A META WRITE_LIFE_NOT_SET -N/A HOT_NODE " -N/A WARM_NODE " -N/A COLD_NODE " -ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME -extension list " " - --- buffered io -WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME -WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT -WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET -WRITE_LIFE_NONE " " -WRITE_LIFE_MEDIUM " " -WRITE_LIFE_LONG " " - --- direct io -WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME -WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT -WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET -WRITE_LIFE_NONE " WRITE_LIFE_NONE -WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM -WRITE_LIFE_LONG " WRITE_LIFE_LONG -===================== ======================== =================== - -3) whint_mode=fs-based. F2FS passes down hints with its policy. - -===================== ======================== =================== -User F2FS Block -===================== ======================== =================== -N/A META WRITE_LIFE_MEDIUM; -N/A HOT_NODE WRITE_LIFE_NOT_SET -N/A WARM_NODE " -N/A COLD_NODE WRITE_LIFE_NONE -ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME -extension list " " - --- buffered io -WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME -WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT -WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG -WRITE_LIFE_NONE " " -WRITE_LIFE_MEDIUM " " -WRITE_LIFE_LONG " " - --- direct io -WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME -WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT -WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET -WRITE_LIFE_NONE " WRITE_LIFE_NONE -WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM -WRITE_LIFE_LONG " WRITE_LIFE_LONG -===================== ======================== =================== - Fallocate(2) Policy ------------------- diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst index 7940a45d3952..85e221f7d240 100644 --- a/Documentation/filesystems/fscrypt.rst +++ b/Documentation/filesystems/fscrypt.rst @@ -77,11 +77,11 @@ Side-channel attacks fscrypt is only resistant to side-channel attacks, such as timing or electromagnetic attacks, to the extent that the underlying Linux -Cryptographic API algorithms are. If a vulnerable algorithm is used, -such as a table-based implementation of AES, it may be possible for an -attacker to mount a side channel attack against the online system. -Side channel attacks may also be mounted against applications -consuming decrypted data. +Cryptographic API algorithms or inline encryption hardware are. If a +vulnerable algorithm is used, such as a table-based implementation of +AES, it may be possible for an attacker to mount a side channel attack +against the online system. Side channel attacks may also be mounted +against applications consuming decrypted data. Unauthorized file access ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -337,6 +337,8 @@ Currently, the following pairs of encryption modes are supported: - AES-256-XTS for contents and AES-256-CTS-CBC for filenames - AES-128-CBC for contents and AES-128-CTS-CBC for filenames - Adiantum for both contents and filenames +- AES-256-XTS for contents and AES-256-HCTR2 for filenames (v2 policies only) +- SM4-XTS for contents and SM4-CTS-CBC for filenames (v2 policies only) If unsure, you should use the (AES-256-XTS, AES-256-CTS-CBC) pair. @@ -357,6 +359,23 @@ To use Adiantum, CONFIG_CRYPTO_ADIANTUM must be enabled. Also, fast implementations of ChaCha and NHPoly1305 should be enabled, e.g. CONFIG_CRYPTO_CHACHA20_NEON and CONFIG_CRYPTO_NHPOLY1305_NEON for ARM. +AES-256-HCTR2 is another true wide-block encryption mode that is intended for +use on CPUs with dedicated crypto instructions. AES-256-HCTR2 has the property +that a bitflip in the plaintext changes the entire ciphertext. This property +makes it desirable for filename encryption since initialization vectors are +reused within a directory. For more details on AES-256-HCTR2, see the paper +"Length-preserving encryption with HCTR2" +(https://eprint.iacr.org/2021/1441.pdf). To use AES-256-HCTR2, +CONFIG_CRYPTO_HCTR2 must be enabled. Also, fast implementations of XCTR and +POLYVAL should be enabled, e.g. CRYPTO_POLYVAL_ARM64_CE and +CRYPTO_AES_ARM64_CE_BLK for ARM64. + +SM4 is a Chinese block cipher that is an alternative to AES. It has +not seen as much security review as AES, and it only has a 128-bit key +size. It may be useful in cases where its use is mandated. +Otherwise, it should not be used. For SM4 support to be available, it +also needs to be enabled in the kernel crypto API. + New encryption modes can be added relatively easily, without changes to individual filesystems. However, authenticated encryption (AE) modes are not currently supported because of the difficulty of dealing @@ -404,11 +423,11 @@ alternatively has the file's nonce (for `DIRECT_KEY policies`_) or inode number (for `IV_INO_LBLK_64 policies`_) included in the IVs. Thus, IV reuse is limited to within a single directory. -With CTS-CBC, the IV reuse means that when the plaintext filenames -share a common prefix at least as long as the cipher block size (16 -bytes for AES), the corresponding encrypted filenames will also share -a common prefix. This is undesirable. Adiantum does not have this -weakness, as it is a wide-block encryption mode. +With CTS-CBC, the IV reuse means that when the plaintext filenames share a +common prefix at least as long as the cipher block size (16 bytes for AES), the +corresponding encrypted filenames will also share a common prefix. This is +undesirable. Adiantum and HCTR2 do not have this weakness, as they are +wide-block encryption modes. All supported filenames encryption modes accept any plaintext length >= 16 bytes; cipher block alignment is not required. However, @@ -1047,8 +1066,8 @@ astute users may notice some differences in behavior: may be used to overwrite the source files but isn't guaranteed to be effective on all filesystems and storage devices. -- Direct I/O is not supported on encrypted files. Attempts to use - direct I/O on such files will fall back to buffered I/O. +- Direct I/O is supported on encrypted files only under some + circumstances. For details, see `Direct I/O support`_. - The fallocate operations FALLOC_FL_COLLAPSE_RANGE and FALLOC_FL_INSERT_RANGE are not supported on encrypted files and will @@ -1135,6 +1154,71 @@ where applications may later write sensitive data. It is recommended that systems implementing a form of "verified boot" take advantage of this by validating all top-level encryption policies prior to access. +Inline encryption support +========================= + +By default, fscrypt uses the kernel crypto API for all cryptographic +operations (other than HKDF, which fscrypt partially implements +itself). The kernel crypto API supports hardware crypto accelerators, +but only ones that work in the traditional way where all inputs and +outputs (e.g. plaintexts and ciphertexts) are in memory. fscrypt can +take advantage of such hardware, but the traditional acceleration +model isn't particularly efficient and fscrypt hasn't been optimized +for it. + +Instead, many newer systems (especially mobile SoCs) have *inline +encryption hardware* that can encrypt/decrypt data while it is on its +way to/from the storage device. Linux supports inline encryption +through a set of extensions to the block layer called *blk-crypto*. +blk-crypto allows filesystems to attach encryption contexts to bios +(I/O requests) to specify how the data will be encrypted or decrypted +in-line. For more information about blk-crypto, see +:ref:`Documentation/block/inline-encryption.rst `. + +On supported filesystems (currently ext4 and f2fs), fscrypt can use +blk-crypto instead of the kernel crypto API to encrypt/decrypt file +contents. To enable this, set CONFIG_FS_ENCRYPTION_INLINE_CRYPT=y in +the kernel configuration, and specify the "inlinecrypt" mount option +when mounting the filesystem. + +Note that the "inlinecrypt" mount option just specifies to use inline +encryption when possible; it doesn't force its use. fscrypt will +still fall back to using the kernel crypto API on files where the +inline encryption hardware doesn't have the needed crypto capabilities +(e.g. support for the needed encryption algorithm and data unit size) +and where blk-crypto-fallback is unusable. (For blk-crypto-fallback +to be usable, it must be enabled in the kernel configuration with +CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y.) + +Currently fscrypt always uses the filesystem block size (which is +usually 4096 bytes) as the data unit size. Therefore, it can only use +inline encryption hardware that supports that data unit size. + +Inline encryption doesn't affect the ciphertext or other aspects of +the on-disk format, so users may freely switch back and forth between +using "inlinecrypt" and not using "inlinecrypt". + +Direct I/O support +================== + +For direct I/O on an encrypted file to work, the following conditions +must be met (in addition to the conditions for direct I/O on an +unencrypted file): + +* The file must be using inline encryption. Usually this means that + the filesystem must be mounted with ``-o inlinecrypt`` and inline + encryption hardware must be present. However, a software fallback + is also available. For details, see `Inline encryption support`_. + +* The I/O request must be fully aligned to the filesystem block size. + This means that the file position the I/O is targeting, the lengths + of all I/O segments, and the memory addresses of all I/O buffers + must be multiples of this value. Note that the filesystem block + size may be greater than the logical block size of the block device. + +If either of the above conditions is not met, then direct I/O on the +encrypted file will fall back to buffered I/O. + Implementation details ====================== @@ -1184,6 +1268,13 @@ keys`_ and `DIRECT_KEY policies`_. Data path changes ----------------- +When inline encryption is used, filesystems just need to associate +encryption contexts with bios to specify how the block layer or the +inline encryption hardware will encrypt/decrypt the file contents. + +When inline encryption isn't used, filesystems must encrypt/decrypt +the file contents themselves, as described below: + For the read path (->readpage()) of regular files, filesystems can read the ciphertext into the page cache and decrypt it in-place. The page lock must be held until decryption has finished, to prevent the @@ -1197,18 +1288,6 @@ buffer. Some filesystems, such as UBIFS, already use temporary buffers regardless of encryption. Other filesystems, such as ext4 and F2FS, have to allocate bounce pages specially for encryption. -Fscrypt is also able to use inline encryption hardware instead of the -kernel crypto API for en/decryption of file contents. When possible, -and if directed to do so (by specifying the 'inlinecrypt' mount option -for an ext4/F2FS filesystem), it adds encryption contexts to bios and -uses blk-crypto to perform the en/decryption instead of making use of -the above read/write path changes. Of course, even if directed to -make use of inline encryption, fscrypt will only be able to do so if -either hardware inline encryption support is available for the -selected encryption algorithm or CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK -is selected. If neither is the case, fscrypt will fall back to using -the above mentioned read/write path changes for en/decryption. - Filename hashing and encoding ----------------------------- diff --git a/Documentation/filesystems/incfs.rst b/Documentation/filesystems/incfs.rst new file mode 100644 index 000000000000..f0fb1d02d530 --- /dev/null +++ b/Documentation/filesystems/incfs.rst @@ -0,0 +1,85 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================================= +incfs: A stacked incremental filesystem for Linux +================================================= + +/sys/fs interface +================= + +Please update Documentation/ABI/testing/sysfs-fs-incfs if you update this +section. + +incfs creates the following files in /sys/fs. + +Features +-------- + +/sys/fs/incremental-fs/features/corefs + Reads 'supported'. Always present. + +/sys/fs/incremental-fs/features/v2 + Reads 'supported'. Present if all v2 features of incfs are supported. These + are: + fs-verity support + inotify support + ioclts: + INCFS_IOC_SET_READ_TIMEOUTS + INCFS_IOC_GET_READ_TIMEOUTS + INCFS_IOC_GET_BLOCK_COUNT + INCFS_IOC_CREATE_MAPPED_FILE + .incomplete folder + .blocks_written pseudo file + report_uid mount option + +/sys/fs/incremental-fs/features/zstd + Reads 'supported'. Present if zstd compression is supported for data blocks. + +/sys/fs/incremental-fs/features/bugfix_throttling + Reads 'supported'. Present if the throttling lock bug is fixed + +Optional per mount +------------------ + +For each incfs mount, the mount option sysfs_name=[name] creates a /sys/fs +node called: + +/sys/fs/incremental-fs/instances/[name] + +This will contain the following files: + +/sys/fs/incremental-fs/instances/[name]/reads_delayed_min + Returns a count of the number of reads that were delayed as a result of the + per UID read timeouts min time setting. + +/sys/fs/incremental-fs/instances/[name]/reads_delayed_min_us + Returns total delay time for all files since first mount as a result of the + per UID read timeouts min time setting. + +/sys/fs/incremental-fs/instances/[name]/reads_delayed_pending + Returns a count of the number of reads that were delayed as a result of + waiting for a pending read. + +/sys/fs/incremental-fs/instances/[name]/reads_delayed_pending_us + Returns total delay time for all files since first mount as a result of + waiting for a pending read. + +/sys/fs/incremental-fs/instances/[name]/reads_failed_hash_verification + Returns number of reads that failed because of hash verification failures. + +/sys/fs/incremental-fs/instances/[name]/reads_failed_other + Returns number of reads that failed for reasons other than timing out or + hash failures. + +/sys/fs/incremental-fs/instances/[name]/reads_failed_timed_out + Returns number of reads that timed out. + +For reads_delayed_*** settings, note that a file can count for both +reads_delayed_min and reads_delayed_pending if incfs first waits for a pending +read then has to wait further for the min time. In that case, the time spent +waiting is split between reads_delayed_pending_us, which is increased by the +time spent waiting for the pending read, and reads_delayed_min_us, which is +increased by the remainder of the time spent waiting. + +Reads that timed out are not added to the reads_delayed_pending or the +reads_delayed_pending_us counters. diff --git a/Documentation/filesystems/overlayfs.rst b/Documentation/filesystems/overlayfs.rst index 7da6c30ed596..d7cd4032134a 100644 --- a/Documentation/filesystems/overlayfs.rst +++ b/Documentation/filesystems/overlayfs.rst @@ -195,7 +195,7 @@ handle it in two different ways: 1. return EXDEV error: this error is returned by rename(2) when trying to move a file or directory across filesystem boundaries. Hence - applications are usually prepared to hande this error (mv(1) for example + applications are usually prepared to handle this error (mv(1) for example recursively copies the directory tree). This is the default behavior. 2. If the "redirect_dir" feature is enabled, then the directory will be @@ -324,6 +324,30 @@ and The resulting access permissions should be the same. The difference is in the time of copy (on-demand vs. up-front). +### Non overlapping credentials + +As noted above, all access to the upper, lower and work directories is the +recorded mounter's MAC and DAC credentials. The incoming accesses are +checked against the caller's credentials. + +In the case where caller MAC or DAC credentials do not overlap the mounter, a +use case available in older versions of the driver, the override_creds mount +flag can be turned off. For when the use pattern has caller with legitimate +credentials where the mounter does not. For example init may have been the +mounter, but the caller would have execute or read MAC permissions where +init would not. override_creds off means all access, incoming, upper, lower +or working, will be tested against the caller. + +Several unintended side effects will occur though. The caller without certain +key capabilities or lower privilege will not always be able to delete files or +directories, create nodes, or search some restricted directories. The ability +to search and read a directory entry is spotty as a result of the cache +mechanism not re-testing the credentials because of the assumption, a +privileged caller can fill cache, then a lower privilege can read the directory +cache. The uneven security model where cache, upperdir and workdir are opened +at privilege, but accessed without creating a form of privilege escalation, +should only be used with strict understanding of the side effects and of the +security policies. Multiple lower layers --------------------- diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 042c418f4090..82ccad3b6e0e 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -426,12 +426,14 @@ with the memory region, as the case would be with BSS (uninitialized data). The "pathname" shows the name associated file for this mapping. If the mapping is not associated with a file: - ======= ==================================== + ============= ==================================== [heap] the heap of the program [stack] the stack of the main process [vdso] the "virtual dynamic shared object", the kernel system call handler - ======= ==================================== + [anon:] an anonymous mapping that has been + named by userspace + ============= ==================================== or if empty, the mapping is anonymous. @@ -971,6 +973,7 @@ You may not have all of these fields. SReclaimable: 159856 kB SUnreclaim: 124508 kB PageTables: 24448 kB + SecPageTables: 0 kB NFS_Unstable: 0 kB Bounce: 0 kB WritebackTmp: 0 kB @@ -1065,6 +1068,9 @@ SUnreclaim PageTables amount of memory dedicated to the lowest level of page tables. +SecPageTables + Memory consumed by secondary page tables, this currently + currently includes KVM mmu allocations on x86 and arm64. NFS_Unstable Always zero. Previous counted pages which had been written to the server, but has not been committed to stable storage. diff --git a/Documentation/kbuild/kbuild.rst b/Documentation/kbuild/kbuild.rst index 2d1fc03d346e..ef19b9c13523 100644 --- a/Documentation/kbuild/kbuild.rst +++ b/Documentation/kbuild/kbuild.rst @@ -77,6 +77,17 @@ HOSTLDLIBS ---------- Additional libraries to link against when building host programs. +.. _userkbuildflags: + +USERCFLAGS +---------- +Additional options used for $(CC) when compiling userprogs. + +USERLDFLAGS +----------- +Additional options used for $(LD) when linking userprogs. userprogs are linked +with CC, so $(USERLDFLAGS) should include "-Wl," prefix as applicable. + KBUILD_KCONFIG -------------- Set the top-level Kconfig file to the value of this environment diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst index db3af0b45baf..64878c281d1f 100644 --- a/Documentation/kbuild/makefiles.rst +++ b/Documentation/kbuild/makefiles.rst @@ -982,6 +982,8 @@ The syntax is quite similar. The difference is to use "userprogs" instead of When linking bpfilter_umh, it will be passed the extra option -static. + From command line, :ref:`USERCFLAGS and USERLDFLAGS ` will also be used. + 5.4 When userspace programs are actually built ---------------------------------------------- diff --git a/Documentation/kbuild/modules.rst b/Documentation/kbuild/modules.rst index a1f3eb7a43e2..e9b6fe2342b6 100644 --- a/Documentation/kbuild/modules.rst +++ b/Documentation/kbuild/modules.rst @@ -21,6 +21,7 @@ This document describes how to build an out-of-tree kernel module. --- 4.1 Kernel Includes --- 4.2 Single Subdirectory --- 4.3 Several Subdirectories + --- 4.4 UAPI Headers Installation === 5. Module Installation --- 5.1 INSTALL_MOD_PATH --- 5.2 INSTALL_MOD_DIR @@ -131,6 +132,10 @@ executed to make module versioning work. /lib/modules//extra/, but a prefix may be added with INSTALL_MOD_PATH (discussed in section 5). + headers_install + Export headers in a format suitable for userspace. The default + location is $PWD/usr. INSTALL_HDR_PATH can change this path. + clean Remove all generated files in the module directory only. @@ -406,6 +411,17 @@ according to the following rule: pointing to the directory where the currently executing kbuild file is located. +4.4 UAPI Headers Installation +----------------------------- + + External modules may export headers to userspace in a similar + fashion to the in-tree counterpart drivers. kbuild supports + running headers_install target in an out-of-tree. The location + where kbuild searches for headers is $(M)/include/uapi and + $(M)/arch/$(SRCARCH)/include/uapi. + + See also Documentation/kbuild/headers_install.rst. + 5. Module Installation ====================== diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index ba0e8e6337c0..8011bcf5a249 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1133,6 +1133,19 @@ ip_local_reserved_ports - list of comma separated ranges Default: Empty +ip_local_unbindable_ports - list of comma separated ranges + Specify the ports which are not directly bind()able. + + Usually you would use this to block the use of ports which + are invalid due to something outside of the control of the + kernel. For example a port stolen by the nic for serial + console, remote power management or debugging. + + There's a relatively high chance you will also want to list + these ports in 'ip_local_reserved_ports' to prevent autobinding. + + Default: Empty + ip_unprivileged_port_start - INTEGER This is a per-namespace sysctl. It defines the first unprivileged port in the network namespace. Privileged ports diff --git a/Documentation/scheduler/sched-energy.rst b/Documentation/scheduler/sched-energy.rst index 8fbce5e767d9..83f11798617c 100644 --- a/Documentation/scheduler/sched-energy.rst +++ b/Documentation/scheduler/sched-energy.rst @@ -402,7 +402,7 @@ Consequently, the only sane governor to use together with EAS is schedutil, because it is the only one providing some degree of consistency between frequency requests and energy predictions. -Using EAS with any other governor than schedutil is not supported. +Using EAS with any other governor than schedutil is not recommended. 6.5 Scale-invariant utilization signals diff --git a/Documentation/scsi/scsi_eh.rst b/Documentation/scsi/scsi_eh.rst index 7d78c2475615..edf37f14331d 100644 --- a/Documentation/scsi/scsi_eh.rst +++ b/Documentation/scsi/scsi_eh.rst @@ -93,16 +93,19 @@ function 1. invokes optional hostt->eh_timed_out() callback. Return value can be one of - - BLK_EH_RESET_TIMER + - SCSI_EH_RESET_TIMER This indicates that more time is required to finish the command. Timer is restarted. This action is counted as a retry and only allowed scmd->allowed + 1(!) times. Once the limit is reached, action for BLK_EH_DONE is taken instead. - - BLK_EH_DONE + - SCSI_EH_NOT_HANDLED eh_timed_out() callback did not handle the command. Step #2 is taken. + - SCSI_EH_DONE + eh_timed_out() completed the command. + 2. scsi_abort_command() is invoked to schedule an asynchrous abort. Asynchronous abort are not invoked for commands which the SCSI_EH_ABORT_SCHEDULED flag is set (this indicates that the command diff --git a/Documentation/scsi/ufs.rst b/Documentation/scsi/ufs.rst index a920c0a5a1f6..522d7230f768 100644 --- a/Documentation/scsi/ufs.rst +++ b/Documentation/scsi/ufs.rst @@ -17,6 +17,8 @@ Universal Flash Storage 3.2 UTP Transfer requests 3.3 UFS error handling 3.4 SCSI Error handling + 4. BSG Support + 5. UFS Reference Clock Frequency configuration 1. Overview @@ -193,3 +195,16 @@ UFS Specifications can be found at: - UFS - http://www.jedec.org/sites/default/files/docs/JESD220.pdf - UFSHCI - http://www.jedec.org/sites/default/files/docs/JESD223.pdf + +5. UFS Reference Clock Frequency configuration +============================================== + +Devicetree can define a clock named "ref_clk" under the UFS controller node +to specify the intended reference clock frequency for the UFS storage +parts. ACPI-based system can specify the frequency using ACPI +Device-Specific Data property named "ref-clk-freq". In both ways the value +is interpreted as frequency in Hz and must match one of the values given in +the UFS specification. UFS subsystem will attempt to read the value when +executing common controller initialization. If the value is available, UFS +subsytem will ensure the bRefClkFreq attribute of the UFS storage device is +set accordingly and will modify it if there is a mismatch. diff --git a/Documentation/sound/alsa-configuration.rst b/Documentation/sound/alsa-configuration.rst index 5d093fb4896b..21ab5e6f7062 100644 --- a/Documentation/sound/alsa-configuration.rst +++ b/Documentation/sound/alsa-configuration.rst @@ -100,6 +100,15 @@ amidi_map MIDI device number maps assigned to the 2st OSS device; Default: 1 +Module snd-soc-core +------------------- + +The soc core module. It is used by all ALSA card drivers. +It takes the following options which have global effects. + +prealloc_buffer_size_kbytes + Specify prealloc buffer size in kbytes (default: 512). + Common parameters for top sound card modules -------------------------------------------- diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index a78350a8fed4..69354e1e7b25 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -1763,6 +1763,21 @@ using the same key and variable from yet another event:: # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ...' >> event3/trigger +Expressions support the use of addition, subtraction, multiplication and +division operators (+-\*/). + +Note if division by zero cannot be detected at parse time (i.e. the +divisor is not a constant), the result will be -1. + +Numeric constants can also be used directly in an expression:: + + # echo 'hist:keys=next_pid:timestamp_secs=common_timestamp/1000000 ...' >> event/trigger + +or assigned to a variable and referenced in a subsequent expression:: + + # echo 'hist:keys=next_pid:us_per_sec=1000000 ...' >> event/trigger + # echo 'hist:keys=next_pid:timestamp_secs=common_timestamp/$us_per_sec ...' >> event/trigger + 2.2.2 Synthetic Events ---------------------- diff --git a/Documentation/usb/gadget-testing.rst b/Documentation/usb/gadget-testing.rst index c18113077889..58781a268890 100644 --- a/Documentation/usb/gadget-testing.rst +++ b/Documentation/usb/gadget-testing.rst @@ -784,6 +784,7 @@ The uvc function provides these attributes in its function directory: streaming_maxpacket maximum packet size this endpoint is capable of sending or receiving when this configuration is selected + function_name name of the interface =================== ================================================ There are also "control" and "streaming" subdirectories, each of which contain diff --git a/Documentation/userspace-api/media/v4l/control.rst b/Documentation/userspace-api/media/v4l/control.rst index f8d0b923da20..3eec65174260 100644 --- a/Documentation/userspace-api/media/v4l/control.rst +++ b/Documentation/userspace-api/media/v4l/control.rst @@ -242,8 +242,17 @@ Control IDs * - ``V4L2_COLORFX_SET_CBCR`` - The Cb and Cr chroma components are replaced by fixed coefficients determined by ``V4L2_CID_COLORFX_CBCR`` control. + * - ``V4L2_COLORFX_SET_RGB`` + - The RGB components are replaced by the fixed RGB components determined + by ``V4L2_CID_COLORFX_RGB`` control. +``V4L2_CID_COLORFX_RGB`` ``(integer)`` + Determines the Red, Green, and Blue coefficients for + ``V4L2_COLORFX_SET_RGB`` color effect. + Bits [7:0] of the supplied 32 bit value are interpreted as Blue component, + bits [15:8] as Green component, bits [23:16] as Red component, and + bits [31:24] must be zero. ``V4L2_CID_COLORFX_CBCR`` ``(integer)`` Determines the Cb and Cr coefficients for ``V4L2_COLORFX_SET_CBCR`` diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index b550f43214c7..baf709bc7c42 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -414,7 +414,7 @@ kvm_run' (see below). ----------------- :Capability: basic -:Architectures: all except ARM, arm64 +:Architectures: all except arm64 :Type: vcpu ioctl :Parameters: struct kvm_regs (out) :Returns: 0 on success, -1 on error @@ -447,7 +447,7 @@ Reads the general purpose registers from the vcpu. ----------------- :Capability: basic -:Architectures: all except ARM, arm64 +:Architectures: all except arm64 :Type: vcpu ioctl :Parameters: struct kvm_regs (in) :Returns: 0 on success, -1 on error @@ -804,7 +804,7 @@ Writes the floating point state to the vcpu. ----------------------- :Capability: KVM_CAP_IRQCHIP, KVM_CAP_S390_IRQCHIP (s390) -:Architectures: x86, ARM, arm64, s390 +:Architectures: x86, arm64, s390 :Type: vm ioctl :Parameters: none :Returns: 0 on success, -1 on error @@ -813,7 +813,7 @@ Creates an interrupt controller model in the kernel. On x86, creates a virtual ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a local APIC. IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23 only go to the IOAPIC. -On ARM/arm64, a GICv2 is created. Any other GIC versions require the usage of +On arm64, a GICv2 is created. Any other GIC versions require the usage of KVM_CREATE_DEVICE, which also supports creating a GICv2. Using KVM_CREATE_DEVICE is preferred over KVM_CREATE_IRQCHIP for GICv2. On s390, a dummy irq routing table is created. @@ -826,7 +826,7 @@ before KVM_CREATE_IRQCHIP can be used. ----------------- :Capability: KVM_CAP_IRQCHIP -:Architectures: x86, arm, arm64 +:Architectures: x86, arm64 :Type: vm ioctl :Parameters: struct kvm_irq_level :Returns: 0 on success, -1 on error @@ -850,7 +850,7 @@ capability is present (or unless it is not using the in-kernel irqchip, of course). -ARM/arm64 can signal an interrupt either at the CPU level, or at the +arm64 can signal an interrupt either at the CPU level, or at the in-kernel irqchip (GIC), and for in-kernel irqchip can tell the GIC to use PPIs designated for specific cpus. The irq field is interpreted like this:: @@ -876,7 +876,7 @@ When KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 is supported, the target vcpu is identified as (256 * vcpu2_index + vcpu_index). Otherwise, vcpu2_index must be zero. -Note that on arm/arm64, the KVM_CAP_IRQCHIP capability only conditions +Note that on arm64, the KVM_CAP_IRQCHIP capability only conditions injection of interrupts for the in-kernel irqchip. KVM_IRQ_LINE can always be used for a userspace interrupt controller. @@ -1037,7 +1037,7 @@ such as migration. :Capability: KVM_CAP_VCPU_EVENTS :Extended by: KVM_CAP_INTR_SHADOW -:Architectures: x86, arm, arm64 +:Architectures: x86, arm64 :Type: vcpu ioctl :Parameters: struct kvm_vcpu_event (out) :Returns: 0 on success, -1 on error @@ -1096,8 +1096,8 @@ The following bits are defined in the flags field: fields contain a valid state. This bit will be set whenever KVM_CAP_EXCEPTION_PAYLOAD is enabled. -ARM/ARM64: -^^^^^^^^^^ +ARM64: +^^^^^^ If the guest accesses a device that is being emulated by the host kernel in such a way that a real device would generate a physical SError, KVM may make @@ -1156,7 +1156,7 @@ directly to the virtual CPU). :Capability: KVM_CAP_VCPU_EVENTS :Extended by: KVM_CAP_INTR_SHADOW -:Architectures: x86, arm, arm64 +:Architectures: x86, arm64 :Type: vcpu ioctl :Parameters: struct kvm_vcpu_event (in) :Returns: 0 on success, -1 on error @@ -1191,8 +1191,8 @@ can be set in the flags field to signal that the exception_has_payload, exception_payload, and exception.pending fields contain a valid state and shall be written into the VCPU. -ARM/ARM64: -^^^^^^^^^^ +ARM64: +^^^^^^ User space may need to inject several types of events to the guest. @@ -1399,7 +1399,7 @@ for vm-wide capabilities. --------------------- :Capability: KVM_CAP_MP_STATE -:Architectures: x86, s390, arm, arm64 +:Architectures: x86, s390, arm64 :Type: vcpu ioctl :Parameters: struct kvm_mp_state (out) :Returns: 0 on success; -1 on error @@ -1416,7 +1416,7 @@ uniprocessor guests). Possible values are: ========================== =============================================== - KVM_MP_STATE_RUNNABLE the vcpu is currently running [x86,arm/arm64] + KVM_MP_STATE_RUNNABLE the vcpu is currently running [x86,arm64] KVM_MP_STATE_UNINITIALIZED the vcpu is an application processor (AP) which has not yet received an INIT signal [x86] KVM_MP_STATE_INIT_RECEIVED the vcpu has received an INIT signal, and is @@ -1425,29 +1425,52 @@ Possible values are: is waiting for an interrupt [x86] KVM_MP_STATE_SIPI_RECEIVED the vcpu has just received a SIPI (vector accessible via KVM_GET_VCPU_EVENTS) [x86] - KVM_MP_STATE_STOPPED the vcpu is stopped [s390,arm/arm64] + KVM_MP_STATE_STOPPED the vcpu is stopped [s390,arm64] KVM_MP_STATE_CHECK_STOP the vcpu is in a special error state [s390] KVM_MP_STATE_OPERATING the vcpu is operating (running or halted) [s390] KVM_MP_STATE_LOAD the vcpu is in a special load/startup state [s390] + KVM_MP_STATE_SUSPENDED the vcpu is in a suspend state and is waiting + for a wakeup event [arm64] ========================== =============================================== On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel irqchip, the multiprocessing state must be maintained by userspace on these architectures. -For arm/arm64: -^^^^^^^^^^^^^^ +For arm64: +^^^^^^^^^^ -The only states that are valid are KVM_MP_STATE_STOPPED and -KVM_MP_STATE_RUNNABLE which reflect if the vcpu is paused or not. +If a vCPU is in the KVM_MP_STATE_SUSPENDED state, KVM will emulate the +architectural execution of a WFI instruction. + +If a wakeup event is recognized, KVM will exit to userspace with a +KVM_SYSTEM_EVENT exit, where the event type is KVM_SYSTEM_EVENT_WAKEUP. If +userspace wants to honor the wakeup, it must set the vCPU's MP state to +KVM_MP_STATE_RUNNABLE. If it does not, KVM will continue to await a wakeup +event in subsequent calls to KVM_RUN. + +.. warning:: + + If userspace intends to keep the vCPU in a SUSPENDED state, it is + strongly recommended that userspace take action to suppress the + wakeup event (such as masking an interrupt). Otherwise, subsequent + calls to KVM_RUN will immediately exit with a KVM_SYSTEM_EVENT_WAKEUP + event and inadvertently waste CPU cycles. + + Additionally, if userspace takes action to suppress a wakeup event, + it is strongly recommended that it also restores the vCPU to its + original state when the vCPU is made RUNNABLE again. For example, + if userspace masked a pending interrupt to suppress the wakeup, + the interrupt should be unmasked before returning control to the + guest. 4.39 KVM_SET_MP_STATE --------------------- :Capability: KVM_CAP_MP_STATE -:Architectures: x86, s390, arm, arm64 +:Architectures: x86, s390, arm64 :Type: vcpu ioctl :Parameters: struct kvm_mp_state (in) :Returns: 0 on success; -1 on error @@ -1459,8 +1482,8 @@ On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel irqchip, the multiprocessing state must be maintained by userspace on these architectures. -For arm/arm64: -^^^^^^^^^^^^^^ +For arm64: +^^^^^^^^^^ The only states that are valid are KVM_MP_STATE_STOPPED and KVM_MP_STATE_RUNNABLE which reflect if the vcpu should be paused or not. @@ -1715,14 +1738,14 @@ The flags bitmap is defined as:: ------------------------ :Capability: KVM_CAP_IRQ_ROUTING -:Architectures: x86 s390 arm arm64 +:Architectures: x86 s390 arm64 :Type: vm ioctl :Parameters: struct kvm_irq_routing (in) :Returns: 0 on success, -1 on error Sets the GSI routing table entries, overwriting any previously set entries. -On arm/arm64, GSI routing has the following limitation: +On arm64, GSI routing has the following limitation: - GSI routing does not apply to KVM_IRQ_LINE but only to KVM_IRQFD. @@ -2526,6 +2549,24 @@ EINVAL. After the vcpu's SVE configuration is finalized, further attempts to write this register will fail with EPERM. +arm64 bitmap feature firmware pseudo-registers have the following bit pattern:: + + 0x6030 0000 0016 + +The bitmap feature firmware registers exposes the hypercall services that +are available for userspace to configure. The set bits corresponds to the +services that are available for the guests to access. By default, KVM +sets all the supported bits during VM initialization. The userspace can +discover the available services via KVM_GET_ONE_REG, and write back the +bitmap corresponding to the features that it wishes guests to see via +KVM_SET_ONE_REG. + +Note: These registers are immutable once any of the vCPUs of the VM has +run at least once. A KVM_SET_ONE_REG in such a scenario will return +a -EBUSY to userspace. + +(See Documentation/virt/kvm/arm/hypercalls.rst for more details.) + MIPS registers are mapped using the lower 32 bits. The upper 16 of that is the register group type: @@ -2636,7 +2677,7 @@ after pausing the vcpu, but before it is resumed. ------------------- :Capability: KVM_CAP_SIGNAL_MSI -:Architectures: x86 arm arm64 +:Architectures: x86 arm64 :Type: vm ioctl :Parameters: struct kvm_msi (in) :Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error @@ -2824,7 +2865,7 @@ into the hash PTE second double word). -------------- :Capability: KVM_CAP_IRQFD -:Architectures: x86 s390 arm arm64 +:Architectures: x86 s390 arm64 :Type: vm ioctl :Parameters: struct kvm_irqfd (in) :Returns: 0 on success, -1 on error @@ -2850,7 +2891,7 @@ Note that closing the resamplefd is not sufficient to disable the irqfd. The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment and need not be specified with KVM_IRQFD_FLAG_DEASSIGN. -On arm/arm64, gsi routing being supported, the following can happen: +On arm64, gsi routing being supported, the following can happen: - in case no routing entry is associated to this gsi, injection fails - in case the gsi is associated to an irqchip routing entry, @@ -3104,7 +3145,7 @@ current state. "addr" is ignored. ---------------------- :Capability: basic -:Architectures: arm, arm64 +:Architectures: arm64 :Type: vcpu ioctl :Parameters: struct kvm_vcpu_init (in) :Returns: 0 on success; -1 on error @@ -3202,7 +3243,7 @@ Possible features: ----------------------------- :Capability: basic -:Architectures: arm, arm64 +:Architectures: arm64 :Type: vm ioctl :Parameters: struct kvm_vcpu_init (out) :Returns: 0 on success; -1 on error @@ -3231,7 +3272,7 @@ VCPU matching underlying host. --------------------- :Capability: basic -:Architectures: arm, arm64, mips +:Architectures: arm64, mips :Type: vcpu ioctl :Parameters: struct kvm_reg_list (in/out) :Returns: 0 on success; -1 on error @@ -3258,7 +3299,7 @@ KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. ----------------------------------------- :Capability: KVM_CAP_ARM_SET_DEVICE_ADDR -:Architectures: arm, arm64 +:Architectures: arm64 :Type: vm ioctl :Parameters: struct kvm_arm_device_address (in) :Returns: 0 on success, -1 on error @@ -3285,13 +3326,13 @@ can access emulated or directly exposed devices, which the host kernel needs to know about. The id field is an architecture specific identifier for a specific device. -ARM/arm64 divides the id field into two parts, a device id and an +arm64 divides the id field into two parts, a device id and an address type id specific to the individual device:: bits: | 63 ... 32 | 31 ... 16 | 15 ... 0 | field: | 0x00000000 | device id | addr type id | -ARM/arm64 currently only require this when using the in-kernel GIC +arm64 currently only require this when using the in-kernel GIC support for the hardware VGIC features, using KVM_ARM_DEVICE_VGIC_V2 as the device id. When setting the base address for the guest's mapping of the VGIC virtual CPU and distributor interface, the ioctl @@ -4505,7 +4546,7 @@ to I/O ports. ------------------------------------ :Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 -:Architectures: x86, arm, arm64, mips +:Architectures: x86, arm64, mips :Type: vm ioctl :Parameters: struct kvm_clear_dirty_log (in) :Returns: 0 on success, -1 on error @@ -4617,7 +4658,7 @@ version has the following quirks: 4.119 KVM_ARM_VCPU_FINALIZE --------------------------- -:Architectures: arm, arm64 +:Architectures: arm64 :Type: vcpu ioctl :Parameters: int feature (in) :Returns: 0 on success, -1 on error @@ -5656,13 +5697,15 @@ should put the acknowledged interrupt vector into the 'epr' field. #define KVM_SYSTEM_EVENT_SHUTDOWN 1 #define KVM_SYSTEM_EVENT_RESET 2 #define KVM_SYSTEM_EVENT_CRASH 3 + #define KVM_SYSTEM_EVENT_WAKEUP 4 + #define KVM_SYSTEM_EVENT_SUSPEND 5 __u32 type; __u64 flags; } system_event; If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered a system-level event using some architecture specific mechanism (hypercall -or some special instruction). In case of ARM/ARM64, this is triggered using +or some special instruction). In case of ARM64, this is triggered using HVC instruction based PSCI call from the vcpu. The 'type' field describes the system-level event type. The 'flags' field describes architecture specific flags for the system-level event. @@ -5680,6 +5723,42 @@ Valid values for 'type' are: has requested a crash condition maintenance. Userspace can choose to ignore the request, or to gather VM memory core dump and/or reset/shutdown of the VM. + - KVM_SYSTEM_EVENT_WAKEUP -- the exiting vCPU is in a suspended state and + KVM has recognized a wakeup event. Userspace may honor this event by + marking the exiting vCPU as runnable, or deny it and call KVM_RUN again. + - KVM_SYSTEM_EVENT_SUSPEND -- the guest has requested a suspension of + the VM. + +For arm/arm64: +-------------- + + KVM_SYSTEM_EVENT_SUSPEND exits are enabled with the + KVM_CAP_ARM_SYSTEM_SUSPEND VM capability. If a guest invokes the PSCI + SYSTEM_SUSPEND function, KVM will exit to userspace with this event + type. + + It is the sole responsibility of userspace to implement the PSCI + SYSTEM_SUSPEND call according to ARM DEN0022D.b 5.19 "SYSTEM_SUSPEND". + KVM does not change the vCPU's state before exiting to userspace, so + the call parameters are left in-place in the vCPU registers. + + Userspace is _required_ to take action for such an exit. It must + either: + + - Honor the guest request to suspend the VM. Userspace can request + in-kernel emulation of suspension by setting the calling vCPU's + state to KVM_MP_STATE_SUSPENDED. Userspace must configure the vCPU's + state according to the parameters passed to the PSCI function when + the calling vCPU is resumed. See ARM DEN0022D.b 5.19.1 "Intended use" + for details on the function parameters. + + - Deny the guest request to suspend the VM. See ARM DEN0022D.b 5.19.2 + "Caller responsibilities" for possible return values. + +Valid flags are: + + - KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 (arm64 only) -- the guest issued + a SYSTEM_RESET2 call according to v1.1 of the PSCI specification. :: @@ -5755,7 +5834,7 @@ in send_page or recv a buffer to recv_page). __u64 fault_ipa; } arm_nisv; -Used on arm and arm64 systems. If a guest accesses memory not in a memslot, +Used on arm64 systems. If a guest accesses memory not in a memslot, KVM will typically return to userspace and ask it to do MMIO emulation on its behalf. However, for certain classes of instructions, no instruction decode (direction, length of memory access) is provided, and fetching and decoding @@ -5772,16 +5851,22 @@ did not fall within an I/O window. Userspace implementations can query for KVM_CAP_ARM_NISV_TO_USER, and enable this capability at VM creation. Once this is done, these types of errors will instead return to userspace with KVM_EXIT_ARM_NISV, with the valid bits from -the HSR (arm) and ESR_EL2 (arm64) in the esr_iss field, and the faulting IPA -in the fault_ipa field. Userspace can either fix up the access if it's -actually an I/O access by decoding the instruction from guest memory (if it's -very brave) and continue executing the guest, or it can decide to suspend, -dump, or restart the guest. +the ESR_EL2 in the esr_iss field, and the faulting IPA in the fault_ipa field. +Userspace can either fix up the access if it's actually an I/O access by +decoding the instruction from guest memory (if it's very brave) and continue +executing the guest, or it can decide to suspend, dump, or restart the guest. Note that KVM does not skip the faulting instruction as it does for KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state if it decides to decode and emulate the instruction. +This feature isn't available to protected VMs, as userspace does not +have access to the state that is required to perform the emulation. +Instead, a data abort exception is directly injected in the guest. +Note that although KVM_CAP_ARM_NISV_TO_USER will be reported if +queried outside of a protected VM context, the feature will not be +exposed if queried on a protected VM file descriptor. + :: /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */ @@ -6464,7 +6549,7 @@ and injected exceptions. 7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 -:Architectures: x86, arm, arm64, mips +:Architectures: x86, arm64, mips :Parameters: args[0] whether feature should be enabled or not Valid flags are:: @@ -6833,7 +6918,7 @@ reserved. 8.9 KVM_CAP_ARM_USER_IRQ ------------------------ -:Architectures: arm, arm64 +:Architectures: arm64 This capability, if KVM_CHECK_EXTENSION indicates that it is available, means that if userspace creates a VM without an in-kernel interrupt controller, it @@ -6960,7 +7045,7 @@ HvFlushVirtualAddressList, HvFlushVirtualAddressListEx. 8.19 KVM_CAP_ARM_INJECT_SERROR_ESR ---------------------------------- -:Architectures: arm, arm64 +:Architectures: arm64 This capability indicates that userspace can specify (via the KVM_SET_VCPU_EVENTS ioctl) the syndrome value reported to the guest when it @@ -7266,6 +7351,16 @@ of the result of KVM_CHECK_EXTENSION. KVM will forward to userspace the hypercalls whose corresponding bit is in the argument, and return ENOSYS for the others. +8.36 KVM_CAP_ARM_SYSTEM_SUSPEND +------------------------------- + +:Capability: KVM_CAP_ARM_SYSTEM_SUSPEND +:Architectures: arm64 +:Type: vm + +When enabled, KVM will exit to userspace with KVM_EXIT_SYSTEM_EVENT of +type KVM_SYSTEM_EVENT_SUSPEND to process the guest suspend request. + 9. Known KVM API problems ========================= diff --git a/Documentation/virt/kvm/arm/fw-pseudo-registers.rst b/Documentation/virt/kvm/arm/fw-pseudo-registers.rst new file mode 100644 index 000000000000..b90fd0b0fa66 --- /dev/null +++ b/Documentation/virt/kvm/arm/fw-pseudo-registers.rst @@ -0,0 +1,138 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================= +ARM firmware pseudo-registers interface +======================================= + +KVM handles the hypercall services as requested by the guests. New hypercall +services are regularly made available by the ARM specification or by KVM (as +vendor services) if they make sense from a virtualization point of view. + +This means that a guest booted on two different versions of KVM can observe +two different "firmware" revisions. This could cause issues if a given guest +is tied to a particular version of a hypercall service, or if a migration +causes a different version to be exposed out of the blue to an unsuspecting +guest. + +In order to remedy this situation, KVM exposes a set of "firmware +pseudo-registers" that can be manipulated using the GET/SET_ONE_REG +interface. These registers can be saved/restored by userspace, and set +to a convenient value as required. + +The following registers are defined: + +* KVM_REG_ARM_PSCI_VERSION: + + KVM implements the PSCI (Power State Coordination Interface) + specification in order to provide services such as CPU on/off, reset + and power-off to the guest. + + - Only valid if the vcpu has the KVM_ARM_VCPU_PSCI_0_2 feature set + (and thus has already been initialized) + - Returns the current PSCI version on GET_ONE_REG (defaulting to the + highest PSCI version implemented by KVM and compatible with v0.2) + - Allows any PSCI version implemented by KVM and compatible with + v0.2 to be set with SET_ONE_REG + - Affects the whole VM (even if the register view is per-vcpu) + +* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + Holds the state of the firmware support to mitigate CVE-2017-5715, as + offered by KVM to the guest via a HVC call. The workaround is described + under SMCCC_ARCH_WORKAROUND_1 in [1]. + + Accepted values are: + + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL: + KVM does not offer + firmware support for the workaround. The mitigation status for the + guest is unknown. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL: + The workaround HVC call is + available to the guest and required for the mitigation. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED: + The workaround HVC call + is available to the guest, but it is not needed on this VCPU. + +* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + Holds the state of the firmware support to mitigate CVE-2018-3639, as + offered by KVM to the guest via a HVC call. The workaround is described + under SMCCC_ARCH_WORKAROUND_2 in [1]_. + + Accepted values are: + + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL: + A workaround is not + available. KVM does not offer firmware support for the workaround. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN: + The workaround state is + unknown. KVM does not offer firmware support for the workaround. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: + The workaround is available, + and can be disabled by a vCPU. If + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED is set, it is active for + this vCPU. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: + The workaround is always active on this vCPU or it is not needed. + + +Bitmap Feature Firmware Registers +--------------------------------- + +Contrary to the above registers, the following registers exposes the +hypercall services in the form of a feature-bitmap to the userspace. This +bitmap is translated to the services that are available to the guest. +There is a register defined per service call owner and can be accessed via +GET/SET_ONE_REG interface. + +By default, these registers are set with the upper limit of the features +that are supported. This way userspace can discover all the usable +hypercall services via GET_ONE_REG. The user-space can write-back the +desired bitmap back via SET_ONE_REG. The features for the registers that +are untouched, probably because userspace isn't aware of them, will be +exposed as is to the guest. + +Note that KVM will not allow the userspace to configure the registers +anymore once any of the vCPUs has run at least once. Instead, it will +return a -EBUSY. + +The pseudo-firmware bitmap register are as follows: + +* KVM_REG_ARM_STD_BMAP: + Controls the bitmap of the ARM Standard Secure Service Calls. + + The following bits are accepted: + + Bit-0: KVM_REG_ARM_STD_BIT_TRNG_V1_0: + The bit represents the services offered under v1.0 of ARM True Random + Number Generator (TRNG) specification, ARM DEN0098. + +* KVM_REG_ARM_STD_HYP_BMAP: + Controls the bitmap of the ARM Standard Hypervisor Service Calls. + + The following bits are accepted: + + Bit-0: KVM_REG_ARM_STD_HYP_BIT_PV_TIME: + The bit represents the Paravirtualized Time service as represented by + ARM DEN0057A. + +* KVM_REG_ARM_VENDOR_HYP_BMAP: + Controls the bitmap of the Vendor specific Hypervisor Service Calls. + + The following bits are accepted: + + Bit-0: KVM_REG_ARM_VENDOR_HYP_BIT_FUNC_FEAT + The bit represents the ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID + and ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID function-ids. + + Bit-1: KVM_REG_ARM_VENDOR_HYP_BIT_PTP: + The bit represents the Precision Time Protocol KVM service. + +Errors: + + ======= ============================================================= + -ENOENT Unknown register accessed. + -EBUSY Attempt a 'write' to the register after the VM has started. + -EINVAL Invalid bitmap written to the register. + ======= ============================================================= + +.. [1] https://developer.arm.com/-/media/developer/pdf/ARM_DEN_0070A_Firmware_interfaces_for_mitigating_CVE-2017-5715.pdf diff --git a/Documentation/virt/kvm/arm/hypercalls.rst b/Documentation/virt/kvm/arm/hypercalls.rst new file mode 100644 index 000000000000..75a28b803ec6 --- /dev/null +++ b/Documentation/virt/kvm/arm/hypercalls.rst @@ -0,0 +1,152 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================================== +KVM/arm64-specific hypercalls exposed to guests +=============================================== + +This file documents the KVM/arm64-specific hypercalls which may be +exposed by KVM/arm64 to guest operating systems. These hypercalls are +issued using the HVC instruction according to version 1.1 of the Arm SMC +Calling Convention (DEN0028/C): + +https://developer.arm.com/docs/den0028/c + +All KVM/arm64-specific hypercalls are allocated within the "Vendor +Specific Hypervisor Service Call" range with a UID of +``28b46fb6-2ec5-11e9-a9ca-4b564d003a74``. This UID should be queried by the +guest using the standard "Call UID" function for the service range in +order to determine that the KVM/arm64-specific hypercalls are available. + +``ARM_SMCCC_KVM_FUNC_FEATURES`` +--------------------------------------------- + +Provides a discovery mechanism for other KVM/arm64 hypercalls. + ++---------------------+-------------------------------------------------------------+ +| Presence: | Mandatory for the KVM/arm64 UID | ++---------------------+-------------------------------------------------------------+ +| Calling convention: | HVC32 | ++---------------------+----------+--------------------------------------------------+ +| Function ID: | (uint32) | 0x86000000 | ++---------------------+----------+--------------------------------------------------+ +| Arguments: | None | ++---------------------+----------+----+---------------------------------------------+ +| Return Values: | (uint32) | R0 | Bitmap of available function numbers 0-31 | +| +----------+----+---------------------------------------------+ +| | (uint32) | R1 | Bitmap of available function numbers 32-63 | +| +----------+----+---------------------------------------------+ +| | (uint32) | R2 | Bitmap of available function numbers 64-95 | +| +----------+----+---------------------------------------------+ +| | (uint32) | R3 | Bitmap of available function numbers 96-127 | ++---------------------+----------+----+---------------------------------------------+ + +``ARM_SMCCC_KVM_FUNC_PTP`` +---------------------------------------- + +See ptp_kvm.rst + +``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO`` +---------------------------------- + +Query the memory protection parameters for a protected virtual machine. + ++---------------------+-------------------------------------------------------------+ +| Presence: | Optional; protected guests only. | ++---------------------+-------------------------------------------------------------+ +| Calling convention: | HVC64 | ++---------------------+----------+--------------------------------------------------+ +| Function ID: | (uint32) | 0xC6000002 | ++---------------------+----------+----+---------------------------------------------+ +| Arguments: | (uint64) | R1 | Reserved / Must be zero | +| +----------+----+---------------------------------------------+ +| | (uint64) | R2 | Reserved / Must be zero | +| +----------+----+---------------------------------------------+ +| | (uint64) | R3 | Reserved / Must be zero | ++---------------------+----------+----+---------------------------------------------+ +| Return Values: | (int64) | R0 | ``INVALID_PARAMETER (-3)`` on error, else | +| | | | memory protection granule in bytes | ++---------------------+----------+----+---------------------------------------------+ + +``ARM_SMCCC_KVM_FUNC_MEM_SHARE`` +-------------------------------- + +Share a region of memory with the KVM host, granting it read, write and execute +permissions. The size of the region is equal to the memory protection granule +advertised by ``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO``. + ++---------------------+-------------------------------------------------------------+ +| Presence: | Optional; protected guests only. | ++---------------------+-------------------------------------------------------------+ +| Calling convention: | HVC64 | ++---------------------+----------+--------------------------------------------------+ +| Function ID: | (uint32) | 0xC6000003 | ++---------------------+----------+----+---------------------------------------------+ +| Arguments: | (uint64) | R1 | Base IPA of memory region to share | +| +----------+----+---------------------------------------------+ +| | (uint64) | R2 | Reserved / Must be zero | +| +----------+----+---------------------------------------------+ +| | (uint64) | R3 | Reserved / Must be zero | ++---------------------+----------+----+---------------------------------------------+ +| Return Values: | (int64) | R0 | ``SUCCESS (0)`` | +| | | +---------------------------------------------+ +| | | | ``INVALID_PARAMETER (-3)`` | ++---------------------+----------+----+---------------------------------------------+ + +``ARM_SMCCC_KVM_FUNC_MEM_UNSHARE`` +---------------------------------- + +Revoke access permission from the KVM host to a memory region previously shared +with ``ARM_SMCCC_KVM_FUNC_MEM_SHARE``. The size of the region is equal to the +memory protection granule advertised by ``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO``. + ++---------------------+-------------------------------------------------------------+ +| Presence: | Optional; protected guests only. | ++---------------------+-------------------------------------------------------------+ +| Calling convention: | HVC64 | ++---------------------+----------+--------------------------------------------------+ +| Function ID: | (uint32) | 0xC6000004 | ++---------------------+----------+----+---------------------------------------------+ +| Arguments: | (uint64) | R1 | Base IPA of memory region to unshare | +| +----------+----+---------------------------------------------+ +| | (uint64) | R2 | Reserved / Must be zero | +| +----------+----+---------------------------------------------+ +| | (uint64) | R3 | Reserved / Must be zero | ++---------------------+----------+----+---------------------------------------------+ +| Return Values: | (int64) | R0 | ``SUCCESS (0)`` | +| | | +---------------------------------------------+ +| | | | ``INVALID_PARAMETER (-3)`` | ++---------------------+----------+----+---------------------------------------------+ + +``ARM_SMCCC_KVM_FUNC_MEM_RELINQUISH`` +-------------------------------------- + +Cooperatively relinquish ownership of a memory region. The size of the +region is equal to the memory protection granule advertised by +``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO``. If this hypercall is advertised +then it is mandatory to call it before freeing memory via, for +example, virtio balloon. If the caller is a protected VM, it is +guaranteed that the memory region will be completely cleared before +becoming visible to another VM. + ++---------------------+-------------------------------------------------------------+ +| Presence: | Optional. | ++---------------------+-------------------------------------------------------------+ +| Calling convention: | HVC64 | ++---------------------+----------+--------------------------------------------------+ +| Function ID: | (uint32) | 0xC6000009 | ++---------------------+----------+----+---------------------------------------------+ +| Arguments: | (uint64) | R1 | Base IPA of memory region to relinquish | +| +----------+----+---------------------------------------------+ +| | (uint64) | R2 | Reserved / Must be zero | +| +----------+----+---------------------------------------------+ +| | (uint64) | R3 | Reserved / Must be zero | ++---------------------+----------+----+---------------------------------------------+ +| Return Values: | (int64) | R0 | ``SUCCESS (0)`` | +| | | +---------------------------------------------+ +| | | | ``INVALID_PARAMETER (-3)`` | ++---------------------+----------+----+---------------------------------------------+ + +``ARM_SMCCC_KVM_FUNC_MMIO_GUARD_*`` +----------------------------------- + +See mmio-guard.rst diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst index 78a9b670aafe..4d795a7da2c8 100644 --- a/Documentation/virt/kvm/arm/index.rst +++ b/Documentation/virt/kvm/arm/index.rst @@ -7,7 +7,10 @@ ARM .. toctree:: :maxdepth: 2 + fw-pseudo-registers hyp-abi - psci + hypercalls + pkvm pvtime ptp_kvm + mmio-guard diff --git a/Documentation/virt/kvm/arm/mmio-guard.rst b/Documentation/virt/kvm/arm/mmio-guard.rst new file mode 100644 index 000000000000..c1ba749c79df --- /dev/null +++ b/Documentation/virt/kvm/arm/mmio-guard.rst @@ -0,0 +1,74 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +KVM MMIO guard +============== + +KVM implements device emulation by handling translation faults to any +IPA range that is not contained in a memory slot. Such a translation +fault is in most cases passed on to userspace (or in rare cases to the +host kernel) with the address, size and possibly data of the access +for emulation. + +Should the guest exit with an address that is not one that corresponds +to an emulatable device, userspace may take measures that are not the +most graceful as far as the guest is concerned (such as terminating it +or delivering a fatal exception). + +There is also an element of trust: by forwarding the request to +userspace, the kernel assumes that the guest trusts userspace to do +the right thing. + +The KVM MMIO guard offers a way to mitigate this last point: a guest +can request that only certain regions of the IPA space are valid as +MMIO. Only these regions will be handled as an MMIO, and any other +will result in an exception being delivered to the guest. + +This relies on a set of hypercalls defined in the KVM-specific range, +using the HVC64 calling convention. + +* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO + + ============== ======== ================================ + Function ID: (uint32) 0xC6000005 + Arguments: r1-r3 Reserved / Must be zero + Return Values: (int64) NOT_SUPPORTED(-1) on error, or + (uint64) Protection Granule (PG) size in + bytes (r0) + ============== ======== ================================ + +* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL + + ============== ======== ============================== + Function ID: (uint32) 0xC6000006 + Arguments: none + Return Values: (int64) NOT_SUPPORTED(-1) on error, or + RET_SUCCESS(0) (r0) + ============== ======== ============================== + +* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP + + ============== ======== ==================================== + Function ID: (uint32) 0xC6000007 + Arguments: (uint64) The base of the PG-sized IPA range + that is allowed to be accessed as + MMIO. Must be aligned to the PG size + (r1) + (uint64) Index in the MAIR_EL1 register + providing the memory attribute that + is used by the guest (r2) + Return Values: (int64) NOT_SUPPORTED(-1) on error, or + RET_SUCCESS(0) (r0) + ============== ======== ==================================== + +* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP + + ============== ======== ====================================== + Function ID: (uint32) 0xC6000008 + Arguments: (uint64) PG-sized IPA range aligned to the PG + size which has been previously mapped. + Must be aligned to the PG size and + have been previously mapped (r1) + Return Values: (int64) NOT_SUPPORTED(-1) on error, or + RET_SUCCESS(0) (r0) + ============== ======== ====================================== diff --git a/Documentation/virt/kvm/arm/pkvm.rst b/Documentation/virt/kvm/arm/pkvm.rst new file mode 100644 index 000000000000..64f099a5ac2e --- /dev/null +++ b/Documentation/virt/kvm/arm/pkvm.rst @@ -0,0 +1,96 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Protected virtual machines (pKVM) +================================= + +Introduction +------------ + +Protected KVM (pKVM) is a KVM/arm64 extension which uses the two-stage +translation capability of the Armv8 MMU to isolate guest memory from the host +system. This allows for the creation of a confidential computing environment +without relying on whizz-bang features in hardware, but still allowing room for +complementary technologies such as memory encryption and hardware-backed +attestation. + +The major implementation change brought about by pKVM is that the hypervisor +code running at EL2 is now largely independent of (and isolated from) the rest +of the host kernel running at EL1 and therefore additional hypercalls are +introduced to manage manipulation of guest stage-2 page tables, creation of VM +data structures and reclamation of memory on teardown. An immediate consequence +of this change is that the host itself runs with an identity mapping enabled +at stage-2, providing the hypervisor code with a mechanism to restrict host +access to an arbitrary physical page. + +Enabling pKVM +------------- + +The pKVM hypervisor is enabled by booting the host kernel at EL2 with +"``kvm-arm.mode=protected``" on the command-line. Once enabled, VMs can be spawned +in either protected or non-protected state, although the hypervisor is still +responsible for managing most of the VM metadata in either case. + +Limitations +----------- + +Enabling pKVM places some significant limitations on KVM guests, regardless of +whether they are spawned in protected state. It is therefore recommended only +to enable pKVM if protected VMs are required, with non-protected state acting +primarily as a debug and development aid. + +If you're still keen, then here is an incomplete list of caveats that apply +to all VMs running under pKVM: + +- Guest memory cannot be file-backed (with the exception of shmem/memfd) and is + pinned as it is mapped into the guest. This prevents the host from + swapping-out, migrating, merging or generally doing anything useful with the + guest pages. It also requires that the VMM has either ``CAP_IPC_LOCK`` or + sufficient ``RLIMIT_MEMLOCK`` to account for this pinned memory. + +- GICv2 is not supported and therefore GICv3 hardware is required in order + to expose a virtual GICv3 to the guest. + +- Read-only memslots are unsupported and therefore dirty logging cannot be + enabled. + +- Memslot configuration is fixed once a VM has started running, with subsequent + move or deletion requests being rejected with ``-EPERM``. + +- There are probably many others. + +Since the host is unable to tear down the hypervisor when pKVM is enabled, +hibernation (``CONFIG_HIBERNATION``) and kexec (``CONFIG_KEXEC``) will fail +with ``-EBUSY``. + +If you are not happy with these limitations, then please don't enable pKVM :) + +VM creation +----------- + +When pKVM is enabled, protected VMs can be created by specifying the +``KVM_VM_TYPE_ARM_PROTECTED`` flag in the machine type identifier parameter +passed to ``KVM_CREATE_VM``. + +Protected VMs are instantiated according to a fixed vCPU configuration +described by the ID register definitions in +``arch/arm64/include/asm/kvm_pkvm.h``. Only a subset of the architectural +features that may be available to the host are exposed to the guest and the +capabilities advertised by ``KVM_CHECK_EXTENSION`` are limited accordingly, +with the vCPU registers being initialised to their architecturally-defined +values. + +Where not defined by the architecture, the registers of a protected vCPU +are reset to zero with the exception of the PC and X0 which can be set +either by the ``KVM_SET_ONE_REG`` interface or by a call to PSCI ``CPU_ON``. + +VM runtime +---------- + +By default, memory pages mapped into a protected guest are inaccessible to the +host and any attempt by the host to access such a page will result in the +injection of an abort at EL1 by the hypervisor. For accesses originating from +EL0, the host will then terminate the current task with a ``SIGSEGV``. + +pKVM exposes additional hypercalls to protected guests, primarily for the +purpose of establishing shared-memory regions with the host for communication +and I/O. These hypercalls are documented in hypercalls.rst. diff --git a/Documentation/virt/kvm/arm/psci.rst b/Documentation/virt/kvm/arm/psci.rst deleted file mode 100644 index d52c2e83b5b8..000000000000 --- a/Documentation/virt/kvm/arm/psci.rst +++ /dev/null @@ -1,77 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -========================================= -Power State Coordination Interface (PSCI) -========================================= - -KVM implements the PSCI (Power State Coordination Interface) -specification in order to provide services such as CPU on/off, reset -and power-off to the guest. - -The PSCI specification is regularly updated to provide new features, -and KVM implements these updates if they make sense from a virtualization -point of view. - -This means that a guest booted on two different versions of KVM can -observe two different "firmware" revisions. This could cause issues if -a given guest is tied to a particular PSCI revision (unlikely), or if -a migration causes a different PSCI version to be exposed out of the -blue to an unsuspecting guest. - -In order to remedy this situation, KVM exposes a set of "firmware -pseudo-registers" that can be manipulated using the GET/SET_ONE_REG -interface. These registers can be saved/restored by userspace, and set -to a convenient value if required. - -The following register is defined: - -* KVM_REG_ARM_PSCI_VERSION: - - - Only valid if the vcpu has the KVM_ARM_VCPU_PSCI_0_2 feature set - (and thus has already been initialized) - - Returns the current PSCI version on GET_ONE_REG (defaulting to the - highest PSCI version implemented by KVM and compatible with v0.2) - - Allows any PSCI version implemented by KVM and compatible with - v0.2 to be set with SET_ONE_REG - - Affects the whole VM (even if the register view is per-vcpu) - -* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: - Holds the state of the firmware support to mitigate CVE-2017-5715, as - offered by KVM to the guest via a HVC call. The workaround is described - under SMCCC_ARCH_WORKAROUND_1 in [1]. - - Accepted values are: - - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL: - KVM does not offer - firmware support for the workaround. The mitigation status for the - guest is unknown. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL: - The workaround HVC call is - available to the guest and required for the mitigation. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED: - The workaround HVC call - is available to the guest, but it is not needed on this VCPU. - -* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: - Holds the state of the firmware support to mitigate CVE-2018-3639, as - offered by KVM to the guest via a HVC call. The workaround is described - under SMCCC_ARCH_WORKAROUND_2 in [1]_. - - Accepted values are: - - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL: - A workaround is not - available. KVM does not offer firmware support for the workaround. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN: - The workaround state is - unknown. KVM does not offer firmware support for the workaround. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: - The workaround is available, - and can be disabled by a vCPU. If - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED is set, it is active for - this vCPU. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: - The workaround is always active on this vCPU or it is not needed. - -.. [1] https://developer.arm.com/-/media/developer/pdf/ARM_DEN_0070A_Firmware_interfaces_for_mitigating_CVE-2017-5715.pdf diff --git a/Documentation/virt/kvm/arm/ptp_kvm.rst b/Documentation/virt/kvm/arm/ptp_kvm.rst index aecdc80ddcd8..c8b01b7e53cc 100644 --- a/Documentation/virt/kvm/arm/ptp_kvm.rst +++ b/Documentation/virt/kvm/arm/ptp_kvm.rst @@ -7,19 +7,29 @@ PTP_KVM is used for high precision time sync between host and guests. It relies on transferring the wall clock and counter value from the host to the guest using a KVM-specific hypercall. -* ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: 0x86000001 +``ARM_SMCCC_KVM_FUNC_PTP`` +---------------------------------------- -This hypercall uses the SMC32/HVC32 calling convention: +Retrieve current time information for the specific counter. There are no +endianness restrictions. -ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID - ============== ======== ===================================== - Function ID: (uint32) 0x86000001 - Arguments: (uint32) KVM_PTP_VIRT_COUNTER(0) - KVM_PTP_PHYS_COUNTER(1) - Return Values: (int32) NOT_SUPPORTED(-1) on error, or - (uint32) Upper 32 bits of wall clock time (r0) - (uint32) Lower 32 bits of wall clock time (r1) - (uint32) Upper 32 bits of counter (r2) - (uint32) Lower 32 bits of counter (r3) - Endianness: No Restrictions. - ============== ======== ===================================== ++---------------------+-------------------------------------------------------+ +| Presence: | Optional | ++---------------------+-------------------------------------------------------+ +| Calling convention: | HVC32 | ++---------------------+----------+--------------------------------------------+ +| Function ID: | (uint32) | 0x86000001 | ++---------------------+----------+----+---------------------------------------+ +| Arguments: | (uint32) | R1 | ``KVM_PTP_VIRT_COUNTER (0)`` | +| | | +---------------------------------------+ +| | | | ``KVM_PTP_PHYS_COUNTER (1)`` | ++---------------------+----------+----+---------------------------------------+ +| Return Values: | (int32) | R0 | ``NOT_SUPPORTED (-1)`` on error, else | +| | | | upper 32 bits of wall clock time | +| +----------+----+---------------------------------------+ +| | (uint32) | R1 | Lower 32 bits of wall clock time | +| +----------+----+---------------------------------------+ +| | (uint32) | R2 | Upper 32 bits of counter | +| +----------+----+---------------------------------------+ +| | (uint32) | R3 | Lower 32 bits of counter | ++---------------------+----------+----+---------------------------------------+ diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst index 2acec3b9ef65..7e20a4a73296 100644 --- a/Documentation/virt/kvm/devices/vcpu.rst +++ b/Documentation/virt/kvm/devices/vcpu.rst @@ -70,7 +70,7 @@ irqchip. -ENODEV PMUv3 not supported or GIC not initialized -ENXIO PMUv3 not properly configured or in-kernel irqchip not configured as required prior to calling this attribute - -EBUSY PMUv3 already initialized + -EBUSY PMUv3 already initialized or a VCPU has already run -EINVAL Invalid filter range ======= ====================================================== @@ -104,11 +104,43 @@ hardware event. Filtering event 0x1E (CHAIN) has no effect either, as it isn't strictly speaking an event. Filtering the cycle counter is possible using event 0x11 (CPU_CYCLES). +1.4 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_SET_PMU +------------------------------------------ + +:Parameters: in kvm_device_attr.addr the address to an int representing the PMU + identifier. + +:Returns: + + ======= ==================================================== + -EBUSY PMUv3 already initialized, a VCPU has already run or + an event filter has already been set + -EFAULT Error accessing the PMU identifier + -ENXIO PMU not found + -ENODEV PMUv3 not supported or GIC not initialized + -ENOMEM Could not allocate memory + ======= ==================================================== + +Request that the VCPU uses the specified hardware PMU when creating guest events +for the purpose of PMU emulation. The PMU identifier can be read from the "type" +file for the desired PMU instance under /sys/devices (or, equivalent, +/sys/bus/even_source). This attribute is particularly useful on heterogeneous +systems where there are at least two CPU PMUs on the system. The PMU that is set +for one VCPU will be used by all the other VCPUs. It isn't possible to set a PMU +if a PMU event filter is already present. + +Note that KVM will not make any attempts to run the VCPU on the physical CPUs +associated with the PMU specified by this attribute. This is entirely left to +userspace. However, attempting to run the VCPU on a physical CPU not supported +by the PMU will fail and KVM_RUN will return with +exit_reason = KVM_EXIT_FAIL_ENTRY and populate the fail_entry struct by setting +hardare_entry_failure_reason field to KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED and +the cpu field to the processor id. 2. GROUP: KVM_ARM_VCPU_TIMER_CTRL ================================= -:Architectures: ARM, ARM64 +:Architectures: ARM64 2.1. ATTRIBUTES: KVM_ARM_VCPU_TIMER_IRQ_VTIMER, KVM_ARM_VCPU_TIMER_IRQ_PTIMER ----------------------------------------------------------------------------- diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst index b51f0d8992f8..1d1e89af555c 100644 --- a/Documentation/vm/index.rst +++ b/Documentation/vm/index.rst @@ -42,6 +42,7 @@ descriptions of data structures and algorithms. ksm memory-model mmu_notifier + multigen_lru numa overcommit-accounting page_migration diff --git a/Documentation/vm/multigen_lru.rst b/Documentation/vm/multigen_lru.rst new file mode 100644 index 000000000000..d7062c6a8946 --- /dev/null +++ b/Documentation/vm/multigen_lru.rst @@ -0,0 +1,159 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +Multi-Gen LRU +============= +The multi-gen LRU is an alternative LRU implementation that optimizes +page reclaim and improves performance under memory pressure. Page +reclaim decides the kernel's caching policy and ability to overcommit +memory. It directly impacts the kswapd CPU usage and RAM efficiency. + +Design overview +=============== +Objectives +---------- +The design objectives are: + +* Good representation of access recency +* Try to profit from spatial locality +* Fast paths to make obvious choices +* Simple self-correcting heuristics + +The representation of access recency is at the core of all LRU +implementations. In the multi-gen LRU, each generation represents a +group of pages with similar access recency. Generations establish a +(time-based) common frame of reference and therefore help make better +choices, e.g., between different memcgs on a computer or different +computers in a data center (for job scheduling). + +Exploiting spatial locality improves efficiency when gathering the +accessed bit. A rmap walk targets a single page and does not try to +profit from discovering a young PTE. A page table walk can sweep all +the young PTEs in an address space, but the address space can be too +sparse to make a profit. The key is to optimize both methods and use +them in combination. + +Fast paths reduce code complexity and runtime overhead. Unmapped pages +do not require TLB flushes; clean pages do not require writeback. +These facts are only helpful when other conditions, e.g., access +recency, are similar. With generations as a common frame of reference, +additional factors stand out. But obvious choices might not be good +choices; thus self-correction is necessary. + +The benefits of simple self-correcting heuristics are self-evident. +Again, with generations as a common frame of reference, this becomes +attainable. Specifically, pages in the same generation can be +categorized based on additional factors, and a feedback loop can +statistically compare the refault percentages across those categories +and infer which of them are better choices. + +Assumptions +----------- +The protection of hot pages and the selection of cold pages are based +on page access channels and patterns. There are two access channels: + +* Accesses through page tables +* Accesses through file descriptors + +The protection of the former channel is by design stronger because: + +1. The uncertainty in determining the access patterns of the former + channel is higher due to the approximation of the accessed bit. +2. The cost of evicting the former channel is higher due to the TLB + flushes required and the likelihood of encountering the dirty bit. +3. The penalty of underprotecting the former channel is higher because + applications usually do not prepare themselves for major page + faults like they do for blocked I/O. E.g., GUI applications + commonly use dedicated I/O threads to avoid blocking rendering + threads. + +There are also two access patterns: + +* Accesses exhibiting temporal locality +* Accesses not exhibiting temporal locality + +For the reasons listed above, the former channel is assumed to follow +the former pattern unless ``VM_SEQ_READ`` or ``VM_RAND_READ`` is +present, and the latter channel is assumed to follow the latter +pattern unless outlying refaults have been observed. + +Workflow overview +================= +Evictable pages are divided into multiple generations for each +``lruvec``. The youngest generation number is stored in +``lrugen->max_seq`` for both anon and file types as they are aged on +an equal footing. The oldest generation numbers are stored in +``lrugen->min_seq[]`` separately for anon and file types as clean file +pages can be evicted regardless of swap constraints. These three +variables are monotonically increasing. + +Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)`` +bits in order to fit into the gen counter in ``folio->flags``. Each +truncated generation number is an index to ``lrugen->lists[]``. The +sliding window technique is used to track at least ``MIN_NR_GENS`` and +at most ``MAX_NR_GENS`` generations. The gen counter stores a value +within ``[1, MAX_NR_GENS]`` while a page is on one of +``lrugen->lists[]``; otherwise it stores zero. + +Each generation is divided into multiple tiers. A page accessed ``N`` +times through file descriptors is in tier ``order_base_2(N)``. Unlike +generations, tiers do not have dedicated ``lrugen->lists[]``. In +contrast to moving across generations, which requires the LRU lock, +moving across tiers only involves atomic operations on +``folio->flags`` and therefore has a negligible cost. A feedback loop +modeled after the PID controller monitors refaults over all the tiers +from anon and file types and decides which tiers from which types to +evict or protect. + +There are two conceptually independent procedures: the aging and the +eviction. They form a closed-loop system, i.e., the page reclaim. + +Aging +----- +The aging produces young generations. Given an ``lruvec``, it +increments ``max_seq`` when ``max_seq-min_seq+1`` approaches +``MIN_NR_GENS``. The aging promotes hot pages to the youngest +generation when it finds them accessed through page tables; the +demotion of cold pages happens consequently when it increments +``max_seq``. The aging uses page table walks and rmap walks to find +young PTEs. For the former, it iterates ``lruvec_memcg()->mm_list`` +and calls ``walk_page_range()`` with each ``mm_struct`` on this list +to scan PTEs, and after each iteration, it increments ``max_seq``. For +the latter, when the eviction walks the rmap and finds a young PTE, +the aging scans the adjacent PTEs. For both, on finding a young PTE, +the aging clears the accessed bit and updates the gen counter of the +page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``. + +Eviction +-------- +The eviction consumes old generations. Given an ``lruvec``, it +increments ``min_seq`` when ``lrugen->lists[]`` indexed by +``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to +evict from, it first compares ``min_seq[]`` to select the older type. +If both types are equally old, it selects the one whose first tier has +a lower refault percentage. The first tier contains single-use +unmapped clean pages, which are the best bet. The eviction sorts a +page according to its gen counter if the aging has found this page +accessed through page tables and updated its gen counter. It also +moves a page to the next generation, i.e., ``min_seq+1``, if this page +was accessed multiple times through file descriptors and the feedback +loop has detected outlying refaults from the tier this page is in. To +this end, the feedback loop uses the first tier as the baseline, for +the reason stated earlier. + +Summary +------- +The multi-gen LRU can be disassembled into the following parts: + +* Generations +* Rmap walks +* Page table walks +* Bloom filters +* PID controller + +The aging and the eviction form a producer-consumer model; +specifically, the latter drives the former by the sliding window over +generations. Within the aging, rmap walks drive page table walks by +inserting hot densely populated page tables to the Bloom filters. +Within the eviction, the PID controller uses refaults as the feedback +to select types to evict and tiers to protect. diff --git a/Kconfig b/Kconfig index 745bc773f567..57a142d8d8b4 100644 --- a/Kconfig +++ b/Kconfig @@ -30,3 +30,6 @@ source "lib/Kconfig" source "lib/Kconfig.debug" source "Documentation/Kconfig" + +# ANDROID: Set KCONFIG_EXT_PREFIX to decend into an external project. +source "$(KCONFIG_EXT_PREFIX)Kconfig.ext" diff --git a/Kconfig.ext b/Kconfig.ext new file mode 100644 index 000000000000..48d805f0bfc0 --- /dev/null +++ b/Kconfig.ext @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 +# This file is intentionally empty. It's used as a placeholder for when +# KCONFIG_EXT_PREFIX isn't defined. diff --git a/MAINTAINERS b/MAINTAINERS index 4b62a7a454c1..6883d056b29f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2424,7 +2424,7 @@ F: drivers/pci/controller/dwc/pcie-qcom.c F: drivers/phy/qualcomm/ F: drivers/power/*/msm* F: drivers/reset/reset-qcom-* -F: drivers/scsi/ufs/ufs-qcom* +F: drivers/ufs/host/ufs-qcom* F: drivers/spi/spi-geni-qcom.c F: drivers/spi/spi-qcom-qspi.c F: drivers/spi/spi-qup.c @@ -7176,6 +7176,7 @@ M: Chao Yu L: linux-f2fs-devel@lists.sourceforge.net S: Maintained W: https://f2fs.wiki.kernel.org/ +B: https://bugzilla.kernel.org/enter_bug.cgi?product=File%20System&component=f2fs T: git git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git F: Documentation/ABI/testing/sysfs-fs-f2fs F: Documentation/filesystems/f2fs.rst @@ -9224,6 +9225,13 @@ F: Documentation/hwmon/ina2xx.rst F: drivers/hwmon/ina2xx.c F: include/linux/platform_data/ina2xx.h +INCREMENTAL FILE SYSTEM +M: Paul Lawrence +L: linux-unionfs@vger.kernel.org +S: Supported +F: fs/incfs/ +F: tools/testing/selftests/filesystems/incfs/ + INDUSTRY PACK SUBSYSTEM (IPACK) M: Samuel Iglesias Gonsalvez M: Jens Taprogge @@ -10136,7 +10144,6 @@ F: arch/*/include/asm/*kasan.h F: arch/*/mm/kasan_init* F: include/linux/kasan*.h F: lib/Kconfig.kasan -F: lib/test_kasan*.c F: mm/kasan/ F: scripts/Makefile.kasan @@ -10305,8 +10312,10 @@ M: Marc Zyngier R: James Morse R: Alexandru Elisei R: Suzuki K Poulose +R: Oliver Upton L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -L: kvmarm@lists.cs.columbia.edu (moderated for non-subscribers) +L: kvmarm@lists.linux.dev +L: kvmarm@lists.cs.columbia.edu (deprecated, moderated for non-subscribers) S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git F: arch/arm64/include/asm/kvm* @@ -13322,6 +13331,12 @@ W: http://www.netlab.is.tsukuba.ac.jp/~yokota/izumi/ninja/ F: Documentation/scsi/NinjaSCSI.rst F: drivers/scsi/nsp32* +NINTENDO HID DRIVER +M: Daniel J. Ogorchock +L: linux-input@vger.kernel.org +S: Maintained +F: drivers/hid/hid-nintendo* + NIOS2 ARCHITECTURE M: Dinh Nguyen S: Maintained @@ -16757,6 +16772,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git F: Documentation/devicetree/bindings/scsi/ F: drivers/scsi/ +F: drivers/ufs/ F: include/scsi/ SCSI TAPE DRIVER @@ -19312,23 +19328,24 @@ F: include/linux/visorbus.h UNIVERSAL FLASH STORAGE HOST CONTROLLER DRIVER R: Alim Akhtar R: Avri Altman +R: Bart Van Assche L: linux-scsi@vger.kernel.org S: Supported F: Documentation/scsi/ufs.rst -F: drivers/scsi/ufs/ +F: drivers/ufs/core/ UNIVERSAL FLASH STORAGE HOST CONTROLLER DRIVER DWC HOOKS M: Pedro Sousa L: linux-scsi@vger.kernel.org S: Supported -F: drivers/scsi/ufs/*dwc* +F: drivers/ufs/host/*dwc* UNIVERSAL FLASH STORAGE HOST CONTROLLER DRIVER MEDIATEK HOOKS M: Stanley Chu L: linux-scsi@vger.kernel.org L: linux-mediatek@lists.infradead.org (moderated for non-subscribers) S: Maintained -F: drivers/scsi/ufs/ufs-mediatek* +F: drivers/ufs/host/ufs-mediatek* UNSORTED BLOCK IMAGES (UBI) M: Richard Weinberger diff --git a/Makefile b/Makefile index 13f41e446294..f75dd9ba2b0c 100644 --- a/Makefile +++ b/Makefile @@ -136,6 +136,24 @@ endif export KBUILD_EXTMOD +# ANDROID: set up mixed-build support. mixed-build allows device kernel modules +# to be compiled against a GKI kernel. This approach still uses the headers and +# Kbuild from device kernel, so care must be taken to ensure that those headers match. +ifdef KBUILD_MIXED_TREE +# Need vmlinux.symvers for modpost and System.map for depmod, check whether they exist in KBUILD_MIXED_TREE +required_mixed_files=vmlinux.symvers System.map +$(if $(filter-out $(words $(required_mixed_files)), \ + $(words $(wildcard $(add-prefix $(KBUILD_MIXED_TREE)/,$(required_mixed_files))))),,\ + $(error KBUILD_MIXED_TREE=$(KBUILD_MIXED_TREE) doesn't contain $(required_mixed_files))) +endif + +mixed-build-prefix = $(if $(KBUILD_MIXED_TREE),$(KBUILD_MIXED_TREE)/) +export KBUILD_MIXED_TREE +# This is a hack for kleaf to set mixed-build-prefix within the execution of a make rule, e.g. +# within __modinst_pre. +# TODO(b/205893923): Revert this hack once it is properly handled. +export mixed-build-prefix + # Kbuild will save output files in the current working directory. # This does not need to match to the root of the kernel source tree. # @@ -432,11 +450,12 @@ HOSTCXX = g++ endif HOSTPKG_CONFIG = pkg-config -export KBUILD_USERCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes \ - -O2 -fomit-frame-pointer -std=gnu89 -export KBUILD_USERLDFLAGS := +KBUILD_USERHOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes \ + -O2 -fomit-frame-pointer -std=gnu89 +KBUILD_USERCFLAGS := $(KBUILD_USERHOSTCFLAGS) $(USERCFLAGS) +KBUILD_USERLDFLAGS := $(USERLDFLAGS) -KBUILD_HOSTCFLAGS := $(KBUILD_USERCFLAGS) $(HOST_LFS_CFLAGS) $(HOSTCFLAGS) +KBUILD_HOSTCFLAGS := $(KBUILD_USERHOSTCFLAGS) $(HOST_LFS_CFLAGS) $(HOSTCFLAGS) KBUILD_HOSTCXXFLAGS := -Wall -O2 $(HOST_LFS_CFLAGS) $(HOSTCXXFLAGS) KBUILD_HOSTLDFLAGS := $(HOST_LFS_LDFLAGS) $(HOSTLDFLAGS) KBUILD_HOSTLDLIBS := $(HOST_LFS_LIBS) $(HOSTLDLIBS) @@ -477,7 +496,7 @@ KGZIP = gzip KBZIP2 = bzip2 KLZOP = lzop LZMA = lzma -LZ4 = lz4c +LZ4 = lz4 XZ = xz ZSTD = zstd @@ -531,6 +550,7 @@ export CPP AR NM STRIP OBJCOPY OBJDUMP READELF PAHOLE RESOLVE_BTFIDS LEX YACC AW export PERL PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX export KGZIP KBZIP2 KLZOP LZMA LZ4 XZ ZSTD export KBUILD_HOSTCXXFLAGS KBUILD_HOSTLDFLAGS KBUILD_HOSTLDLIBS LDFLAGS_MODULE +export KBUILD_USERCFLAGS KBUILD_USERLDFLAGS export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS KBUILD_LDFLAGS export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE @@ -672,11 +692,13 @@ drivers-y += virt/ libs-y := lib/ endif # KBUILD_EXTMOD +ifndef KBUILD_MIXED_TREE # The all: target is the default when no target is given on the # command line. # This allow a user to issue only 'make' to build a kernel including modules # Defaults to vmlinux, but the arch makefile usually adds further targets all: vmlinux +endif CFLAGS_GCOV := -fprofile-arcs -ftest-coverage ifdef CONFIG_CC_IS_GCC @@ -955,7 +977,13 @@ KBUILD_LDFLAGS += --thinlto-cache-dir=$(extmod_prefix).thinlto-cache else CC_FLAGS_LTO := -flto endif + +ifeq ($(SRCARCH),x86) +# Workaround for compiler / linker bug CC_FLAGS_LTO += -fvisibility=hidden +else +CC_FLAGS_LTO += -fvisibility=default +endif # Limit inlining across translation units to reduce binary size KBUILD_LDFLAGS += -mllvm -import-instr-limit=5 @@ -1150,6 +1178,40 @@ export extmod_prefix = $(if $(KBUILD_EXTMOD),$(KBUILD_EXTMOD)/) export MODORDER := $(extmod_prefix)modules.order export MODULES_NSDEPS := $(extmod_prefix)modules.nsdeps +# --------------------------------------------------------------------------- +# Kernel headers + +PHONY += headers + +#Default location for installed headers +ifeq ($(KBUILD_EXTMOD),) +PHONY += archheaders archscripts +hdr-inst := -f $(srctree)/scripts/Makefile.headersinst obj +headers: $(version_h) scripts_unifdef uapi-asm-generic archheaders archscripts +else +hdr-prefix = $(KBUILD_EXTMOD)/ +hdr-inst := -f $(srctree)/scripts/Makefile.headersinst dst=$(KBUILD_EXTMOD)/usr/include objtree=$(objtree)/$(KBUILD_EXTMOD) obj +endif + +export INSTALL_HDR_PATH = $(objtree)/$(hdr-prefix)usr + +quiet_cmd_headers_install = INSTALL $(INSTALL_HDR_PATH)/include + cmd_headers_install = \ + mkdir -p $(INSTALL_HDR_PATH); \ + rsync -mrl --include='*/' --include='*\.h' --exclude='*' \ + $(hdr-prefix)usr/include $(INSTALL_HDR_PATH); + +PHONY += headers_install +headers_install: headers + $(call cmd,headers_install) + +headers: +ifeq ($(KBUILD_EXTMOD),) + $(if $(filter um, $(SRCARCH)), $(error Headers not exportable for UML)) +endif + $(Q)$(MAKE) $(hdr-inst)=$(hdr-prefix)include/uapi + $(Q)$(MAKE) $(hdr-inst)=$(hdr-prefix)arch/$(SRCARCH)/include/uapi + ifeq ($(KBUILD_EXTMOD),) core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ core-$(CONFIG_BLOCK) += block/ @@ -1215,8 +1277,10 @@ cmd_link-vmlinux = \ $(CONFIG_SHELL) $< "$(LD)" "$(KBUILD_LDFLAGS)" "$(LDFLAGS_vmlinux)"; \ $(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) $@, true) +ifndef KBUILD_MIXED_TREE vmlinux: scripts/link-vmlinux.sh autoksyms_recursive $(vmlinux-deps) FORCE +$(call if_changed_dep,link-vmlinux) +endif targets := vmlinux @@ -1225,7 +1289,8 @@ targets := vmlinux $(sort $(vmlinux-deps) $(subdir-modorder)): descend ; filechk_kernel.release = \ - echo "$(KERNELVERSION)$$($(CONFIG_SHELL) $(srctree)/scripts/setlocalversion $(srctree))" + echo "$(KERNELVERSION)$$($(CONFIG_SHELL) $(srctree)/scripts/setlocalversion \ + $(srctree) $(BRANCH) $(KMI_GENERATION))" # Store (new) KERNELRELEASE string in include/config/kernel.release include/config/kernel.release: FORCE @@ -1315,32 +1380,6 @@ headerdep: $(Q)find $(srctree)/include/ -name '*.h' | xargs --max-args 1 \ $(srctree)/scripts/headerdep.pl -I$(srctree)/include -# --------------------------------------------------------------------------- -# Kernel headers - -#Default location for installed headers -export INSTALL_HDR_PATH = $(objtree)/usr - -quiet_cmd_headers_install = INSTALL $(INSTALL_HDR_PATH)/include - cmd_headers_install = \ - mkdir -p $(INSTALL_HDR_PATH); \ - rsync -mrl --include='*/' --include='*\.h' --exclude='*' \ - usr/include $(INSTALL_HDR_PATH) - -PHONY += headers_install -headers_install: headers - $(call cmd,headers_install) - -PHONY += archheaders archscripts - -hdr-inst := -f $(srctree)/scripts/Makefile.headersinst obj - -PHONY += headers -headers: $(version_h) scripts_unifdef uapi-asm-generic archheaders archscripts - $(if $(filter um, $(SRCARCH)), $(error Headers not exportable for UML)) - $(Q)$(MAKE) $(hdr-inst)=include/uapi - $(Q)$(MAKE) $(hdr-inst)=arch/$(SRCARCH)/include/uapi - # Deprecated. It is no-op now. PHONY += headers_check headers_check: @@ -1426,7 +1465,9 @@ kselftest-merge: # Devicetree files ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/boot/dts/),) -dtstree := arch/$(SRCARCH)/boot/dts +# ANDROID: allow this to be overridden by the build environment. This allows +# one to compile a device tree that is located out-of-tree. +dtstree ?= arch/$(SRCARCH)/boot/dts endif ifneq ($(dtstree),) @@ -1492,7 +1533,9 @@ endif # using awk while concatenating to the final file. PHONY += modules -modules: $(if $(KBUILD_BUILTIN),vmlinux) modules_check modules_prepare +# if KBUILD_BUILTIN && !KBUILD_MIXED_TREE, depend on vmlinux +modules: $(if $(KBUILD_BUILTIN), $(if $(KBUILD_MIXED_TREE),,vmlinux)) +modules: modules_check modules_prepare cmd_modules_order = $(AWK) '!x[$$0]++' $(real-prereqs) > $@ @@ -1537,8 +1580,8 @@ __modinst_pre: ln -s $(CURDIR) $(MODLIB)/build ; \ fi @sed 's:^:kernel/:' modules.order > $(MODLIB)/modules.order - @cp -f modules.builtin $(MODLIB)/ - @cp -f $(objtree)/modules.builtin.modinfo $(MODLIB)/ + @cp -f $(mixed-build-prefix)modules.builtin $(MODLIB)/ + @cp -f $(or $(mixed-build-prefix),$(objtree)/)modules.builtin.modinfo $(MODLIB)/ endif # CONFIG_MODULES @@ -1799,6 +1842,8 @@ help: @echo '' @echo ' modules - default target, build the module(s)' @echo ' modules_install - install the module' + @echo ' headers_install - Install sanitised kernel headers to INSTALL_HDR_PATH' + @echo ' (default: $(abspath $(INSTALL_HDR_PATH)))' @echo ' clean - remove generated files in module directory only' @echo '' @@ -1823,7 +1868,7 @@ modules_check: $(MODORDER) quiet_cmd_depmod = DEPMOD $(MODLIB) cmd_depmod = $(CONFIG_SHELL) $(srctree)/scripts/depmod.sh $(DEPMOD) \ - $(KERNELRELEASE) + $(KERNELRELEASE) $(mixed-build-prefix) modules_install: $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modinst @@ -1860,7 +1905,8 @@ ifdef single-build # .ko is special because modpost is needed single-ko := $(sort $(filter %.ko, $(MAKECMDGOALS))) -single-no-ko := $(sort $(patsubst %.ko,%.mod, $(MAKECMDGOALS))) +single-no-ko := $(filter-out $(single-ko), $(MAKECMDGOALS)) \ + $(foreach x, o mod, $(patsubst %.ko, %.$x, $(single-ko))) $(single-ko): single_modpost @: @@ -1902,7 +1948,7 @@ descend: $(build-dirs) $(build-dirs): prepare $(Q)$(MAKE) $(build)=$@ \ single-build=$(if $(filter-out $@/, $(filter $@/%, $(KBUILD_SINGLE_TARGETS))),1) \ - need-builtin=1 need-modorder=1 + $(if $(KBUILD_MIXED_TREE),,need-builtin=1) need-modorder=1 clean-dirs := $(addprefix _clean_, $(clean-dirs)) PHONY += $(clean-dirs) clean @@ -1911,12 +1957,14 @@ $(clean-dirs): clean: $(clean-dirs) $(call cmd,rmfiles) - @find $(if $(KBUILD_EXTMOD), $(KBUILD_EXTMOD), .) $(RCS_FIND_IGNORE) \ + @find $(if $(KBUILD_EXTMOD), $(KBUILD_EXTMOD), .) \ + $(if $(filter-out arch/$(SRCARCH)/boot/dts, $(dtstree)), $(dtstree)) \ + $(RCS_FIND_IGNORE) \ \( -name '*.[aios]' -o -name '*.ko' -o -name '.*.cmd' \ -o -name '*.ko.*' \ -o -name '*.dtb' -o -name '*.dtbo' -o -name '*.dtb.S' -o -name '*.dt.yaml' \ -o -name '*.dwo' -o -name '*.lst' \ - -o -name '*.su' -o -name '*.mod' \ + -o -name '*.su' -o -name '*.mod' -o -name '*.usyms' \ -o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \ -o -name '*.lex.c' -o -name '*.tab.[ch]' \ -o -name '*.asn1.[ch]' \ @@ -2007,7 +2055,8 @@ checkstack: $(PERL) $(srctree)/scripts/checkstack.pl $(CHECKSTACK_ARCH) kernelrelease: - @echo "$(KERNELVERSION)$$($(CONFIG_SHELL) $(srctree)/scripts/setlocalversion $(srctree))" + @echo "$(KERNELVERSION)$$($(CONFIG_SHELL) $(srctree)/scripts/setlocalversion \ + $(srctree) $(BRANCH) $(KMI_GENERATION))" kernelversion: @echo $(KERNELVERSION) diff --git a/OWNERS b/OWNERS new file mode 100644 index 000000000000..2b5ccceaf513 --- /dev/null +++ b/OWNERS @@ -0,0 +1,2 @@ +# include OWNERS from the authoritative android-mainline branch +include kernel/common:android-mainline:/OWNERS diff --git a/android/OWNERS b/android/OWNERS new file mode 100644 index 000000000000..3a9d87457d81 --- /dev/null +++ b/android/OWNERS @@ -0,0 +1,8 @@ +# If we ever add another OWNERS above this directory, it's likely to be +# more permissive, so don't inherit from it +set noparent +include kernel/common:android-mainline:/OWNERS_DrNo + +# Downstream boards maintained directly in this manifest branch +per-file abi_gki_aarch64_cuttlefish = adelva@google.com, rammuthiah@google.com +per-file abi_gki_aarch64_goldfish = rkir@google.com diff --git a/android/abi_gki_aarch64 b/android/abi_gki_aarch64 new file mode 100644 index 000000000000..57914f291962 --- /dev/null +++ b/android/abi_gki_aarch64 @@ -0,0 +1,4 @@ +[abi_symbol_list] +# commonly used symbols + module_layout + __put_task_struct diff --git a/android/abi_gki_aarch64_db845c b/android/abi_gki_aarch64_db845c new file mode 100644 index 000000000000..f351f70b632e --- /dev/null +++ b/android/abi_gki_aarch64_db845c @@ -0,0 +1,1877 @@ +[abi_symbol_list] +# commonly used symbols + add_uevent_var + alloc_io_pgtable_ops + alloc_workqueue + __arch_copy_from_user + __arch_copy_to_user + arm64_const_caps_ready + arm64_use_ng_mappings + bcmp + blocking_notifier_call_chain + blocking_notifier_chain_register + blocking_notifier_chain_unregister + bpf_trace_run1 + bpf_trace_run2 + bpf_trace_run3 + bpf_trace_run4 + bpf_trace_run5 + bus_register + bus_unregister + cancel_delayed_work + cancel_delayed_work_sync + cancel_work_sync + capable + cdev_device_add + cdev_device_del + cdev_init + cfg80211_find_elem_match + cfg80211_find_vendor_elem + cfg80211_get_bss + cfg80211_put_bss + __cfi_slowpath_diag + __check_object_size + clk_bulk_disable + clk_bulk_enable + clk_bulk_prepare + clk_bulk_unprepare + clk_disable + clk_enable + clk_fixed_rate_ops + clk_get + __clk_get_name + clk_get_rate + clk_hw_get_name + clk_hw_get_parent + clk_hw_get_rate + clk_hw_register + clk_prepare + clk_put + clk_round_rate + clk_set_rate + clk_sync_state + clk_unprepare + complete + complete_all + completion_done + __const_udelay + consume_skb + _copy_from_iter + cpu_hwcap_keys + cpu_hwcaps + cpumask_next + cpu_number + __cpu_online_mask + debugfs_create_dir + debugfs_create_file + debugfs_create_u32 + debugfs_create_x32 + debugfs_remove + default_llseek + delayed_work_timer_fn + del_timer_sync + destroy_workqueue + dev_coredumpv + dev_driver_string + _dev_err + dev_err_probe + dev_get_regmap + device_find_child + device_for_each_child + device_get_match_data + device_get_named_child_node + device_initialize + device_init_wakeup + device_property_present + device_property_read_u32_array + device_register + device_set_wakeup_capable + device_unregister + device_wakeup_enable + _dev_info + __dev_kfree_skb_any + devm_add_action + devm_clk_bulk_get + devm_clk_bulk_get_all + devm_clk_get + devm_clk_get_optional + devm_clk_hw_register + devm_free_irq + devm_gpiochip_add_data_with_key + devm_gpiod_get + devm_gpiod_get_optional + devm_ioremap + devm_ioremap_resource + devm_ioremap_wc + devm_iounmap + devm_kasprintf + devm_kfree + devm_kmalloc + devm_mbox_controller_register + devm_memremap + devm_of_clk_add_hw_provider + devm_of_icc_get + __devm_of_phy_provider_register + devm_of_platform_populate + devm_phy_create + devm_phy_get + devm_pinctrl_register + devm_platform_get_and_ioremap_resource + devm_platform_ioremap_resource + devm_pm_opp_of_add_table + devm_pm_opp_set_clkname + devm_regmap_add_irq_chip + devm_regmap_field_alloc + __devm_regmap_init + __devm_regmap_init_i2c + __devm_regmap_init_mmio_clk + devm_regulator_bulk_get + devm_regulator_get + devm_regulator_get_optional + devm_regulator_register + devm_request_threaded_irq + __devm_reset_control_get + devm_reset_controller_register + devm_snd_soc_register_card + devm_snd_soc_register_component + __devm_spi_alloc_controller + devm_spi_register_controller + devm_thermal_zone_of_sensor_register + devm_watchdog_register_device + _dev_notice + dev_pm_domain_attach_by_name + dev_pm_domain_detach + dev_pm_genpd_set_performance_state + dev_pm_opp_add + dev_pm_opp_find_freq_ceil + dev_pm_opp_find_freq_exact + dev_pm_opp_find_freq_floor + dev_pm_opp_get_level + dev_pm_opp_get_opp_count + dev_pm_opp_of_find_icc_paths + dev_pm_opp_put + dev_pm_opp_set_opp + dev_pm_opp_set_rate + devres_add + __devres_alloc_node + devres_free + devres_release + dev_set_name + _dev_warn + disable_irq + disable_irq_nosync + divider_get_val + divider_recalc_rate + divider_round_rate_parent + dma_alloc_attrs + dma_buf_export + dma_fence_context_alloc + dma_fence_init + dma_fence_release + dma_fence_signal + dma_fence_wait_timeout + dma_free_attrs + dmam_alloc_attrs + dma_map_page_attrs + dma_map_sg_attrs + dma_map_sgtable + dma_mmap_attrs + dma_release_channel + dma_request_chan + dma_set_coherent_mask + dma_set_mask + dma_sync_single_for_cpu + dma_sync_single_for_device + dma_unmap_page_attrs + dma_unmap_sg_attrs + driver_register + driver_unregister + drm_add_edid_modes + drm_atomic_helper_connector_destroy_state + drm_atomic_helper_connector_duplicate_state + drm_atomic_helper_connector_reset + drm_bridge_add + drm_bridge_hpd_notify + drm_bridge_remove + drm_connector_attach_encoder + drm_connector_cleanup + drm_connector_init + drm_connector_update_edid_property + drm_do_get_edid + __drm_err + drm_get_edid + drm_hdmi_avi_infoframe_from_display_mode + drm_helper_probe_single_connector_modes + drm_kms_helper_hotplug_event + drm_mode_vrefresh + enable_irq + event_triggers_call + _find_first_bit + _find_first_zero_bit + _find_next_bit + finish_wait + firmware_request_nowarn + flush_work + flush_workqueue + free_io_pgtable_ops + free_irq + generic_handle_domain_irq + generic_handle_irq + geni_icc_disable + geni_icc_enable + geni_icc_get + geni_icc_set_bw + geni_se_config_packing + geni_se_init + geni_se_resources_off + geni_se_resources_on + geni_se_select_mode + get_device + get_random_bytes + gic_nonsecure_priorities + gpiochip_add_data_with_key + gpiochip_add_pin_range + gpiochip_generic_free + gpiochip_generic_request + gpiochip_get_data + gpiochip_remove + gpiod_direction_output + gpiod_direction_output_raw + gpiod_get_value_cansleep + gpiod_set_consumer_name + gpiod_set_raw_value + gpiod_set_value + gpiod_set_value_cansleep + gpiod_to_irq + gpio_to_desc + handle_edge_irq + handle_level_irq + handle_nested_irq + handle_simple_irq + hdmi_audio_infoframe_init + i2c_adapter_type + i2c_add_adapter + i2c_add_numbered_adapter + i2c_del_adapter + i2c_del_driver + i2c_parse_fw_timings + i2c_put_adapter + i2c_register_driver + __i2c_smbus_xfer + i2c_smbus_xfer + i2c_transfer + icc_link_create + icc_node_add + icc_node_create + icc_nodes_remove + icc_provider_add + icc_provider_del + icc_set_bw + icc_sync_state + ida_alloc_range + ida_free + idr_alloc + idr_alloc_cyclic + idr_destroy + idr_find + idr_for_each + idr_get_next + idr_remove + ieee80211_alloc_hw_nm + ieee80211_beacon_get_template + ieee80211_beacon_loss + ieee80211_bss_get_elem + ieee80211_channel_to_freq_khz + ieee80211_connection_loss + ieee80211_csa_finish + ieee80211_find_sta + ieee80211_find_sta_by_ifaddr + ieee80211_free_hw + ieee80211_free_txskb + ieee80211_get_channel_khz + ieee80211_hdrlen + ieee80211_iterate_active_interfaces_atomic + ieee80211_iterate_stations_atomic + ieee80211_queue_delayed_work + ieee80211_queue_work + ieee80211_radar_detected + ieee80211_register_hw + ieee80211_remain_on_channel_expired + ieee80211_report_low_ack + ieee80211_restart_hw + ieee80211_rx_napi + ieee80211_scan_completed + ieee80211_stop_queues + ieee80211_tx_status + ieee80211_tx_status_irqsafe + ieee80211_unregister_hw + ieee80211_wake_queues + iio_read_channel_processed + init_dummy_netdev + init_net + __init_swait_queue_head + init_timer_key + init_wait_entry + __init_waitqueue_head + iomem_resource + iommu_attach_device + iommu_detach_device + iommu_domain_alloc + iommu_domain_free + iommu_present + iommu_unmap + __ioread32_copy + __ioremap + iounmap + iov_iter_revert + __iowrite32_copy + irq_chip_disable_parent + irq_chip_enable_parent + irq_chip_eoi_parent + irq_chip_mask_parent + irq_chip_set_affinity_parent + irq_chip_set_parent_state + irq_chip_set_type_parent + irq_chip_set_vcpu_affinity_parent + irq_chip_set_wake_parent + irq_chip_unmask_parent + __irq_domain_add + irq_domain_free_irqs_common + irq_domain_remove + irq_domain_xlate_onecell + irq_domain_xlate_twocell + irq_find_matching_fwspec + irq_get_irq_data + irq_modify_status + irq_of_parse_and_map + __irq_resolve_mapping + irq_set_chained_handler_and_data + irq_set_chip_and_handler_name + irq_set_chip_data + irq_set_irq_wake + is_vmalloc_addr + jiffies + jiffies_to_msecs + jiffies_to_usecs + kasan_flag_enabled + kasprintf + kernel_connect + kernel_getsockname + kernel_recvmsg + kernel_sendmsg + kfree + kfree_const + kfree_skb_reason + __kmalloc + kmalloc_caches + kmalloc_order_trace + kmem_cache_alloc_trace + kmemdup + kstrdup + kstrdup_const + kstrtoint + kstrtouint + kthread_create_on_node + kthread_should_stop + kthread_stop + ktime_get + ktime_get_mono_fast_ns + ktime_get_real_ts64 + ktime_get_with_offset + __list_add_valid + __list_del_entry_valid + __local_bh_enable_ip + mbox_client_txdone + mbox_free_channel + mbox_request_channel + mbox_send_message + memcpy + __memcpy_fromio + __memcpy_toio + memmove + memremap + memset + __memset_io + memstart_addr + memunmap + mipi_dsi_attach + mipi_dsi_detach + mipi_dsi_device_register_full + mipi_dsi_device_unregister + misc_deregister + misc_register + mod_delayed_work_on + mod_timer + module_layout + __msecs_to_jiffies + msleep + __mutex_init + mutex_is_locked + mutex_lock + mutex_unlock + napi_complete_done + napi_disable + napi_enable + __napi_schedule + napi_schedule_prep + __netdev_alloc_skb + netif_napi_add + __netif_napi_del + no_llseek + nr_cpu_ids + nvmem_cell_get + nvmem_cell_put + nvmem_cell_read + of_address_to_resource + of_alias_get_id + of_clk_add_hw_provider + of_clk_del_provider + of_clk_hw_onecell_get + of_clk_hw_simple_get + of_clk_set_defaults + of_device_get_match_data + of_device_is_compatible + of_device_uevent_modalias + of_dma_configure_id + of_find_device_by_node + of_find_mipi_dsi_host_by_node + of_find_property + of_fwnode_ops + of_genpd_add_provider_onecell + of_genpd_del_provider + of_get_child_by_name + of_get_named_gpio_flags + of_get_next_available_child + of_get_next_child + of_get_property + of_get_regulator_init_data + of_graph_get_remote_node + of_graph_parse_endpoint + of_icc_xlate_onecell + of_iomap + of_irq_get + of_irq_get_byname + of_match_device + of_match_node + of_node_name_eq + of_parse_phandle + of_parse_phandle_with_args + of_parse_phandle_with_fixed_args + of_phy_simple_xlate + of_platform_depopulate + of_platform_populate + of_property_count_elems_of_size + of_property_read_string + of_property_read_string_helper + of_property_read_u32_index + of_property_read_variable_u32_array + of_property_read_variable_u8_array + of_reserved_mem_lookup + param_ops_bool + param_ops_uint + pci_clear_master + pci_disable_device + pcie_capability_read_word + pcie_capability_write_word + pci_enable_device + pci_iomap + pci_iounmap + pci_read_config_dword + __pci_register_driver + pci_release_region + pci_request_region + pci_set_master + pci_unregister_driver + perf_trace_buf_alloc + perf_trace_run_bpf_submit + phy_exit + phy_init + phy_power_off + phy_power_on + pinconf_generic_dt_node_to_map + pinctrl_dev_get_drvdata + pinctrl_pm_select_default_state + pinctrl_pm_select_sleep_state + pinctrl_utils_free_map + platform_bus_type + platform_device_register_full + platform_device_unregister + __platform_driver_register + platform_driver_unregister + platform_get_irq + platform_get_irq_byname + platform_get_resource + platform_get_resource_byname + pm_genpd_add_subdomain + pm_genpd_init + __pm_runtime_disable + pm_runtime_enable + pm_runtime_forbid + pm_runtime_force_resume + pm_runtime_force_suspend + __pm_runtime_idle + __pm_runtime_resume + pm_runtime_set_autosuspend_delay + __pm_runtime_set_status + __pm_runtime_suspend + __pm_runtime_use_autosuspend + preempt_schedule + preempt_schedule_notrace + prepare_to_wait_event + _printk + pskb_expand_head + __pskb_pull_tail + put_device + __put_task_struct + qcom_smem_state_register + qcom_smem_state_unregister + queue_delayed_work_on + queue_work_on + radix_tree_insert + radix_tree_lookup + radix_tree_next_chunk + ___ratelimit + rational_best_approximation + _raw_spin_lock + _raw_spin_lock_bh + _raw_spin_lock_irq + _raw_spin_lock_irqsave + _raw_spin_unlock + _raw_spin_unlock_bh + _raw_spin_unlock_irq + _raw_spin_unlock_irqrestore + _raw_write_lock_bh + _raw_write_unlock_bh + __rcu_read_lock + __rcu_read_unlock + rdev_get_drvdata + refcount_warn_saturate + regcache_cache_only + regcache_mark_dirty + regcache_sync + register_reboot_notifier + __register_rpmsg_driver + regmap_bulk_read + regmap_bulk_write + regmap_field_read + regmap_field_update_bits_base + __regmap_init + regmap_irq_get_virq + regmap_multi_reg_write + regmap_read + regmap_register_patch + regmap_update_bits_base + regmap_write + regulator_bulk_disable + regulator_bulk_enable + regulator_bulk_get + regulator_disable + regulator_disable_regmap + regulator_enable + regulator_enable_regmap + regulator_is_enabled_regmap + regulator_set_load + regulator_set_voltage + release_firmware + __release_region + remap_pfn_range + request_firmware + request_firmware_direct + request_firmware_into_buf + __request_region + request_threaded_irq + reset_control_assert + reset_control_deassert + reset_control_put + reset_control_reset + rpmsg_register_device + rpmsg_send + rpmsg_unregister_device + rproc_add + rproc_add_subdev + rproc_alloc + rproc_coredump_add_custom_segment + rproc_coredump_set_elf_info + rproc_del + rproc_free + rproc_remove_subdev + sched_set_fifo_low + schedule + schedule_timeout + scnprintf + seq_lseek + seq_printf + seq_puts + seq_read + sg_alloc_table + sg_free_table + sg_init_table + sg_next + __sg_page_iter_start + simple_read_from_buffer + single_open + single_release + skb_dequeue + skb_pull + skb_push + skb_put + skb_queue_purge + skb_queue_tail + skb_trim + sk_free + snd_pcm_format_width + snd_soc_add_component_controls + snd_soc_component_init_regmap + snd_soc_component_read + snd_soc_component_read_field + snd_soc_component_update_bits + snd_soc_component_write + snd_soc_component_write_field + snd_soc_dai_set_fmt + snd_soc_dai_set_sysclk + snd_soc_dapm_add_routes + snd_soc_dapm_get_enum_double + snd_soc_dapm_get_volsw + snd_soc_dapm_kcontrol_dapm + snd_soc_dapm_kcontrol_widget + snd_soc_dapm_mixer_update_power + snd_soc_dapm_mux_update_power + snd_soc_dapm_put_enum_double + snd_soc_dapm_put_volsw + snd_soc_get_enum_double + snd_soc_get_volsw + snd_soc_info_enum_double + snd_soc_info_volsw + snd_soc_jack_report + snd_soc_put_enum_double + snd_soc_put_volsw + snprintf + sock_create_kern + sock_release + sort + __spi_alloc_controller + spi_controller_resume + spi_controller_suspend + spi_finalize_current_transfer + spi_register_controller + spi_unregister_controller + sprintf + sscanf + __stack_chk_fail + strcmp + strcpy + strlcpy + strlen + strncmp + strncpy + strnlen + strpbrk + strscpy + strscpy_pad + strsep + __sw_hweight16 + __sw_hweight32 + __sw_hweight64 + __sw_hweight8 + synchronize_irq + synchronize_net + synchronize_rcu + syscon_node_to_regmap + syscon_regmap_lookup_by_phandle + sysfs_create_link + sysfs_emit + sysfs_remove_link + system_wq + tasklet_init + tasklet_kill + __tasklet_schedule + tasklet_setup + thermal_cooling_device_register + thermal_cooling_device_unregister + thermal_zone_device_update + trace_event_buffer_commit + trace_event_buffer_reserve + trace_event_ignore_this_pid + trace_event_printf + trace_event_raw_init + trace_event_reg + trace_handle_return + trace_raw_output_prep + __ubsan_handle_cfi_check_fail_abort + __udelay + unregister_chrdev_region + unregister_reboot_notifier + unregister_rpmsg_driver + usb_disabled + usleep_range_state + vabits_actual + vfree + vmalloc + vmap + vunmap + vzalloc + wait_for_completion_interruptible + wait_for_completion_timeout + __wake_up + wake_up_process + __warn_printk + watchdog_init_timeout + wiphy_to_ieee80211_hw + xa_erase + xa_find + xa_find_after + +# required by apr.ko + rpmsg_trysend + +# required by arm_smmu.ko + amba_bustype + bus_set_iommu + device_link_add + device_match_fwnode + devm_krealloc + driver_find_device + generic_device_group + generic_iommu_put_resv_regions + iommu_alloc_resv_region + iommu_device_register + iommu_device_sysfs_add + iommu_device_sysfs_remove + iommu_device_unregister + iommu_dma_get_resv_regions + iommu_fwspec_add_ids + iommu_fwspec_free + iommu_group_ref_get + of_dma_is_coherent + param_ops_int + pci_bus_type + pci_device_group + report_iommu_fault + +# required by ath.ko + freq_reg_info + reg_initiator_name + wiphy_apply_custom_regulatory + +# required by ath10k_core.ko + cfg80211_calculate_bitrate + cpu_latency_qos_add_request + cpu_latency_qos_remove_request + crc32_le + device_get_mac_address + device_set_wakeup_enable + guid_gen + ieee80211_beacon_cntdwn_is_complete + ieee80211_beacon_get_tim + ieee80211_beacon_update_cntdwn + ieee80211_iter_chan_contexts_atomic + ieee80211_manage_rx_ba_offl + ieee80211_next_txq + ieee80211_proberesp_get + ieee80211_ready_on_channel + ieee80211_return_txq + ieee80211_sta_register_airtime + ieee80211_stop_queue + ieee80211_tdls_oper_request + ieee80211_tx_dequeue + ieee80211_txq_get_depth + ieee80211_txq_may_transmit + ieee80211_txq_schedule_start + ieee80211_tx_rate_update + ieee80211_wake_queue + init_uts_ns + __kfifo_alloc + __kfifo_free + param_ops_ulong + regulatory_hint + rfc1042_header + skb_copy + skb_dequeue_tail + skb_queue_head + skb_realloc_headroom + strlcat + wiphy_read_of_freq_limits + wiphy_rfkill_set_hw_state_reason + +# required by ath10k_pci.ko + pci_disable_msi + pci_enable_msi + pci_write_config_dword + +# required by ath10k_snoc.ko + __bitmap_clear + devm_clk_bulk_get_optional + iommu_map + +# required by ath11k.ko + crypto_alloc_shash + crypto_destroy_tfm + crypto_shash_final + crypto_shash_setkey + crypto_shash_update + ieee80211_freq_khz_to_channel + ieee80211_get_fils_discovery_tmpl + ieee80211_get_hdrlen_from_skb + ieee80211_get_unsol_bcast_probe_resp_tmpl + memcpy_and_pad + regulatory_set_wiphy_regd + +# required by ath11k_ahb.ko + rproc_boot + rproc_get_by_phandle + rproc_shutdown + +# required by ath11k_pci.ko + pci_alloc_irq_vectors_affinity + pci_assign_resource + pci_free_irq_vectors + pci_irq_vector + pci_read_config_word + +# required by bam_dma.ko + dma_async_device_register + dma_async_device_unregister + dma_async_tx_descriptor_init + dma_get_slave_channel + of_dma_controller_free + of_dma_controller_register + pm_runtime_irq_safe + vchan_dma_desc_free_list + vchan_find_desc + vchan_init + vchan_tx_desc_free + vchan_tx_submit + +# required by clk-qcom.ko + __clk_determine_rate + clk_fixed_factor_ops + clk_hw_get_flags + clk_hw_get_num_parents + clk_hw_get_parent_by_index + clk_hw_is_enabled + clk_hw_round_rate + __clk_is_enabled + __clk_mux_determine_rate_closest + divider_ro_round_rate_parent + of_find_node_opts_by_path + of_prop_next_u32 + pm_genpd_remove_subdomain + +# required by clk-rpmh.ko + clk_hw_is_prepared + +# required by clk-spmi-pmic-div.ko + __ndelay + of_clk_get_parent_name + +# required by cmd-db.ko + seq_putc + +# required by cqhci.ko + devm_blk_crypto_profile_init + dmam_free_coherent + mmc_cqe_request_done + +# required by display-connector.ko + drm_atomic_get_new_bridge_state + drm_atomic_helper_bridge_destroy_state + drm_atomic_helper_bridge_duplicate_state + drm_atomic_helper_bridge_reset + drm_probe_ddc + of_get_i2c_adapter_by_node + +# required by extcon-usb-gpio.ko + devm_extcon_dev_allocate + devm_extcon_dev_register + extcon_set_state_sync + gpiod_set_debounce + system_power_efficient_wq + +# required by fastrpc.ko + dma_buf_attach + dma_buf_detach + dma_buf_fd + dma_buf_get + dma_buf_map_attachment + dma_buf_put + dma_buf_unmap_attachment + dma_get_sgtable_attrs + down_read + __find_vma + __mmap_lock_do_trace_acquire_returned + __mmap_lock_do_trace_released + __mmap_lock_do_trace_start_locking + __traceiter_mmap_lock_acquire_returned + __traceiter_mmap_lock_released + __traceiter_mmap_lock_start_locking + __tracepoint_mmap_lock_acquire_returned + __tracepoint_mmap_lock_released + __tracepoint_mmap_lock_start_locking + up_read + +# required by gpio-regulator.ko + devm_gpiod_get_index + devm_kmemdup + devm_kstrdup + gpiod_count + gpiod_get_optional + +# required by gpu-sched.ko + call_rcu + dma_fence_add_callback + dma_fence_remove_callback + kmem_cache_alloc + kmem_cache_create + kmem_cache_destroy + kmem_cache_free + kthread_park + kthread_parkme + kthread_should_park + kthread_unpark + rcu_barrier + wait_for_completion + +# required by i2c-designware-core.ko + i2c_generic_scl_recovery + i2c_recover_bus + +# required by i2c-designware-platform.ko + pm_suspend_global_flags + +# required by i2c-dev.ko + bus_register_notifier + bus_unregister_notifier + __class_create + class_destroy + i2c_bus_type + i2c_for_each_dev + i2c_get_adapter + i2c_transfer_buffer_flags + i2c_verify_client + memdup_user + register_chrdev_region + +# required by i2c-mux-pca954x.ko + device_create_file + device_remove_file + i2c_get_device_id + i2c_smbus_read_byte + i2c_smbus_write_byte + irq_create_mapping_affinity + irq_dispose_mapping + irq_domain_simple_ops + +# required by i2c-mux.ko + __i2c_transfer + rt_mutex_lock + rt_mutex_trylock + rt_mutex_unlock + +# required by i2c-qcom-geni.ko + geni_se_rx_dma_prep + geni_se_rx_dma_unprep + geni_se_tx_dma_prep + geni_se_tx_dma_unprep + i2c_get_dma_safe_msg_buf + i2c_put_dma_safe_msg_buf + +# required by i2c-qup.ko + __usecs_to_jiffies + +# required by i2c-rk3x.ko + clk_notifier_register + clk_notifier_unregister + +# required by icc-bcm-voter.ko + list_sort + of_property_match_string + +# required by icc-osm-l3.ko + icc_std_aggregate + +# required by led-class-multicolor.ko + led_classdev_register_ext + led_classdev_unregister + led_colors + led_set_brightness + stpcpy + +# required by llcc-qcom.ko + devm_platform_ioremap_resource_byname + +# required by lmh.ko + of_cpu_node_to_id + +# required by lontium-lt9611uxc.ko + print_hex_dump + regmap_noinc_read + regmap_noinc_write + +# required by lpass-gfm-sm8250.ko + __clk_mux_determine_rate + devm_pm_clk_create + devm_pm_runtime_enable + of_pm_clk_add_clks + pm_clk_resume + pm_clk_suspend + +# required by mcp251xfd.ko + alloc_candev_mqs + alloc_can_err_skb + alloc_canfd_skb + alloc_can_skb + can_bus_off + can_change_mtu + can_change_state + can_fd_dlc2len + can_fd_len2dlc + can_put_echo_skb + can_rx_offload_add_manual + can_rx_offload_del + can_rx_offload_enable + can_rx_offload_get_echo_skb + can_rx_offload_queue_sorted + can_rx_offload_threaded_irq_finish + can_skb_get_frame_len + close_candev + devm_gpiod_put + dql_completed + dql_reset + free_candev + netdev_err + netdev_info + netdev_notice + netdev_warn + netif_schedule_queue + netif_tx_wake_queue + open_candev + register_candev + regmap_get_val_bytes + regmap_raw_write + spi_async + spi_get_device_id + __spi_register_driver + spi_setup + spi_sync + spi_write_then_read + timecounter_cyc2time + timecounter_init + timecounter_read + unregister_candev + __vmalloc + +# required by mhi.ko + device_add + device_del + pm_wakeup_dev_event + prandom_u32 + _raw_read_lock_bh + _raw_read_lock_irq + _raw_read_lock_irqsave + _raw_read_unlock_bh + _raw_read_unlock_irq + _raw_read_unlock_irqrestore + _raw_write_lock_irq + _raw_write_lock_irqsave + _raw_write_unlock_irq + _raw_write_unlock_irqrestore + +# required by michael_mic.ko + crypto_register_shash + crypto_unregister_shash + +# required by msm.ko + __bitmap_andnot + __bitmap_weight + bpf_trace_run6 + bpf_trace_run8 + clk_get_parent + clk_set_parent + component_add + component_bind_all + component_del + component_master_add_with_match + component_master_del + component_unbind_all + _ctype + debugfs_create_bool + debugfs_create_u64 + del_timer + dev_coredumpm + devfreq_cooling_unregister + devfreq_recommended_opp + devfreq_resume_device + devfreq_suspend_device + __devm_clk_hw_register_divider + devm_clk_hw_register_fixed_factor + __devm_clk_hw_register_mux + devm_clk_register + devm_devfreq_add_device + devm_pm_opp_set_supported_hw + devm_regulator_get_exclusive + dev_pm_opp_get_freq + dev_pm_opp_get_voltage + dma_resv_add_excl_fence + dma_resv_add_shared_fence + dma_resv_reserve_shared + dma_resv_wait_timeout + drm_atomic_get_private_obj_state + drm_atomic_helper_check + drm_atomic_helper_check_plane_state + drm_atomic_helper_cleanup_planes + drm_atomic_helper_commit + drm_atomic_helper_commit_hw_done + drm_atomic_helper_commit_modeset_disables + drm_atomic_helper_commit_modeset_enables + drm_atomic_helper_commit_planes + __drm_atomic_helper_crtc_destroy_state + drm_atomic_helper_crtc_destroy_state + __drm_atomic_helper_crtc_duplicate_state + drm_atomic_helper_crtc_duplicate_state + __drm_atomic_helper_crtc_reset + drm_atomic_helper_crtc_reset + drm_atomic_helper_dirtyfb + drm_atomic_helper_disable_plane + drm_atomic_helper_duplicate_state + drm_atomic_helper_page_flip + __drm_atomic_helper_plane_destroy_state + drm_atomic_helper_plane_destroy_state + __drm_atomic_helper_plane_duplicate_state + drm_atomic_helper_plane_duplicate_state + __drm_atomic_helper_plane_reset + drm_atomic_helper_plane_reset + __drm_atomic_helper_private_obj_duplicate_state + drm_atomic_helper_set_config + drm_atomic_helper_shutdown + drm_atomic_helper_update_plane + drm_atomic_print_new_state + drm_atomic_private_obj_fini + drm_atomic_private_obj_init + __drm_atomic_state_free + drm_bridge_attach + drm_bridge_connector_enable_hpd + drm_bridge_connector_init + drm_bridge_detect + drm_compat_ioctl + drm_connector_has_possible_encoder + drm_connector_list_iter_begin + drm_connector_list_iter_end + drm_connector_list_iter_next + drm_crtc_cleanup + drm_crtc_enable_color_mgmt + drm_crtc_handle_vblank + drm_crtc_init_with_planes + drm_crtc_send_vblank_event + drm_crtc_set_max_vblank_count + drm_crtc_vblank_get + drm_crtc_vblank_helper_get_vblank_timestamp + drm_crtc_vblank_off + drm_crtc_vblank_on + drm_crtc_vblank_put + __drm_dbg + __drm_debug + drm_debugfs_create_files + drm_detect_hdmi_monitor + drm_detect_monitor_audio + drm_dev_alloc + drm_dev_dbg + drm_dev_printk + drm_dev_put + drm_dev_register + drm_dev_unregister + drm_dp_aux_register + drm_dp_aux_unregister + drm_dp_bw_code_to_link_rate + drm_dp_channel_eq_ok + drm_dp_clock_recovery_ok + drm_dp_dpcd_read + drm_dp_dpcd_read_link_status + drm_dp_dpcd_write + drm_dp_get_adjust_request_pre_emphasis + drm_dp_get_adjust_request_voltage + drm_dp_link_rate_to_bw_code + drm_dp_link_train_channel_eq_delay + drm_dp_link_train_clock_recovery_delay + drm_dp_read_sink_count + drm_edid_block_valid + drm_encoder_cleanup + drm_encoder_init + drm_flip_work_cleanup + drm_flip_work_commit + drm_flip_work_init + drm_flip_work_queue + drm_format_info + drm_framebuffer_init + drm_gem_create_mmap_offset + drm_gem_fb_create_handle + drm_gem_fb_destroy + drm_gem_fb_get_obj + drm_gem_fence_array_add + drm_gem_fence_array_add_implicit + drm_gem_free_mmap_offset + drm_gem_get_pages + drm_gem_handle_create + drm_gem_mmap + drm_gem_object_free + drm_gem_object_init + drm_gem_object_lookup + drm_gem_object_release + drm_gem_plane_helper_prepare_fb + drm_gem_prime_fd_to_handle + drm_gem_prime_handle_to_fd + drm_gem_prime_mmap + drm_gem_private_object_init + drm_gem_put_pages + drm_gem_vm_close + drm_gem_vm_open + drm_get_format_info + drm_handle_vblank + drm_helper_hpd_irq_event + drm_helper_mode_fill_fb_struct + drm_ioctl + drm_kms_helper_poll_fini + drm_kms_helper_poll_init + drm_mm_init + drm_mm_insert_node_in_range + drmm_mode_config_init + drm_mm_print + drm_mm_remove_node + drm_mm_takedown + drm_mode_config_cleanup + drm_mode_config_helper_resume + drm_mode_config_helper_suspend + drm_mode_config_reset + drm_mode_copy + drm_mode_debug_printmodeline + drm_mode_destroy + drm_mode_duplicate + drm_mode_object_find + drm_mode_object_put + drm_mode_probed_add + drm_modeset_acquire_fini + drm_modeset_acquire_init + drm_modeset_backoff + drm_modeset_drop_locks + drm_modeset_lock + drm_modeset_lock_all + drm_modeset_lock_all_ctx + drm_modeset_lock_init + drm_modeset_unlock + drm_modeset_unlock_all + drm_object_attach_property + drm_of_component_match_add + drm_open + drm_panel_disable + drm_panel_enable + drm_panel_get_modes + drm_panel_prepare + drm_panel_unprepare + drm_plane_cleanup + drm_plane_create_alpha_property + drm_plane_create_blend_mode_property + drm_plane_create_rotation_property + drm_plane_create_zpos_property + drm_plane_enable_fb_damage_clips + drm_poll + drm_prime_gem_destroy + drm_prime_pages_to_sg + drm_prime_sg_to_page_array + drm_printf + __drm_printfn_coredump + __drm_printfn_info + __drm_printfn_seq_file + drm_puts + __drm_puts_coredump + __drm_puts_seq_file + drm_read + drm_rect_calc_hscale + drm_rect_calc_vscale + drm_rect_intersect + drm_release + drm_rotation_simplify + drm_state_dump + drm_syncobj_add_point + drm_syncobj_find + drm_syncobj_find_fence + drm_syncobj_free + drm_syncobj_replace_fence + drm_universal_plane_init + drm_vblank_init + fd_install + generic_file_llseek + get_pid_task + get_unused_fd_flags + gpiod_direction_input + gpiod_get_value + hdmi_audio_infoframe_pack + hdmi_infoframe_pack + hrtimer_init + hrtimer_start_range_ns + invalidate_mapping_pages + iommu_map_sg + iommu_set_fault_handler + iommu_set_pgtable_quirks + kstrdup_quotable_cmdline + kstrtouint_from_user + kthread_create_worker + kthread_destroy_worker + kthread_queue_work + kvfree + kvmalloc_node + memdup_user_nul + memparse + mipi_dsi_create_packet + mipi_dsi_host_register + mipi_dsi_host_unregister + mipi_dsi_packet_format_is_long + mutex_lock_interruptible + mutex_trylock + noop_llseek + nr_swap_pages + nsecs_to_jiffies + nvmem_cell_read_variable_le_u32 + of_devfreq_cooling_register + of_device_is_available + of_drm_find_bridge + of_drm_find_panel + of_find_matching_node_and_match + of_get_compatible_child + of_graph_get_endpoint_by_regs + of_graph_get_next_endpoint + of_graph_get_remote_port_parent + of_icc_get + param_ops_charp + phy_calibrate + phy_configure + pid_task + pm_runtime_get_if_active + put_pid + put_unused_fd + _raw_read_lock + _raw_read_unlock + _raw_write_lock + _raw_write_unlock + register_shrinker + register_vmap_purge_notifier + reservation_ww_class + round_jiffies_up + sched_set_fifo + schedule_timeout_interruptible + __sg_page_iter_dma_next + shmem_truncate_range + simple_attr_open + simple_attr_read + simple_attr_release + simple_attr_write + simple_open + strstr + sync_file_create + sync_file_get_fence + unmap_mapping_range + unregister_shrinker + unregister_vmap_purge_notifier + vmf_insert_mixed + vm_get_page_prot + vscnprintf + vsnprintf + ww_mutex_lock + ww_mutex_lock_interruptible + ww_mutex_unlock + xa_destroy + +# required by msm_serial.ko + do_SAK + handle_sysrq + sysrq_mask + tty_flip_buffer_push + __tty_insert_flip_char + tty_termios_baud_rate + tty_termios_encode_baud_rate + uart_add_one_port + uart_get_baud_rate + uart_register_driver + uart_remove_one_port + uart_resume_port + uart_suspend_port + uart_try_toggle_sysrq + uart_unregister_driver + uart_update_timeout + uart_write_wakeup + +# required by ns.ko + kernel_bind + radix_tree_delete + radix_tree_iter_resume + +# required by nvmem_qfprom.ko + devm_nvmem_register + +# required by ohci-hcd.ko + dma_pool_alloc + dma_pool_create + dma_pool_destroy + dma_pool_free + free_pages + gen_pool_dma_alloc_align + gen_pool_dma_zalloc_align + gen_pool_free_owner + get_zeroed_page + sb800_prefetch + schedule_timeout_uninterruptible + usb_amd_dev_put + usb_amd_quirk_pll_disable + usb_amd_quirk_pll_enable + usb_calc_bus_time + usb_debug_root + usb_hcd_check_unlink_urb + usb_hcd_giveback_urb + usb_hc_died + usb_hcd_link_urb_to_ep + usb_hcd_poll_rh_status + usb_hcd_resume_root_hub + usb_hcds_loaded + usb_hcd_unlink_urb_from_ep + usb_root_hub_lost_power + +# required by ohci-pci.ko + pci_dev_put + pci_get_slot + pci_match_id + usb_amd_prefetch_quirk + usb_amd_quirk_pll_check + usb_hcd_pci_pm_ops + usb_hcd_pci_probe + usb_hcd_pci_remove + usb_hcd_pci_shutdown + +# required by ohci-platform.ko + devm_reset_control_array_get + of_clk_get + usb_add_hcd + usb_create_hcd + usb_hcd_platform_shutdown + usb_put_hcd + usb_remove_hcd + +# required by phy-qcom-qmp.ko + devm_get_clk_from_child + __of_reset_control_get + +# required by phy-qcom-qusb2.ko + devm_nvmem_cell_get + +# required by phy-qcom-usb-hs.ko + extcon_get_edev_by_phandle + extcon_get_state + extcon_register_notifier + extcon_unregister_notifier + +# required by pinctrl-msm.ko + device_property_read_u16_array + gpiochip_line_is_valid + gpiochip_lock_as_irq + gpiochip_unlock_as_irq + handle_bad_irq + handle_fasteoi_ack_irq + handle_fasteoi_irq + module_put + pinctrl_force_default + pinctrl_force_sleep + pm_power_off + register_restart_handler + try_module_get + unregister_restart_handler + +# required by pinctrl-spmi-gpio.ko + irq_chip_ack_parent + of_irq_find_parent + +# required by pinctrl-spmi-mpp.ko + platform_irq_count + +# required by pm8941-pwrkey.ko + devm_input_allocate_device + input_event + input_register_device + input_set_capability + +# required by q6asm-dai.ko + snd_dma_alloc_pages + snd_dma_free_pages + snd_pcm_hw_constraint_integer + snd_pcm_hw_constraint_list + snd_pcm_hw_constraint_minmax + snd_pcm_hw_constraint_step + snd_pcm_period_elapsed + snd_pcm_set_managed_buffer_all + snd_soc_new_compress + snd_soc_set_runtime_hwparams + +# required by qcom-cpufreq-hw.ko + cpufreq_cpu_get_raw + cpufreq_enable_boost_support + cpufreq_freq_attr_scaling_available_freqs + cpufreq_freq_attr_scaling_boost_freqs + cpufreq_generic_frequency_table_verify + cpufreq_get_driver_data + cpufreq_register_driver + cpufreq_unregister_driver + __cpu_possible_mask + cpu_scale + dev_pm_opp_adjust_voltage + dev_pm_opp_disable + dev_pm_opp_enable + dev_pm_opp_of_add_table + dev_pm_opp_of_cpumask_remove_table + dev_pm_opp_of_register_em + dev_pm_opp_remove_all_dynamic + dev_pm_opp_set_sharing_cpus + get_cpu_device + of_get_cpu_node + __per_cpu_offset + policy_has_boost_freq + system_highpri_wq + topology_set_thermal_pressure + +# required by qcom-pdc.ko + irq_chip_get_parent_state + irq_chip_retrigger_hierarchy + irq_domain_alloc_irqs_parent + irq_domain_create_hierarchy + irq_domain_disconnect_hierarchy + irq_domain_set_hwirq_and_chip + irq_domain_update_bus_token + platform_irqchip_probe + +# required by qcom-pmic-typec.ko + dev_fwnode + fwnode_handle_put + fwnode_property_read_string + fwnode_usb_role_switch_get + typec_find_port_data_role + typec_find_port_power_role + typec_register_port + typec_set_orientation + typec_unregister_port + usb_role_switch_put + usb_role_switch_set_role + +# required by qcom-rpmh-regulator.ko + regulator_list_voltage_linear_range + +# required by qcom-scm.ko + __arm_smccc_smc + __cpu_present_mask + kimage_voffset + +# required by qcom-spmi-adc-tm5.ko + devm_of_iio_channel_get_by_name + +# required by qcom-spmi-adc5.ko + devm_iio_device_alloc + __devm_iio_device_register + +# required by qcom-spmi-pmic.ko + __devm_regmap_init_spmi_ext + __spmi_driver_register + +# required by qcom-spmi-temp-alarm.ko + devm_iio_channel_get + of_thermal_get_ntrips + of_thermal_get_trip_points + of_thermal_is_trip_valid + +# required by qcom-wdt.ko + platform_get_irq_optional + +# required by qcom_aoss.ko + clk_hw_unregister + devm_thermal_of_cooling_device_register + pm_genpd_remove + +# required by qcom_common.ko + rproc_coredump_add_segment + rproc_coredump_using_sections + srcu_init_notifier_head + srcu_notifier_call_chain + srcu_notifier_chain_register + srcu_notifier_chain_unregister + +# required by qcom_glink.ko + device_add_groups + +# required by qcom_hwspinlock.ko + devm_hwspin_lock_register + +# required by qcom_pil_info.ko + of_find_compatible_node + +# required by qcom_q6v5.ko + devm_qcom_smem_state_get + qcom_smem_state_update_bits + rproc_report_crash + +# required by qcom_q6v5_wcss.ko + rproc_elf_get_boot_addr + +# required by qcom_rpmh.ko + bitmap_find_next_zero_area_off + __bitmap_set + cpu_pm_register_notifier + __num_online_cpus + _raw_spin_trylock + +# required by qcom_spmi-regulator.ko + smp_call_function_single + +# required by qcom_sysmon.ko + rproc_get_by_child + try_wait_for_completion + +# required by qcom_tsens.ko + debugfs_lookup + +# required by qcom_usb_vbus-regulator.ko + regulator_get_current_limit_regmap + regulator_set_current_limit_regmap + +# required by qrtr-tun.ko + _copy_to_iter + +# required by qrtr.ko + __alloc_skb + autoremove_wake_function + datagram_poll + do_wait_intr_irq + get_user_ifreq + lock_sock_nested + proto_register + proto_unregister + put_user_ifreq + radix_tree_iter_delete + refcount_dec_and_mutex_lock + release_sock + sk_alloc + skb_clone + skb_copy_bits + skb_copy_datagram_iter + skb_free_datagram + __skb_pad + skb_recv_datagram + skb_set_owner_w + sk_error_report + sock_alloc_send_skb + sock_gettstamp + sock_init_data + sock_no_accept + sock_no_listen + sock_no_mmap + sock_no_sendpage + sock_no_shutdown + sock_no_socketpair + sock_queue_rcv_skb + sock_register + sock_unregister + __xa_alloc + __xa_insert + xa_load + +# required by rmtfs_mem.ko + alloc_chrdev_region + __class_register + class_unregister + +# required by rpmsg_ns.ko + rpmsg_create_channel + rpmsg_create_ept + rpmsg_release_channel + +# required by rtc-pm8xxx.ko + devm_request_any_context_irq + devm_rtc_allocate_device + __devm_rtc_register_device + rtc_time64_to_tm + rtc_tm_to_time64 + rtc_update_irq + +# required by sdhci-msm.ko + mmc_of_parse + mmc_regulator_get_supply + mmc_regulator_set_ocr + mmc_regulator_set_vqmmc + mmc_send_tuning + regulator_is_supported_voltage + __reset_control_get + __sdhci_add_host + sdhci_add_host + sdhci_cleanup_host + sdhci_cqe_disable + sdhci_cqe_enable + sdhci_cqe_irq + sdhci_enable_clk + sdhci_get_property + sdhci_pltfm_free + sdhci_pltfm_init + sdhci_remove_host + sdhci_reset + sdhci_set_bus_width + sdhci_set_power_noreg + __sdhci_set_timeout + sdhci_setup_host + +# required by slim-qcom-ngd-ctrl.ko + platform_device_add + platform_device_alloc + +# required by smem.ko + hwspin_lock_free + hwspin_lock_request_specific + __hwspin_lock_timeout + __hwspin_unlock + of_hwspin_lock_get_id + +# required by snd-soc-hdmi-codec.ko + snd_ctl_add + snd_ctl_new1 + snd_pcm_add_chmap_ctls + snd_pcm_create_iec958_consumer_default + snd_pcm_fill_iec958_consumer + snd_pcm_fill_iec958_consumer_hw_params + snd_pcm_hw_constraint_eld + +# required by snd-soc-lpass-va-macro.ko + dapm_regulator_event + regcache_sync_region + +# required by snd-soc-qcom-common.ko + snd_soc_dai_link_set_capabilities + snd_soc_of_get_dai_link_codecs + snd_soc_of_get_dai_name + snd_soc_of_parse_audio_routing + snd_soc_of_parse_aux_devs + snd_soc_of_parse_card_name + +# required by snd-soc-rl6231.ko + gcd + +# required by snd-soc-rt5663.ko + regcache_cache_bypass + snd_soc_dapm_disable_pin + snd_soc_dapm_force_enable_pin + snd_soc_dapm_new_controls + snd_soc_dapm_sync + +# required by snd-soc-sdm845.ko + snd_jack_set_key + snd_soc_card_jack_new + snd_soc_component_set_jack + snd_soc_dai_get_channel_map + snd_soc_dai_set_channel_map + snd_soc_dai_set_tdm_slot + +# required by snd-soc-wcd9335.ko + kmemdup_nul + strnstr + +# required by snd-soc-wcd934x.ko + kstrndup + +# required by socinfo.ko + add_device_randomness + soc_device_register + soc_device_unregister + +# required by soundwire-bus.ko + devm_device_add_group + devm_device_add_groups + dev_pm_domain_attach + fwnode_property_present + fwnode_property_read_u32_array + pm_generic_runtime_resume + pm_generic_runtime_suspend + +# required by spi-geni-qcom.ko + geni_se_clk_freq_match + geni_se_get_qup_hw_version + +# required by spi-pl022.ko + amba_driver_register + amba_driver_unregister + amba_release_regions + amba_request_regions + __dma_request_channel + loops_per_jiffy + pinctrl_pm_select_idle_state + spi_delay_exec + spi_finalize_current_message + spi_get_next_queued_message + tasklet_unlock_wait + +# required by spi-qcom-qspi.ko + icc_disable + icc_enable + +# required by spmi-pmic-arb.ko + irq_domain_set_info + spmi_controller_add + spmi_controller_alloc + spmi_controller_remove + +# required by system_heap.ko + dmabuf_page_pool_alloc + dmabuf_page_pool_create + dmabuf_page_pool_destroy + dmabuf_page_pool_free + dma_heap_add + dma_heap_get_dev + dma_heap_get_name + dma_sync_sg_for_cpu + dma_sync_sg_for_device + __free_pages + __sg_page_iter_next + +# required by ufs_qcom.ko + phy_set_mode_ext + ufshcd_dme_configure_adapt + ufshcd_dme_get_attr + ufshcd_dme_set_attr + ufshcd_dump_regs + ufshcd_get_local_unipro_ver + ufshcd_get_pwr_dev_param + ufshcd_init_pwr_dev_param + ufshcd_pltfrm_init + ufshcd_pltfrm_shutdown + ufshcd_remove + ufshcd_resume_complete + ufshcd_runtime_resume + ufshcd_runtime_suspend + ufshcd_suspend_prepare + ufshcd_system_resume + ufshcd_system_suspend + ufshcd_uic_hibern8_enter + ufshcd_uic_hibern8_exit + +# required by ulpi.ko + of_device_modalias + of_device_request_module + __request_module + +# required by wcd934x.ko + mfd_add_devices + mfd_remove_devices + +# preserved by --additions-only + __nla_parse + nla_put + rtnl_lock + rtnl_unlock + snd_soc_get_volsw_sx + snd_soc_info_volsw_sx + snd_soc_put_volsw_sx + spmi_ext_register_read + spmi_ext_register_readl + spmi_ext_register_write + spmi_ext_register_writel + spmi_register_read + spmi_register_write + spmi_register_zero_write diff --git a/android/abi_gki_aarch64_exynos b/android/abi_gki_aarch64_exynos new file mode 100644 index 000000000000..8ae395d9d5d6 --- /dev/null +++ b/android/abi_gki_aarch64_exynos @@ -0,0 +1,1980 @@ +[abi_symbol_list] + activate_task + add_cpu + add_timer + add_timer_on + adjust_managed_page_count + alarm_cancel + alarm_init + alarm_start_relative + alloc_anon_inode + alloc_chrdev_region + alloc_netdev_mqs + __alloc_pages + __alloc_percpu + __alloc_skb + alloc_workqueue + amba_driver_register + amba_driver_unregister + android_debug_symbol + android_rvh_probe_register + anon_inode_getfile + __arch_copy_from_user + __arch_copy_to_user + arch_freq_scale + arm64_const_caps_ready + arm64_use_ng_mappings + __arm_smccc_hvc + __arm_smccc_smc + atomic_notifier_call_chain + atomic_notifier_chain_register + atomic_notifier_chain_unregister + autoremove_wake_function + available_idle_cpu + backlight_device_register + backlight_device_unregister + balance_push_callback + bcmp + bio_endio + bio_end_io_acct_remapped + bio_start_io_acct + __bitmap_clear + __bitmap_complement + bitmap_parse + bitmap_parselist + bitmap_print_to_pagebuf + __bitmap_set + bitmap_to_arr32 + __bitmap_weight + __blk_alloc_disk + blk_cleanup_disk + blk_queue_flag_clear + blk_queue_flag_set + blk_queue_io_min + blk_queue_io_opt + blk_queue_logical_block_size + blk_queue_max_discard_sectors + blk_queue_max_write_zeroes_sectors + blk_queue_physical_block_size + blocking_notifier_call_chain + blocking_notifier_chain_register + blocking_notifier_chain_unregister + bpf_trace_run1 + bpf_trace_run10 + bpf_trace_run11 + bpf_trace_run12 + bpf_trace_run2 + bpf_trace_run3 + bpf_trace_run4 + bpf_trace_run5 + bpf_trace_run6 + bpf_trace_run7 + bpf_trace_run8 + bpf_trace_run9 + bus_find_device + bus_for_each_dev + bus_register + bus_register_notifier + bus_set_iommu + bus_unregister + bus_unregister_notifier + call_rcu + cancel_delayed_work + cancel_delayed_work_sync + cancel_work_sync + capable + cdev_add + cdev_alloc + cdev_del + cdev_device_add + cdev_device_del + cdev_init + __cfi_slowpath_diag + cgroup_taskset_first + cgroup_taskset_next + __check_object_size + check_preempt_curr + __class_create + class_create_file_ns + class_destroy + class_find_device + __class_register + class_unregister + __ClearPageMovable + clk_bulk_disable + clk_bulk_enable + clk_bulk_prepare + clk_bulk_unprepare + clk_disable + clk_enable + clk_get + __clk_get_hw + clk_get_rate + clk_hw_get_name + clk_hw_get_parent + __clk_is_enabled + clk_prepare + clk_put + clk_register + clk_register_clkdev + clk_register_fixed_factor + clk_register_fixed_rate + clk_register_gate + clk_set_parent + clk_set_rate + clk_unprepare + clockevents_config_and_register + clocks_calc_mult_shift + __clocksource_register_scale + cma_alloc + cma_release + complete + complete_all + complete_and_exit + completion_done + component_add + component_bind_all + component_del + component_master_add_with_match + component_master_del + component_match_add_release + component_unbind_all + config_ep_by_speed + config_group_init_type_name + console_lock + console_printk + console_stop + console_suspend_enabled + console_trylock + console_unlock + __const_udelay + contig_page_data + __cpu_active_mask + cpu_all_bits + cpu_bit_bitmap + cpufreq_add_update_util_hook + cpufreq_cpu_get + cpufreq_cpu_put + cpufreq_disable_fast_switch + cpufreq_driver_fast_switch + __cpufreq_driver_target + cpufreq_enable_fast_switch + cpufreq_frequency_table_get_index + cpufreq_generic_attr + cpufreq_get_policy + cpufreq_quick_get + cpufreq_quick_get_max + cpufreq_register_driver + cpufreq_register_governor + cpufreq_register_notifier + cpufreq_remove_update_util_hook + cpufreq_this_cpu_can_update + cpufreq_unregister_notifier + cpu_hotplug_disable + cpu_hotplug_enable + __cpuhp_remove_state + __cpuhp_setup_state + __cpuhp_setup_state_cpuslocked + __cpuhp_state_add_instance + __cpuhp_state_remove_instance + cpuhp_tasks_frozen + cpu_hwcap_keys + cpu_hwcaps + cpuidle_get_cpu_driver + cpuidle_governor_latency_req + cpuidle_pause_and_lock + cpuidle_register_governor + cpuidle_resume_and_unlock + cpumask_next + cpumask_next_and + cpu_number + __cpu_online_mask + cpu_pm_register_notifier + __cpu_possible_mask + cpu_scale + cpus_read_lock + cpus_read_unlock + cpu_subsys + cpu_topology + crypto_alloc_base + crypto_alloc_shash + crypto_comp_compress + crypto_comp_decompress + crypto_destroy_tfm + crypto_has_alg + crypto_shash_final + crypto_shash_update + _ctype + dapm_pinctrl_event + dapm_regulator_event + deactivate_task + debugfs_attr_read + debugfs_attr_write + debugfs_create_blob + debugfs_create_bool + debugfs_create_dir + debugfs_create_file + debugfs_create_file_size + debugfs_create_symlink + debugfs_create_u32 + debugfs_create_x32 + debugfs_remove + dec_zone_page_state + default_llseek + deferred_free + delayed_work_timer_fn + del_gendisk + del_timer + del_timer_sync + desc_to_gpio + destroy_workqueue + _dev_alert + _dev_crit + dev_driver_string + _dev_emerg + _dev_err + dev_err_probe + devfreq_add_device + devfreq_add_governor + devfreq_get_devfreq_by_phandle + devfreq_monitor_resume + devfreq_monitor_start + devfreq_monitor_stop + devfreq_monitor_suspend + devfreq_recommended_opp + devfreq_register_opp_notifier + devfreq_remove_device + devfreq_remove_governor + devfreq_resume_device + devfreq_suspend_device + devfreq_unregister_opp_notifier + devfreq_update_interval + dev_get_by_name + dev_get_regmap + device_add_disk + device_create + device_create_bin_file + device_create_file + device_create_managed_software_node + device_destroy + device_for_each_child + device_get_child_node_count + device_get_next_child_node + device_initialize + device_init_wakeup + device_link_add + device_link_del + device_property_present + device_property_read_u32_array + device_register + device_remove_file + device_show_bool + device_show_int + device_store_bool + device_store_int + device_unregister + _dev_info + __dev_kfree_skb_any + devm_add_action + devm_backlight_device_register + devm_backlight_device_unregister + devm_clk_bulk_get_all + devm_clk_get + devm_clk_put + devm_devfreq_register_notifier + devm_devfreq_unregister_notifier + __devm_drm_dev_alloc + devm_drm_panel_bridge_add_typed + devm_free_irq + devm_gen_pool_create + devm_gpiochip_add_data_with_key + devm_gpiod_get + devm_gpiod_get_index + devm_gpiod_get_optional + devm_gpio_request_one + devm_hwrng_register + devm_i2c_new_dummy_device + devm_iio_device_alloc + __devm_iio_device_register + devm_input_allocate_device + devm_ioremap + devm_ioremap_resource + devm_iounmap + __devm_irq_alloc_descs + devm_kasprintf + devm_kfree + devm_kmalloc + devm_kmemdup + devm_krealloc + devm_kstrdup + devm_led_classdev_register_ext + devm_mfd_add_devices + __devm_of_phy_provider_register + devm_phy_create + devm_phy_get + devm_pinctrl_get + devm_pinctrl_register + devm_platform_ioremap_resource + __devm_regmap_init_i2c + __devm_regmap_init_mmio_clk + devm_regulator_bulk_get + devm_regulator_get + devm_regulator_register + __devm_request_region + devm_request_threaded_irq + __devm_reset_control_get + devm_rtc_device_register + devm_snd_dmaengine_pcm_register + devm_snd_soc_register_card + devm_snd_soc_register_component + devm_thermal_zone_of_sensor_register + _dev_notice + dev_pm_opp_add + dev_pm_opp_disable + dev_pm_opp_find_freq_ceil + dev_pm_opp_find_freq_exact + dev_pm_opp_find_freq_floor + dev_pm_opp_get_freq + dev_pm_opp_get_opp_count + dev_pm_opp_get_voltage + dev_pm_opp_of_register_em + dev_pm_opp_put + dev_pm_qos_add_notifier + dev_pm_qos_add_request + dev_pm_qos_read_value + dev_pm_qos_remove_request + dev_pm_qos_update_request + devres_add + __devres_alloc_node + devres_free + devres_release + dev_set_name + _dev_warn + disable_irq + disable_irq_nosync + disk_end_io_acct + disk_start_io_acct + dma_alloc_attrs + dma_async_device_register + dma_async_device_unregister + dma_async_tx_descriptor_init + dma_buf_attach + dma_buf_begin_cpu_access + dma_buf_begin_cpu_access_partial + dma_buf_detach + dma_buf_dynamic_attach + dma_buf_end_cpu_access + dma_buf_end_cpu_access_partial + dma_buf_export + dma_buf_fd + dma_buf_get + dma_buf_get_flags + dma_buf_map_attachment + dma_buf_mmap + dma_buf_move_notify + dmabuf_page_pool_alloc + dmabuf_page_pool_create + dmabuf_page_pool_destroy + dmabuf_page_pool_free + dma_buf_pin + dma_buf_put + dma_buf_unmap_attachment + dma_buf_unpin + dma_buf_vmap + dma_buf_vunmap + dmaengine_unmap_put + dma_fence_add_callback + dma_fence_array_create + dma_fence_chain_init + dma_fence_chain_ops + dma_fence_chain_walk + dma_fence_context_alloc + dma_fence_default_wait + dma_fence_enable_sw_signaling + dma_fence_get_status + dma_fence_get_stub + dma_fence_init + dma_fence_release + dma_fence_remove_callback + dma_fence_signal + dma_fence_wait_any_timeout + dma_fence_wait_timeout + dma_free_attrs + dma_get_required_mask + dma_get_slave_caps + dma_get_slave_channel + dma_heap_add + dma_heap_buffer_alloc + dma_heap_buffer_free + dma_heap_find + dma_heap_get_dev + dma_heap_get_drvdata + dma_heap_get_name + dma_heap_put + dmam_alloc_attrs + dma_map_page_attrs + dma_map_resource + dma_map_sgtable + dma_mmap_attrs + dma_release_channel + dma_request_chan + dma_resv_add_excl_fence + dma_resv_add_shared_fence + dma_resv_get_fences + dma_resv_reserve_shared + dma_resv_test_signaled + dma_resv_wait_timeout + dma_set_coherent_mask + dma_set_mask + dma_sync_sg_for_cpu + dma_sync_sg_for_device + dma_sync_single_for_cpu + dma_sync_single_for_device + dma_unmap_page_attrs + dma_unmap_resource + dma_unmap_sg_attrs + __do_once_done + __do_once_start + do_SAK + double_rq_lock + do_wait_intr + down + down_read + down_read_killable + down_read_trylock + down_write + down_write_trylock + d_path + driver_register + driver_unregister + drm_add_edid_modes + drm_any_plane_has_format + drm_aperture_remove_conflicting_pci_framebuffers + drm_atomic_add_affected_connectors + drm_atomic_add_affected_planes + drm_atomic_bridge_chain_disable + drm_atomic_commit + drm_atomic_get_connector_state + drm_atomic_get_crtc_state + drm_atomic_get_plane_state + drm_atomic_get_private_obj_state + drm_atomic_helper_check + drm_atomic_helper_check_modeset + drm_atomic_helper_check_planes + drm_atomic_helper_check_plane_state + drm_atomic_helper_cleanup_planes + drm_atomic_helper_commit + drm_atomic_helper_commit_cleanup_done + drm_atomic_helper_commit_duplicated_state + drm_atomic_helper_commit_hw_done + drm_atomic_helper_commit_modeset_disables + drm_atomic_helper_commit_modeset_enables + drm_atomic_helper_commit_planes + drm_atomic_helper_commit_tail + __drm_atomic_helper_connector_destroy_state + drm_atomic_helper_connector_destroy_state + __drm_atomic_helper_connector_duplicate_state + drm_atomic_helper_connector_duplicate_state + drm_atomic_helper_connector_reset + __drm_atomic_helper_crtc_destroy_state + drm_atomic_helper_crtc_destroy_state + __drm_atomic_helper_crtc_duplicate_state + drm_atomic_helper_crtc_duplicate_state + drm_atomic_helper_crtc_reset + drm_atomic_helper_disable_plane + drm_atomic_helper_fake_vblank + drm_atomic_helper_page_flip + __drm_atomic_helper_plane_destroy_state + drm_atomic_helper_plane_destroy_state + __drm_atomic_helper_plane_duplicate_state + drm_atomic_helper_plane_duplicate_state + drm_atomic_helper_plane_reset + drm_atomic_helper_prepare_planes + __drm_atomic_helper_private_obj_duplicate_state + drm_atomic_helper_set_config + drm_atomic_helper_setup_commit + drm_atomic_helper_swap_state + drm_atomic_helper_update_plane + drm_atomic_helper_wait_for_dependencies + drm_atomic_normalize_zpos + drm_atomic_private_obj_fini + drm_atomic_private_obj_init + drm_atomic_set_crtc_for_plane + drm_atomic_set_fb_for_plane + drm_atomic_state_alloc + drm_atomic_state_clear + __drm_atomic_state_free + drm_bridge_add + drm_bridge_attach + drm_bridge_chain_mode_set + drm_bridge_remove + drm_calc_timestamping_constants + drm_compat_ioctl + drm_connector_attach_dp_subconnector_property + drm_connector_attach_encoder + drm_connector_cleanup + drm_connector_init + drm_connector_init_with_ddc + drm_connector_list_iter_begin + drm_connector_list_iter_end + drm_connector_list_iter_next + drm_connector_register + drm_connector_unregister + drm_connector_update_edid_property + drm_crtc_add_crc_entry + drm_crtc_arm_vblank_event + drm_crtc_cleanup + drm_crtc_enable_color_mgmt + drm_crtc_handle_vblank + drm_crtc_helper_set_config + drm_crtc_helper_set_mode + drm_crtc_init + drm_crtc_init_with_planes + drm_crtc_send_vblank_event + drm_crtc_vblank_count + drm_crtc_vblank_get + drm_crtc_vblank_helper_get_vblank_timestamp + drm_crtc_vblank_off + drm_crtc_vblank_on + drm_crtc_vblank_put + drm_crtc_wait_one_vblank + drm_cvt_mode + __drm_dbg + drm_detect_hdmi_monitor + drm_dev_alloc + drm_dev_dbg + drm_dev_enter + drm_dev_exit + drm_dev_printk + drm_dev_put + drm_dev_register + drm_dev_unplug + drm_dev_unregister + drm_display_mode_to_videomode + drm_dp_aux_init + drm_dp_aux_register + drm_dp_aux_unregister + drm_dp_bw_code_to_link_rate + drm_dp_channel_eq_ok + drm_dp_clock_recovery_ok + drm_dp_dpcd_read + drm_dp_dpcd_read_link_status + drm_dp_dpcd_write + drm_dp_get_adjust_request_pre_emphasis + drm_dp_get_adjust_request_voltage + drm_dp_link_rate_to_bw_code + drm_dp_link_train_channel_eq_delay + drm_dp_link_train_clock_recovery_delay + drm_dp_set_subconnector_property + drm_edid_header_is_valid + drm_edid_is_valid + drm_edid_to_sad + drm_edid_to_speaker_allocation + drm_encoder_cleanup + drm_encoder_init + __drm_err + drm_format_info + drm_framebuffer_cleanup + drm_framebuffer_init + drm_framebuffer_unregister_private + drm_gem_create_mmap_offset + drm_gem_dmabuf_mmap + drm_gem_dmabuf_release + drm_gem_dmabuf_vmap + drm_gem_dmabuf_vunmap + drm_gem_fb_create_handle + drm_gem_fb_destroy + drm_gem_handle_create + drm_gem_mmap + drm_gem_object_free + drm_gem_object_lookup + drm_gem_object_release + drm_gem_prime_export + drm_gem_prime_fd_to_handle + drm_gem_prime_handle_to_fd + drm_gem_prime_import_dev + drm_gem_prime_mmap + drm_gem_private_object_init + drm_gem_vm_close + drm_gem_vm_open + drm_get_edid + drm_get_format_info + drm_handle_vblank + drm_hdmi_avi_infoframe_from_display_mode + drm_hdmi_infoframe_set_hdr_metadata + drm_helper_connector_dpms + drm_helper_disable_unused_functions + drm_helper_force_disable_all + drm_helper_hpd_irq_event + drm_helper_mode_fill_fb_struct + drm_helper_probe_single_connector_modes + drm_helper_resume_force_mode + drm_ioctl + drm_is_current_master + drm_kms_helper_hotplug_event + drm_kms_helper_is_poll_worker + drm_kms_helper_poll_disable + drm_kms_helper_poll_enable + drm_kms_helper_poll_fini + drm_kms_helper_poll_init + drm_mm_init + drm_mm_insert_node_in_range + drmm_mode_config_init + drm_mm_print + drm_mm_remove_node + drm_mm_reserve_node + drm_mm_takedown + drm_mode_config_cleanup + drm_mode_config_helper_resume + drm_mode_config_helper_suspend + drm_mode_config_reset + drm_mode_convert_to_umode + drm_mode_create_scaling_mode_property + drm_mode_crtc_set_gamma_size + drm_mode_debug_printmodeline + drm_mode_duplicate + drm_mode_equal + drm_mode_match + drm_mode_probed_add + drm_modeset_acquire_fini + drm_modeset_acquire_init + drm_modeset_backoff + drm_mode_set_crtcinfo + drm_modeset_drop_locks + drm_modeset_lock + drm_modeset_lock_all + drm_modeset_lock_all_ctx + drm_mode_set_name + drm_modeset_unlock + drm_modeset_unlock_all + drm_mode_vrefresh + drm_object_attach_property + drm_open + drm_panel_add + drm_panel_disable + drm_panel_enable + drm_panel_get_modes + drm_panel_init + drm_panel_prepare + drm_panel_remove + drm_panel_unprepare + drm_plane_cleanup + drm_plane_create_alpha_property + drm_plane_create_blend_mode_property + drm_plane_create_rotation_property + drm_plane_create_zpos_property + drm_poll + drm_prime_gem_destroy + drm_prime_pages_to_sg + drm_prime_sg_to_dma_addr_array + drm_print_bits + drm_printf + __drm_printfn_info + __drm_printfn_seq_file + drm_property_blob_get + drm_property_blob_put + drm_property_create + drm_property_create_bitmask + drm_property_create_blob + drm_property_create_bool + drm_property_create_enum + drm_property_create_range + drm_property_create_signed_range + drm_property_lookup_blob + drm_property_replace_blob + __drm_puts_seq_file + drm_read + drm_rect_clip_scaled + drm_rect_intersect + drm_release + drm_rotation_simplify + drm_set_preferred_mode + drm_simple_encoder_init + drm_syncobj_add_point + drm_syncobj_create + drm_syncobj_find + drm_syncobj_find_fence + drm_syncobj_free + drm_syncobj_get_fd + drm_syncobj_get_handle + drm_syncobj_replace_fence + drm_universal_plane_init + drm_vblank_init + drm_wait_one_vblank + drm_writeback_connector_init + drm_writeback_queue_job + drm_writeback_signal_completion + dump_backtrace + dump_stack + dw_handle_msi_irq + dw_pcie_host_init + dw_pcie_own_conf_map_bus + dw_pcie_read + dw_pcie_setup_rc + dw_pcie_write + enable_irq + event_triggers_call + extcon_get_state + extcon_set_state_sync + fb_mode_option + __fdget + fd_install + _find_first_bit + _find_first_zero_bit + _find_next_bit + find_vpid + finish_wait + firmware_request_nowarn + flush_dcache_page + flush_delayed_work + flush_work + flush_workqueue + fput + free_irq + free_netdev + __free_pages + free_pages + free_percpu + freezing_slow_path + freq_qos_add_request + freq_qos_remove_request + freq_qos_update_request + fsync_bdev + fwnode_get_name + fwnode_property_read_string + fwnode_property_read_u32_array + gcd + generic_file_llseek + generic_handle_domain_irq + generic_handle_irq + gen_pool_add_owner + gen_pool_alloc_algo_owner + gen_pool_avail + gen_pool_create + gen_pool_destroy + gen_pool_first_fit_align + gen_pool_free_owner + gen_pool_has_addr + gen_pool_size + get_cpu_device + get_cpu_idle_time + get_device + __get_free_pages + get_options + get_random_u32 + __get_task_comm + get_task_mm + get_thermal_instance + get_unused_fd_flags + get_zeroed_page + gic_nonsecure_priorities + gpiochip_add_data_with_key + gpiochip_add_pin_range + gpiochip_generic_free + gpiochip_generic_request + gpiochip_get_data + gpiochip_lock_as_irq + gpiochip_remove + gpiochip_unlock_as_irq + gpiod_cansleep + gpiod_direction_input + gpiod_direction_output + gpiod_direction_output_raw + gpiod_get_raw_value + gpiod_get_value_cansleep + gpiod_set_raw_value + gpiod_set_value + gpiod_set_value_cansleep + gpiod_to_irq + gpio_free + gpio_request + gpio_request_one + gpio_to_desc + gserial_alloc_line + gserial_connect + gserial_disconnect + handle_edge_irq + handle_level_irq + handle_nested_irq + handle_simple_irq + handle_sysrq + hdmi_avi_infoframe_pack + hdmi_drm_infoframe_pack_only + hex_dump_to_buffer + housekeeping_cpumask + hrtimer_active + hrtimer_cancel + hrtimer_forward + __hrtimer_get_remaining + hrtimer_init + hrtimer_start_range_ns + hrtimer_try_to_cancel + i2c_adapter_type + i2c_add_adapter + i2c_add_numbered_adapter + i2c_bit_add_bus + i2c_bit_add_numbered_bus + i2c_bus_type + i2c_del_adapter + i2c_del_driver + i2c_for_each_dev + i2c_get_adapter + i2c_new_dummy_device + i2c_put_adapter + i2c_register_driver + i2c_smbus_read_byte_data + i2c_smbus_read_i2c_block_data + i2c_smbus_read_word_data + i2c_smbus_write_byte_data + i2c_smbus_write_i2c_block_data + i2c_smbus_write_word_data + i2c_smbus_xfer + i2c_transfer + i2c_transfer_buffer_flags + i2c_unregister_device + i2c_verify_client + i3c_generic_ibi_alloc_pool + i3c_generic_ibi_free_pool + i3c_generic_ibi_get_free_slot + i3c_generic_ibi_recycle_slot + i3c_master_add_i3c_dev_locked + i3c_master_defslvs_locked + i3c_master_disec_locked + i3c_master_do_daa + i3c_master_enec_locked + i3c_master_entdaa_locked + i3c_master_get_free_addr + i3c_master_queue_ibi + i3c_master_register + i3c_master_set_info + i3c_master_unregister + ida_alloc_range + ida_free + idr_alloc + idr_destroy + idr_find + idr_for_each + idr_get_next + idr_remove + idr_replace + ignore_console_lock_warning + iio_device_alloc + iio_device_free + __iio_device_register + iio_device_unregister + inc_zone_page_state + init_dummy_netdev + init_net + init_pseudo + __init_rwsem + __init_swait_queue_head + init_task + init_timer_key + init_wait_entry + __init_waitqueue_head + input_allocate_device + input_close_device + input_event + input_ff_create + input_free_device + input_mt_destroy_slots + input_mt_init_slots + input_mt_report_slot_state + input_open_device + input_register_device + input_register_handle + input_register_handler + input_set_abs_params + input_set_capability + input_unregister_device + input_unregister_handle + int_pow + int_sqrt + iomem_resource + iommu_alloc_resv_region + iommu_device_register + iommu_device_sysfs_add + iommu_device_sysfs_remove + iommu_device_unlink + iommu_fwspec_add_ids + iommu_fwspec_free + iommu_get_dma_cookie + iommu_get_domain_for_dev + iommu_group_alloc + iommu_group_for_each_dev + iommu_group_get + iommu_group_get_iommudata + iommu_group_set_iommudata + iommu_group_set_name + iommu_iova_to_phys + iommu_map + iommu_map_sg + iommu_put_dma_cookie + iommu_register_device_fault_handler + iommu_report_device_fault + iommu_unmap + iommu_unregister_device_fault_handler + __ioremap + iounmap + iput + __irq_alloc_descs + irq_create_mapping_affinity + __irq_domain_add + irq_domain_get_irq_data + irq_domain_remove + irq_domain_set_info + irq_domain_xlate_onetwocell + irq_domain_xlate_twocell + irq_force_affinity + irq_get_irq_data + irq_modify_status + irq_of_parse_and_map + __irq_resolve_mapping + irq_set_affinity_hint + irq_set_chained_handler_and_data + irq_set_chip + irq_set_chip_and_handler_name + irq_set_chip_data + __irq_set_handler + irq_set_handler_data + irq_set_irq_wake + irq_to_desc + irq_work_queue + irq_work_sync + is_console_locked + is_dma_buf_file + is_vmalloc_addr + iterate_fd + jiffies + jiffies_64_to_clock_t + jiffies64_to_msecs + jiffies_to_msecs + jiffies_to_usecs + kasan_flag_enabled + kasprintf + kernel_kobj + kern_mount + kern_unmount + __kfifo_in + __kfifo_out + kfree + kfree_const + kill_anon_super + kimage_voffset + __kmalloc + kmalloc_caches + kmalloc_order_trace + kmem_cache_alloc + kmem_cache_alloc_trace + kmem_cache_create + kmem_cache_create_usercopy + kmem_cache_destroy + kmem_cache_free + kmemdup + kobject_add + kobject_create_and_add + kobject_get + kobject_init + kobject_init_and_add + kobject_put + kobject_uevent + kobject_uevent_env + krealloc + kstat + kstrdup + kstrndup + kstrtobool_from_user + kstrtoint + kstrtoint_from_user + kstrtol_from_user + kstrtoll + kstrtou16 + kstrtou8 + kstrtouint + kstrtouint_from_user + kstrtoull + kstrtoull_from_user + kthread_bind + kthread_bind_mask + kthread_cancel_work_sync + kthread_create_on_node + kthread_flush_work + __kthread_init_worker + kthread_park + kthread_parkme + kthread_queue_work + kthread_should_park + kthread_should_stop + kthread_stop + kthread_unpark + kthread_worker_fn + ktime_get + ktime_get_mono_fast_ns + ktime_get_raw_ts64 + ktime_get_real_seconds + ktime_get_real_ts64 + ktime_get_ts64 + ktime_get_with_offset + kvasprintf + kvfree + kvfree_call_rcu + kvmalloc_node + __list_add_valid + __list_del_entry_valid + __lock_page + loops_per_jiffy + memchr + memcpy + __memcpy_fromio + __memcpy_toio + memdup_user + memmove + memory_read_from_buffer + memparse + memset + memset64 + __memset_io + memstart_addr + mfd_add_devices + mfd_remove_devices + mipi_dsi_attach + mipi_dsi_compression_mode + mipi_dsi_dcs_read + mipi_dsi_dcs_set_column_address + mipi_dsi_dcs_set_display_brightness + mipi_dsi_dcs_set_page_address + mipi_dsi_dcs_write_buffer + mipi_dsi_detach + mipi_dsi_device_register_full + mipi_dsi_driver_register_full + mipi_dsi_driver_unregister + mipi_dsi_host_register + mipi_dsi_host_unregister + mipi_dsi_picture_parameter_set + misc_deregister + misc_register + mmput + mmu_notifier_synchronize + mod_delayed_work_on + mod_timer + module_layout + module_put + __msecs_to_jiffies + msleep + msleep_interruptible + __mutex_init + mutex_is_locked + mutex_lock + mutex_lock_interruptible + mutex_trylock + mutex_unlock + napi_complete_done + napi_enable + napi_gro_receive + __napi_schedule + napi_schedule_prep + __netdev_alloc_skb + netif_napi_add + netif_receive_skb + netif_tx_wake_queue + no_llseek + nonseekable_open + noop_llseek + nr_cpu_ids + nr_irqs + nsecs_to_jiffies + ns_to_timespec64 + __num_online_cpus + of_address_to_resource + of_alias_get_id + of_clk_add_provider + of_clk_get_by_name + of_clk_src_onecell_get + of_count_phandle_with_args + of_cpu_node_to_id + of_device_get_match_data + of_device_is_available + of_device_is_compatible + of_dma_controller_free + of_dma_controller_register + of_drm_find_bridge + of_drm_find_panel + of_find_compatible_node + of_find_device_by_node + of_find_matching_node_and_match + of_find_node_by_name + of_find_node_by_type + of_find_node_opts_by_path + of_find_node_with_property + of_find_property + of_genpd_add_provider_simple + of_get_child_by_name + of_get_cpu_node + of_get_named_gpio_flags + of_get_next_available_child + of_get_next_child + of_get_property + of_get_regulator_init_data + of_get_videomode + of_iomap + of_irq_get_byname + of_irq_parse_one + of_match_device + of_match_node + of_n_addr_cells + of_node_name_eq + of_n_size_cells + of_parse_phandle + of_parse_phandle_with_fixed_args + of_phandle_iterator_init + of_phandle_iterator_next + of_platform_populate + of_property_count_elems_of_size + of_property_match_string + of_property_read_string + of_property_read_string_helper + of_property_read_u32_index + of_property_read_variable_u32_array + of_property_read_variable_u8_array + of_prop_next_string + of_prop_next_u32 + of_pwm_xlate_with_flags + of_reserved_mem_device_init_by_idx + of_reserved_mem_device_release + of_reserved_mem_lookup + of_root + of_thermal_get_ntrips + oops_in_progress + page_endio + page_mapping + panic + panic_notifier_list + param_array_ops + param_ops_bint + param_ops_bool + param_ops_charp + param_ops_hexint + param_ops_int + param_ops_long + param_ops_string + param_ops_uint + param_ops_ulong + pci_alloc_irq_vectors_affinity + pci_assign_resource + pci_assign_unassigned_bus_resources + pci_bus_resource_n + pci_clear_master + pci_disable_device + pci_enable_atomic_ops_to_root + pci_enable_device + pci_enable_pcie_error_reporting + pci_find_bus + pci_find_ext_capability + pci_free_irq_vectors + pci_generic_config_read + pci_generic_config_write + pci_get_device + pci_load_saved_state + pci_map_rom + pci_msix_vec_count + pci_read_config_dword + pci_read_config_word + pci_rebar_get_possible_sizes + pci_release_resource + pci_rescan_bus + pci_reset_function + pci_resize_resource + pci_restore_state + pci_save_state + pci_set_master + pci_store_saved_state + pci_unmap_rom + pci_unregister_driver + pci_wait_for_pending_transaction + pci_write_config_dword + pci_write_config_word + PDE_DATA + __per_cpu_offset + perf_event_update_userpage + perf_pmu_register + perf_pmu_unregister + perf_trace_buf_alloc + perf_trace_run_bpf_submit + phy_init + phy_power_off + phy_power_on + pid_task + pinconf_generic_dt_node_to_map + pinctrl_add_gpio_range + pinctrl_dev_get_drvdata + pinctrl_force_sleep + pinctrl_get + pinctrl_lookup_state + pinctrl_put + pinctrl_remove_gpio_range + pinctrl_select_state + pinctrl_utils_free_map + pin_get_name + pin_user_pages + pin_user_pages_fast + platform_bus_type + platform_device_add + platform_device_add_resources + platform_device_alloc + platform_device_del + platform_device_put + platform_device_register + platform_device_register_full + platform_device_unregister + __platform_driver_probe + __platform_driver_register + platform_driver_unregister + platform_find_device_by_driver + platform_get_irq + platform_get_irq_byname + platform_get_resource + platform_get_resource_byname + __platform_register_drivers + pm_genpd_add_subdomain + pm_genpd_init + pm_power_off + __pm_relax + pm_relax + pm_runtime_allow + pm_runtime_barrier + __pm_runtime_disable + pm_runtime_enable + pm_runtime_forbid + pm_runtime_get_if_active + __pm_runtime_idle + pm_runtime_irq_safe + pm_runtime_no_callbacks + __pm_runtime_resume + pm_runtime_set_autosuspend_delay + __pm_runtime_set_status + __pm_runtime_suspend + __pm_runtime_use_autosuspend + __pm_stay_awake + pm_stay_awake + pm_suspend_global_flags + pm_wakeup_dev_event + pm_wakeup_ws_event + power_supply_changed + power_supply_get_by_name + power_supply_get_drvdata + power_supply_get_property + power_supply_put + power_supply_register + power_supply_set_property + power_supply_unregister + preempt_schedule + preempt_schedule_notrace + prepare_to_wait_event + print_hex_dump + _printk + __printk_ratelimit + printk_timed_ratelimit + proc_create + proc_create_data + proc_create_seq_private + proc_mkdir + proc_remove + proc_set_size + proc_symlink + put_device + __put_page + __put_task_struct + put_unused_fd + pwmchip_add + pwmchip_remove + pwm_get_chip_data + pwm_set_chip_data + queue_delayed_work_on + queue_work_on + ___ratelimit + raw_notifier_call_chain + raw_notifier_chain_register + raw_notifier_chain_unregister + _raw_read_lock + _raw_read_lock_irqsave + _raw_read_unlock + _raw_read_unlock_irqrestore + _raw_spin_lock + _raw_spin_lock_irq + _raw_spin_lock_irqsave + raw_spin_rq_lock_nested + raw_spin_rq_unlock + _raw_spin_trylock + _raw_spin_unlock + _raw_spin_unlock_irq + _raw_spin_unlock_irqrestore + _raw_write_lock + _raw_write_lock_irqsave + _raw_write_trylock + _raw_write_unlock + _raw_write_unlock_irqrestore + __rb_erase_color + rb_first_postorder + __rb_insert_augmented + rb_insert_color + rb_next + rb_next_postorder + rcu_barrier + __rcu_read_lock + __rcu_read_unlock + rdev_get_drvdata + rdev_get_id + reciprocal_value + refcount_warn_saturate + refresh_frequency_limits + __refrigerator + regcache_cache_only + regcache_mark_dirty + regcache_sync + __register_blkdev + __register_chrdev + register_chrdev_region + register_console + register_die_notifier + register_netdev + register_pm_notifier + register_reboot_notifier + register_restart_handler + register_shrinker + register_syscore_ops + regmap_async_complete + regmap_bulk_read + regmap_bulk_write + regmap_multi_reg_write + regmap_multi_reg_write_bypassed + regmap_raw_read + regmap_raw_write + regmap_raw_write_async + regmap_read + regmap_register_patch + regmap_update_bits_base + regmap_write + regulator_bulk_disable + regulator_bulk_enable + regulator_disable + regulator_enable + regulator_get + regulator_is_enabled + regulator_list_voltage_linear + regulator_map_voltage_linear + regulator_put + regulator_set_voltage + release_firmware + release_pages + __release_region + remap_pfn_range + remove_cpu + remove_proc_entry + request_firmware + request_firmware_direct + request_firmware_into_buf + request_firmware_nowait + __request_module + __request_region + request_threaded_irq + reset_control_assert + reset_control_deassert + return_address + rps_needed + rtc_class_close + rtc_class_open + rtc_read_time + rtc_set_time + rtc_time64_to_tm + rtc_tm_to_time64 + rtc_update_irq + rtc_valid_tm + runqueues + sched_clock + sched_feat_keys + sched_set_fifo_low + sched_setscheduler + sched_setscheduler_nocheck + sched_uclamp_used + schedule + schedule_timeout + schedule_timeout_interruptible + scnprintf + scsi_dma_unmap + scsi_eh_ready_devs + seq_lseek + seq_printf + seq_puts + seq_read + seq_release + set_capacity + set_capacity_and_notify + set_cpus_allowed_ptr + set_normalized_timespec64 + __SetPageMovable + set_task_cpu + sg_alloc_table + sg_alloc_table_from_pages_segment + sg_free_table + sg_init_table + sg_miter_next + sg_miter_start + sg_miter_stop + sg_nents_for_len + sg_next + __sg_page_iter_next + __sg_page_iter_start + si_meminfo + simple_attr_open + simple_attr_read + simple_attr_release + simple_attr_write + simple_open + simple_read_from_buffer + simple_strtol + simple_strtoul + simple_write_to_buffer + single_open + single_open_size + single_release + skb_clone + skb_copy_expand + skb_dequeue + skb_dequeue_tail + skb_pull + skb_push + skb_put + skb_queue_head + skb_queue_purge + skb_queue_tail + skb_trim + smp_call_function + smp_call_function_many + smp_call_function_single + smp_call_on_cpu + snd_compr_stop_error + snd_ctl_add + snd_ctl_new1 + snd_ctl_notify + snd_device_free + snd_dma_alloc_pages + snd_dmaengine_pcm_prepare_slave_config + snd_dma_free_pages + snd_hwdep_new + snd_pcm_format_physical_width + snd_pcm_format_width + snd_pcm_hw_constraint_integer + snd_pcm_hw_constraint_list + snd_pcm_lib_free_pages + snd_pcm_lib_ioctl + snd_pcm_lib_malloc_pages + snd_pcm_lib_preallocate_free_for_all + snd_pcm_lib_preallocate_pages + snd_pcm_period_elapsed + snd_pcm_rate_range_to_bits + snd_soc_add_component_controls + snd_soc_bytes_info_ext + snd_soc_bytes_tlv_callback + snd_soc_card_get_kcontrol + snd_soc_component_async_complete + snd_soc_component_disable_pin + snd_soc_component_force_enable_pin + snd_soc_component_init_regmap + snd_soc_component_read + snd_soc_component_set_sysclk + snd_soc_component_update_bits + snd_soc_component_update_bits_async + snd_soc_component_write + snd_soc_daifmt_clock_provider_from_bitmap + snd_soc_daifmt_parse_clock_provider_raw + snd_soc_daifmt_parse_format + snd_soc_dai_set_bclk_ratio + snd_soc_dai_set_channel_map + snd_soc_dai_set_sysclk + snd_soc_dai_set_tdm_slot + snd_soc_dapm_add_routes + snd_soc_dapm_disable_pin + snd_soc_dapm_enable_pin + snd_soc_dapm_get_enum_double + snd_soc_dapm_get_pin_status + snd_soc_dapm_get_pin_switch + snd_soc_dapm_get_volsw + snd_soc_dapm_ignore_suspend + snd_soc_dapm_info_pin_switch + snd_soc_dapm_kcontrol_dapm + snd_soc_dapm_new_control + snd_soc_dapm_new_controls + snd_soc_dapm_put_enum_double + snd_soc_dapm_put_pin_switch + snd_soc_dapm_put_volsw + snd_soc_dapm_sync + snd_soc_dapm_weak_routes + snd_soc_find_dai + snd_soc_find_dai_with_mutex + snd_soc_get_enum_double + snd_soc_get_pcm_runtime + snd_soc_get_volsw + snd_soc_get_volsw_sx + snd_soc_get_xr_sx + snd_soc_info_enum_double + snd_soc_info_volsw + snd_soc_info_volsw_sx + snd_soc_info_xr_sx + snd_soc_lookup_component + snd_soc_new_compress + snd_soc_of_get_dai_link_codecs + snd_soc_of_get_dai_name + snd_soc_of_parse_audio_routing + snd_soc_of_put_dai_link_codecs + snd_soc_params_to_bclk + snd_soc_pm_ops + snd_soc_put_enum_double + snd_soc_put_volsw + snd_soc_put_volsw_sx + snd_soc_put_xr_sx + snd_soc_register_card + snd_soc_register_component + snd_soc_set_runtime_hwparams + snd_soc_tplg_component_load + snd_soc_tplg_component_remove + snd_soc_tplg_widget_bind_event + snd_soc_unregister_card + snd_soc_unregister_component + snprintf + soc_device_register + softnet_data + sort + __spi_alloc_controller + spi_controller_resume + spi_controller_suspend + spi_delay_exec + spi_finalize_current_message + spi_register_controller + __spi_register_driver + spi_setup + spi_sync + spi_unregister_controller + split_page + sprintf + sprint_symbol + srcu_init_notifier_head + srcu_notifier_call_chain + srcu_notifier_chain_register + srcu_notifier_chain_unregister + sscanf + __stack_chk_fail + start_backtrace + static_key_slow_dec + static_key_slow_inc + stop_one_cpu_nowait + stpcpy + strcasecmp + strcat + strchr + strcmp + strcpy + stream_open + strim + strlcat + strlcpy + strlen + strncasecmp + strncat + strnchr + strncmp + strncpy + strnlen + strnstr + strpbrk + strrchr + strscpy + strsep + strstr + subsys_system_register + __sw_hweight32 + __sw_hweight64 + sync_file_create + sync_file_get_fence + synchronize_rcu + syscon_regmap_lookup_by_phandle + sysfs_add_file_to_group + sysfs_create_bin_file + sysfs_create_file_ns + sysfs_create_files + sysfs_create_group + sysfs_create_groups + sysfs_create_link + sysfs_emit + __sysfs_match_string + sysfs_notify + sysfs_remove_bin_file + sysfs_remove_file_from_group + sysfs_remove_file_ns + sysfs_remove_files + sysfs_remove_group + sysfs_remove_link + sysfs_streq + sysrq_mask + system_32bit_el0_cpumask + system_freezable_wq + system_freezing_cnt + system_highpri_wq + system_state + system_unbound_wq + system_wq + sys_tz + __tasklet_hi_schedule + tasklet_init + tasklet_kill + __tasklet_schedule + __task_pid_nr_ns + task_rq_lock + tcp_register_congestion_control + tcp_reno_cong_avoid + tcp_reno_ssthresh + tcp_reno_undo_cwnd + tcp_slow_start + tcp_unregister_congestion_control + thermal_cdev_update + thermal_cooling_device_unregister + thermal_of_cooling_device_register + thermal_zone_device_disable + thermal_zone_device_enable + thermal_zone_device_update + thermal_zone_get_temp + thermal_zone_get_zone_by_name + thermal_zone_of_sensor_register + thermal_zone_of_sensor_unregister + tick_nohz_get_sleep_length + time64_to_tm + topology_set_thermal_pressure + _totalram_pages + touch_softlockup_watchdog + trace_event_buffer_commit + trace_event_buffer_reserve + trace_event_ignore_this_pid + trace_event_printf + trace_event_raw_init + trace_event_reg + trace_handle_return + __traceiter_android_rvh_can_migrate_task + __traceiter_android_rvh_cpu_cgroup_can_attach + __traceiter_android_rvh_cpufreq_transition + __traceiter_android_rvh_dequeue_task + __traceiter_android_rvh_enqueue_task + __traceiter_android_rvh_find_lowest_rq + __traceiter_android_rvh_find_new_ilb + __traceiter_android_rvh_post_init_entity_util_avg + __traceiter_android_rvh_replace_next_task_fair + __traceiter_android_rvh_sched_fork_init + __traceiter_android_rvh_sched_newidle_balance + __traceiter_android_rvh_sched_nohz_balancer_kick + __traceiter_android_rvh_sched_rebalance_domains + __traceiter_android_rvh_schedule + __traceiter_android_rvh_select_fallback_rq + __traceiter_android_rvh_select_task_rq_fair + __traceiter_android_rvh_select_task_rq_rt + __traceiter_android_rvh_set_task_cpu + __traceiter_android_vh_cpu_idle_enter + __traceiter_android_vh_cpu_idle_exit + __traceiter_android_vh_gic_set_affinity + __traceiter_android_rvh_gic_v3_set_affinity + __traceiter_android_vh_ipi_stop + __traceiter_android_vh_logbuf + __traceiter_android_vh_logbuf_pr_cont + __traceiter_android_vh_scheduler_tick + __traceiter_android_vh_ufs_check_int_errors + __traceiter_android_vh_ufs_compl_command + __traceiter_device_pm_callback_end + __traceiter_device_pm_callback_start + __traceiter_gpu_mem_total + __traceiter_hrtimer_expire_entry + __traceiter_hrtimer_expire_exit + __traceiter_ipi_entry + __traceiter_ipi_exit + __traceiter_ipi_raise + __traceiter_irq_handler_entry + __traceiter_irq_handler_exit + __traceiter_pelt_cfs_tp + __traceiter_pelt_dl_tp + __traceiter_pelt_irq_tp + __traceiter_pelt_rt_tp + __traceiter_pelt_se_tp + __traceiter_rwmmio_post_read + __traceiter_rwmmio_read + __traceiter_rwmmio_write + __traceiter_sched_overutilized_tp + __traceiter_sched_switch + __traceiter_suspend_resume + __traceiter_workqueue_execute_end + __traceiter_workqueue_execute_start + __tracepoint_android_rvh_can_migrate_task + __tracepoint_android_rvh_cpu_cgroup_can_attach + __tracepoint_android_rvh_cpufreq_transition + __tracepoint_android_rvh_dequeue_task + __tracepoint_android_rvh_enqueue_task + __tracepoint_android_rvh_find_lowest_rq + __tracepoint_android_rvh_find_new_ilb + __tracepoint_android_rvh_post_init_entity_util_avg + __tracepoint_android_rvh_replace_next_task_fair + __tracepoint_android_rvh_sched_fork_init + __tracepoint_android_rvh_sched_newidle_balance + __tracepoint_android_rvh_sched_nohz_balancer_kick + __tracepoint_android_rvh_sched_rebalance_domains + __tracepoint_android_rvh_schedule + __tracepoint_android_rvh_select_fallback_rq + __tracepoint_android_rvh_select_task_rq_fair + __tracepoint_android_rvh_select_task_rq_rt + __tracepoint_android_rvh_set_task_cpu + __tracepoint_android_vh_cpu_idle_enter + __tracepoint_android_vh_cpu_idle_exit + __tracepoint_android_vh_gic_set_affinity + __tracepoint_android_rvh_gic_v3_set_affinity + __tracepoint_android_vh_ipi_stop + __tracepoint_android_vh_logbuf + __tracepoint_android_vh_logbuf_pr_cont + __tracepoint_android_vh_scheduler_tick + __tracepoint_android_vh_ufs_check_int_errors + __tracepoint_android_vh_ufs_compl_command + __tracepoint_device_pm_callback_end + __tracepoint_device_pm_callback_start + __tracepoint_gpu_mem_total + __tracepoint_hrtimer_expire_entry + __tracepoint_hrtimer_expire_exit + __tracepoint_ipi_entry + __tracepoint_ipi_exit + __tracepoint_ipi_raise + __tracepoint_irq_handler_entry + __tracepoint_irq_handler_exit + __tracepoint_pelt_cfs_tp + __tracepoint_pelt_dl_tp + __tracepoint_pelt_irq_tp + __tracepoint_pelt_rt_tp + __tracepoint_pelt_se_tp + tracepoint_probe_register + tracepoint_probe_unregister + __tracepoint_rwmmio_post_read + __tracepoint_rwmmio_read + __tracepoint_rwmmio_write + __tracepoint_sched_overutilized_tp + __tracepoint_sched_switch + __tracepoint_suspend_resume + __tracepoint_workqueue_execute_end + __tracepoint_workqueue_execute_start + trace_print_array_seq + trace_print_symbols_seq + trace_raw_output_prep + try_module_get + ttm_bo_bulk_move_lru_tail + ttm_bo_eviction_valuable + ttm_bo_init_reserved + ttm_bo_kmap + ttm_bo_kunmap + ttm_bo_lock_delayed_workqueue + ttm_bo_mem_space + ttm_bo_mmap_obj + ttm_bo_move_accel_cleanup + ttm_bo_move_memcpy + ttm_bo_move_to_lru_tail + ttm_bo_put + ttm_bo_unlock_delayed_workqueue + ttm_bo_validate + ttm_bo_vm_access + ttm_bo_vmap + ttm_bo_vm_close + ttm_bo_vm_dummy_page + ttm_bo_vm_fault_reserved + ttm_bo_vm_open + ttm_bo_vm_reserve + ttm_bo_vunmap + ttm_bo_wait + ttm_device_fini + ttm_device_init + ttm_eu_backoff_reservation + ttm_eu_fence_buffer_objects + ttm_eu_reserve_buffers + ttm_glob + ttm_pool_alloc + ttm_pool_debugfs + ttm_pool_free + ttm_range_man_fini + ttm_range_man_init + ttm_resource_free + ttm_resource_init + ttm_resource_manager_evict_all + ttm_resource_manager_init + ttm_sg_tt_init + ttm_tt_destroy_common + ttm_tt_fini + tty_flip_buffer_push + tty_insert_flip_string_fixed_flag + tty_kref_put + tty_port_tty_get + typec_get_drvdata + typec_register_partner + typec_register_port + typec_set_data_role + typec_set_pwr_opmode + typec_set_pwr_role + typec_unregister_partner + typec_unregister_port + uart_add_one_port + uart_console_write + uart_get_baud_rate + uart_parse_options + uart_register_driver + uart_remove_one_port + uart_resume_port + uart_set_options + uart_suspend_port + uart_try_toggle_sysrq + uart_unregister_driver + uart_update_timeout + uart_write_wakeup + __ubsan_handle_cfi_check_fail_abort + uclamp_eff_value + __udelay + ufshcd_auto_hibern8_update + ufshcd_dme_set_attr + ufshcd_hold + ufshcd_pltfrm_init + ufshcd_query_flag_retry + ufshcd_release + ufshcd_remove + ufshcd_shutdown + ufshcd_system_resume + ufshcd_system_suspend + unlock_page + unmap_mapping_range + unpin_user_page + unregister_blkdev + __unregister_chrdev + unregister_chrdev_region + unregister_netdev + unregister_pm_notifier + unregister_reboot_notifier + unregister_shrinker + up + update_devfreq + up_read + up_write + usb_add_function + usb_choose_configuration + usb_copy_descriptors + usb_enable_autosuspend + usb_ep_alloc_request + usb_ep_autoconfig + usb_ep_free_request + usb_ep_queue + usb_function_register + usb_function_unregister + usb_gadget_set_state + usb_hub_find_child + usb_interface_id + usb_otg_state_string + usb_put_function_instance + usb_register_notify + usb_speed_string + usb_string_id + usb_unregister_notify + __usecs_to_jiffies + usleep_range_state + v4l2_ctrl_handler_free + v4l2_ctrl_handler_init_class + v4l2_ctrl_handler_setup + v4l2_ctrl_log_status + v4l2_ctrl_new_custom + v4l2_ctrl_new_std + v4l2_ctrl_new_std_menu + v4l2_device_register + v4l2_device_unregister + v4l2_fh_add + v4l2_fh_del + v4l2_fh_exit + v4l2_fh_init + v4l2_m2m_buf_queue + v4l2_m2m_buf_remove + v4l2_m2m_ctx_init + v4l2_m2m_ctx_release + v4l2_m2m_dqbuf + v4l2_m2m_fop_mmap + v4l2_m2m_fop_poll + v4l2_m2m_get_curr_priv + v4l2_m2m_get_vq + v4l2_m2m_init + v4l2_m2m_ioctl_dqbuf + v4l2_m2m_ioctl_querybuf + v4l2_m2m_ioctl_reqbufs + v4l2_m2m_ioctl_streamoff + v4l2_m2m_ioctl_streamon + v4l2_m2m_job_finish + v4l2_m2m_mmap + v4l2_m2m_next_buf + v4l2_m2m_poll + v4l2_m2m_qbuf + v4l2_m2m_release + v4l2_m2m_reqbufs + v4l2_m2m_streamoff + v4l2_m2m_streamon + v4l2_m2m_try_schedule + v4l2_match_dv_timings + v4l_bound_align_image + vabits_actual + vb2_buffer_done + vb2_dma_sg_memops + vb2_dqbuf + vb2_ops_wait_finish + vb2_ops_wait_prepare + vb2_plane_cookie + vb2_plane_vaddr + vb2_poll + vb2_qbuf + vb2_querybuf + vb2_queue_init + vb2_queue_release + vb2_reqbufs + vb2_streamoff + vb2_streamon + vb2_wait_for_all_buffers + vfree + video_devdata + video_device_alloc + video_device_release + video_ioctl2 + __video_register_device + video_unregister_device + vmalloc + vmalloc_to_page + vmap + vm_get_page_prot + vm_iomap_memory + vscnprintf + vsnprintf + vsprintf + vunmap + vzalloc + wait_for_completion + wait_for_completion_interruptible + wait_for_completion_interruptible_timeout + wait_for_completion_killable + wait_for_completion_timeout + __wake_up + __wake_up_locked + __wake_up_locked_key + wake_up_process + wakeup_source_register + wakeup_source_unregister + __wake_up_sync + __warn_printk + watchdog_init_timeout + watchdog_register_device + watchdog_set_restart_priority + watchdog_unregister_device + work_busy + ww_mutex_lock + ww_mutex_lock_interruptible + ww_mutex_unlock + xa_destroy + __xa_erase + xa_load + __xa_store diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel new file mode 100644 index 000000000000..56d36371e748 --- /dev/null +++ b/android/abi_gki_aarch64_pixel @@ -0,0 +1,2419 @@ +[abi_symbol_list] + add_cpu + add_timer + add_timer_on + add_wait_queue + adjust_managed_page_count + alarm_cancel + alarm_init + alarm_start_relative + alarmtimer_get_rtcdev + alarm_try_to_cancel + alloc_anon_inode + alloc_chrdev_region + alloc_etherdev_mqs + alloc_netdev_mqs + __alloc_pages + alloc_pages_exact + __alloc_percpu + __alloc_percpu_gfp + __alloc_skb + alloc_workqueue + amba_driver_register + amba_driver_unregister + android_debug_symbol + android_rvh_probe_register + anon_inode_getfd + arc4_crypt + arc4_setkey + __arch_clear_user + __arch_copy_from_user + __arch_copy_to_user + arch_freq_scale + arch_timer_read_counter + argv_free + argv_split + arm64_const_caps_ready + arm64_use_ng_mappings + __arm_smccc_smc + async_schedule_node_domain + async_synchronize_full_domain + atomic_notifier_call_chain + atomic_notifier_chain_register + atomic_notifier_chain_unregister + autoremove_wake_function + available_idle_cpu + backlight_device_set_brightness + balance_push_callback + bcmp + bin2hex + bio_add_page + bio_alloc_bioset + bio_associate_blkg + bio_chain + bio_endio + bio_end_io_acct_remapped + bio_init + bio_put + bio_start_io_acct + __bitmap_andnot + __bitmap_clear + __bitmap_equal + bitmap_find_next_zero_area_off + bitmap_from_arr32 + __bitmap_or + bitmap_parse + bitmap_parselist + bitmap_print_to_pagebuf + __bitmap_set + bitmap_to_arr32 + __bitmap_weight + __bitmap_xor + __blk_alloc_disk + blk_check_plugged + blk_cleanup_disk + blkdev_get_by_dev + blkdev_put + blk_execute_rq_nowait + blk_get_request + blk_put_request + blk_queue_flag_clear + blk_queue_flag_set + blk_queue_io_min + blk_queue_io_opt + blk_queue_logical_block_size + blk_queue_max_discard_sectors + blk_queue_max_write_zeroes_sectors + blk_queue_physical_block_size + blk_rq_map_user + blk_rq_map_user_iov + blk_rq_unmap_user + blk_status_to_errno + blocking_notifier_call_chain + blocking_notifier_chain_register + blocking_notifier_chain_unregister + bpf_trace_run1 + bpf_trace_run10 + bpf_trace_run11 + bpf_trace_run12 + bpf_trace_run2 + bpf_trace_run3 + bpf_trace_run4 + bpf_trace_run5 + bpf_trace_run6 + bpf_trace_run7 + bpf_trace_run8 + bpf_trace_run9 + build_skb + bus_find_device + bus_for_each_dev + bus_for_each_drv + bus_register + bus_register_notifier + bus_set_iommu + bus_unregister + bus_unregister_notifier + cache_line_size + call_rcu + cancel_delayed_work + cancel_delayed_work_sync + cancel_work_sync + capable + cdev_add + cdev_alloc + cdev_del + cdev_device_add + cdev_device_del + cdev_init + __cfg80211_alloc_event_skb + __cfg80211_alloc_reply_skb + cfg80211_chandef_create + cfg80211_ch_switch_notify + cfg80211_connect_done + cfg80211_del_sta_sinfo + cfg80211_disconnected + cfg80211_external_auth_request + cfg80211_find_elem_match + cfg80211_get_bss + cfg80211_ibss_joined + cfg80211_inform_bss_frame_data + cfg80211_mgmt_tx_status_ext + cfg80211_michael_mic_failure + cfg80211_new_sta + cfg80211_port_authorized + cfg80211_put_bss + cfg80211_ready_on_channel + cfg80211_register_netdevice + cfg80211_remain_on_channel_expired + cfg80211_roamed + cfg80211_rx_mgmt_ext + cfg80211_scan_done + cfg80211_sched_scan_results + cfg80211_sched_scan_stopped_locked + __cfg80211_send_event_skb + cfg80211_unlink_bss + cfg80211_unregister_wdev + cfg80211_update_owe_info_event + cfg80211_vendor_cmd_reply + __cfi_slowpath_diag + __check_object_size + __class_create + class_destroy + class_interface_unregister + __class_register + class_unregister + clear_page + __ClearPageMovable + clk_disable + clk_enable + clk_get + __clk_get_hw + __clk_get_name + clk_get_rate + clk_hw_get_name + clk_hw_get_parent + __clk_is_enabled + clk_prepare + clk_put + clk_register + clk_register_clkdev + clk_register_fixed_factor + clk_register_fixed_rate + clk_register_gate + clk_set_rate + clk_unprepare + clockevents_config_and_register + clocks_calc_mult_shift + __clocksource_register_scale + cma_alloc + cma_for_each_area + cma_get_name + cma_release + compat_ptr_ioctl + complete + complete_all + complete_and_exit + completion_done + component_add + component_bind_all + component_del + component_master_add_with_match + component_master_del + component_match_add_release + component_unbind_all + config_ep_by_speed + config_group_init_type_name + console_set_on_cmdline + console_suspend_enabled + console_trylock + console_unlock + __const_udelay + consume_skb + contig_page_data + _copy_from_iter + _copy_to_iter + __cpu_active_mask + cpu_all_bits + cpu_bit_bitmap + cpufreq_add_update_util_hook + cpufreq_cpu_get + cpufreq_cpu_get_raw + cpufreq_cpu_put + cpufreq_disable_fast_switch + cpufreq_driver_fast_switch + cpufreq_driver_resolve_freq + __cpufreq_driver_target + cpufreq_driver_target + cpufreq_enable_fast_switch + cpufreq_freq_transition_begin + cpufreq_freq_transition_end + cpufreq_frequency_table_verify + cpufreq_generic_attr + cpufreq_get + cpufreq_get_policy + cpufreq_policy_transition_delay_us + cpufreq_quick_get + cpufreq_register_driver + cpufreq_register_governor + cpufreq_register_notifier + cpufreq_remove_update_util_hook + cpufreq_table_index_unsorted + cpufreq_this_cpu_can_update + cpu_hotplug_disable + cpu_hotplug_enable + __cpuhp_remove_state + __cpuhp_setup_state + __cpuhp_setup_state_cpuslocked + __cpuhp_state_add_instance + __cpuhp_state_remove_instance + cpuhp_tasks_frozen + cpu_hwcap_keys + cpu_hwcaps + cpu_latency_qos_add_request + cpu_latency_qos_remove_request + cpu_latency_qos_update_request + cpumask_any_but + cpumask_local_spread + cpumask_next + cpumask_next_and + cpumask_next_wrap + cpu_number + __cpu_online_mask + cpu_pm_register_notifier + __cpu_possible_mask + cpupri_find_fitness + cpu_scale + cpus_read_lock + cpus_read_unlock + cpu_subsys + crc32_be + crc32_le + crc8 + crc8_populate_msb + crypto_aead_decrypt + crypto_aead_encrypt + crypto_aead_setauthsize + crypto_aead_setkey + crypto_alloc_aead + crypto_alloc_base + crypto_alloc_shash + crypto_alloc_skcipher + crypto_alloc_sync_skcipher + crypto_comp_compress + crypto_comp_decompress + crypto_destroy_tfm + __crypto_memneq + crypto_shash_digest + crypto_shash_finup + crypto_shash_setkey + crypto_shash_update + crypto_skcipher_decrypt + crypto_skcipher_encrypt + crypto_skcipher_setkey + __crypto_xor + csum_ipv6_magic + csum_tcpudp_nofold + _ctype + debugfs_attr_read + debugfs_attr_write + debugfs_create_atomic_t + debugfs_create_bool + debugfs_create_dir + debugfs_create_file + debugfs_create_size_t + debugfs_create_u16 + debugfs_create_u32 + debugfs_create_u64 + debugfs_create_u8 + debugfs_create_x32 + debugfs_lookup + debugfs_remove + debugfs_rename + dec_zone_page_state + default_llseek + deferred_free + delayed_work_timer_fn + del_gendisk + del_timer + del_timer_sync + desc_to_gpio + destroy_workqueue + dev_alloc_name + __dev_change_net_namespace + dev_close + _dev_crit + dev_driver_string + _dev_emerg + _dev_err + dev_err_probe + dev_fetch_sw_netstats + devfreq_add_device + devfreq_add_governor + devfreq_monitor_resume + devfreq_monitor_start + devfreq_monitor_stop + devfreq_monitor_suspend + devfreq_recommended_opp + devfreq_register_opp_notifier + devfreq_remove_device + devfreq_unregister_opp_notifier + devfreq_update_interval + dev_fwnode + __dev_get_by_index + dev_get_by_index + dev_get_by_name + dev_get_stats + device_add + device_add_disk + device_add_groups + device_create + device_create_file + device_create_managed_software_node + device_create_with_groups + device_del + device_destroy + device_find_child + device_for_each_child + device_get_child_node_count + device_get_dma_attr + device_get_match_data + device_get_next_child_node + device_initialize + device_init_wakeup + device_link_add + device_link_del + device_property_present + device_property_read_string + device_property_read_u32_array + device_register + device_remove_file + device_remove_groups + device_rename + device_set_wakeup_capable + device_set_wakeup_enable + device_unregister + device_wakeup_enable + _dev_info + __dev_kfree_skb_any + devm_add_action + devm_backlight_device_register + devm_backlight_device_unregister + devm_blk_crypto_profile_init + devm_clk_get + devm_clk_get_optional + devm_clk_put + devm_device_add_groups + __devm_drm_dev_alloc + devm_drm_panel_bridge_add_typed + devm_extcon_dev_allocate + devm_extcon_dev_register + devm_free_irq + devm_gen_pool_create + devm_gpiochip_add_data_with_key + devm_gpiod_get + devm_gpiod_get_array + devm_gpiod_get_optional + devm_gpiod_put_array + devm_gpio_free + devm_gpio_request + devm_gpio_request_one + devm_i2c_new_dummy_device + devm_iio_device_alloc + __devm_iio_device_register + devm_input_allocate_device + devm_ioremap + devm_ioremap_resource + devm_ioremap_wc + devm_iounmap + __devm_irq_alloc_descs + devm_kasprintf + devm_kfree + devm_kmalloc + devm_kmemdup + devm_krealloc + devm_kstrdup + devm_kstrdup_const + devm_memremap + devm_memunmap + devm_mfd_add_devices + devm_nvmem_register + __devm_of_phy_provider_register + devm_of_platform_populate + devm_phy_create + devm_phy_get + devm_phy_put + devm_pinctrl_get + devm_pinctrl_put + devm_pinctrl_register + devm_pinctrl_register_and_init + devm_platform_ioremap_resource + devm_platform_ioremap_resource_byname + devm_power_supply_register + __devm_regmap_init + __devm_regmap_init_i2c + __devm_regmap_init_spi + __devm_regmap_init_spmi_ext + devm_regulator_bulk_get + devm_regulator_get + devm_regulator_get_exclusive + devm_regulator_get_optional + devm_regulator_put + devm_regulator_register + __devm_request_region + devm_request_threaded_irq + devm_rtc_device_register + devm_snd_soc_register_component + devm_thermal_of_cooling_device_register + devm_thermal_zone_of_sensor_register + devm_thermal_zone_of_sensor_unregister + devm_usb_get_phy_by_phandle + _dev_notice + dev_pm_domain_attach_by_name + dev_pm_domain_detach + dev_pm_opp_add + dev_pm_opp_disable + dev_pm_opp_find_freq_ceil + dev_pm_opp_find_freq_exact + dev_pm_opp_find_freq_floor + dev_pm_opp_get_freq + dev_pm_opp_get_opp_count + dev_pm_opp_get_voltage + dev_pm_opp_of_add_table + dev_pm_opp_of_register_em + dev_pm_opp_of_remove_table + dev_pm_opp_put + dev_pm_opp_put_regulators + dev_pm_opp_set_regulators + dev_pm_qos_read_value + dev_pm_qos_update_request + _dev_printk + dev_printk_emit + dev_queue_xmit + devres_add + __devres_alloc_node + devres_free + dev_set_name + _dev_warn + disable_irq + disable_irq_nosync + disable_percpu_irq + disk_end_io_acct + disk_start_io_acct + dma_alloc_attrs + dma_async_device_register + dma_async_device_unregister + dma_async_tx_descriptor_init + dma_buf_attach + dma_buf_begin_cpu_access + dma_buf_begin_cpu_access_partial + dma_buf_detach + dma_buf_end_cpu_access + dma_buf_end_cpu_access_partial + dma_buf_export + dma_buf_fd + dma_buf_get + dma_buf_map_attachment + dma_buf_mmap + dmabuf_page_pool_alloc + dmabuf_page_pool_create + dmabuf_page_pool_destroy + dmabuf_page_pool_free + dma_buf_put + dma_buf_unmap_attachment + dma_buf_vmap + dma_buf_vunmap + dmaengine_unmap_put + dma_fence_add_callback + dma_fence_context_alloc + dma_fence_default_wait + dma_fence_get_status + dma_fence_init + dma_fence_release + dma_fence_remove_callback + dma_fence_signal + dma_fence_signal_locked + dma_fence_wait_timeout + dma_free_attrs + dma_get_slave_caps + dma_get_slave_channel + dma_heap_add + dma_heap_buffer_alloc + dma_heap_find + dma_heap_get_dev + dma_heap_get_drvdata + dma_heap_get_name + dma_heap_put + dmam_alloc_attrs + dma_map_page_attrs + dma_map_resource + dma_map_sg_attrs + dma_map_sgtable + dma_mmap_attrs + dma_release_channel + dma_request_chan + dma_set_coherent_mask + dma_set_mask + dma_sync_sg_for_cpu + dma_sync_sg_for_device + dma_sync_single_for_cpu + dma_sync_single_for_device + dma_unmap_page_attrs + dma_unmap_resource + dma_unmap_sg_attrs + do_SAK + do_trace_netlink_extack + do_wait_intr_irq + down + downgrade_write + down_interruptible + down_read + down_read_trylock + down_trylock + down_write + drain_workqueue + driver_create_file + driver_register + driver_remove_file + driver_unregister + drm_add_modes_noedid + drm_atomic_add_affected_connectors + drm_atomic_add_affected_planes + drm_atomic_commit + drm_atomic_get_connector_state + drm_atomic_get_crtc_state + drm_atomic_get_new_connector_for_encoder + drm_atomic_get_old_connector_for_encoder + drm_atomic_get_plane_state + drm_atomic_get_private_obj_state + drm_atomic_helper_bridge_destroy_state + drm_atomic_helper_bridge_duplicate_state + drm_atomic_helper_bridge_reset + drm_atomic_helper_check_modeset + drm_atomic_helper_check_planes + drm_atomic_helper_check_plane_state + drm_atomic_helper_cleanup_planes + drm_atomic_helper_commit_cleanup_done + drm_atomic_helper_commit_duplicated_state + drm_atomic_helper_commit_hw_done + drm_atomic_helper_commit_modeset_disables + drm_atomic_helper_commit_modeset_enables + drm_atomic_helper_commit_planes + drm_atomic_helper_commit_tail + __drm_atomic_helper_connector_destroy_state + __drm_atomic_helper_connector_duplicate_state + __drm_atomic_helper_crtc_destroy_state + __drm_atomic_helper_crtc_duplicate_state + __drm_atomic_helper_crtc_reset + drm_atomic_helper_disable_plane + drm_atomic_helper_duplicate_state + drm_atomic_helper_fake_vblank + drm_atomic_helper_page_flip + __drm_atomic_helper_plane_destroy_state + __drm_atomic_helper_plane_duplicate_state + drm_atomic_helper_prepare_planes + __drm_atomic_helper_private_obj_duplicate_state + drm_atomic_helper_set_config + drm_atomic_helper_setup_commit + drm_atomic_helper_shutdown + drm_atomic_helper_swap_state + drm_atomic_helper_update_plane + drm_atomic_helper_wait_for_dependencies + drm_atomic_helper_wait_for_fences + drm_atomic_helper_wait_for_flip_done + drm_atomic_nonblocking_commit + drm_atomic_normalize_zpos + drm_atomic_private_obj_fini + drm_atomic_private_obj_init + drm_atomic_set_crtc_for_connector + drm_atomic_set_crtc_for_plane + drm_atomic_set_fb_for_plane + drm_atomic_set_mode_prop_for_crtc + drm_atomic_state_alloc + drm_atomic_state_clear + __drm_atomic_state_free + drm_bridge_add + drm_bridge_attach + drm_bridge_chain_mode_set + drm_bridge_remove + drm_compat_ioctl + drm_connector_attach_encoder + drm_connector_cleanup + drm_connector_init + drm_connector_list_iter_begin + drm_connector_list_iter_end + drm_connector_list_iter_next + drm_connector_register + drm_connector_unregister + drm_crtc_arm_vblank_event + drm_crtc_cleanup + __drm_crtc_commit_free + drm_crtc_enable_color_mgmt + drm_crtc_handle_vblank + drm_crtc_init_with_planes + drm_crtc_send_vblank_event + drm_crtc_vblank_count_and_time + drm_crtc_vblank_get + drm_crtc_vblank_off + drm_crtc_vblank_on + drm_crtc_vblank_put + drm_crtc_wait_one_vblank + __drm_dbg + drm_dev_put + drm_dev_register + drm_dev_unregister + drm_display_mode_to_videomode + drm_dsc_pps_payload_pack + drm_encoder_cleanup + drm_encoder_init + __drm_err + drm_event_cancel_free + drm_event_reserve_init + drm_format_info + drm_framebuffer_init + drm_gem_create_mmap_offset + drm_gem_fb_create_handle + drm_gem_fb_destroy + drm_gem_handle_create + drm_gem_mmap + drm_gem_object_free + drm_gem_object_lookup + drm_gem_object_release + drm_gem_prime_fd_to_handle + drm_gem_prime_handle_to_fd + drm_gem_prime_import_dev + drm_gem_private_object_init + drm_gem_vm_close + drm_gem_vm_open + drm_get_format_info + drm_helper_mode_fill_fb_struct + drm_helper_probe_single_connector_modes + drm_ioctl + drm_kms_helper_hotplug_event + drm_kms_helper_poll_fini + drm_kms_helper_poll_init + drmm_kmalloc + drmm_mode_config_init + drm_mode_config_reset + drm_mode_convert_to_umode + drm_mode_duplicate + drm_mode_equal + drm_mode_equal_no_clocks + drm_mode_object_find + drm_mode_object_get + drm_mode_object_put + drm_mode_probed_add + drm_modeset_acquire_fini + drm_modeset_acquire_init + drm_modeset_backoff + drm_modeset_drop_locks + drm_modeset_lock + drm_modeset_lock_all_ctx + drm_modeset_unlock + drm_mode_vrefresh + drm_object_attach_property + drm_open + drm_panel_add + drm_panel_disable + drm_panel_enable + drm_panel_get_modes + drm_panel_init + drm_panel_prepare + drm_panel_remove + drm_panel_unprepare + drm_plane_cleanup + drm_plane_create_alpha_property + drm_plane_create_blend_mode_property + drm_plane_create_rotation_property + drm_plane_create_zpos_immutable_property + drm_plane_create_zpos_property + drm_poll + drm_prime_gem_destroy + drm_printf + __drm_printfn_debug + __drm_printfn_info + __drm_printfn_seq_file + drm_property_blob_get + drm_property_blob_put + drm_property_create + drm_property_create_bitmask + drm_property_create_blob + drm_property_create_bool + drm_property_create_enum + drm_property_create_range + drm_property_create_signed_range + drm_property_lookup_blob + drm_property_replace_blob + drm_puts + __drm_puts_seq_file + drm_read + drm_rect_clip_scaled + drm_rect_intersect + drm_release + drm_rotation_simplify + drm_self_refresh_helper_alter_state + drm_send_event + drm_send_event_locked + drm_universal_plane_init + drm_vblank_init + drm_writeback_connector_init + drm_writeback_queue_job + drm_writeback_signal_completion + dump_backtrace + dump_stack + dw_handle_msi_irq + dw_pcie_host_init + dw_pcie_read + dw_pcie_setup_rc + dw_pcie_write + __dynamic_dev_dbg + __dynamic_pr_debug + em_cpu_get + emergency_restart + enable_irq + enable_percpu_irq + ether_setup + eth_mac_addr + ethtool_op_get_link + eth_type_trans + eventfd_ctx_fdget + eventfd_ctx_put + eventfd_signal + event_triggers_call + extcon_get_edev_by_phandle + extcon_get_property + extcon_get_state + extcon_register_notifier + extcon_set_property + extcon_set_property_capability + extcon_set_state_sync + extcon_unregister_notifier + fasync_helper + __fdget + fd_install + fget + file_path + filp_close + filp_open_block + find_extend_vma + _find_first_bit + _find_first_zero_bit + find_get_pid + _find_next_bit + find_pid_ns + find_task_by_vpid + find_vm_area + __find_vma + finish_wait + flush_dcache_page + flush_delayed_work + flush_work + flush_workqueue + fput + frame_vector_create + frame_vector_destroy + frame_vector_to_pages + free_irq + free_netdev + __free_pages + free_pages + free_pages_exact + free_percpu + free_percpu_irq + freezing_slow_path + freq_qos_add_request + freq_qos_remove_request + freq_qos_update_request + fs_bio_set + fsnotify + __fsnotify_parent + fsync_bdev + full_name_hash + fwnode_get_name + fwnode_gpiod_get_index + fwnode_property_read_string + fwnode_property_read_u32_array + gcd + generic_file_llseek + generic_handle_domain_irq + generic_iommu_put_resv_regions + genlmsg_multicast_allns + genlmsg_put + genl_register_family + genl_unregister_family + gen_pool_add_owner + gen_pool_alloc_algo_owner + gen_pool_avail + gen_pool_create + gen_pool_destroy + gen_pool_first_fit_align + gen_pool_free_owner + gen_pool_size + get_cpu_device + get_cpu_idle_time + get_cpu_idle_time_us + get_cpu_iowait_time_us + get_device + __get_free_pages + get_governor_parent_kobj + gether_cleanup + gether_connect + gether_disconnect + gether_get_dev_addr + gether_get_host_addr + gether_get_host_addr_u8 + gether_get_ifname + gether_get_qmult + gether_register_netdev + gether_set_dev_addr + gether_set_gadget + gether_set_host_addr + gether_set_ifname + gether_set_qmult + gether_setup_name_default + get_net_ns_by_fd + get_net_ns_by_pid + get_pid_task + get_random_bytes + get_random_u32 + get_sg_io_hdr + get_task_cred + get_thermal_instance + get_unused_fd_flags + get_user_pages + get_user_pages_fast + get_vaddr_frames + gic_nonsecure_priorities + glob_match + gov_attr_set_get + gov_attr_set_init + gov_attr_set_put + governor_sysfs_ops + gpiochip_generic_config + gpiochip_generic_free + gpiochip_generic_request + gpiochip_get_data + gpiochip_lock_as_irq + gpiochip_unlock_as_irq + gpiod_count + gpiod_direction_input + gpiod_direction_output + gpiod_direction_output_raw + gpiod_get_raw_value + gpiod_get_raw_value_cansleep + gpiod_get_value + gpiod_get_value_cansleep + gpiod_set_raw_value + gpiod_set_raw_value_cansleep + gpiod_set_value + gpiod_set_value_cansleep + gpiod_to_irq + gpio_free + gpio_request + gpio_request_one + gpio_to_desc + gserial_alloc_line + gserial_connect + gserial_disconnect + handle_edge_irq + handle_level_irq + handle_nested_irq + handle_simple_irq + handle_sysrq + have_governor_per_policy + hex2bin + hex_dump_to_buffer + hex_to_bin + hrtimer_active + hrtimer_cancel + hrtimer_forward + hrtimer_init + hrtimer_init_sleeper + hrtimer_sleeper_start_expires + hrtimer_start_range_ns + hrtimer_try_to_cancel + __hw_addr_init + __hw_addr_sync + __hw_addr_unsync + hwrng_register + hwrng_unregister + i2c_adapter_type + i2c_add_numbered_adapter + i2c_bus_type + i2c_del_adapter + i2c_del_driver + i2c_for_each_dev + i2c_get_adapter + i2c_match_id + i2c_new_ancillary_device + i2c_new_client_device + i2c_new_dummy_device + i2c_put_adapter + i2c_register_driver + i2c_smbus_read_byte_data + i2c_smbus_write_byte_data + i2c_smbus_xfer + i2c_transfer + i2c_transfer_buffer_flags + i2c_unregister_device + i2c_verify_client + ida_alloc_range + ida_destroy + ida_free + idr_alloc + idr_destroy + idr_find + idr_for_each + idr_get_next + idr_preload + idr_remove + ieee80211_channel_to_freq_khz + ieee80211_freq_khz_to_channel + ieee80211_get_channel_khz + ieee802154_alloc_hw + ieee802154_free_hw + ieee802154_register_hw + ieee802154_rx_irqsafe + ieee802154_unregister_hw + ieee802154_wake_queue + ieee802154_xmit_complete + iio_device_unregister + import_iovec + in6_pton + in_aton + inc_zone_page_state + inet_csk_get_port + init_dummy_netdev + init_net + init_pid_ns + init_pseudo + __init_rwsem + __init_swait_queue_head + init_task + init_timer_key + init_uts_ns + init_wait_entry + __init_waitqueue_head + input_alloc_absinfo + input_allocate_device + input_close_device + input_event + input_ff_create + input_free_device + input_mt_destroy_slots + input_mt_init_slots + input_mt_report_slot_state + input_open_device + input_register_device + input_register_handle + input_register_handler + input_set_abs_params + input_set_capability + input_set_timestamp + input_unregister_device + input_unregister_handle + input_unregister_handler + interval_tree_insert + interval_tree_iter_first + interval_tree_iter_next + interval_tree_remove + int_pow + int_sqrt + iomem_resource + iommu_alloc_resv_region + iommu_attach_group + iommu_aux_attach_device + iommu_aux_detach_device + iommu_aux_get_pasid + iommu_dev_enable_feature + iommu_dev_feature_enabled + iommu_device_register + iommu_device_sysfs_add + iommu_device_sysfs_remove + iommu_device_unlink + iommu_device_unregister + iommu_dma_enable_best_fit_algo + iommu_domain_alloc + iommu_domain_free + iommu_fwspec_add_ids + iommu_fwspec_free + iommu_get_dma_cookie + iommu_get_domain_for_dev + iommu_group_alloc + iommu_group_for_each_dev + iommu_group_get + iommu_group_get_iommudata + iommu_group_put + iommu_group_set_iommudata + iommu_group_set_name + iommu_iova_to_phys + iommu_map + iommu_map_sg + iommu_put_dma_cookie + iommu_register_device_fault_handler + iommu_report_device_fault + iommu_set_fault_handler + iommu_unmap + iommu_unregister_device_fault_handler + __ioremap + io_schedule_timeout + iounmap + iov_iter_bvec + ip_send_check + iput + __irq_alloc_descs + irq_create_mapping_affinity + irq_create_of_mapping + __irq_domain_add + irq_domain_get_irq_data + irq_domain_remove + irq_domain_set_info + irq_domain_xlate_twocell + irq_force_affinity + irq_get_irq_data + irq_modify_status + irq_of_parse_and_map + __irq_resolve_mapping + irq_set_affinity + irq_set_affinity_hint + irq_set_chained_handler_and_data + irq_set_chip_and_handler_name + irq_set_chip_data + irq_set_irq_type + irq_set_irq_wake + irq_to_desc + irq_work_queue + irq_work_sync + is_vmalloc_addr + jiffies + jiffies64_to_msecs + jiffies_to_msecs + jiffies_to_usecs + kasan_flag_enabled + kasprintf + kernel_cpustat + kernel_kobj + kernel_param_lock + kernel_param_unlock + kernel_restart + kern_mount + kern_unmount + key_create_or_update + key_put + keyring_alloc + __kfifo_alloc + __kfifo_free + __kfifo_in + __kfifo_init + __kfifo_out + __kfifo_out_peek + kfree + kfree_sensitive + kfree_skb_list + kfree_skb_reason + kill_anon_super + kill_fasync + kimage_voffset + __kmalloc + kmalloc_caches + kmalloc_order + kmalloc_order_trace + kmem_cache_alloc + kmem_cache_alloc_trace + kmem_cache_create + kmem_cache_destroy + kmem_cache_free + kmemdup + kobject_add + kobject_create_and_add + kobject_del + kobject_init + kobject_init_and_add + kobject_put + kobject_uevent + kobject_uevent_env + kobj_sysfs_ops + krealloc + kstat + kstrdup + kstrndup + kstrtobool + kstrtobool_from_user + kstrtoint + kstrtoint_from_user + kstrtol_from_user + kstrtoll + kstrtos8 + kstrtou16 + kstrtou8 + kstrtou8_from_user + kstrtouint + kstrtouint_from_user + kstrtoull + kstrtoull_from_user + kthread_bind + kthread_bind_mask + kthread_cancel_delayed_work_sync + kthread_cancel_work_sync + kthread_create_on_node + kthread_create_worker + kthread_delayed_work_timer_fn + kthread_destroy_worker + kthread_flush_work + kthread_flush_worker + __kthread_init_worker + kthread_mod_delayed_work + kthread_queue_work + kthread_should_stop + kthread_stop + kthread_worker_fn + ktime_get + ktime_get_coarse_with_offset + ktime_get_mono_fast_ns + ktime_get_raw + ktime_get_raw_ts64 + ktime_get_real_seconds + ktime_get_real_ts64 + ktime_get_seconds + ktime_get_snapshot + ktime_get_ts64 + ktime_get_with_offset + kvfree + kvfree_call_rcu + kvmalloc_node + led_classdev_register_ext + led_classdev_unregister + __list_add_valid + __list_del_entry_valid + list_sort + __local_bh_enable_ip + __lock_page + log_abnormal_wakeup_reason + log_threaded_irq_wakeup_reason + loops_per_jiffy + mac_pton + mbox_chan_received_data + mbox_controller_register + mbox_controller_unregister + mbox_free_channel + mbox_request_channel + mbox_send_message + memchr + memcmp + memcpy + __memcpy_fromio + __memcpy_toio + memdup_user + memdup_user_nul + memmove + memparse + memremap + mem_section + memset + memset64 + __memset_io + memstart_addr + memunmap + mfd_add_devices + mfd_remove_devices + mipi_dsi_attach + mipi_dsi_compression_mode + mipi_dsi_create_packet + mipi_dsi_dcs_read + mipi_dsi_dcs_set_column_address + mipi_dsi_dcs_set_display_brightness + mipi_dsi_dcs_set_page_address + mipi_dsi_dcs_write_buffer + mipi_dsi_detach + mipi_dsi_device_register_full + mipi_dsi_driver_register_full + mipi_dsi_driver_unregister + mipi_dsi_host_register + mipi_dsi_host_unregister + mipi_dsi_packet_format_is_long + mipi_dsi_picture_parameter_set + misc_deregister + misc_register + __mmap_lock_do_trace_acquire_returned + __mmap_lock_do_trace_released + __mmap_lock_do_trace_start_locking + __mmdrop + mod_delayed_work_on + mod_timer + __module_get + module_layout + module_put + __msecs_to_jiffies + msleep + msleep_interruptible + __mutex_init + mutex_is_locked + mutex_lock + mutex_lock_interruptible + mutex_trylock + mutex_unlock + __napi_alloc_skb + napi_complete_done + napi_disable + napi_enable + napi_gro_receive + __napi_schedule + napi_schedule_prep + __netdev_alloc_skb + netdev_err + netdev_info + netdev_set_default_ethtool_ops + netdev_state_change + netdev_update_features + netif_carrier_off + netif_carrier_on + netif_napi_add + __netif_napi_del + netif_receive_skb + netif_receive_skb_list + netif_rx + netif_rx_ni + netif_tx_stop_all_queues + netif_tx_wake_queue + netlink_broadcast + __netlink_kernel_create + netlink_kernel_release + netlink_register_notifier + netlink_unicast + netlink_unregister_notifier + net_ns_type_operations + net_ratelimit + nla_find + nla_memcpy + __nla_parse + nla_put + nla_put_64bit + nla_put_nohdr + nla_reserve + nla_strscpy + __nla_validate + __nlmsg_put + no_llseek + nonseekable_open + noop_llseek + nr_cpu_ids + nr_irqs + ns_capable + nsec_to_clock_t + ns_to_timespec64 + __num_online_cpus + nvmem_device_put + nvmem_device_read + nvmem_device_write + of_address_to_resource + of_alias_get_id + of_clk_add_provider + of_clk_get + of_clk_get_by_name + of_clk_src_onecell_get + of_count_phandle_with_args + of_cpu_node_to_id + of_devfreq_cooling_register_power + of_device_get_match_data + of_device_is_available + of_device_is_compatible + of_dma_controller_free + of_dma_controller_register + of_dma_is_coherent + of_drm_find_bridge + of_drm_find_panel + of_find_backlight_by_node + of_find_compatible_node + of_find_device_by_node + of_find_i2c_adapter_by_node + of_find_i2c_device_by_node + of_find_matching_node_and_match + of_find_node_by_name + of_find_node_by_phandle + of_find_node_by_type + of_find_node_opts_by_path + of_find_node_with_property + of_find_property + of_fwnode_ops + of_genpd_add_provider_simple + of_get_child_by_name + of_get_cpu_node + of_get_named_gpio_flags + of_get_next_available_child + of_get_next_child + of_get_property + of_get_regulator_init_data + of_iomap + of_irq_find_parent + of_irq_get + of_irq_get_byname + of_irq_parse_one + of_machine_is_compatible + of_match_device + of_match_node + of_n_addr_cells + of_node_name_eq + of_n_size_cells + of_nvmem_device_get + of_parse_phandle + of_parse_phandle_with_args + of_parse_phandle_with_fixed_args + of_phandle_iterator_init + of_phandle_iterator_next + of_platform_device_create + of_platform_device_destroy + of_platform_populate + of_property_count_elems_of_size + of_property_match_string + of_property_read_string + of_property_read_string_helper + of_property_read_u32_index + of_property_read_u64 + of_property_read_u64_index + of_property_read_variable_u16_array + of_property_read_variable_u32_array + of_property_read_variable_u64_array + of_property_read_variable_u8_array + of_prop_next_string + of_prop_next_u32 + of_reserved_mem_device_init_by_idx + of_reserved_mem_device_release + of_reserved_mem_lookup + of_root + of_thermal_get_ntrips + of_thermal_get_trip_points + of_thermal_is_trip_valid + of_translate_address + of_usb_host_tpl_support + page_endio + page_mapping + panic + panic_notifier_list + param_array_ops + param_ops_bool + param_ops_byte + param_ops_charp + param_ops_int + param_ops_long + param_ops_string + param_ops_uint + param_ops_ulong + pci_alloc_irq_vectors_affinity + pci_assign_resource + pci_clear_master + pci_disable_device + pci_enable_device + pci_enable_wake + pci_find_bus + pci_find_capability + pci_find_ext_capability + pci_free_irq_vectors + pci_get_device + pci_load_and_free_saved_state + pci_load_saved_state + pci_msi_mask_irq + pci_msi_unmask_irq + pci_read_config_dword + pci_read_config_word + __pci_register_driver + pci_release_regions + pci_rescan_bus + pci_restore_msi_state + pci_restore_state + pci_save_state + pci_set_master + pci_set_power_state + pci_store_saved_state + pci_unregister_driver + pci_wake_from_d3 + pci_write_config_dword + pci_write_config_word + PDE_DATA + __per_cpu_offset + perf_event_create_kernel_counter + perf_event_disable + perf_event_enable + perf_event_pause + perf_event_read_local + perf_event_read_value + perf_event_release_kernel + perf_event_update_userpage + perf_pmu_migrate_context + perf_pmu_register + perf_pmu_unregister + perf_trace_buf_alloc + perf_trace_run_bpf_submit + pfn_is_map_memory + phy_init + phy_power_off + phy_power_on + pid_task + pinconf_generic_dt_free_map + pinconf_generic_dt_node_to_map + pinctrl_add_gpio_range + pinctrl_dev_get_drvdata + pinctrl_enable + pinctrl_force_sleep + pinctrl_lookup_state + pinctrl_remove_gpio_range + pinctrl_select_state + pin_get_name + pin_user_pages + pin_user_pages_fast + pin_user_pages_remote + pktgen_xfrm_outer_mode_output + pkvm_iommu_driver_init + pkvm_iommu_register + pkvm_iommu_resume + pkvm_iommu_s2mpu_init + pkvm_iommu_s2mpu_register + pkvm_iommu_suspend + __pkvm_load_el2_module + __pkvm_register_el2_call + platform_bus_type + platform_device_add + platform_device_add_data + platform_device_add_resources + platform_device_alloc + platform_device_del + platform_device_put + platform_device_register + platform_device_register_full + platform_device_unregister + __platform_driver_probe + __platform_driver_register + platform_driver_unregister + platform_find_device_by_driver + platform_get_irq + platform_get_irq_byname + platform_get_irq_optional + platform_get_resource + platform_get_resource_byname + platform_irq_count + pm_genpd_add_subdomain + pm_genpd_init + pm_power_off + __pm_relax + pm_relax + pm_runtime_allow + __pm_runtime_disable + pm_runtime_enable + pm_runtime_forbid + pm_runtime_force_resume + pm_runtime_force_suspend + pm_runtime_get_if_active + __pm_runtime_idle + pm_runtime_irq_safe + __pm_runtime_resume + pm_runtime_set_autosuspend_delay + __pm_runtime_set_status + __pm_runtime_suspend + __pm_runtime_use_autosuspend + __pm_stay_awake + pm_stay_awake + pm_wakeup_dev_event + pm_wakeup_ws_event + power_supply_changed + power_supply_get_by_name + power_supply_get_by_phandle_array + power_supply_get_drvdata + power_supply_get_property + power_supply_put + power_supply_register + power_supply_reg_notifier + power_supply_set_property + power_supply_unregister + power_supply_unreg_notifier + pps_event + pps_register_source + pps_unregister_source + prandom_bytes + prandom_u32 + preempt_schedule + preempt_schedule_notrace + prepare_to_wait + prepare_to_wait_event + print_hex_dump + _printk + _printk_deferred + proc_create + proc_create_data + proc_create_single_data + proc_dointvec + proc_dostring + proc_douintvec_minmax + proc_mkdir + proc_mkdir_data + proc_remove + proc_set_size + pskb_expand_head + __pskb_pull_tail + ___pskb_trim + __put_cred + put_device + __put_net + __put_page + put_pid + put_sg_io_hdr + __put_task_struct + put_unused_fd + put_vaddr_frames + queue_delayed_work_on + queue_work_on + ___ratelimit + raw_notifier_call_chain + raw_notifier_chain_register + _raw_read_lock + _raw_read_lock_bh + _raw_read_lock_irqsave + _raw_read_unlock + _raw_read_unlock_bh + _raw_read_unlock_irqrestore + _raw_spin_lock + _raw_spin_lock_bh + _raw_spin_lock_irq + _raw_spin_lock_irqsave + raw_spin_rq_lock_nested + raw_spin_rq_unlock + _raw_spin_trylock + _raw_spin_unlock + _raw_spin_unlock_bh + _raw_spin_unlock_irq + _raw_spin_unlock_irqrestore + _raw_write_lock + _raw_write_lock_irq + _raw_write_lock_irqsave + _raw_write_unlock + _raw_write_unlock_irq + _raw_write_unlock_irqrestore + rb_erase + rb_first + rb_insert_color + rb_next + rb_prev + rb_replace_node + rcu_barrier + __rcu_read_lock + __rcu_read_unlock + rdev_get_drvdata + rdev_get_id + reboot_mode + refcount_warn_saturate + __refrigerator + regcache_cache_only + regcache_drop_region + regcache_mark_dirty + regcache_sync + __register_blkdev + __register_chrdev + register_chrdev_region + register_console + register_die_notifier + register_inet6addr_notifier + register_inetaddr_notifier + register_kernel_break_hook + register_netdev + register_netdevice + register_netdevice_notifier + register_oom_notifier + register_pernet_device + register_pernet_subsys + register_pm_notifier + register_reboot_notifier + register_restart_handler + register_shrinker + register_syscore_ops + register_sysctl + register_sysctl_table + register_virtio_device + register_virtio_driver + regmap_async_complete + regmap_bulk_read + regmap_bulk_write + regmap_get_device + regmap_multi_reg_write + regmap_multi_reg_write_bypassed + regmap_raw_read + regmap_raw_write + regmap_raw_write_async + regmap_read + regmap_register_patch + regmap_update_bits_base + regmap_write + regulator_bulk_disable + regulator_bulk_enable + regulator_disable + regulator_disable_regmap + regulator_enable + regulator_enable_regmap + regulator_get + regulator_get_optional + regulator_get_voltage_sel_regmap + regulator_is_enabled + regulator_is_enabled_regmap + regulator_list_voltage_linear + regulator_map_voltage_linear + regulator_notifier_call_chain + regulator_put + regulator_register + regulator_set_voltage + regulator_set_voltage_sel_regmap + regulator_unregister + regulatory_hint + regulatory_set_wiphy_regd + regulatory_set_wiphy_regd_sync + release_firmware + __release_region + remap_pfn_range + remap_vmalloc_range + remove_cpu + remove_proc_entry + remove_wait_queue + request_firmware + request_firmware_nowait + __request_percpu_irq + __request_region + request_threaded_irq + return_address + rfkill_alloc + rfkill_blocked + rfkill_destroy + rfkill_init_sw_state + rfkill_register + rfkill_resume_polling + rfkill_set_hw_state_reason + rfkill_unregister + rhashtable_free_and_destroy + rhashtable_insert_slow + rhltable_init + __rht_bucket_nested + rht_bucket_nested + rht_bucket_nested_insert + root_task_group + round_jiffies + round_jiffies_relative + round_jiffies_up + rps_needed + rtc_class_close + rtc_class_open + rtc_read_time + rtc_time64_to_tm + rtc_tm_to_time64 + rtc_update_irq + rtc_valid_tm + __rt_mutex_init + rt_mutex_lock + rt_mutex_trylock + rt_mutex_unlock + rtnl_is_locked + rtnl_lock + rtnl_trylock + rtnl_unlock + runqueues + sched_clock + sched_feat_keys + sched_setattr_nocheck + sched_set_fifo + sched_set_normal + sched_setscheduler + sched_setscheduler_nocheck + sched_show_task + sched_trace_cfs_rq_avg + sched_trace_cfs_rq_cpu + sched_trace_cfs_rq_path + sched_trace_rd_span + sched_trace_rq_avg_dl + sched_trace_rq_avg_irq + sched_trace_rq_avg_rt + sched_trace_rq_cpu + sched_uclamp_used + schedule + schedule_timeout + schedule_timeout_interruptible + scnprintf + scsi_autopm_get_device + scsi_autopm_put_device + scsi_block_requests + scsi_block_when_processing_errors + scsi_cmd_allowed + scsi_command_size_tbl + scsi_device_get + scsi_device_put + scsi_ioctl + scsi_ioctl_block_when_processing_errors + scsi_normalize_sense + __scsi_print_sense + scsi_register_interface + scsi_unblock_requests + sdev_prefix_printk + seq_hex_dump + seq_lseek + seq_open + __seq_open_private + seq_printf + seq_putc + seq_puts + seq_read + seq_release + seq_release_private + seq_write + set_blocksize + set_capacity + set_capacity_and_notify + set_cpus_allowed_ptr + set_freezable + set_page_dirty + set_page_dirty_lock + __SetPageMovable + set_user_nice + sg_alloc_table + sg_alloc_table_from_pages_segment + sg_free_table + sg_init_one + sg_init_table + sg_next + __sg_page_iter_next + __sg_page_iter_start + shmem_file_setup + simple_attr_open + simple_attr_read + simple_attr_release + simple_attr_write + simple_open + simple_read_from_buffer + simple_strtol + simple_strtoll + simple_strtoul + simple_strtoull + simple_write_to_buffer + single_open + single_open_size + single_release + skb_add_rx_frag + skb_checksum + skb_checksum_help + skb_clone + skb_clone_sk + skb_complete_wifi_ack + skb_copy + skb_copy_bits + skb_copy_expand + skb_dequeue + skb_dequeue_tail + skb_ensure_writable + __skb_get_hash + __skb_gso_segment + skb_pull + skb_push + skb_put + skb_queue_head + skb_queue_purge + skb_queue_tail + skb_realloc_headroom + skb_trim + skip_spaces + smp_call_function + smp_call_function_single + smp_call_on_cpu + snd_compr_stop_error + snd_ctl_add + snd_ctl_boolean_mono_info + snd_ctl_enum_info + snd_ctl_new1 + snd_ctl_notify + snd_info_create_card_entry + snd_jack_set_key + snd_pcm_format_physical_width + snd_pcm_format_width + snd_pcm_hw_constraint_list + snd_pcm_lib_free_pages + snd_pcm_lib_ioctl + snd_pcm_lib_malloc_pages + snd_pcm_lib_preallocate_pages + snd_pcm_period_elapsed + snd_soc_add_component_controls + snd_soc_add_dai_controls + snd_soc_bytes_tlv_callback + snd_soc_card_get_kcontrol + snd_soc_card_jack_new + snd_soc_component_disable_pin + snd_soc_component_force_enable_pin + snd_soc_component_set_jack + snd_soc_component_set_pll + snd_soc_component_set_sysclk + snd_soc_daifmt_clock_provider_from_bitmap + snd_soc_daifmt_parse_clock_provider_raw + snd_soc_daifmt_parse_format + snd_soc_dai_set_pll + snd_soc_dai_set_sysclk + snd_soc_dai_set_tdm_slot + snd_soc_dapm_add_routes + snd_soc_dapm_get_enum_double + snd_soc_dapm_get_volsw + snd_soc_dapm_ignore_suspend + snd_soc_dapm_kcontrol_widget + snd_soc_dapm_mixer_update_power + snd_soc_dapm_new_controls + snd_soc_dapm_put_enum_double + snd_soc_dapm_put_volsw + snd_soc_dapm_sync + snd_soc_get_enum_double + snd_soc_get_volsw + snd_soc_get_volsw_range + snd_soc_info_enum_double + snd_soc_info_volsw + snd_soc_info_volsw_range + snd_soc_new_compress + snd_soc_of_get_dai_link_codecs + snd_soc_of_get_dai_name + snd_soc_of_parse_card_name + snd_soc_params_to_bclk + snd_soc_put_enum_double + snd_soc_put_volsw + snd_soc_put_volsw_range + snd_soc_register_card + snd_soc_register_component + snd_soc_runtime_set_dai_fmt + snd_soc_unregister_card + snd_soc_unregister_component + snprintf + soc_device_register + __sock_create + sock_release + sock_wfree + softnet_data + sort + __spi_alloc_controller + spi_bus_lock + spi_bus_unlock + spi_controller_resume + spi_controller_suspend + spi_delay_exec + spi_finalize_current_message + spi_register_controller + __spi_register_driver + spi_setup + spi_sync + spi_sync_locked + spi_unregister_controller + split_page + spmi_controller_add + spmi_controller_alloc + spmi_controller_remove + __spmi_driver_register + sprintf + sprint_symbol + srcu_init_notifier_head + srcu_notifier_call_chain + srcu_notifier_chain_register + srcu_notifier_chain_unregister + sscanf + __stack_chk_fail + static_key_disable + static_key_slow_dec + static_key_slow_inc + stop_machine + strcasecmp + strcat + strchr + strcmp + strcpy + strcspn + stream_open + strim + strlcat + strlcpy + strlen + strncasecmp + strncat + strncmp + strncpy + strncpy_from_user + strndup_user + strnlen + strnstr + strpbrk + strrchr + strscpy + strsep + strspn + strstr + submit_bio + submit_bio_wait + subsys_system_register + suspend_set_ops + __sw_hweight16 + __sw_hweight32 + __sw_hweight64 + __sw_hweight8 + sync_file_create + sync_file_get_fence + synchronize_irq + synchronize_net + synchronize_rcu + syscon_regmap_lookup_by_phandle + sysctl_sched_features + sysctl_sched_latency + sysfs_add_file_to_group + sysfs_create_file_ns + sysfs_create_files + sysfs_create_group + sysfs_create_groups + sysfs_create_link + sysfs_emit + sysfs_emit_at + sysfs_notify + sysfs_remove_file_ns + sysfs_remove_group + sysfs_remove_groups + sysfs_remove_link + sysfs_streq + sysfs_update_group + sysrq_mask + system_freezable_wq + system_freezing_cnt + system_highpri_wq + system_long_wq + system_power_efficient_wq + system_state + system_unbound_wq + system_wq + sys_tz + task_active_pid_ns + __tasklet_hi_schedule + tasklet_init + tasklet_kill + __tasklet_schedule + tasklet_setup + tasklet_unlock_wait + task_may_not_preempt + __task_pid_nr_ns + task_rq_lock + tcpci_get_tcpm_port + tcpci_irq + tcpci_register_port + tcpci_unregister_port + tcpm_cc_change + tcpm_is_debouncing + tcpm_pd_hard_reset + tcpm_pd_receive + tcpm_pd_transmit_complete + tcpm_sink_frs + tcpm_sourcing_vbus + tcpm_update_sink_capabilities + tcpm_vbus_change + thermal_cdev_update + thermal_cooling_device_unregister + thermal_of_cooling_device_register + thermal_zone_device_disable + thermal_zone_device_enable + thermal_zone_device_register + thermal_zone_device_unregister + thermal_zone_device_update + thermal_zone_get_temp + thermal_zone_of_sensor_register + thermal_zone_of_sensor_unregister + thread_group_cputime_adjusted + tick_nohz_get_idle_calls_cpu + time64_to_tm + topology_set_thermal_pressure + _totalram_pages + touch_softlockup_watchdog + trace_event_buffer_commit + trace_event_buffer_reserve + trace_event_ignore_this_pid + trace_event_printf + trace_event_raw_init + trace_event_reg + trace_handle_return + __traceiter_android_rvh_arm64_serror_panic + __traceiter_android_rvh_cgroup_force_kthread_migration + __traceiter_android_rvh_check_preempt_wakeup + __traceiter_android_rvh_cpu_cgroup_online + __traceiter_android_rvh_cpu_overutilized + __traceiter_android_rvh_dequeue_task + __traceiter_android_rvh_die_kernel_fault + __traceiter_android_rvh_do_mem_abort + __traceiter_android_rvh_do_ptrauth_fault + __traceiter_android_rvh_do_sea + __traceiter_android_rvh_do_sp_pc_abort + __traceiter_android_rvh_do_undefinstr + __traceiter_android_rvh_enqueue_task + __traceiter_android_rvh_find_energy_efficient_cpu + __traceiter_android_rvh_irqs_disable + __traceiter_android_rvh_irqs_enable + __traceiter_android_rvh_panic_unhandled + __traceiter_android_rvh_post_init_entity_util_avg + __traceiter_android_rvh_preempt_disable + __traceiter_android_rvh_preempt_enable + __traceiter_android_rvh_sched_fork + __traceiter_android_rvh_select_task_rq_fair + __traceiter_android_rvh_select_task_rq_rt + __traceiter_android_rvh_set_cpus_allowed_by_task + __traceiter_android_rvh_set_iowait + __traceiter_android_rvh_set_task_cpu + __traceiter_android_rvh_typec_tcpci_get_vbus + __traceiter_android_rvh_uclamp_eff_get + __traceiter_android_rvh_ufs_complete_init + __traceiter_android_rvh_ufs_reprogram_all_keys + __traceiter_android_rvh_update_rt_rq_load_avg + __traceiter_android_rvh_util_est_update + __traceiter_android_vh_arch_set_freq_scale + __traceiter_android_vh_binder_restore_priority + __traceiter_android_vh_binder_set_priority + __traceiter_android_vh_cpu_idle_enter + __traceiter_android_vh_cpu_idle_exit + __traceiter_android_vh_dump_throttled_rt_tasks + __traceiter_android_vh_dup_task_struct + __traceiter_android_vh_enable_thermal_genl_check + __traceiter_android_vh_ipi_stop + __traceiter_android_vh_scheduler_tick + __traceiter_android_vh_setscheduler_uclamp + __traceiter_android_vh_sysrq_crash + __traceiter_android_vh_typec_store_partner_src_caps + __traceiter_android_vh_typec_tcpci_override_toggling + __traceiter_android_vh_typec_tcpm_get_timer + __traceiter_android_vh_typec_tcpm_log + __traceiter_android_vh_ufs_check_int_errors + __traceiter_android_vh_ufs_compl_command + __traceiter_android_vh_ufs_fill_prdt + __traceiter_android_vh_ufs_prepare_command + __traceiter_android_vh_ufs_send_command + __traceiter_android_vh_ufs_send_tm_command + __traceiter_android_vh_ufs_send_uic_command + __traceiter_android_vh_ufs_update_sdev + __traceiter_android_vh_ufs_update_sysfs + __traceiter_clock_set_rate + __traceiter_cpu_frequency + __traceiter_device_pm_callback_end + __traceiter_device_pm_callback_start + __traceiter_gpu_mem_total + __traceiter_mmap_lock_acquire_returned + __traceiter_mmap_lock_released + __traceiter_mmap_lock_start_locking + __traceiter_pelt_cfs_tp + __traceiter_pelt_dl_tp + __traceiter_pelt_irq_tp + __traceiter_pelt_rt_tp + __traceiter_pelt_se_tp + __traceiter_sched_cpu_capacity_tp + __traceiter_sched_overutilized_tp + __traceiter_sched_switch + __traceiter_sched_util_est_cfs_tp + __traceiter_sched_util_est_se_tp + __traceiter_suspend_resume + trace_output_call + __tracepoint_android_rvh_arm64_serror_panic + __tracepoint_android_rvh_cgroup_force_kthread_migration + __tracepoint_android_rvh_check_preempt_wakeup + __tracepoint_android_rvh_cpu_cgroup_online + __tracepoint_android_rvh_cpu_overutilized + __tracepoint_android_rvh_dequeue_task + __tracepoint_android_rvh_die_kernel_fault + __tracepoint_android_rvh_do_mem_abort + __tracepoint_android_rvh_do_ptrauth_fault + __tracepoint_android_rvh_do_sea + __tracepoint_android_rvh_do_sp_pc_abort + __tracepoint_android_rvh_do_undefinstr + __tracepoint_android_rvh_enqueue_task + __tracepoint_android_rvh_find_energy_efficient_cpu + __tracepoint_android_rvh_irqs_disable + __tracepoint_android_rvh_irqs_enable + __tracepoint_android_rvh_panic_unhandled + __tracepoint_android_rvh_post_init_entity_util_avg + __tracepoint_android_rvh_preempt_disable + __tracepoint_android_rvh_preempt_enable + __tracepoint_android_rvh_sched_fork + __tracepoint_android_rvh_select_task_rq_fair + __tracepoint_android_rvh_select_task_rq_rt + __tracepoint_android_rvh_set_cpus_allowed_by_task + __tracepoint_android_rvh_set_iowait + __tracepoint_android_rvh_set_task_cpu + __tracepoint_android_rvh_typec_tcpci_get_vbus + __tracepoint_android_rvh_uclamp_eff_get + __tracepoint_android_rvh_ufs_complete_init + __tracepoint_android_rvh_ufs_reprogram_all_keys + __tracepoint_android_rvh_update_rt_rq_load_avg + __tracepoint_android_rvh_util_est_update + __tracepoint_android_vh_arch_set_freq_scale + __tracepoint_android_vh_binder_restore_priority + __tracepoint_android_vh_binder_set_priority + __tracepoint_android_vh_cpu_idle_enter + __tracepoint_android_vh_cpu_idle_exit + __tracepoint_android_vh_dump_throttled_rt_tasks + __tracepoint_android_vh_dup_task_struct + __tracepoint_android_vh_enable_thermal_genl_check + __tracepoint_android_vh_ipi_stop + __tracepoint_android_vh_scheduler_tick + __tracepoint_android_vh_setscheduler_uclamp + __tracepoint_android_vh_sysrq_crash + __tracepoint_android_vh_typec_store_partner_src_caps + __tracepoint_android_vh_typec_tcpci_override_toggling + __tracepoint_android_vh_typec_tcpm_get_timer + __tracepoint_android_vh_typec_tcpm_log + __tracepoint_android_vh_ufs_check_int_errors + __tracepoint_android_vh_ufs_compl_command + __tracepoint_android_vh_ufs_fill_prdt + __tracepoint_android_vh_ufs_prepare_command + __tracepoint_android_vh_ufs_send_command + __tracepoint_android_vh_ufs_send_tm_command + __tracepoint_android_vh_ufs_send_uic_command + __tracepoint_android_vh_ufs_update_sdev + __tracepoint_android_vh_ufs_update_sysfs + __tracepoint_clock_set_rate + __tracepoint_cpu_frequency + __tracepoint_device_pm_callback_end + __tracepoint_device_pm_callback_start + __tracepoint_gpu_mem_total + __tracepoint_mmap_lock_acquire_returned + __tracepoint_mmap_lock_released + __tracepoint_mmap_lock_start_locking + __tracepoint_pelt_cfs_tp + __tracepoint_pelt_dl_tp + __tracepoint_pelt_irq_tp + __tracepoint_pelt_rt_tp + __tracepoint_pelt_se_tp + tracepoint_probe_register + tracepoint_probe_unregister + __tracepoint_sched_cpu_capacity_tp + __tracepoint_sched_overutilized_tp + __tracepoint_sched_switch + __tracepoint_sched_util_est_cfs_tp + __tracepoint_sched_util_est_se_tp + __tracepoint_suspend_resume + trace_print_array_seq + trace_print_bitmask_seq + trace_print_flags_seq + trace_print_hex_seq + trace_print_symbols_seq + trace_raw_output_prep + try_module_get + try_to_del_timer_sync + tty_flip_buffer_push + tty_insert_flip_string_fixed_flag + tty_kref_put + tty_port_tty_get + typec_switch_get_drvdata + typec_switch_register + typec_switch_unregister + uart_add_one_port + uart_console_device + uart_console_write + uart_get_baud_rate + uart_parse_options + uart_register_driver + uart_remove_one_port + uart_resume_port + uart_set_options + uart_suspend_port + uart_try_toggle_sysrq + uart_unregister_driver + uart_update_timeout + uart_write_wakeup + __ubsan_handle_cfi_check_fail_abort + uclamp_eff_value + __udelay + udp4_hwcsum + ufshcd_auto_hibern8_update + ufshcd_bkops_ctrl + ufshcd_hold + ufshcd_pltfrm_init + ufshcd_query_attr_retry + ufshcd_query_flag_retry + ufshcd_read_desc_param + ufshcd_release + ufshcd_remove + ufshcd_shutdown + ufshcd_system_resume + ufshcd_system_suspend + unlock_page + unmap_mapping_range + unpin_user_page + unregister_blkdev + __unregister_chrdev + unregister_chrdev_region + unregister_inet6addr_notifier + unregister_inetaddr_notifier + unregister_netdev + unregister_netdevice_many + unregister_netdevice_notifier + unregister_netdevice_queue + unregister_oom_notifier + unregister_pernet_device + unregister_pernet_subsys + unregister_pm_notifier + unregister_reboot_notifier + unregister_shrinker + unregister_syscore_ops + unregister_sysctl_table + unregister_virtio_device + unregister_virtio_driver + up + update_devfreq + ___update_load_avg + __update_load_avg_blocked_se + ___update_load_sum + update_rq_clock + up_read + up_write + usb_add_function + usb_add_hcd + usb_assign_descriptors + usb_copy_descriptors + __usb_create_hcd + usb_disabled + usb_enable_autosuspend + usb_ep_alloc_request + usb_ep_autoconfig + usb_ep_disable + usb_ep_enable + usb_ep_free_request + usb_ep_queue + usb_free_all_descriptors + usb_function_register + usb_function_unregister + usb_gadget_activate + usb_gadget_deactivate + usb_gadget_set_state + usb_gstrings_attach + usb_hcd_is_primary_hcd + usb_hcd_platform_shutdown + usb_hub_find_child + usb_interface_id + usb_os_desc_prepare_interf_dir + usb_otg_state_string + usb_put_function_instance + usb_put_hcd + usb_register_notify + usb_remove_hcd + usb_role_switch_get_drvdata + usb_role_switch_register + usb_role_switch_unregister + usb_speed_string + usb_string_id + usb_unregister_notify + __usecs_to_jiffies + usleep_range_state + v4l2_ctrl_handler_free + v4l2_ctrl_handler_init_class + v4l2_ctrl_log_status + v4l2_ctrl_new_custom + v4l2_ctrl_new_std + v4l2_ctrl_new_std_menu + v4l2_device_register + v4l2_device_unregister + v4l2_fh_add + v4l2_fh_del + v4l2_fh_exit + v4l2_fh_init + v4l2_fh_open + v4l2_m2m_buf_queue + v4l2_m2m_buf_remove + v4l2_m2m_ctx_init + v4l2_m2m_ctx_release + v4l2_m2m_fop_mmap + v4l2_m2m_fop_poll + v4l2_m2m_get_curr_priv + v4l2_m2m_get_vq + v4l2_m2m_init + v4l2_m2m_ioctl_dqbuf + v4l2_m2m_ioctl_querybuf + v4l2_m2m_ioctl_reqbufs + v4l2_m2m_ioctl_streamoff + v4l2_m2m_ioctl_streamon + v4l2_m2m_job_finish + v4l2_m2m_next_buf + v4l2_m2m_qbuf + v4l2_m2m_release + vabits_actual + vb2_buffer_done + vb2_common_vm_ops + vb2_create_framevec + vb2_destroy_framevec + vb2_dma_sg_memops + vb2_dqbuf + vb2_fop_mmap + vb2_fop_poll + vb2_fop_read + vb2_fop_release + vb2_ioctl_create_bufs + vb2_ioctl_dqbuf + vb2_ioctl_expbuf + vb2_ioctl_qbuf + vb2_ioctl_querybuf + vb2_ioctl_reqbufs + vb2_ioctl_streamoff + vb2_ioctl_streamon + vb2_mmap + vb2_ops_wait_finish + vb2_ops_wait_prepare + vb2_plane_cookie + vb2_plane_vaddr + vb2_poll + vb2_qbuf + vb2_querybuf + vb2_queue_init + vb2_queue_release + vb2_reqbufs + vb2_streamoff + vb2_streamon + vb2_vmalloc_memops + vb2_wait_for_all_buffers + verify_pkcs7_signature + vfree + video_devdata + video_device_alloc + video_device_release + video_device_release_empty + video_ioctl2 + __video_register_device + video_unregister_device + virtqueue_add_inbuf + virtqueue_add_outbuf + virtqueue_detach_unused_buf + virtqueue_get_buf + virtqueue_get_vring_size + virtqueue_kick + virtqueue_kick_prepare + virtqueue_notify + vmalloc + vmalloc_to_page + vmalloc_user + vmap + vmf_insert_pfn_prot + vm_iomap_memory + vm_map_pages + vm_map_ram + vm_unmap_ram + vprintk + vprintk_emit + vring_del_virtqueue + vring_interrupt + vring_new_virtqueue + vsnprintf + vunmap + vzalloc + vzalloc_node + wait_for_completion + wait_for_completion_interruptible + wait_for_completion_interruptible_timeout + wait_for_completion_timeout + wait_woken + __wake_up + __wake_up_locked + wake_up_process + wakeup_source_add + wakeup_source_create + wakeup_source_destroy + wakeup_source_register + wakeup_source_unregister + __wake_up_sync + __warn_printk + watchdog_init_timeout + watchdog_register_device + watchdog_set_restart_priority + watchdog_unregister_device + wiphy_free + wiphy_new_nm + wiphy_register + wiphy_unregister + wireless_nlevent_flush + woken_wake_function + work_busy + __xfrm_state_destroy + xfrm_state_lookup_byspi + xfrm_stateonly_find + xhci_gen_setup + xhci_init_driver + xhci_resume + xhci_run + xhci_suspend + zs_compact + zs_create_pool + zs_destroy_pool + zs_free + zs_get_total_pages + zs_huge_class_size + zs_malloc + zs_map_object + zs_pool_stats + zs_unmap_object diff --git a/android/abi_gki_aarch64_virtual_device b/android/abi_gki_aarch64_virtual_device new file mode 100644 index 000000000000..a22bc01a62cb --- /dev/null +++ b/android/abi_gki_aarch64_virtual_device @@ -0,0 +1,1383 @@ +[abi_symbol_list] +# commonly used symbols + alloc_etherdev_mqs + __alloc_pages + __alloc_skb + alloc_workqueue + amba_driver_register + amba_driver_unregister + __arch_copy_from_user + __arch_copy_to_user + arm64_const_caps_ready + arm64_use_ng_mappings + bcmp + bpf_trace_run2 + bpf_trace_run3 + bt_err + bt_info + bt_warn + build_skb + cancel_delayed_work_sync + cancel_work_sync + __cfi_slowpath_diag + __check_object_size + __class_create + class_destroy + clk_disable + clk_enable + clk_get_rate + clk_prepare + clk_set_rate + clk_unprepare + complete + __const_udelay + consume_skb + cpu_hwcap_keys + cpu_hwcaps + cpu_number + __cpu_online_mask + debugfs_attr_read + debugfs_attr_write + debugfs_create_devm_seqfile + debugfs_create_dir + debugfs_create_file + debugfs_create_u32 + debugfs_create_u8 + debugfs_remove + delayed_work_timer_fn + del_timer + destroy_workqueue + dev_driver_string + _dev_err + device_create + device_create_file + device_init_wakeup + device_remove_file + device_unregister + _dev_info + __dev_kfree_skb_any + devm_clk_get + devm_clk_hw_register + devm_ioremap + devm_ioremap_resource + devm_kfree + devm_kmalloc + devm_request_threaded_irq + _dev_notice + dev_queue_xmit + _dev_warn + dma_alloc_attrs + dma_buf_export + dma_fence_context_alloc + dma_fence_init + dma_fence_release + dma_fence_signal_locked + dma_free_attrs + dmam_alloc_attrs + dma_map_page_attrs + dma_map_sgtable + dma_set_coherent_mask + dma_set_mask + dma_sync_sg_for_device + dma_sync_single_for_cpu + dma_sync_single_for_device + dma_unmap_page_attrs + dma_unmap_sg_attrs + do_trace_netlink_extack + drm_add_modes_noedid + drm_atomic_get_crtc_state + drm_atomic_helper_check + drm_atomic_helper_check_plane_state + drm_atomic_helper_commit + drm_atomic_helper_connector_destroy_state + drm_atomic_helper_connector_duplicate_state + drm_atomic_helper_connector_reset + drm_atomic_helper_disable_plane + drm_atomic_helper_page_flip + drm_atomic_helper_set_config + drm_atomic_helper_shutdown + drm_atomic_helper_update_plane + drm_compat_ioctl + drm_connector_attach_encoder + drm_connector_cleanup + drm_connector_init + drm_crtc_arm_vblank_event + drm_crtc_cleanup + drm_crtc_handle_vblank + drm_crtc_init_with_planes + drm_crtc_send_vblank_event + drm_crtc_vblank_get + drm_crtc_vblank_off + drm_crtc_vblank_on + __drm_dbg + drm_debugfs_create_files + drm_dev_alloc + drm_dev_put + drm_dev_register + drm_dev_unregister + __drm_err + drm_gem_fb_create + drm_gem_mmap + drm_gem_prime_fd_to_handle + drm_gem_prime_handle_to_fd + drm_gem_prime_mmap + drm_helper_probe_single_connector_modes + drm_ioctl + drmm_mode_config_init + drm_mode_config_reset + drm_open + drm_poll + drm_read + drm_release + drm_set_preferred_mode + drm_simple_encoder_init + drm_vblank_init + ether_setup + ethtool_op_get_link + ethtool_op_get_ts_info + eth_type_trans + eth_validate_addr + event_triggers_call + fd_install + finish_wait + firmware_request_nowarn + flush_work + flush_workqueue + fput + free_irq + free_netdev + __free_pages + free_pages + get_device + __get_free_pages + get_random_bytes + get_unused_fd_flags + gic_nonsecure_priorities + gpiod_put + hci_alloc_dev_priv + __hci_cmd_sync + __hci_cmd_sync_ev + hci_free_dev + hci_recv_frame + hci_register_dev + hci_unregister_dev + hrtimer_cancel + hrtimer_forward + hrtimer_init + hrtimer_start_range_ns + ida_alloc_range + ida_free + ieee80211_alloc_hw_nm + ieee80211_beacon_cntdwn_is_complete + ieee80211_beacon_get_tim + ieee80211_csa_finish + ieee80211_free_hw + ieee80211_free_txskb + ieee80211_get_buffered_bc + ieee80211_get_hdrlen_from_skb + ieee80211_get_tx_rates + ieee80211_iterate_active_interfaces_atomic + ieee80211_queue_delayed_work + ieee80211_radar_detected + ieee80211_register_hw + ieee80211_send_bar + ieee80211_sta_register_airtime + ieee80211_stop_queues + ieee80211_stop_tx_ba_cb_irqsafe + ieee80211_tx_status_ext + ieee80211_unregister_hw + ieee80211_wake_queues + __init_swait_queue_head + init_timer_key + init_wait_entry + __init_waitqueue_head + input_alloc_absinfo + input_allocate_device + input_event + input_free_device + input_mt_init_slots + input_register_device + input_set_abs_params + input_unregister_device + __ioremap + iounmap + is_vmalloc_addr + jiffies + kasan_flag_enabled + kfree + kfree_skb_reason + kimage_voffset + __kmalloc + kmalloc_caches + kmem_cache_alloc_trace + kmemdup + kstrndup + kthread_create_on_node + kthread_park + kthread_should_stop + kthread_stop + kthread_unpark + ktime_get + ktime_get_with_offset + kvfree + kvmalloc_node + __list_add_valid + __list_del_entry_valid + __local_bh_enable_ip + memcpy + memmove + memset + memstart_addr + misc_deregister + misc_register + mod_timer + module_layout + __msecs_to_jiffies + msleep + __mutex_init + mutex_lock + mutex_lock_interruptible + mutex_unlock + __napi_alloc_skb + napi_complete_done + napi_disable + napi_enable + napi_gro_receive + __napi_schedule + napi_schedule_prep + netdev_err + netdev_info + netdev_rx_handler_register + netdev_rx_handler_unregister + netdev_upper_dev_unlink + netdev_warn + netif_carrier_off + netif_carrier_on + netif_device_attach + netif_device_detach + netif_napi_add + __netif_napi_del + netif_rx + netif_schedule_queue + netif_tx_stop_all_queues + netif_tx_wake_queue + net_ratelimit + nf_conntrack_destroy + __nla_parse + nla_put_64bit + nla_put + no_llseek + nonseekable_open + noop_llseek + nr_cpu_ids + of_device_is_compatible + of_find_property + of_property_read_variable_u32_array + page_frag_alloc_align + __page_frag_cache_drain + page_frag_free + param_ops_bool + param_ops_int + param_ops_uint + passthru_features_check + pci_disable_device + pci_enable_device + pci_find_capability + pci_find_next_capability + pci_iounmap + pci_read_config_byte + pci_read_config_dword + __pci_register_driver + pci_release_region + pci_release_selected_regions + pci_request_region + pci_request_selected_regions + pci_set_master + pci_unregister_driver + perf_trace_buf_alloc + perf_trace_run_bpf_submit + platform_device_add + platform_device_add_data + platform_device_alloc + platform_device_del + platform_device_put + platform_device_register_full + platform_device_unregister + __platform_driver_register + platform_driver_unregister + platform_get_irq + platform_get_resource + pm_wakeup_dev_event + preempt_schedule + preempt_schedule_notrace + prepare_to_wait_event + print_hex_dump + _printk + __pskb_pull_tail + ___pskb_trim + put_device + __put_page + __put_task_struct + put_unused_fd + queue_delayed_work_on + queue_work_on + ___ratelimit + _raw_spin_lock + _raw_spin_lock_bh + _raw_spin_lock_irq + _raw_spin_lock_irqsave + _raw_spin_trylock + _raw_spin_unlock + _raw_spin_unlock_bh + _raw_spin_unlock_irq + _raw_spin_unlock_irqrestore + __rcu_read_lock + __rcu_read_unlock + refcount_warn_saturate + register_netdev + register_netdevice + register_netdevice_notifier + register_virtio_device + register_virtio_driver + __regmap_init + regmap_write + release_firmware + remap_pfn_range + request_firmware + request_threaded_irq + rtnl_lock + rtnl_unlock + sched_set_fifo_low + schedule + schedule_timeout + seq_lseek + seq_printf + seq_puts + seq_read + serio_close + serio_interrupt + serio_open + serio_reconnect + __serio_register_driver + __serio_register_port + serio_unregister_driver + sg_alloc_table + sg_free_table + sg_init_one + sg_init_table + sg_miter_next + sg_miter_start + sg_miter_stop + sg_next + simple_attr_open + simple_attr_release + single_open + single_release + skb_add_rx_frag + skb_dequeue + __skb_pad + skb_pull + skb_push + skb_put + skb_queue_tail + skb_to_sgvec + skb_trim + skb_tstamp_tx + snprintf + softnet_data + sprintf + sscanf + __stack_chk_fail + strcasecmp + strchr + strcmp + strlcpy + strlen + strncpy + strscpy + __sw_hweight8 + sync_file_create + synchronize_irq + synchronize_net + sysfs_create_group + sysfs_remove_group + system_wq + tasklet_unlock_wait + trace_event_buffer_commit + trace_event_buffer_reserve + trace_event_ignore_this_pid + trace_event_printf + trace_event_raw_init + trace_event_reg + trace_handle_return + trace_raw_output_prep + __ubsan_handle_cfi_check_fail_abort + __udelay + unregister_netdev + unregister_netdevice_notifier + unregister_netdevice_queue + unregister_virtio_device + unregister_virtio_driver + usb_add_hcd + usb_alloc_urb + usb_anchor_urb + usb_bulk_msg + usb_control_msg + usb_create_hcd + usb_create_shared_hcd + usb_deregister + usb_disabled + usb_free_urb + usb_get_dev + usb_hcd_check_unlink_urb + usb_hcd_giveback_urb + usb_hcd_is_primary_hcd + usb_hcd_link_urb_to_ep + usb_hcd_poll_rh_status + usb_hcd_resume_root_hub + usb_hcd_unlink_urb_from_ep + usb_kill_anchored_urbs + usb_put_dev + usb_put_hcd + usb_register_driver + usb_remove_hcd + usb_reset_device + usb_submit_urb + usb_unanchor_urb + usleep_range_state + vabits_actual + vfree + virtio_break_device + virtio_check_driver_offered_feature + virtio_config_changed + virtio_device_freeze + virtio_device_restore + virtqueue_add_inbuf + virtqueue_add_outbuf + virtqueue_add_sgs + virtqueue_detach_unused_buf + virtqueue_disable_cb + virtqueue_enable_cb + virtqueue_get_avail_addr + virtqueue_get_buf + virtqueue_get_desc_addr + virtqueue_get_used_addr + virtqueue_get_vring_size + virtqueue_is_broken + virtqueue_kick + virtqueue_kick_prepare + virtqueue_notify + vmalloc_to_page + vring_create_virtqueue + vring_del_virtqueue + vring_interrupt + vring_transport_features + wait_for_completion + __wake_up + wake_up_process + __warn_printk + +# required by ambakmi.ko + amba_release_regions + amba_request_regions + clk_get + clk_put + serio_unregister_port + +# required by armmmci.ko + clk_round_rate + devm_of_iomap + devm_pinctrl_get + __devm_reset_control_get + dma_map_sg_attrs + dma_release_channel + dma_request_chan + gpiod_direction_input + gpiod_get + gpiod_get_value + gpiod_set_value + mmc_add_host + mmc_alloc_host + mmc_free_host + mmc_gpiod_request_cd + mmc_gpiod_request_ro + mmc_gpio_get_cd + mmc_gpio_get_ro + mmc_of_parse + mmc_regulator_get_supply + mmc_regulator_set_ocr + mmc_regulator_set_vqmmc + mmc_remove_host + mmc_request_done + mmc_send_tuning + of_get_property + pinctrl_lookup_state + pinctrl_pm_select_sleep_state + pinctrl_select_default_state + pinctrl_select_state + pm_runtime_force_resume + pm_runtime_force_suspend + __pm_runtime_idle + __pm_runtime_resume + pm_runtime_set_autosuspend_delay + __pm_runtime_use_autosuspend + regulator_disable + regulator_enable + reset_control_assert + reset_control_deassert + +# required by btintel.ko + bit_wait_timeout + bt_to_errno + hci_cmd_sync + out_of_line_wait_on_bit_timeout + request_firmware_direct + wake_up_bit + +# required by btusb.ko + btbcm_set_bdaddr + btbcm_setup_apple + btbcm_setup_patchram + device_set_wakeup_capable + __dev_kfree_skb_irq + disable_irq + disable_irq_nosync + enable_irq + gpiod_get_optional + gpiod_set_value_cansleep + hci_recv_diag + irq_modify_status + irq_set_irq_wake + ktime_get_mono_fast_ns + of_irq_get_byname + of_match_device + of_property_read_variable_u16_array + pm_runtime_allow + pm_runtime_forbid + __pm_runtime_suspend + pm_system_wakeup + usb_autopm_get_interface + usb_autopm_put_interface + usb_driver_claim_interface + usb_driver_release_interface + usb_enable_autosuspend + usb_get_from_anchor + usb_ifnum_to_if + usb_match_id + usb_queue_reset_device + usb_scuttle_anchored_urbs + usb_set_interface + +# required by clk-vexpress-osc.ko + clk_hw_set_rate_range + devm_of_clk_add_hw_provider + of_clk_hw_simple_get + of_property_read_string + regmap_read + +# required by dummy-cpufreq.ko + cpufreq_generic_attr + cpufreq_register_driver + cpufreq_unregister_driver + +# required by dummy_hcd.ko + ktime_get_ts64 + scnprintf + strstr + usb_add_gadget_udc + usb_del_gadget_udc + usb_ep_set_maxpacket_limit + usb_gadget_giveback_request + usb_gadget_udc_reset + +# required by e1000.ko + csum_ipv6_magic + csum_tcpudp_nofold + device_set_wakeup_enable + dql_completed + dql_reset + ethtool_convert_legacy_u32_to_link_mode + ethtool_convert_link_mode_to_legacy_u32 + _find_first_bit + _find_next_bit + msleep_interruptible + napi_get_frags + napi_gro_frags + __netdev_alloc_frag_align + param_array_ops + pci_clear_mwi + pci_enable_device_mem + pci_enable_wake + pci_ioremap_bar + pci_read_config_word + pci_save_state + pci_select_bars + pci_set_mwi + pci_set_power_state + pci_wake_from_d3 + pcix_get_mmrbc + pcix_set_mmrbc + pskb_expand_head + skb_copy_bits + system_state + vzalloc + +# required by failover.ko + netdev_master_upper_dev_link + rtnl_is_locked + +# required by goldfish_address_space.ko + memremap + memunmap + +# required by goldfish_battery.ko + power_supply_changed + power_supply_get_drvdata + power_supply_register + power_supply_unregister + +# required by goldfish_pipe.ko + pin_user_pages_fast + unpin_user_pages_dirty_lock + +# required by goldfish_sync.ko + dma_fence_default_wait + dma_fence_free + +# required by gs_usb.ko + alloc_candev_mqs + alloc_can_err_skb + alloc_can_skb + can_change_mtu + can_free_echo_skb + can_get_echo_skb + can_put_echo_skb + close_candev + free_candev + open_candev + register_candev + unregister_candev + usb_alloc_coherent + usb_free_coherent + +# required by hci_vhci.ko + _copy_from_iter + iov_iter_revert + skb_queue_head + skb_queue_purge + +# required by mac80211_hwsim.ko + alloc_netdev_mqs + __cfg80211_alloc_event_skb + __cfg80211_alloc_reply_skb + __cfg80211_send_event_skb + cfg80211_vendor_cmd_reply + dev_alloc_name + device_bind_driver + device_release_driver + dst_release + eth_mac_addr + genlmsg_put + genl_notify + genl_register_family + genl_unregister_family + ieee80211_probereq_get + ieee80211_ready_on_channel + ieee80211_remain_on_channel_expired + ieee80211_rx_irqsafe + ieee80211_scan_completed + ieee80211_tx_prepare_skb + ieee80211_tx_status_irqsafe + init_net + jiffies_to_msecs + __netdev_alloc_skb + netlink_broadcast + netlink_register_notifier + netlink_unicast + netlink_unregister_notifier + net_namespace_list + nla_memcpy + register_pernet_device + regulatory_hint + rhashtable_destroy + rhashtable_init + rhashtable_insert_slow + __rht_bucket_nested + rht_bucket_nested + rht_bucket_nested_insert + schedule_timeout_interruptible + skb_copy + skb_copy_expand + __skb_ext_put + unregister_pernet_device + wiphy_apply_custom_regulatory + +# required by mt76-usb.ko + usb_init_urb + usb_kill_urb + usb_poison_urb + usb_unpoison_urb + +# required by mt76.ko + debugfs_create_blob + debugfs_create_file_unsafe + devm_kmemdup + dev_set_threaded + idr_alloc + idr_remove + ieee80211_calc_rx_airtime + ieee80211_find_sta_by_ifaddr + ieee80211_get_key_rx_seq + ieee80211_next_txq + ieee80211_return_txq + ieee80211_rx_list + ieee80211_sta_eosp + ieee80211_sta_pspoll + ieee80211_sta_ps_transition + ieee80211_sta_uapsd_trigger + ieee80211_tx_dequeue + ieee80211_txq_schedule_start + init_dummy_netdev + __ioread32_copy + __iowrite32_copy + kthread_parkme + kthread_should_park + kvfree_call_rcu + led_classdev_register_ext + led_classdev_unregister + netif_receive_skb_list + of_get_child_by_name + of_get_mac_address + of_get_next_child + of_prop_next_string + pci_disable_link_state + pcie_capability_clear_and_set_word + pcie_capability_read_word + rfc1042_header + wiphy_read_of_freq_limits + +# required by mt76x02-lib.ko + bpf_trace_run1 + debugfs_create_bool + ieee80211_calc_tx_airtime + ieee80211_hdrlen + ieee80211_iter_keys_rcu + ieee80211_restart_hw + __kfifo_init + __tasklet_schedule + tasklet_setup + wiphy_to_ieee80211_hw + +# required by mt76x02-usb.ko + hrtimer_active + ieee80211_iterate_interfaces + system_highpri_wq + +# required by nd_virtio.ko + bio_alloc_bioset + bio_chain + bio_clone_blkg_association + fs_bio_set + submit_bio + +# required by net_failover.ko + call_netdevice_notifiers + dev_close + dev_get_stats + dev_mc_sync_multiple + dev_mc_unsync + dev_open + dev_set_mtu + dev_uc_sync_multiple + dev_uc_unsync + __ethtool_get_link_ksettings + netdev_change_features + netdev_increment_features + netdev_lower_state_changed + netdev_pick_tx + pci_bus_type + vlan_uses_dev + vlan_vid_add + vlan_vid_del + vlan_vids_add_by_dev + vlan_vids_del_by_dev + +# required by open-dice.ko + devm_memremap + devm_memunmap + of_reserved_mem_lookup + __platform_driver_probe + simple_read_from_buffer + vm_iomap_memory + +# required by pl111_drm.ko + __clk_get_name + clk_hw_get_parent + clk_hw_round_rate + drm_fb_cma_get_gem_addr + drm_gem_cma_dumb_create + drm_gem_cma_prime_import_sg_table + drm_kms_helper_poll_init + drm_of_find_panel_or_bridge + drm_panel_bridge_add_typed + drm_panel_bridge_connector + drm_panel_bridge_remove + drm_simple_display_pipe_attach_bridge + drm_simple_display_pipe_init + of_find_device_by_node + of_find_matching_node_and_match + of_find_node_opts_by_path + of_get_next_available_child + of_graph_get_next_endpoint + of_reserved_mem_device_init_by_idx + of_reserved_mem_device_release + regmap_update_bits_base + syscon_node_to_regmap + +# required by psmouse.ko + bus_register_notifier + bus_unregister_notifier + del_timer_sync + device_add_groups + device_remove_groups + i2c_adapter_type + i2c_bus_type + i2c_client_type + i2c_for_each_dev + i2c_new_scanned_device + i2c_unregister_device + i2c_verify_adapter + input_mt_assign_slots + input_mt_drop_unused + input_mt_report_finger_count + input_mt_report_pointer_emulation + input_mt_report_slot_state + input_mt_sync_frame + input_set_capability + kstrtobool + kstrtou8 + kstrtouint + ps2_begin_command + ps2_cmd_aborted + ps2_command + ps2_drain + ps2_end_command + ps2_handle_ack + ps2_handle_response + ps2_init + ps2_sendbyte + ps2_sliced_command + serio_rescan + serio_unregister_child_port + strncmp + strsep + +# required by pulse8-cec.ko + cec_allocate_adapter + cec_delete_adapter + cec_received_msg_ts + cec_register_adapter + cec_s_log_addrs + cec_s_phys_addr + cec_transmit_attempt_done_ts + cec_unregister_adapter + wait_for_completion_timeout + +# required by rtc-test.ko + add_timer + devm_rtc_allocate_device + __devm_rtc_register_device + ktime_get_real_seconds + rtc_time64_to_tm + rtc_tm_to_time64 + rtc_update_irq + +# required by system_heap.ko + dmabuf_page_pool_alloc + dmabuf_page_pool_create + dmabuf_page_pool_destroy + dmabuf_page_pool_free + dma_heap_add + dma_heap_get_dev + dma_heap_get_name + dma_sync_sg_for_cpu + __sg_page_iter_next + __sg_page_iter_start + vmalloc + vmap + vunmap + +# required by usbip-core.ko + iov_iter_kvec + param_ops_ulong + sock_recvmsg + +# required by vexpress-config.ko + devres_add + __devres_alloc_node + devres_free + of_find_compatible_node + of_get_next_parent + of_parse_phandle + of_platform_populate + of_root + regmap_exit + __usecs_to_jiffies + +# required by vexpress-sysreg.ko + bgpio_init + devm_gpiochip_add_data_with_key + devm_mfd_add_devices + +# required by vhci-hcd.ko + kernel_sendmsg + kernel_sock_shutdown + kstrtoint + kstrtoll + platform_bus + sockfd_lookup + sysfs_remove_link + usb_speed_string + +# required by virt_wifi.ko + cfg80211_connect_done + cfg80211_disconnected + cfg80211_inform_bss_data + cfg80211_put_bss + cfg80211_scan_done + __dev_get_by_index + _dev_printk + __module_get + module_put + netdev_upper_dev_link + netif_stacked_transfer_operstate + rtnl_link_register + rtnl_link_unregister + skb_clone + unregister_netdevice_many + wiphy_free + wiphy_new_nm + wiphy_register + wiphy_unregister + +# required by virtio-gpu.ko + __devm_request_region + dma_fence_match_context + dma_fence_wait_timeout + dma_resv_add_excl_fence + dma_resv_test_signaled + dma_resv_wait_timeout + drm_add_edid_modes + drm_aperture_remove_conflicting_pci_framebuffers + drm_atomic_helper_crtc_destroy_state + drm_atomic_helper_crtc_duplicate_state + drm_atomic_helper_crtc_reset + drm_atomic_helper_damage_merged + drm_atomic_helper_dirtyfb + drm_atomic_helper_plane_destroy_state + drm_atomic_helper_plane_duplicate_state + drm_atomic_helper_plane_reset + drm_connector_attach_edid_property + drm_connector_register + drm_connector_unregister + drm_connector_update_edid_property + drm_cvt_mode + drm_dev_enter + drm_dev_exit + drm_dev_get + drm_dev_printk + drm_dev_set_unique + drm_dev_unplug + drm_do_get_edid + drm_framebuffer_init + drm_gem_create_mmap_offset + drm_gem_dmabuf_mmap + drm_gem_dmabuf_release + drm_gem_dmabuf_vmap + drm_gem_dmabuf_vunmap + drm_gem_fb_create_handle + drm_gem_fb_destroy + drm_gem_free_mmap_offset + drm_gem_handle_create + drm_gem_lock_reservations + drm_gem_map_attach + drm_gem_map_detach + drm_gem_map_dma_buf + drm_gem_object_free + drm_gem_object_lookup + drm_gem_object_release + drm_gem_prime_import + drm_gem_private_object_init + drm_gem_shmem_create + drm_gem_shmem_free + drm_gem_shmem_get_sg_table + drm_gem_shmem_mmap + drm_gem_shmem_pin + drm_gem_shmem_print_info + drm_gem_shmem_unpin + drm_gem_shmem_vmap + drm_gem_shmem_vunmap + drm_gem_unlock_reservations + drm_gem_unmap_dma_buf + drm_gem_vm_close + drm_gem_vm_open + drm_helper_hpd_irq_event + drm_helper_mode_fill_fb_struct + drm_kms_helper_hotplug_event + drm_mm_init + drm_mm_insert_node_in_range + drm_mm_print + drm_mm_remove_node + drm_mm_takedown + drm_mode_probed_add + drm_plane_cleanup + __drm_printfn_seq_file + __drm_puts_seq_file + drm_universal_plane_init + __get_task_comm + iomem_resource + kmalloc_order_trace + kmem_cache_alloc + kmem_cache_create + kmem_cache_destroy + kmem_cache_free + memdup_user + sync_file_get_fence + __traceiter_dma_fence_emit + __tracepoint_dma_fence_emit + vmemdup_user + vm_get_page_prot + ww_mutex_lock_interruptible + ww_mutex_unlock + +# required by virtio-rng.ko + hwrng_register + hwrng_unregister + wait_for_completion_killable + +# required by virtio_balloon.ko + adjust_managed_page_count + alloc_anon_inode + all_vm_events + balloon_aops + balloon_page_alloc + balloon_page_dequeue + balloon_page_enqueue + __ClearPageMovable + contig_page_data + init_on_free + init_pseudo + iput + kern_mount + kern_unmount + kill_anon_super + mutex_trylock + page_relinquish + page_reporting_register + page_reporting_unregister + register_oom_notifier + register_shrinker + __SetPageMovable + si_mem_available + si_meminfo + system_freezable_wq + unregister_oom_notifier + unregister_shrinker + virtqueue_disable_dma_api_for_buffers + vm_event_states + vm_node_stat + +# required by virtio_blk.ko + blk_cleanup_disk + blk_execute_rq + blk_get_request + __blk_mq_alloc_disk + blk_mq_alloc_tag_set + blk_mq_complete_request + blk_mq_end_request + blk_mq_free_tag_set + blk_mq_quiesce_queue + blk_mq_start_request + blk_mq_start_stopped_hw_queues + blk_mq_stop_hw_queue + blk_mq_unquiesce_queue + blk_mq_virtio_map_queues + blk_put_request + blk_queue_alignment_offset + blk_queue_flag_set + blk_queue_io_min + blk_queue_io_opt + blk_queue_logical_block_size + blk_queue_max_discard_sectors + blk_queue_max_discard_segments + blk_queue_max_hw_sectors + blk_queue_max_segments + blk_queue_max_segment_size + blk_queue_max_write_zeroes_sectors + blk_queue_physical_block_size + blk_queue_write_cache + blk_rq_map_kern + __blk_rq_map_sg + blk_status_to_errno + del_gendisk + device_add_disk + __register_blkdev + set_capacity_and_notify + set_disk_ro + sg_alloc_table_chained + sg_free_table_chained + string_get_size + __sysfs_match_string + unregister_blkdev + virtio_max_dma_size + +# required by virtio_console.ko + cdev_add + cdev_alloc + cdev_del + device_destroy + fasync_helper + freezing_slow_path + hvc_alloc + hvc_instantiate + hvc_kick + hvc_poll + hvc_remove + __hvc_resize + kill_fasync + kobject_uevent + pipe_lock + pipe_unlock + __refrigerator + __register_chrdev + __splice_from_pipe + system_freezing_cnt + unlock_page + __unregister_chrdev + +# required by virtio_mmio.ko + device_for_each_child + device_register + devm_platform_ioremap_resource + memparse + +# required by virtio_net.ko + bpf_dispatcher_xdp_func + bpf_master_redirect_enabled_key + bpf_prog_add + bpf_prog_put + bpf_prog_sub + bpf_stats_enabled_key + bpf_warn_invalid_xdp_action + __cpuhp_remove_state + __cpuhp_setup_state + __cpuhp_state_add_instance + __cpuhp_state_remove_instance + cpumask_next + cpumask_next_wrap + cpus_read_lock + cpus_read_unlock + eth_commit_mac_addr_change + eth_prepare_mac_addr_change + ethtool_sprintf + ethtool_virtdev_set_link_ksettings + flow_keys_basic_dissector + napi_consume_skb + netdev_notify_peers + netif_set_real_num_rx_queues + netif_set_real_num_tx_queues + __netif_set_xps_queue + __num_online_cpus + sched_clock + skb_coalesce_rx_frag + __skb_flow_dissect + skb_page_frag_refill + skb_partial_csum_set + __traceiter_xdp_exception + __tracepoint_xdp_exception + virtqueue_add_inbuf_ctx + virtqueue_enable_cb_delayed + virtqueue_enable_cb_prepare + virtqueue_get_buf_ctx + virtqueue_poll + xdp_convert_zc_to_xdp_frame + xdp_do_flush + xdp_do_redirect + xdp_master_redirect + xdp_return_frame + xdp_return_frame_rx_napi + xdp_rxq_info_reg + xdp_rxq_info_reg_mem_model + xdp_rxq_info_unreg + xdp_warn + +# required by virtio_pci.ko + irq_set_affinity_hint + pci_alloc_irq_vectors_affinity + pci_device_is_present + pci_disable_sriov + pci_enable_sriov + pci_find_ext_capability + pci_free_irq_vectors + pci_iomap + pci_irq_get_affinity + pci_irq_vector + pci_vfs_assigned + +# required by virtio_pci_modern_dev.ko + pci_iomap_range + +# required by virtio_pmem.ko + nvdimm_bus_register + nvdimm_bus_unregister + nvdimm_pmem_region_create + +# required by virtio_snd.ko + snd_card_free + snd_card_new + snd_card_register + snd_jack_new + snd_jack_report + snd_pcm_add_chmap_ctls + snd_pcm_format_physical_width + snd_pcm_hw_constraint_integer + snd_pcm_lib_ioctl + snd_pcm_new + snd_pcm_period_elapsed + snd_pcm_set_managed_buffer_all + snd_pcm_set_ops + wait_for_completion_interruptible_timeout + +# required by vkms.ko + crc32_le + __devm_drm_dev_alloc + devres_open_group + devres_release_group + drm_atomic_add_affected_planes + drm_atomic_helper_cleanup_planes + drm_atomic_helper_commit_hw_done + drm_atomic_helper_commit_modeset_disables + drm_atomic_helper_commit_modeset_enables + drm_atomic_helper_commit_planes + __drm_atomic_helper_crtc_destroy_state + __drm_atomic_helper_crtc_duplicate_state + __drm_atomic_helper_crtc_reset + drm_atomic_helper_fake_vblank + drm_atomic_helper_wait_for_flip_done + drm_calc_timestamping_constants + drm_crtc_accurate_vblank_count + drm_crtc_add_crc_entry + drm_crtc_vblank_put + drm_encoder_cleanup + drm_gem_cleanup_shadow_fb + __drm_gem_destroy_shadow_plane_state + __drm_gem_duplicate_shadow_plane_state + drm_gem_fb_get_obj + drm_gem_fb_vmap + drm_gem_fb_vunmap + drm_gem_prepare_shadow_fb + __drm_gem_reset_shadow_plane + drm_gem_shmem_dumb_create + drm_gem_shmem_prime_import_sg_table + drm_mode_object_get + drm_mode_object_put + __drmm_universal_plane_alloc + drm_writeback_connector_init + drm_writeback_queue_job + drm_writeback_signal_completion + +# required by vmw_vsock_virtio_transport.ko + sk_error_report + synchronize_rcu + virtio_transport_connect + virtio_transport_deliver_tap_pkt + virtio_transport_destruct + virtio_transport_dgram_allow + virtio_transport_dgram_bind + virtio_transport_dgram_dequeue + virtio_transport_dgram_enqueue + virtio_transport_do_socket_init + virtio_transport_free_pkt + virtio_transport_notify_buffer_size + virtio_transport_notify_poll_in + virtio_transport_notify_poll_out + virtio_transport_notify_recv_init + virtio_transport_notify_recv_post_dequeue + virtio_transport_notify_recv_pre_block + virtio_transport_notify_recv_pre_dequeue + virtio_transport_notify_send_init + virtio_transport_notify_send_post_enqueue + virtio_transport_notify_send_pre_block + virtio_transport_notify_send_pre_enqueue + virtio_transport_recv_pkt + virtio_transport_release + virtio_transport_seqpacket_dequeue + virtio_transport_seqpacket_enqueue + virtio_transport_seqpacket_has_data + virtio_transport_shutdown + virtio_transport_stream_allow + virtio_transport_stream_dequeue + virtio_transport_stream_enqueue + virtio_transport_stream_has_data + virtio_transport_stream_has_space + virtio_transport_stream_is_active + virtio_transport_stream_rcvhiwat + vsock_core_register + vsock_core_unregister + vsock_for_each_connected_socket + +# preserved by --additions-only + __alloc_percpu + bio_endio + bio_end_io_acct_remapped + bio_start_io_acct + __blk_alloc_disk + blk_queue_flag_clear + capable + __class_register + class_unregister + __cpu_possible_mask + crypto_alloc_base + crypto_comp_compress + crypto_comp_decompress + crypto_destroy_tfm + crypto_has_alg + dec_zone_page_state + disk_end_io_acct + disk_start_io_acct + down_read + down_write + flush_dcache_page + free_percpu + fsync_bdev + hex_asc_upper + hex_to_bin + idr_destroy + idr_find + idr_for_each + inc_zone_page_state + __init_rwsem + kstrdup + kstrtou16 + kstrtoull + memset64 + mutex_is_locked + netif_rx_ni + page_endio + page_mapping + __per_cpu_offset + _raw_read_lock + _raw_read_unlock + _raw_write_lock + _raw_write_unlock + set_capacity + sock_efree + strcpy + sysfs_streq + tty_hangup + tty_mode_ioctl + tty_register_ldisc + tty_unregister_ldisc + up_read + up_write + wait_on_page_bit diff --git a/android/abi_gki_protected_exports b/android/abi_gki_protected_exports new file mode 100644 index 000000000000..1c9cd8544faf --- /dev/null +++ b/android/abi_gki_protected_exports @@ -0,0 +1,517 @@ +__cfg80211_alloc_event_skb +__cfg80211_alloc_reply_skb +__cfg80211_radar_event +__cfg80211_send_event_skb +__hci_cmd_send +__hci_cmd_sync +__hci_cmd_sync_ev +__nfc_alloc_vendor_cmd_reply_skb +alloc_can_err_skb +alloc_can_skb +alloc_candev_mqs +alloc_canfd_skb +arc4_crypt +arc4_setkey +baswap +bridge_tunnel_header +bt_accept_dequeue +bt_accept_enqueue +bt_accept_unlink +bt_debugfs +bt_err +bt_err_ratelimited +bt_info +bt_procfs_cleanup +bt_procfs_init +bt_sock_ioctl +bt_sock_link +bt_sock_poll +bt_sock_reclassify_lock +bt_sock_recvmsg +bt_sock_register +bt_sock_stream_recvmsg +bt_sock_unlink +bt_sock_unregister +bt_sock_wait_ready +bt_sock_wait_state +bt_to_errno +bt_warn +bt_warn_ratelimited +btbcm_check_bdaddr +btbcm_finalize +btbcm_initialize +btbcm_patchram +btbcm_read_pcm_int_params +btbcm_set_bdaddr +btbcm_setup_apple +btbcm_setup_patchram +btbcm_write_pcm_int_params +can_bus_off +can_change_mtu +can_change_state +can_fd_dlc2len +can_fd_len2dlc +can_free_echo_skb +can_get_echo_skb +can_get_state_str +can_proto_register +can_proto_unregister +can_put_echo_skb +can_rx_offload_add_fifo +can_rx_offload_add_manual +can_rx_offload_add_timestamp +can_rx_offload_del +can_rx_offload_enable +can_rx_offload_get_echo_skb +can_rx_offload_irq_finish +can_rx_offload_irq_offload_fifo +can_rx_offload_irq_offload_timestamp +can_rx_offload_queue_sorted +can_rx_offload_queue_tail +can_rx_offload_threaded_irq_finish +can_rx_register +can_rx_unregister +can_send +can_skb_get_frame_len +can_sock_destruct +cfg80211_any_usable_channels +cfg80211_assoc_comeback +cfg80211_assoc_failure +cfg80211_auth_timeout +cfg80211_background_cac_abort +cfg80211_bss_color_notify +cfg80211_bss_flush +cfg80211_bss_iter +cfg80211_cac_event +cfg80211_calculate_bitrate +cfg80211_ch_switch_notify +cfg80211_ch_switch_started_notify +cfg80211_chandef_compatible +cfg80211_chandef_create +cfg80211_chandef_dfs_required +cfg80211_chandef_usable +cfg80211_chandef_valid +cfg80211_check_combinations +cfg80211_check_station_change +cfg80211_classify8021d +cfg80211_conn_failed +cfg80211_connect_done +cfg80211_control_port_tx_status +cfg80211_cqm_beacon_loss_notify +cfg80211_cqm_pktloss_notify +cfg80211_cqm_rssi_notify +cfg80211_cqm_txe_notify +cfg80211_crit_proto_stopped +cfg80211_del_sta_sinfo +cfg80211_disconnected +cfg80211_external_auth_request +cfg80211_find_elem_match +cfg80211_find_vendor_elem +cfg80211_free_nan_func +cfg80211_ft_event +cfg80211_get_bss +cfg80211_get_drvinfo +cfg80211_get_iftype_ext_capa +cfg80211_get_p2p_attr +cfg80211_get_station +cfg80211_gtk_rekey_notify +cfg80211_ibss_joined +cfg80211_iftype_allowed +cfg80211_inform_bss_data +cfg80211_inform_bss_frame_data +cfg80211_is_element_inherited +cfg80211_iter_combinations +cfg80211_merge_profile +cfg80211_mgmt_tx_status_ext +cfg80211_michael_mic_failure +cfg80211_nan_func_terminated +cfg80211_nan_match +cfg80211_new_sta +cfg80211_notify_new_peer_candidate +cfg80211_pmksa_candidate_notify +cfg80211_pmsr_complete +cfg80211_pmsr_report +cfg80211_port_authorized +cfg80211_probe_status +cfg80211_put_bss +cfg80211_ready_on_channel +cfg80211_ref_bss +cfg80211_reg_can_beacon +cfg80211_reg_can_beacon_relax +cfg80211_register_netdevice +cfg80211_remain_on_channel_expired +cfg80211_report_obss_beacon_khz +cfg80211_report_wowlan_wakeup +cfg80211_roamed +cfg80211_rx_assoc_resp +cfg80211_rx_control_port +cfg80211_rx_mgmt_ext +cfg80211_rx_mlme_mgmt +cfg80211_rx_spurious_frame +cfg80211_rx_unexpected_4addr_frame +cfg80211_rx_unprot_mlme_mgmt +cfg80211_scan_done +cfg80211_sched_scan_results +cfg80211_sched_scan_stopped +cfg80211_sched_scan_stopped_locked +cfg80211_send_layer2_update +cfg80211_shutdown_all_interfaces +cfg80211_sinfo_alloc_tid_stats +cfg80211_sta_opmode_change_notify +cfg80211_stop_iface +cfg80211_tdls_oper_request +cfg80211_tx_mgmt_expired +cfg80211_tx_mlme_mgmt +cfg80211_unlink_bss +cfg80211_unregister_wdev +cfg80211_update_owe_info_event +cfg80211_vendor_cmd_get_sender +cfg80211_vendor_cmd_reply +close_candev +free_candev +freq_reg_info +get_wiphy_regdom +h4_recv_buf +hci_alloc_dev_priv +hci_cmd_sync +hci_conn_check_secure +hci_conn_security +hci_conn_switch_role +hci_free_dev +hci_get_route +hci_mgmt_chan_register +hci_mgmt_chan_unregister +hci_recv_diag +hci_recv_frame +hci_register_cb +hci_register_dev +hci_release_dev +hci_reset_dev +hci_resume_dev +hci_set_fw_info +hci_set_hw_info +hci_suspend_dev +hci_uart_register_device +hci_uart_tx_wakeup +hci_uart_unregister_device +hci_unregister_cb +hci_unregister_dev +hidp_hid_driver +ieee80211_alloc_hw_nm +ieee80211_amsdu_to_8023s +ieee80211_ap_probereq_get +ieee80211_ave_rssi +ieee80211_beacon_cntdwn_is_complete +ieee80211_beacon_get_template +ieee80211_beacon_get_tim +ieee80211_beacon_loss +ieee80211_beacon_set_cntdwn +ieee80211_beacon_update_cntdwn +ieee80211_bss_get_elem +ieee80211_calc_rx_airtime +ieee80211_calc_tx_airtime +ieee80211_chandef_to_operating_class +ieee80211_channel_to_freq_khz +ieee80211_chswitch_done +ieee80211_color_change_finish +ieee80211_connection_loss +ieee80211_cqm_beacon_loss_notify +ieee80211_cqm_rssi_notify +ieee80211_csa_finish +ieee80211_ctstoself_duration +ieee80211_ctstoself_get +ieee80211_data_to_8023_exthdr +ieee80211_disable_rssi_reports +ieee80211_disconnect +ieee80211_enable_rssi_reports +ieee80211_find_sta +ieee80211_find_sta_by_ifaddr +ieee80211_free_hw +ieee80211_free_txskb +ieee80211_freq_khz_to_channel +ieee80211_generic_frame_duration +ieee80211_get_bssid +ieee80211_get_buffered_bc +ieee80211_get_channel_khz +ieee80211_get_fils_discovery_tmpl +ieee80211_get_hdrlen_from_skb +ieee80211_get_key_rx_seq +ieee80211_get_mesh_hdrlen +ieee80211_get_num_supported_channels +ieee80211_get_response_rate +ieee80211_get_tkip_p1k_iv +ieee80211_get_tkip_p2k +ieee80211_get_tkip_rx_p1k +ieee80211_get_tx_rates +ieee80211_get_unsol_bcast_probe_resp_tmpl +ieee80211_get_vht_max_nss +ieee80211_gtk_rekey_add +ieee80211_gtk_rekey_notify +ieee80211_hdrlen +ieee80211_ie_split_ric +ieee80211_iter_chan_contexts_atomic +ieee80211_iter_keys +ieee80211_iter_keys_rcu +ieee80211_iterate_active_interfaces_atomic +ieee80211_iterate_active_interfaces_mtx +ieee80211_iterate_interfaces +ieee80211_iterate_stations_atomic +ieee80211_key_mic_failure +ieee80211_key_replay +ieee80211_manage_rx_ba_offl +ieee80211_mandatory_rates +ieee80211_mark_rx_ba_filtered_frames +ieee80211_nan_func_match +ieee80211_nan_func_terminated +ieee80211_next_txq +ieee80211_nullfunc_get +ieee80211_operating_class_to_band +ieee80211_parse_p2p_noa +ieee80211_probereq_get +ieee80211_proberesp_get +ieee80211_pspoll_get +ieee80211_queue_delayed_work +ieee80211_queue_stopped +ieee80211_queue_work +ieee80211_radar_detected +ieee80211_radiotap_iterator_init +ieee80211_radiotap_iterator_next +ieee80211_rate_control_register +ieee80211_rate_control_unregister +ieee80211_ready_on_channel +ieee80211_register_hw +ieee80211_remain_on_channel_expired +ieee80211_remove_key +ieee80211_report_low_ack +ieee80211_report_wowlan_wakeup +ieee80211_request_smps +ieee80211_reserve_tid +ieee80211_restart_hw +ieee80211_resume_disconnect +ieee80211_return_txq +ieee80211_rts_duration +ieee80211_rts_get +ieee80211_rx_ba_timer_expired +ieee80211_rx_irqsafe +ieee80211_rx_list +ieee80211_rx_napi +ieee80211_s1g_channel_width +ieee80211_scan_completed +ieee80211_sched_scan_results +ieee80211_sched_scan_stopped +ieee80211_schedule_txq +ieee80211_send_bar +ieee80211_send_eosp_nullfunc +ieee80211_set_key_rx_seq +ieee80211_sta_block_awake +ieee80211_sta_eosp +ieee80211_sta_ps_transition +ieee80211_sta_pspoll +ieee80211_sta_register_airtime +ieee80211_sta_set_buffered +ieee80211_sta_uapsd_trigger +ieee80211_start_tx_ba_cb_irqsafe +ieee80211_start_tx_ba_session +ieee80211_stop_queue +ieee80211_stop_queues +ieee80211_stop_rx_ba_session +ieee80211_stop_tx_ba_cb_irqsafe +ieee80211_stop_tx_ba_session +ieee80211_tdls_oper_request +ieee80211_tkip_add_iv +ieee80211_tx_dequeue +ieee80211_tx_prepare_skb +ieee80211_tx_rate_update +ieee80211_tx_status +ieee80211_tx_status_8023 +ieee80211_tx_status_ext +ieee80211_tx_status_irqsafe +ieee80211_txq_airtime_check +ieee80211_txq_get_depth +ieee80211_txq_may_transmit +ieee80211_txq_schedule_start +ieee80211_unregister_hw +ieee80211_unreserve_tid +ieee80211_update_mu_groups +ieee80211_update_p2p_noa +ieee80211_vif_to_wdev +ieee80211_wake_queue +ieee80211_wake_queues +ieee802154_alloc_hw +ieee802154_free_hw +ieee802154_hdr_peek +ieee802154_hdr_peek_addrs +ieee802154_hdr_pull +ieee802154_hdr_push +ieee802154_max_payload +ieee802154_register_hw +ieee802154_rx_irqsafe +ieee802154_stop_queue +ieee802154_unregister_hw +ieee802154_wake_queue +ieee802154_xmit_complete +ieeee80211_obss_color_collision_notify +l2cap_add_psm +l2cap_chan_close +l2cap_chan_connect +l2cap_chan_create +l2cap_chan_del +l2cap_chan_list +l2cap_chan_put +l2cap_chan_send +l2cap_chan_set_defaults +l2cap_conn_get +l2cap_conn_put +l2cap_is_socket +l2cap_register_user +l2cap_unregister_user +l2tp_recv_common +l2tp_session_create +l2tp_session_dec_refcount +l2tp_session_delete +l2tp_session_get +l2tp_session_get_by_ifname +l2tp_session_get_nth +l2tp_session_inc_refcount +l2tp_session_register +l2tp_session_set_header_len +l2tp_sk_to_tunnel +l2tp_tunnel_create +l2tp_tunnel_dec_refcount +l2tp_tunnel_delete +l2tp_tunnel_get +l2tp_tunnel_get_nth +l2tp_tunnel_get_session +l2tp_tunnel_inc_refcount +l2tp_tunnel_register +l2tp_udp_encap_recv +l2tp_xmit_skb +lowpan_header_compress +lowpan_header_decompress +lowpan_nhc_add +lowpan_nhc_del +lowpan_register_netdev +lowpan_register_netdevice +lowpan_unregister_netdev +lowpan_unregister_netdevice +nfc_add_se +nfc_alloc_recv_skb +nfc_allocate_device +nfc_class +nfc_dep_link_is_up +nfc_driver_failure +nfc_find_se +nfc_fw_download_done +nfc_get_local_general_bytes +nfc_proto_register +nfc_proto_unregister +nfc_register_device +nfc_remove_se +nfc_se_connectivity +nfc_se_transaction +nfc_send_to_raw_sock +nfc_set_remote_general_bytes +nfc_target_lost +nfc_targets_found +nfc_tm_activated +nfc_tm_data_received +nfc_tm_deactivated +nfc_unregister_device +nfc_vendor_cmd_reply +of_can_transceiver +open_candev +ppp_channel_index +ppp_dev_name +ppp_input +ppp_input_error +ppp_output_wakeup +ppp_register_channel +ppp_register_compressor +ppp_register_net_channel +ppp_unit_number +ppp_unregister_channel +ppp_unregister_compressor +pppox_compat_ioctl +pppox_ioctl +pppox_unbind_sock +qca_read_soc_version +qca_send_pre_shutdown_cmd +qca_set_bdaddr +qca_set_bdaddr_rome +qca_uart_setup +rate_control_set_rates +reg_initiator_name +reg_query_regdb_wmm +register_candev +register_pppox_proto +regulatory_hint +regulatory_pre_cac_allowed +regulatory_set_wiphy_regd +regulatory_set_wiphy_regd_sync +rfc1042_header +rfkill_alloc +rfkill_blocked +rfkill_destroy +rfkill_find_type +rfkill_get_led_trigger_name +rfkill_init_sw_state +rfkill_pause_polling +rfkill_register +rfkill_resume_polling +rfkill_set_hw_state_reason +rfkill_set_led_trigger_name +rfkill_set_states +rfkill_set_sw_state +rfkill_unregister +safe_candev_priv +slhc_compress +slhc_free +slhc_init +slhc_remember +slhc_toss +slhc_uncompress +tipc_dump_done +tipc_dump_start +tipc_nl_sk_walk +tipc_sk_fill_sock_diag +unregister_candev +unregister_pppox_proto +usb_serial_claim_interface +usb_serial_deregister_drivers +usb_serial_generic_chars_in_buffer +usb_serial_generic_close +usb_serial_generic_get_icount +usb_serial_generic_open +usb_serial_generic_process_read_urb +usb_serial_generic_read_bulk_callback +usb_serial_generic_resume +usb_serial_generic_submit_read_urbs +usb_serial_generic_throttle +usb_serial_generic_tiocmiwait +usb_serial_generic_unthrottle +usb_serial_generic_wait_until_sent +usb_serial_generic_write +usb_serial_generic_write_bulk_callback +usb_serial_generic_write_start +usb_serial_handle_dcd_change +usb_serial_port_softint +usb_serial_register_drivers +usb_serial_resume +usb_serial_suspend +wdev_chandef +wdev_to_ieee80211_vif +wiphy_apply_custom_regulatory +wiphy_free +wiphy_new_nm +wiphy_read_of_freq_limits +wiphy_register +wiphy_rfkill_set_hw_state_reason +wiphy_rfkill_start_polling +wiphy_to_ieee80211_hw +wiphy_unregister +wpan_phy_find +wpan_phy_for_each +wpan_phy_free +wpan_phy_new +wpan_phy_register +wpan_phy_unregister \ No newline at end of file diff --git a/android/abi_gki_rockpi4 b/android/abi_gki_rockpi4 new file mode 100644 index 000000000000..57914f291962 --- /dev/null +++ b/android/abi_gki_rockpi4 @@ -0,0 +1,4 @@ +[abi_symbol_list] +# commonly used symbols + module_layout + __put_task_struct diff --git a/android/gki_aarch64_modules b/android/gki_aarch64_modules new file mode 100644 index 000000000000..8961cde5b6f9 --- /dev/null +++ b/android/gki_aarch64_modules @@ -0,0 +1,48 @@ +mm/zsmalloc.ko +drivers/block/zram/zram.ko +drivers/net/can/dev/can-dev.ko +drivers/net/can/vcan.ko +drivers/net/can/slcan.ko +drivers/net/ppp/ppp_generic.ko +drivers/net/ppp/bsd_comp.ko +drivers/net/ppp/ppp_deflate.ko +drivers/net/ppp/ppp_mppe.ko +drivers/net/ppp/pppox.ko +drivers/net/ppp/pptp.ko +drivers/net/slip/slhc.ko +drivers/usb/class/cdc-acm.ko +drivers/usb/serial/usbserial.ko +drivers/usb/serial/ftdi_sio.ko +drivers/bluetooth/hci_uart.ko +drivers/bluetooth/btsdio.ko +drivers/bluetooth/btbcm.ko +drivers/bluetooth/btqca.ko +net/8021q/8021q.ko +net/wireless/cfg80211.ko +net/can/can.ko +net/can/can-raw.ko +net/can/can-bcm.ko +net/can/can-gw.ko +net/bluetooth/bluetooth.ko +net/bluetooth/rfcomm/rfcomm.ko +net/bluetooth/hidp/hidp.ko +net/l2tp/l2tp_core.ko +net/l2tp/l2tp_ppp.ko +net/mac80211/mac80211.ko +net/tipc/tipc.ko +net/tipc/diag.ko +net/rfkill/rfkill.ko +net/6lowpan/6lowpan.ko +net/6lowpan/nhc_dest.ko +net/6lowpan/nhc_fragment.ko +net/6lowpan/nhc_hop.ko +net/6lowpan/nhc_ipv6.ko +net/6lowpan/nhc_mobility.ko +net/6lowpan/nhc_routing.ko +net/6lowpan/nhc_udp.ko +net/ieee802154/6lowpan/ieee802154_6lowpan.ko +net/ieee802154/ieee802154.ko +net/ieee802154/ieee802154_socket.ko +net/mac802154/mac802154.ko +net/nfc/nfc.ko +lib/crypto/libarc4.ko diff --git a/android/gki_protected_modules b/android/gki_protected_modules new file mode 100644 index 000000000000..3ae19d639b6c --- /dev/null +++ b/android/gki_protected_modules @@ -0,0 +1,47 @@ +drivers/bluetooth/btbcm.ko +drivers/bluetooth/btqca.ko +drivers/bluetooth/btsdio.ko +drivers/bluetooth/hci_uart.ko +drivers/net/can/dev/can-dev.ko +drivers/net/can/slcan.ko +drivers/net/can/vcan.ko +drivers/net/ppp/bsd_comp.ko +drivers/net/ppp/ppp_deflate.ko +drivers/net/ppp/ppp_generic.ko +drivers/net/ppp/ppp_mppe.ko +drivers/net/ppp/pppox.ko +drivers/net/ppp/pptp.ko +drivers/net/slip/slhc.ko +drivers/usb/class/cdc-acm.ko +drivers/usb/serial/ftdi_sio.ko +drivers/usb/serial/usbserial.ko +lib/crypto/libarc4.ko +net/6lowpan/6lowpan.ko +net/6lowpan/nhc_dest.ko +net/6lowpan/nhc_fragment.ko +net/6lowpan/nhc_hop.ko +net/6lowpan/nhc_ipv6.ko +net/6lowpan/nhc_mobility.ko +net/6lowpan/nhc_routing.ko +net/6lowpan/nhc_udp.ko +net/8021q/8021q.ko +net/bluetooth/bluetooth.ko +net/bluetooth/hidp/hidp.ko +net/bluetooth/rfcomm/rfcomm.ko +net/can/can.ko +net/can/can-bcm.ko +net/can/can-gw.ko +net/can/can-raw.ko +net/ieee802154/6lowpan/ieee802154_6lowpan.ko +net/ieee802154/ieee802154.ko +net/ieee802154/ieee802154_socket.ko +net/l2tp/l2tp_core.ko +net/l2tp/l2tp_ppp.ko +net/mac80211/mac80211.ko +net/mac802154/mac802154.ko +net/nfc/nfc.ko +net/rfkill/rfkill.ko +net/tipc/diag.ko +net/tipc/tipc.ko +net/wireless/cfg80211.ko + diff --git a/android/gki_system_dlkm_modules b/android/gki_system_dlkm_modules new file mode 100644 index 000000000000..56604a8f8aa5 --- /dev/null +++ b/android/gki_system_dlkm_modules @@ -0,0 +1,49 @@ +drivers/block/zram/zram.ko +drivers/bluetooth/btbcm.ko +drivers/bluetooth/btqca.ko +drivers/bluetooth/btsdio.ko +drivers/bluetooth/hci_uart.ko +drivers/net/can/dev/can-dev.ko +drivers/net/can/slcan.ko +drivers/net/can/vcan.ko +drivers/net/ppp/bsd_comp.ko +drivers/net/ppp/ppp_deflate.ko +drivers/net/ppp/ppp_generic.ko +drivers/net/ppp/ppp_mppe.ko +drivers/net/ppp/pppox.ko +drivers/net/ppp/pptp.ko +drivers/net/slip/slhc.ko +drivers/usb/class/cdc-acm.ko +drivers/usb/serial/ftdi_sio.ko +drivers/usb/serial/usbserial.ko +lib/crypto/libarc4.ko +mm/zsmalloc.ko +net/6lowpan/6lowpan.ko +net/6lowpan/nhc_dest.ko +net/6lowpan/nhc_fragment.ko +net/6lowpan/nhc_hop.ko +net/6lowpan/nhc_ipv6.ko +net/6lowpan/nhc_mobility.ko +net/6lowpan/nhc_routing.ko +net/6lowpan/nhc_udp.ko +net/8021q/8021q.ko +net/bluetooth/bluetooth.ko +net/bluetooth/hidp/hidp.ko +net/bluetooth/rfcomm/rfcomm.ko +net/can/can.ko +net/can/can-bcm.ko +net/can/can-gw.ko +net/can/can-raw.ko +net/ieee802154/6lowpan/ieee802154_6lowpan.ko +net/ieee802154/ieee802154.ko +net/ieee802154/ieee802154_socket.ko +net/l2tp/l2tp_core.ko +net/l2tp/l2tp_ppp.ko +net/mac80211/mac80211.ko +net/mac802154/mac802154.ko +net/nfc/nfc.ko +net/rfkill/rfkill.ko +net/tipc/diag.ko +net/tipc/tipc.ko +net/wireless/cfg80211.ko + diff --git a/arch/Kconfig b/arch/Kconfig index 5987363b41c2..4fc75f3ea5d7 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -24,6 +24,13 @@ config KEXEC_ELF config HAVE_IMA_KEXEC bool +config ARCH_HAS_SUBPAGE_FAULTS + bool + help + Select if the architecture can check permissions at sub-page + granularity (e.g. arm64 MTE). The probe_user_*() functions + must be implemented. + config SET_FS bool @@ -713,10 +720,7 @@ config ARCH_SUPPORTS_CFI_CLANG config CFI_CLANG bool "Use Clang's Control Flow Integrity (CFI)" depends on LTO_CLANG && ARCH_SUPPORTS_CFI_CLANG - # Clang >= 12: - # - https://bugs.llvm.org/show_bug.cgi?id=46258 - # - https://bugs.llvm.org/show_bug.cgi?id=47479 - depends on CLANG_VERSION >= 120000 + depends on CLANG_VERSION >= 140000 select KALLSYMS help This option enables Clang’s forward-edge Control Flow Integrity @@ -1238,6 +1242,9 @@ config RELR config ARCH_HAS_MEM_ENCRYPT bool +config ARCH_HAS_MEM_RELINQUISH + bool + config ARCH_HAS_CC_PLATFORM bool @@ -1295,6 +1302,17 @@ config ARCH_HAS_ELFCORE_COMPAT config ARCH_HAS_PARANOID_L1D_FLUSH bool +config ARCH_HAVE_TRACE_MMIO_ACCESS + bool + +config ARCH_HAS_NONLEAF_PMD_YOUNG + bool + help + Architectures that select this option are capable of setting the + accessed bit in non-leaf PMD entries when using them as part of linear + address translations. Page table walkers that clear the accessed bit + may use this capability to reduce their search space. + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index a8ae17f5740d..858bd83d0aa8 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -54,6 +54,7 @@ config ARM select GENERIC_ATOMIC64 if CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_IRQ_IPI if SMP + select ARCH_WANTS_IRQ_RAW if GENERIC_IRQ_IPI select GENERIC_CPU_AUTOPROBE select GENERIC_EARLY_IOREMAP select GENERIC_IDLE_POLL_SETUP diff --git a/arch/arm/OWNERS b/arch/arm/OWNERS new file mode 100644 index 000000000000..54f66d6eb2ee --- /dev/null +++ b/arch/arm/OWNERS @@ -0,0 +1 @@ +include ../arm64/OWNERS diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 91265e7ff672..adc0e318a1ea 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -77,10 +77,10 @@ CPPFLAGS_vmlinux.lds += -DTEXT_OFFSET="$(TEXT_OFFSET)" CPPFLAGS_vmlinux.lds += -DMALLOC_SIZE="$(MALLOC_SIZE)" compress-$(CONFIG_KERNEL_GZIP) = gzip -compress-$(CONFIG_KERNEL_LZO) = lzo -compress-$(CONFIG_KERNEL_LZMA) = lzma -compress-$(CONFIG_KERNEL_XZ) = xzkern -compress-$(CONFIG_KERNEL_LZ4) = lz4 +compress-$(CONFIG_KERNEL_LZO) = lzo_with_size +compress-$(CONFIG_KERNEL_LZMA) = lzma_with_size +compress-$(CONFIG_KERNEL_XZ) = xzkern_with_size +compress-$(CONFIG_KERNEL_LZ4) = lz4_with_size libfdt_objs := fdt_rw.o fdt_ro.o fdt_wip.o fdt.o diff --git a/arch/arm/include/asm/hypervisor.h b/arch/arm/include/asm/hypervisor.h index bd61502b9715..8133c8c81a35 100644 --- a/arch/arm/include/asm/hypervisor.h +++ b/arch/arm/include/asm/hypervisor.h @@ -6,5 +6,6 @@ void kvm_init_hyp_services(void); bool kvm_arm_hyp_service_available(u32 func_id); +void kvm_arm_init_hyp_services(void); #endif diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c index 1626dfc6f6ce..bc6b246ab55e 100644 --- a/arch/arm/kernel/perf_callchain.c +++ b/arch/arm/kernel/perf_callchain.c @@ -62,14 +62,8 @@ user_backtrace(struct frame_tail __user *tail, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct frame_tail __user *tail; - if (guest_cbs && guest_cbs->is_in_guest()) { - /* We don't support guest os callchain now */ - return; - } - perf_callchain_store(entry, regs->ARM_pc); if (!current->mm) @@ -99,44 +93,25 @@ callchain_trace(struct stackframe *fr, void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stackframe fr; - if (guest_cbs && guest_cbs->is_in_guest()) { - /* We don't support guest os callchain now */ - return; - } - arm_get_current_stackframe(regs, &fr); walk_stackframe(&fr, callchain_trace, entry); } unsigned long perf_instruction_pointer(struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - if (guest_cbs && guest_cbs->is_in_guest()) - return guest_cbs->get_guest_ip(); - return instruction_pointer(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); int misc = 0; - if (guest_cbs && guest_cbs->is_in_guest()) { - if (guest_cbs->is_user_mode()) - misc |= PERF_RECORD_MISC_GUEST_USER; - else - misc |= PERF_RECORD_MISC_GUEST_KERNEL; - } else { - if (user_mode(regs)) - misc |= PERF_RECORD_MISC_USER; - else - misc |= PERF_RECORD_MISC_KERNEL; - } + if (user_mode(regs)) + misc |= PERF_RECORD_MISC_USER; + else + misc |= PERF_RECORD_MISC_KERNEL; return misc; } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 23d369ab7e03..0cdaebb28b26 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -51,6 +51,10 @@ #define CREATE_TRACE_POINTS #include +EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_raise); +EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_entry); +EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_exit); + /* * as from 2.5, kernels no longer have an init_tasks structure * so we need some other way of telling a new secondary core @@ -727,7 +731,12 @@ void __init set_smp_ipi_range(int ipi_base, int n) WARN_ON(err); ipi_desc[i] = irq_to_desc(ipi_base + i); - irq_set_status_flags(ipi_base + i, IRQ_HIDDEN); + + if (i != IPI_RESCHEDULE) + irq_set_status_flags(ipi_base + i, IRQ_HIDDEN); + else + /* The recheduling IPI is special... */ + irq_set_status_flags(ipi_base + i, IRQ_HIDDEN|IRQ_RAW); } ipi_irq_base = ipi_base; diff --git a/arch/arm/mm/kasan_init.c b/arch/arm/mm/kasan_init.c index 948ada4a2938..29d7233e5ad2 100644 --- a/arch/arm/mm/kasan_init.c +++ b/arch/arm/mm/kasan_init.c @@ -32,7 +32,7 @@ pmd_t tmp_pmd_table[PTRS_PER_PMD] __page_aligned_bss; static __init void *kasan_alloc_block(size_t size) { return memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), - MEMBLOCK_ALLOC_KASAN, NUMA_NO_NODE); + MEMBLOCK_ALLOC_NOLEAKTRACE, NUMA_NO_NODE); } static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr, diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 9d3cbe786f8d..1ddfda7b9125 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -10,6 +10,7 @@ config ARM64 select ACPI_SPCR_TABLE if ACPI select ACPI_PPTT if ACPI select ARCH_HAS_DEBUG_WX + select ARCH_BINFMT_ELF_EXTRA_PHDRS select ARCH_BINFMT_ELF_STATE select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_MEMORY_HOTPLUG @@ -25,9 +26,12 @@ config ARM64 select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE + select ARCH_HAS_IOREMAP_PHYS_HOOKS select ARCH_HAS_KCOV select ARCH_HAS_KEEPINITRD select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_MEM_ENCRYPT + select ARCH_HAS_MEM_RELINQUISH select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PTE_DEVMAP select ARCH_HAS_PTE_SPECIAL @@ -45,6 +49,7 @@ config ARM64 select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_ELF_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_HAVE_TRACE_MMIO_ACCESS select ARCH_INLINE_READ_LOCK if !PREEMPTION select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPTION @@ -122,6 +127,7 @@ config ARM64 select GENERIC_FIND_FIRST_BIT select GENERIC_IDLE_POLL_SETUP select GENERIC_IRQ_IPI + select ARCH_WANTS_IRQ_RAW select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL @@ -135,6 +141,7 @@ config ARM64 select GENERIC_VDSO_TIME_NS select HANDLE_DOMAIN_IRQ select HARDIRQS_SW_RESEND + select HAVE_MOD_ARCH_SPECIFIC if (ARM64_MODULE_PLTS || KVM) select HAVE_MOVE_PMD select HAVE_MOVE_PUD select HAVE_PCI @@ -185,6 +192,7 @@ config ARM64 select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_IRQ_TIME_ACCOUNTING + select HAVE_KVM select HAVE_NMI select HAVE_PATA_PLATFORM select HAVE_PERF_EVENTS @@ -203,7 +211,7 @@ config ARM64 select IOMMU_DMA if IOMMU_SUPPORT select IRQ_DOMAIN select IRQ_FORCED_THREADING - select KASAN_VMALLOC if KASAN_GENERIC + select KASAN_VMALLOC if KASAN select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE select NEED_SG_DMA_LENGTH @@ -221,6 +229,7 @@ config ARM64 select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD select TRACE_IRQFLAGS_SUPPORT select TRACE_IRQFLAGS_NMI_SUPPORT + select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT help ARM 64-bit (AArch64) Linux support. @@ -699,6 +708,130 @@ config ARM64_ERRATUM_1508412 If unsure, say Y. +config ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE + bool + +config ARM64_ERRATUM_2658417 + bool "Cortex-A510: 2658417: remove BF16 support due to incorrect result" + default y + help + This option adds the workaround for ARM Cortex-A510 erratum 2658417. + Affected Cortex-A510 (r0p0 to r1p1) may produce the wrong result for + BFMMLA or VMMLA instructions in rare circumstances when a pair of + A510 CPUs are using shared neon hardware. As the sharing is not + discoverable by the kernel, hide the BF16 HWCAP to indicate that + user-space should not be using these instructions. + + If unsure, say Y. + +config ARM64_ERRATUM_2119858 + bool "Cortex-A710: 2119858: workaround TRBE overwriting trace data in FILL mode" + default y + depends on CORESIGHT_TRBE + select ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE + help + This option adds the workaround for ARM Cortex-A710 erratum 2119858. + + Affected Cortex-A710 cores could overwrite up to 3 cache lines of trace + data at the base of the buffer (pointed to by TRBASER_EL1) in FILL mode in + the event of a WRAP event. + + Work around the issue by always making sure we move the TRBPTR_EL1 by + 256 bytes before enabling the buffer and filling the first 256 bytes of + the buffer with ETM ignore packets upon disabling. + + If unsure, say Y. + +config ARM64_ERRATUM_2139208 + bool "Neoverse-N2: 2139208: workaround TRBE overwriting trace data in FILL mode" + default y + depends on CORESIGHT_TRBE + select ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE + help + This option adds the workaround for ARM Neoverse-N2 erratum 2139208. + + Affected Neoverse-N2 cores could overwrite up to 3 cache lines of trace + data at the base of the buffer (pointed to by TRBASER_EL1) in FILL mode in + the event of a WRAP event. + + Work around the issue by always making sure we move the TRBPTR_EL1 by + 256 bytes before enabling the buffer and filling the first 256 bytes of + the buffer with ETM ignore packets upon disabling. + + If unsure, say Y. + +config ARM64_WORKAROUND_TSB_FLUSH_FAILURE + bool + +config ARM64_ERRATUM_2054223 + bool "Cortex-A710: 2054223: workaround TSB instruction failing to flush trace" + default y + select ARM64_WORKAROUND_TSB_FLUSH_FAILURE + help + Enable workaround for ARM Cortex-A710 erratum 2054223 + + Affected cores may fail to flush the trace data on a TSB instruction, when + the PE is in trace prohibited state. This will cause losing a few bytes + of the trace cached. + + Workaround is to issue two TSB consecutively on affected cores. + + If unsure, say Y. + +config ARM64_ERRATUM_2067961 + bool "Neoverse-N2: 2067961: workaround TSB instruction failing to flush trace" + default y + select ARM64_WORKAROUND_TSB_FLUSH_FAILURE + help + Enable workaround for ARM Neoverse-N2 erratum 2067961 + + Affected cores may fail to flush the trace data on a TSB instruction, when + the PE is in trace prohibited state. This will cause losing a few bytes + of the trace cached. + + Workaround is to issue two TSB consecutively on affected cores. + + If unsure, say Y. + +config ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE + bool + +config ARM64_ERRATUM_2253138 + bool "Neoverse-N2: 2253138: workaround TRBE writing to address out-of-range" + depends on CORESIGHT_TRBE + default y + select ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE + help + This option adds the workaround for ARM Neoverse-N2 erratum 2253138. + + Affected Neoverse-N2 cores might write to an out-of-range address, not reserved + for TRBE. Under some conditions, the TRBE might generate a write to the next + virtually addressed page following the last page of the TRBE address space + (i.e., the TRBLIMITR_EL1.LIMIT), instead of wrapping around to the base. + + Work around this in the driver by always making sure that there is a + page beyond the TRBLIMITR_EL1.LIMIT, within the space allowed for the TRBE. + + If unsure, say Y. + +config ARM64_ERRATUM_2224489 + bool "Cortex-A710: 2224489: workaround TRBE writing to address out-of-range" + depends on CORESIGHT_TRBE + default y + select ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE + help + This option adds the workaround for ARM Cortex-A710 erratum 2224489. + + Affected Cortex-A710 cores might write to an out-of-range address, not reserved + for TRBE. Under some conditions, the TRBE might generate a write to the next + virtually addressed page following the last page of the TRBE address space + (i.e., the TRBLIMITR_EL1.LIMIT), instead of wrapping around to the base. + + Work around this in the driver by always making sure that there is a + page beyond the TRBLIMITR_EL1.LIMIT, within the space allowed for the TRBE. + + If unsure, say Y. + config ARM64_ERRATUM_2441009 bool "Cortex-A510: Completion of affected memory accesses might not be guaranteed by completion of a TLBI" default y @@ -1639,6 +1772,21 @@ config ARM64_TLB_RANGE The feature introduces new assembly instructions, and they were support when binutils >= 2.30. +config ARM64_MPAM + bool "Enable support for MPAM" + help + Memory Partitioning and Monitoring is an optional extension + that allows the CPUs to mark load and store transactions with + labels for partition-id and performance-monitoring-group. + System components, such as the caches, can use the partition-id + to apply a performance policy. MPAM monitors can use the + partition-id and performance-monitoring-group to measure the + cache occupancy or data throughput. + + Use of this extension requires CPU support, support in the + memory system components (MSC), and a description from firmware + of where the MSC are in the address space. + endmenu menu "ARMv8.5 architectural features" @@ -1727,6 +1875,7 @@ config ARM64_MTE depends on AS_HAS_LSE_ATOMICS # Required for tag checking in the uaccess routines depends on ARM64_PAN + select ARCH_HAS_SUBPAGE_FAULTS select ARCH_USES_HIGH_VMA_FLAGS help Memory Tagging (part of the ARMv8.5 Extensions) provides @@ -1798,7 +1947,6 @@ config ARM64_SVE config ARM64_MODULE_PLTS bool "Use PLTs to allow module memory to spill over into vmalloc area" depends on MODULES - select HAVE_MOD_ARCH_SPECIFIC help Allocate PLTs when loading modules so that jumps and calls whose targets are too far away for their relative offsets to be encoded @@ -1932,6 +2080,12 @@ config CMDLINE_FROM_BOOTLOADER the boot loader doesn't provide any, the default kernel command string provided in CMDLINE will be used. +config CMDLINE_EXTEND + bool "Extend bootloader kernel arguments" + help + The command-line arguments provided by the boot loader will be + appended to the default kernel command string. + config CMDLINE_FORCE bool "Always use the default kernel command string" help diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index d7772a4c34fe..3a67559e60ac 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -167,7 +167,6 @@ config ARCH_MEDIATEK config ARCH_MESON bool "Amlogic Platforms" select COMMON_CLK - select MESON_IRQ_GPIO help This enables support for the arm64 based Amlogic SoCs such as the s905, S905X/D, S912, A113X/D or S905X/D2 diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index c744b1e7b356..f01045b82646 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -148,7 +148,10 @@ libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a boot := arch/arm64/boot KBUILD_IMAGE := $(boot)/Image.gz +# Don't compile Image in mixed build with "all" target +ifndef KBUILD_MIXED_TREE all: Image.gz +endif Image: vmlinux @@ -189,6 +192,11 @@ archclean: $(Q)$(MAKE) $(clean)=arch/arm64/kernel/vdso $(Q)$(MAKE) $(clean)=arch/arm64/kernel/vdso32 +ifeq ($(CONFIG_KVM),y) +archscripts: + $(Q)$(MAKE) $(build)=arch/arm64/tools gen-hyprel +endif + ifeq ($(KBUILD_EXTMOD),) # We need to generate vdso-offsets.h before compiling certain files in kernel/. # In order to do that, we should use the archprepare target, but we can't since diff --git a/arch/arm64/Makefile.postlink b/arch/arm64/Makefile.postlink new file mode 100644 index 000000000000..8cf297fb7dd9 --- /dev/null +++ b/arch/arm64/Makefile.postlink @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: GPL-2.0 + +# +# This file is included by the generic Kbuild makefile to permit the +# architecture to perform postlink actions on vmlinux and any .ko module file. +# In this case, we only need it for fips140.ko, which needs some postprocessing +# for the integrity check mandated by FIPS. This involves making copies of the +# relocation sections so that the module will have access to them at +# initialization time, and calculating and injecting a HMAC digest into the +# module. All other targets are NOPs. +# + +PHONY := __archpost +__archpost: + +-include include/config/auto.conf +include scripts/Kbuild.include + +CMD_FIPS140_GEN_HMAC = crypto/fips140_gen_hmac +quiet_cmd_gen_hmac = HMAC $@ + cmd_gen_hmac = $(OBJCOPY) $@ \ + --dump-section=$(shell $(READELF) -SW $@|grep -Eo '\.rela\.text\S*')=$@.rela.text \ + --dump-section=$(shell $(READELF) -SW $@|grep -Eo '\.rela\.rodata\S*')=$@.rela.rodata && \ + $(OBJCOPY) $@ \ + --add-section=.init.rela.text=$@.rela.text \ + --add-section=.init.rela.rodata=$@.rela.rodata \ + --set-section-flags=.init.rela.text=alloc,readonly \ + --set-section-flags=.init.rela.rodata=alloc,readonly && \ + $(CMD_FIPS140_GEN_HMAC) $@ + +# `@true` prevents complaints when there is nothing to be done + +vmlinux: FORCE + @true + +$(objtree)/crypto/fips140.ko: FORCE + $(call cmd,gen_hmac) + +%.ko: FORCE + @true + +clean: + rm -f $(objtree)/crypto/fips140.ko.rela.* + +PHONY += FORCE clean + +FORCE: + +.PHONY: $(PHONY) diff --git a/arch/arm64/OWNERS b/arch/arm64/OWNERS new file mode 100644 index 000000000000..f362e24fd9cf --- /dev/null +++ b/arch/arm64/OWNERS @@ -0,0 +1,4 @@ +per-file crypto/**=file:/crypto/OWNERS +per-file {include,kernel,kvm,lib}/**=mzyngier@google.com,willdeacon@google.com +per-file mm/**=file:/mm/OWNERS +per-file net/**=file:/net/OWNERS diff --git a/arch/arm64/boot/dts/amlogic/Makefile b/arch/arm64/boot/dts/amlogic/Makefile index faa0a79a34f5..0f4b503a99d8 100644 --- a/arch/arm64/boot/dts/amlogic/Makefile +++ b/arch/arm64/boot/dts/amlogic/Makefile @@ -1,12 +1,14 @@ # SPDX-License-Identifier: GPL-2.0 dtb-$(CONFIG_ARCH_MESON) += meson-axg-s400.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12a-sei510.dtb +dtb-$(CONFIG_ARCH_MESON) += meson-g12a-sei510-android.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12a-u200.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12a-x96-max.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12b-gsking-x.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12b-gtking.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12b-gtking-pro.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12b-a311d-khadas-vim3.dtb +dtb-$(CONFIG_ARCH_MESON) += meson-g12b-a311d-khadas-vim3-android.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12b-s922x-khadas-vim3.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12b-odroid-n2.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12b-odroid-n2-plus.dtb @@ -50,7 +52,9 @@ dtb-$(CONFIG_ARCH_MESON) += meson-gxm-vega-s96.dtb dtb-$(CONFIG_ARCH_MESON) += meson-gxm-wetek-core2.dtb dtb-$(CONFIG_ARCH_MESON) += meson-sm1-bananapi-m5.dtb dtb-$(CONFIG_ARCH_MESON) += meson-sm1-khadas-vim3l.dtb +dtb-$(CONFIG_ARCH_MESON) += meson-sm1-khadas-vim3l-android.dtb dtb-$(CONFIG_ARCH_MESON) += meson-sm1-odroid-c4.dtb dtb-$(CONFIG_ARCH_MESON) += meson-sm1-odroid-hc4.dtb dtb-$(CONFIG_ARCH_MESON) += meson-sm1-sei610.dtb +dtb-$(CONFIG_ARCH_MESON) += meson-sm1-sei610-android.dtb dtb-$(CONFIG_ARCH_MESON) += meson-a1-ad401.dtb diff --git a/arch/arm64/boot/dts/amlogic/meson-g12a-sei510-android.dts b/arch/arm64/boot/dts/amlogic/meson-g12a-sei510-android.dts new file mode 100644 index 000000000000..2f89a01c04b5 --- /dev/null +++ b/arch/arm64/boot/dts/amlogic/meson-g12a-sei510-android.dts @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR MIT) +/* + * Copyright (c) 2020 BayLibre SAS. All rights reserved. + */ + +/dts-v1/; +/plugin/; + +#include +#include +#include +#include +#include + +/ { + compatible = "seirobotics,sei510", "amlogic,g12a"; + model = "SEI Robotics SEI510"; + fragment@101 { + target-path = "/"; + + __overlay__ { + reserved-memory { + #address-cells = <2>; + #size-cells = <2>; + ramoops@d000000 { + compatible = "ramoops"; + reg = <0x0 0x0d000000 0x0 0x00100000>; + record-size = <0x8000>; + console-size = <0x8000>; + ftrace-size = <0x0>; + pmsg-size = <0x8000>; + }; + }; + + adc_keys { + button-onoff { + linux,code = ; + }; + }; + + cvbs-connector { + status = "disabled"; + }; + }; + }; +}; + +&vddao_3v3_t { + gpio-open-drain; +}; + +&uart_A { + bluetooth { + interrupt-parent = <&gpio_intc>; + interrupts = <95 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "host-wakeup"; + }; +}; diff --git a/arch/arm64/boot/dts/amlogic/meson-g12b-a311d-khadas-vim3-android.dts b/arch/arm64/boot/dts/amlogic/meson-g12b-a311d-khadas-vim3-android.dts new file mode 100644 index 000000000000..3ab19e275321 --- /dev/null +++ b/arch/arm64/boot/dts/amlogic/meson-g12b-a311d-khadas-vim3-android.dts @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR MIT) +/* + * Copyright (c) 2019 BayLibre SAS. All rights reserved. + */ + +/dts-v1/; +/plugin/; + +#include +#include +#include +#include +#include + +/ { + compatible = "khadas,vim3", "amlogic,a311d", "amlogic,g12b"; + model = "Khadas VIM3"; + fragment@101 { + target-path = "/"; + __overlay__ { + reserved-memory { + #address-cells = <2>; + #size-cells = <2>; + ramoops@d000000 { + compatible = "ramoops"; + reg = <0x0 0x0d000000 0x0 0x00100000>; + record-size = <0x8000>; + console-size = <0x8000>; + ftrace-size = <0x0>; + pmsg-size = <0x8000>; + }; + }; + + }; + }; +}; + +&vcc_5v { + gpio-open-drain; +}; + +&uart_C { + status = "okay"; + pinctrl-0 = <&uart_c_pins>; + pinctrl-names = "default"; +}; + +&emmc_pwrseq{ + status = "okay"; +}; + +&sd_emmc_a { + /* WiFi firmware requires power to be kept while in suspend */ + keep-power-in-suspend; +}; \ No newline at end of file diff --git a/arch/arm64/boot/dts/amlogic/meson-sm1-khadas-vim3l-android.dts b/arch/arm64/boot/dts/amlogic/meson-sm1-khadas-vim3l-android.dts new file mode 100644 index 000000000000..4b06bf24fc48 --- /dev/null +++ b/arch/arm64/boot/dts/amlogic/meson-sm1-khadas-vim3l-android.dts @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR MIT) +/* + * Copyright (c) 2019 BayLibre SAS. All rights reserved. + */ + +/dts-v1/; +/plugin/; + +#include +#include +#include +#include +#include + +/ { + compatible = "khadas,vim3l", "amlogic,sm1"; + model = "Khadas VIM3L"; + fragment@101 { + target-path = "/"; + __overlay__ { + reserved-memory { + #address-cells = <2>; + #size-cells = <2>; + ramoops@d000000 { + compatible = "ramoops"; + reg = <0x0 0x0d000000 0x0 0x00100000>; + record-size = <0x8000>; + console-size = <0x8000>; + ftrace-size = <0x0>; + pmsg-size = <0x8000>; + }; + }; + }; + }; +}; + +&vcc_5v { + gpio-open-drain; +}; + +&uart_A { + bluetooth { + interrupt-parent = <&gpio_intc>; + interrupts = <95 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "host-wakeup"; + }; +}; + +&uart_C { + status = "disabled"; + pinctrl-0 = <&uart_c_pins>; + pinctrl-names = "default"; +}; + +&emmc_pwrseq{ + status = "okay"; +}; + +&sd_emmc_a { + /* WiFi firmware requires power to be kept while in suspend */ + keep-power-in-suspend; +}; + +&spicc1 { + status = "okay"; + pinctrl-names = "default"; + pinctrl-0 = <&spicc1_pins>; + cs-gpios = <&gpio GPIOH_6 GPIO_ACTIVE_LOW>; + #address-cells = <1>; + #size-cells = <0>; + + spidev@0 { + compatible = "rohm,dh2228fv"; + reg = <0>; + spi-max-frequency = <500000>; + status = "okay"; + }; + + neonkey@0 { + compatible = "nanohub"; + reg = <0>; + spi-max-frequency = <500000>; + + sensorhub,nreset-gpio = <&gpio GPIOA_0 0>; + sensorhub,boot0-gpio = <&gpio GPIOA_3 0>; /* Fake */ + sensorhub,wakeup-gpio = <&gpio GPIOA_2 0>; /* A2 -> PB9 */ + sensorhub,irq1-gpio = <&gpio GPIOA_1 0>; /* A1 -> PB5 */ + interrupt-parent = <&gpio_intc>; + interrupts = <62 IRQ_TYPE_EDGE_RISING>; /* A1 */ + /* sensorhub,spi-cs-gpio = <&gpio GPIOH_6 GPIO_ACTIVE_LOW>; Optional */ + sensorhub,bl-addr = <0x08000000>; + sensorhub,kernel-addr = <0x0800C000>; + sensorhub,shared-addr = <0x08040000>; + sensorhub,flash-banks = <0 0x08000000 0x04000>, + <3 0x0800C000 0x04000>, + <4 0x08010000 0x10000>, + <5 0x08020000 0x20000>, + <6 0x08040000 0x20000>, + <7 0x08060000 0x20000>; + sensorhub,num-flash-banks = <6>; + status = "disabled"; + }; + + argonkey@0 { + compatible = "nanohub"; + reg = <0>; + spi-max-frequency = <500000>; + spi-cpol; + + sensorhub,nreset-gpio = <&gpio GPIOA_0 0>; + sensorhub,boot0-gpio = <&gpio GPIOA_3 0>; + sensorhub,wakeup-gpio = <&gpio GPIOA_1 0>; /* A1 -> PA0 */ + sensorhub,irq1-gpio = <&gpio GPIOA_2 0>; /* A2 -> PA1 */ + interrupt-parent = <&gpio_intc>; + interrupts = <63 IRQ_TYPE_EDGE_RISING>; /* A2 */ + sensorhub,bl-addr = <0x08000000>; + sensorhub,kernel-addr = <0x0800C000>; + sensorhub,num-flash-banks = <4>; + sensorhub,flash-banks = <0 0x08000000 0x04000>, + <3 0x0800C000 0x04000>, + <4 0x08010000 0x10000>, + <5 0x08020000 0x20000>; + sensorhub,shared-addr = <0x08040000>; + sensorhub,num-shared-flash-banks = <6>; + sensorhub,shared-flash-banks = <6 0x08040000 0x20000>, + <7 0x08060000 0x20000>, + <8 0x08080000 0x20000>, + <9 0x080A0000 0x20000>, + <10 0x080C0000 0x20000>, + <11 0x080E0000 0x20000>; + status = "disabled"; + }; +}; diff --git a/arch/arm64/boot/dts/amlogic/meson-sm1-sei610-android.dts b/arch/arm64/boot/dts/amlogic/meson-sm1-sei610-android.dts new file mode 100644 index 000000000000..363d6c4cbc70 --- /dev/null +++ b/arch/arm64/boot/dts/amlogic/meson-sm1-sei610-android.dts @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR MIT) +/* + * Copyright (c) 2020 BayLibre SAS. All rights reserved. + */ + +/dts-v1/; +/plugin/; + +#include +#include +#include +#include +#include + +/ { + compatible = "seirobotics,sei610", "amlogic,sm1"; + model = "SEI Robotics SEI610"; + fragment@101 { + target-path = "/"; + __overlay__ { + + reserved-memory { + #address-cells = <2>; + #size-cells = <2>; + ramoops@d000000 { + compatible = "ramoops"; + reg = <0x0 0x0d000000 0x0 0x00100000>; + record-size = <0x8000>; + console-size = <0x8000>; + ftrace-size = <0x0>; + pmsg-size = <0x8000>; + }; + }; + }; + }; +}; + +&vddao_3v3_t { + gpio-open-drain; +}; + +&emmc_pwrseq { + status = "okay"; +}; + +&sd_emmc_a { + /* WiFi firmware requires power to be kept while in suspend */ + keep-power-in-suspend; +}; + +&uart_C { + status = "disabled"; + pinctrl-0 = <&uart_c_pins>; + pinctrl-names = "default"; +}; + +&spicc0 { + status = "disabled"; + pinctrl-names = "default"; + pinctrl-0 = <&spicc0_x_pins>; + cs-gpios = <&gpio GPIOX_10 GPIO_ACTIVE_LOW>; + #address-cells = <1>; + #size-cells = <0>; + + spidev@0 { + compatible = "rohm,dh2228fv"; + reg = <0>; + spi-max-frequency = <500000>; + status = "disabled"; + }; +}; diff --git a/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts b/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts index 0ce2d36ab257..c35cadea0a91 100644 --- a/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts +++ b/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts @@ -16,6 +16,8 @@ / { model = "Qualcomm Technologies, Inc. Robotics RB5"; compatible = "qcom,qrb5165-rb5", "qcom,sm8250"; + qcom,msm-id = <455 0x20001>; + qcom,board-id = <11 3>; aliases { serial0 = &uart12; diff --git a/arch/arm64/boot/dts/qcom/sdm845-db845c.dts b/arch/arm64/boot/dts/qcom/sdm845-db845c.dts index 146d3cd3f1b3..6092b91cd939 100644 --- a/arch/arm64/boot/dts/qcom/sdm845-db845c.dts +++ b/arch/arm64/boot/dts/qcom/sdm845-db845c.dts @@ -17,6 +17,8 @@ / { model = "Thundercomm Dragonboard 845c"; compatible = "thundercomm,db845c", "qcom,sdm845"; + qcom,msm-id = <341 0x20001>; + qcom,board-id = <8 0>; aliases { serial0 = &uart9; diff --git a/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4b.dts b/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4b.dts index 6c63e617063c..9f41a7f83ead 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4b.dts +++ b/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4b.dts @@ -31,7 +31,7 @@ }; &uart0 { - status = "okay"; + status = "disabled"; bluetooth { compatible = "brcm,bcm43438-bt"; diff --git a/arch/arm64/configs/16k_gki.fragment b/arch/arm64/configs/16k_gki.fragment new file mode 100644 index 000000000000..b923493edf18 --- /dev/null +++ b/arch/arm64/configs/16k_gki.fragment @@ -0,0 +1,3 @@ +CONFIG_ARM64_16K_PAGES=y +# b/241785095 +# CONFIG_INCREMENTAL_FS is not set diff --git a/arch/arm64/configs/amlogic_gki.fragment b/arch/arm64/configs/amlogic_gki.fragment new file mode 100644 index 000000000000..e577f550c46c --- /dev/null +++ b/arch/arm64/configs/amlogic_gki.fragment @@ -0,0 +1,142 @@ +# +# Generic drivers/frameworks +# +CONFIG_COMMON_CLK_PWM=m +CONFIG_REGULATOR_PWM=m +CONFIG_PWRSEQ_EMMC=m +CONFIG_PWRSEQ_SIMPLE=m +CONFIG_USB_DWC2=m +CONFIG_LEDS_GPIO=m + +# +# Networking +# +CONFIG_REALTEK_PHY=m +CONFIG_STMMAC_ETH=m +CONFIG_STMMAC_PLATFORM=m + +# +# WLAN +# +CONFIG_WLAN_VENDOR_BROADCOM=y +CONFIG_BRCMUTIL=m +CONFIG_BRCMFMAC=m +CONFIG_BRCMFMAC_PROTO_BCDC=y +CONFIG_BRCMFMAC_SDIO=y + +# +# Amlogic +# +CONFIG_ARCH_MESON=y +CONFIG_SERIAL_MESON=m +CONFIG_SERIAL_MESON_CONSOLE=y + +# +# Amlogic drivers as modules +# + +# core +CONFIG_MESON_SM=m +CONFIG_RESET_MESON=m +CONFIG_MESON_IRQ_GPIO=m + +# clocks +CONFIG_COMMON_CLK_MESON_REGMAP=m +CONFIG_COMMON_CLK_MESON_DUALDIV=m +CONFIG_COMMON_CLK_MESON_MPLL=m +CONFIG_COMMON_CLK_MESON_PHASE=m +CONFIG_COMMON_CLK_MESON_PLL=m +CONFIG_COMMON_CLK_MESON_SCLK_DIV=m +CONFIG_COMMON_CLK_MESON_VID_PLL_DIV=m +CONFIG_COMMON_CLK_MESON_AO_CLKC=m +CONFIG_COMMON_CLK_MESON_EE_CLKC=m +CONFIG_COMMON_CLK_MESON_CPU_DYNDIV=m +CONFIG_COMMON_CLK_GXBB=m +CONFIG_COMMON_CLK_AXG=m +CONFIG_COMMON_CLK_G12A=m + +# PHY +CONFIG_PHY_MESON8B_USB2=m +CONFIG_PHY_MESON_GXL_USB2=m +CONFIG_PHY_MESON_G12A_USB2=m +CONFIG_PHY_MESON_G12A_USB3_PCIE=m +CONFIG_PHY_MESON_AXG_PCIE=m +CONFIG_PHY_MESON_AXG_MIPI_PCIE_ANALOG=m + +# peripherals +CONFIG_I2C_MESON=m +CONFIG_MMC_MESON_GX=m +CONFIG_HW_RANDOM_MESON=m +CONFIG_USB_DWC3_MESON_G12A=m +CONFIG_MESON_SARADC=m +CONFIG_SPI_MESON_SPICC=m +CONFIG_SPI_MESON_SPIFC=m +CONFIG_PCI_MESON=m +CONFIG_DWMAC_MESON=m +CONFIG_MDIO_BUS_MUX_MESON_G12A=m +CONFIG_MESON_GXL_PHY=m +CONFIG_PINCTRL_MESON=m +CONFIG_PINCTRL_MESON_GXBB=m +CONFIG_PINCTRL_MESON_GXL=m +CONFIG_PINCTRL_MESON_AXG=m +CONFIG_PINCTRL_MESON_AXG_PMX=m +CONFIG_PINCTRL_MESON_G12A=m +CONFIG_MESON_GXBB_WATCHDOG=m +CONFIG_MESON_WATCHDOG=m +CONFIG_MTD_NAND_MESON=m +CONFIG_PWM_MESON=m +CONFIG_IR_MESON=m +CONFIG_MESON_EFUSE=m +CONFIG_MFD_KHADAS_MCU=m +CONFIG_KHADAS_MCU_FAN_THERMAL=m +CONFIG_AMLOGIC_THERMAL=m + +# sound +CONFIG_SND_MESON_AXG_SOUND_CARD=m +CONFIG_SND_MESON_GX_SOUND_CARD=m +CONFIG_SND_MESON_G12A_TOHDMITX=m + +# display / video +CONFIG_DRM_MESON=m +CONFIG_DRM_MESON_DW_HDMI=m +CONFIG_DRM_DW_HDMI=m +CONFIG_DRM_DW_HDMI_AHB_AUDIO=m +CONFIG_DRM_DW_HDMI_I2S_AUDIO=m +CONFIG_DRM_DW_HDMI_CEC=m +CONFIG_CEC_MESON_AO=m +CONFIG_CEC_MESON_G12A_AO=m +CONFIG_VIDEO_MESON_GE2D=m + +# SoC drivers +CONFIG_MESON_CANVAS=m +CONFIG_MESON_CLK_MEASURE=m +CONFIG_MESON_GX_PM_DOMAINS=m +CONFIG_MESON_EE_PM_DOMAINS=m +CONFIG_MESON_SECURE_PM_DOMAINS=m + +# +# Amlogic drivers disable +# + +# 32-bit SoC drivers +CONFIG_MESON6_TIMER=n +CONFIG_MESON_MX_SOCINFO=n + +# only needed by DRM on S805X +CONFIG_MESON_GX_SOCINFO=n + +# +# Debug / Testing +# + +# devtmpfs needed for buildroot/udev module loading, serial console +#CONFIG_DEVTMPFS=y +#CONFIG_DEVTMPFS_MOUNT=y + +# debug/testing with FB console +#CONFIG_DRM_KMS_FB_HELPER=y +#CONFIG_DRM_FBDEV_EMULATION=y +#CONFIG_FB=y +#CONFIG_VT=y +#CONFIG_FRAMEBUFFER_CONSOLE=y +#CONFIG_LOGO=y diff --git a/arch/arm64/configs/db845c_gki.fragment b/arch/arm64/configs/db845c_gki.fragment new file mode 100644 index 000000000000..e2dd3b7396ef --- /dev/null +++ b/arch/arm64/configs/db845c_gki.fragment @@ -0,0 +1,302 @@ +# CONFIG_MODULE_SIG_ALL is not set +CONFIG_QRTR=m +CONFIG_QRTR_TUN=m +CONFIG_SCSI_UFS_QCOM=m +CONFIG_INPUT_PM8941_PWRKEY=m +CONFIG_SERIAL_MSM=m +CONFIG_I2C_QCOM_GENI=m +CONFIG_I2C_QUP=m +CONFIG_PINCTRL_MSM=m +CONFIG_PINCTRL_QCOM_SPMI_PMIC=m +CONFIG_PINCTRL_SDM845=m +CONFIG_POWER_RESET_QCOM_PON=m +CONFIG_SYSCON_REBOOT_MODE=m +CONFIG_QCOM_TSENS=m +CONFIG_QCOM_WDT=m +CONFIG_PM8916_WATCHDOG=m +CONFIG_MFD_SPMI_PMIC=m +CONFIG_SPMI_MSM_PMIC_ARB=m +CONFIG_REGULATOR_QCOM_RPMH=m +CONFIG_REGULATOR_QCOM_SPMI=m +CONFIG_DRM_MSM=m +# CONFIG_DRM_MSM_DSI_28NM_PHY is not set +# CONFIG_DRM_MSM_DSI_20NM_PHY is not set +# CONFIG_DRM_MSM_DSI_28NM_8960_PHY is not set +CONFIG_DRM_LONTIUM_LT9611=m +CONFIG_USB_OHCI_HCD=m +CONFIG_USB_OHCI_HCD_PLATFORM=m +# CONFIG_USB_DWC3_HAPS is not set +# CONFIG_USB_DWC3_OF_SIMPLE is not set +CONFIG_USB_GADGET_VBUS_DRAW=500 +# CONFIG_USB_DUMMY_HCD is not set +CONFIG_USB_ROLE_SWITCH=m +CONFIG_USB_ULPI_BUS=m +CONFIG_MMC_SDHCI_MSM=m +CONFIG_RTC_DRV_PM8XXX=m +CONFIG_COMMON_CLK_QCOM=m +CONFIG_SDM_GPUCC_845=m +CONFIG_QCOM_CLK_RPMH=m +CONFIG_SDM_DISPCC_845=m +CONFIG_HWSPINLOCK_QCOM=m +CONFIG_QCOM_LLCC=m +CONFIG_QCOM_RMTFS_MEM=m +CONFIG_QCOM_SMEM=m +CONFIG_QCOM_SMSM=m +CONFIG_EXTCON_USB_GPIO=m +CONFIG_RESET_QCOM_AOSS=m +CONFIG_RESET_QCOM_PDC=m +CONFIG_PHY_QCOM_QMP=m +CONFIG_PHY_QCOM_QUSB2=m +CONFIG_PHY_QCOM_USB_HS=m +CONFIG_QCOM_QFPROM=m +CONFIG_INTERCONNECT_QCOM=y +CONFIG_INTERCONNECT_QCOM_OSM_L3=m +CONFIG_INTERCONNECT_QCOM_SDM845=m +CONFIG_QCOM_RPMH=m +CONFIG_QCOM_RPMHPD=m +CONFIG_WLAN_VENDOR_ATH=y +CONFIG_ATH10K_AHB=y +CONFIG_ATH10K=m +CONFIG_ATH10K_PCI=m +CONFIG_ATH10K_SNOC=m +CONFIG_QRTR_SMD=m +CONFIG_QCOM_FASTRPC=m +CONFIG_QCOM_APCS_IPC=m +CONFIG_QCOM_Q6V5_COMMON=m +CONFIG_QCOM_RPROC_COMMON=m +CONFIG_QCOM_Q6V5_ADSP=m +CONFIG_QCOM_Q6V5_MSS=m +CONFIG_QCOM_Q6V5_PAS=m +CONFIG_QCOM_Q6V5_WCSS=m +CONFIG_QCOM_SYSMON=m +CONFIG_RPMSG_QCOM_GLINK_SMEM=m +CONFIG_RPMSG_QCOM_SMD=m +CONFIG_QCOM_AOSS_QMP=m +CONFIG_QCOM_SMP2P=m +CONFIG_QCOM_SOCINFO=m +CONFIG_QCOM_APR=m +CONFIG_QCOM_GLINK_SSR=m +CONFIG_RPMSG_QCOM_GLINK_RPM=m +CONFIG_QCOM_PDC=m +CONFIG_QCOM_SCM=m +CONFIG_ARM_SMMU=m +CONFIG_ARM_QCOM_CPUFREQ_HW=m +# XXX Audio bits start here +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_MUX=m +CONFIG_I2C_MUX_PCA954x=m +CONFIG_I2C_DESIGNWARE_PLATFORM=m +CONFIG_I2C_RK3X=m +CONFIG_SPI_PL022=m +CONFIG_SPI_QCOM_QSPI=m +CONFIG_SPI_QUP=m +CONFIG_SPI_QCOM_GENI=m +CONFIG_GPIO_WCD934X=m +CONFIG_MFD_WCD934X=m +CONFIG_REGULATOR_GPIO=m +CONFIG_SND_SOC_QCOM=m +CONFIG_SND_SOC_QCOM_COMMON=m +CONFIG_SND_SOC_QDSP6_COMMON=m +CONFIG_SND_SOC_QDSP6_CORE=m +CONFIG_SND_SOC_QDSP6_AFE=m +CONFIG_SND_SOC_QDSP6_AFE_DAI=m +CONFIG_SND_SOC_QDSP6_ADM=m +CONFIG_SND_SOC_QDSP6_ROUTING=m +CONFIG_SND_SOC_QDSP6_ASM=m +CONFIG_SND_SOC_QDSP6_ASM_DAI=m +CONFIG_SND_SOC_QDSP6=m +CONFIG_SND_SOC_SDM845=m +CONFIG_SND_SOC_DMIC=m +CONFIG_SND_SOC_WCD9335=m +CONFIG_SND_SOC_WCD934X=m +CONFIG_SND_SOC_WSA881X=m +CONFIG_QCOM_BAM_DMA=m +CONFIG_SPMI_PMIC_CLKDIV=m +CONFIG_SOUNDWIRE=m +CONFIG_SOUNDWIRE_QCOM=m +CONFIG_SLIMBUS=m +CONFIG_SLIM_QCOM_NGD_CTRL=m +CONFIG_DMABUF_HEAPS_SYSTEM=m +CONFIG_SDM_VIDEOCC_845=m +# CONFIG_CXD2880_SPI_DRV is not set +# CONFIG_MEDIA_TUNER_SIMPLE is not set +# CONFIG_MEDIA_TUNER_TDA18250 is not set +# CONFIG_MEDIA_TUNER_TDA8290 is not set +# CONFIG_MEDIA_TUNER_TDA827X is not set +# CONFIG_MEDIA_TUNER_TDA18271 is not set +# CONFIG_MEDIA_TUNER_TDA9887 is not set +# CONFIG_MEDIA_TUNER_TEA5761 is not set +# CONFIG_MEDIA_TUNER_TEA5767 is not set +# CONFIG_MEDIA_TUNER_MSI001 is not set +# CONFIG_MEDIA_TUNER_MT20XX is not set +# CONFIG_MEDIA_TUNER_MT2060 is not set +# CONFIG_MEDIA_TUNER_MT2063 is not set +# CONFIG_MEDIA_TUNER_MT2266 is not set +# CONFIG_MEDIA_TUNER_MT2131 is not set +# CONFIG_MEDIA_TUNER_QT1010 is not set +# CONFIG_MEDIA_TUNER_XC2028 is not set +# CONFIG_MEDIA_TUNER_XC5000 is not set +# CONFIG_MEDIA_TUNER_XC4000 is not set +# CONFIG_MEDIA_TUNER_MXL5005S is not set +# CONFIG_MEDIA_TUNER_MXL5007T is not set +# CONFIG_MEDIA_TUNER_MC44S803 is not set +# CONFIG_MEDIA_TUNER_MAX2165 is not set +# CONFIG_MEDIA_TUNER_TDA18218 is not set +# CONFIG_MEDIA_TUNER_FC0011 is not set +# CONFIG_MEDIA_TUNER_FC0012 is not set +# CONFIG_MEDIA_TUNER_FC0013 is not set +# CONFIG_MEDIA_TUNER_TDA18212 is not set +# CONFIG_MEDIA_TUNER_E4000 is not set +# CONFIG_MEDIA_TUNER_FC2580 is not set +# CONFIG_MEDIA_TUNER_M88RS6000T is not set +# CONFIG_MEDIA_TUNER_TUA9001 is not set +# CONFIG_MEDIA_TUNER_SI2157 is not set +# CONFIG_MEDIA_TUNER_IT913X is not set +# CONFIG_MEDIA_TUNER_R820T is not set +# CONFIG_MEDIA_TUNER_MXL301RF is not set +# CONFIG_MEDIA_TUNER_QM1D1C0042 is not set +# CONFIG_MEDIA_TUNER_QM1D1B0004 is not set +# CONFIG_DVB_STB0899 is not set +# CONFIG_DVB_STB6100 is not set +# CONFIG_DVB_STV090x is not set +# CONFIG_DVB_STV0910 is not set +# CONFIG_DVB_STV6110x is not set +# CONFIG_DVB_STV6111 is not set +# CONFIG_DVB_MXL5XX is not set +# CONFIG_DVB_M88DS3103 is not set +# CONFIG_DVB_DRXK is not set +# CONFIG_DVB_TDA18271C2DD is not set +# CONFIG_DVB_SI2165 is not set +# CONFIG_DVB_MN88472 is not set +# CONFIG_DVB_MN88473 is not set +# CONFIG_DVB_CX24110 is not set +# CONFIG_DVB_CX24123 is not set +# CONFIG_DVB_MT312 is not set +# CONFIG_DVB_ZL10036 is not set +# CONFIG_DVB_ZL10039 is not set +# CONFIG_DVB_S5H1420 is not set +# CONFIG_DVB_STV0288 is not set +# CONFIG_DVB_STB6000 is not set +# CONFIG_DVB_STV0299 is not set +# CONFIG_DVB_STV6110 is not set +# CONFIG_DVB_STV0900 is not set +# CONFIG_DVB_TDA8083 is not set +# CONFIG_DVB_TDA10086 is not set +# CONFIG_DVB_TDA8261 is not set +# CONFIG_DVB_VES1X93 is not set +# CONFIG_DVB_TUNER_ITD1000 is not set +# CONFIG_DVB_TUNER_CX24113 is not set +# CONFIG_DVB_TDA826X is not set +# CONFIG_DVB_TUA6100 is not set +# CONFIG_DVB_CX24116 is not set +# CONFIG_DVB_CX24117 is not set +# CONFIG_DVB_CX24120 is not set +# CONFIG_DVB_SI21XX is not set +# CONFIG_DVB_TS2020 is not set +# CONFIG_DVB_DS3000 is not set +# CONFIG_DVB_MB86A16 is not set +# CONFIG_DVB_TDA10071 is not set +# CONFIG_DVB_SP8870 is not set +# CONFIG_DVB_SP887X is not set +# CONFIG_DVB_CX22700 is not set +# CONFIG_DVB_CX22702 is not set +# CONFIG_DVB_S5H1432 is not set +# CONFIG_DVB_DRXD is not set +# CONFIG_DVB_L64781 is not set +# CONFIG_DVB_TDA1004X is not set +# CONFIG_DVB_NXT6000 is not set +# CONFIG_DVB_MT352 is not set +# CONFIG_DVB_ZL10353 is not set +# CONFIG_DVB_DIB3000MB is not set +# CONFIG_DVB_DIB3000MC is not set +# CONFIG_DVB_DIB7000M is not set +# CONFIG_DVB_DIB7000P is not set +# CONFIG_DVB_DIB9000 is not set +# CONFIG_DVB_TDA10048 is not set +# CONFIG_DVB_AF9013 is not set +# CONFIG_DVB_EC100 is not set +# CONFIG_DVB_STV0367 is not set +# CONFIG_DVB_CXD2820R is not set +# CONFIG_DVB_CXD2841ER is not set +# CONFIG_DVB_RTL2830 is not set +# CONFIG_DVB_RTL2832 is not set +# CONFIG_DVB_RTL2832_SDR is not set +# CONFIG_DVB_SI2168 is not set +# CONFIG_DVB_ZD1301_DEMOD is not set +# CONFIG_DVB_CXD2880 is not set +# CONFIG_DVB_VES1820 is not set +# CONFIG_DVB_TDA10021 is not set +# CONFIG_DVB_TDA10023 is not set +# CONFIG_DVB_STV0297 is not set +# CONFIG_DVB_NXT200X is not set +# CONFIG_DVB_OR51211 is not set +# CONFIG_DVB_OR51132 is not set +# CONFIG_DVB_BCM3510 is not set +# CONFIG_DVB_LGDT330X is not set +# CONFIG_DVB_LGDT3305 is not set +# CONFIG_DVB_LGDT3306A is not set +# CONFIG_DVB_LG2160 is not set +# CONFIG_DVB_S5H1409 is not set +# CONFIG_DVB_AU8522_DTV is not set +# CONFIG_DVB_AU8522_V4L is not set +# CONFIG_DVB_S5H1411 is not set +# CONFIG_DVB_S921 is not set +# CONFIG_DVB_DIB8000 is not set +# CONFIG_DVB_MB86A20S is not set +# CONFIG_DVB_TC90522 is not set +# CONFIG_DVB_MN88443X is not set +# CONFIG_DVB_PLL is not set +# CONFIG_DVB_TUNER_DIB0070 is not set +# CONFIG_DVB_TUNER_DIB0090 is not set +# CONFIG_DVB_DRX39XYJ is not set +# CONFIG_DVB_LNBH25 is not set +# CONFIG_DVB_LNBH29 is not set +# CONFIG_DVB_LNBP21 is not set +# CONFIG_DVB_LNBP22 is not set +# CONFIG_DVB_ISL6405 is not set +# CONFIG_DVB_ISL6421 is not set +# CONFIG_DVB_ISL6423 is not set +# CONFIG_DVB_A8293 is not set +# CONFIG_DVB_LGS8GL5 is not set +# CONFIG_DVB_LGS8GXX is not set +# CONFIG_DVB_ATBM8830 is not set +# CONFIG_DVB_TDA665x is not set +# CONFIG_DVB_IX2505V is not set +# CONFIG_DVB_M88RS2000 is not set +# CONFIG_DVB_AF9033 is not set +# CONFIG_DVB_HORUS3A is not set +# CONFIG_DVB_ASCOT2E is not set +# CONFIG_DVB_HELENE is not set +# CONFIG_DVB_CXD2099 is not set +# CONFIG_DVB_SP2 is not set +CONFIG_QCOM_COMMAND_DB=m +CONFIG_QCOM_LMH=m +# XXX RB5 bits start here +CONFIG_QCOM_IPCC=m +CONFIG_QCOM_SPMI_ADC5=m +CONFIG_QCOM_SPMI_TEMP_ALARM=m +CONFIG_RPMSG_NS=m +CONFIG_CAN_MCP251XFD=m +CONFIG_ATH11K=m +CONFIG_ATH11K_AHB=m +CONFIG_ATH11K_PCI=m +CONFIG_PINCTRL_SM8250=m +CONFIG_PINCTRL_LPASS_LPI=m +CONFIG_QCOM_SPMI_ADC_TM5=m +CONFIG_MFD_QCOM_QCA639X=m +CONFIG_REGULATOR_QCOM_USB_VBUS=m +CONFIG_DRM_DISPLAY_CONNECTOR=m +CONFIG_DRM_LONTIUM_LT9611UXC=m +CONFIG_SND_SOC_SM8250=m +CONFIG_SND_SOC_LPASS_WSA_MACRO=m +CONFIG_SND_SOC_LPASS_VA_MACRO=m +CONFIG_TYPEC_QCOM_PMIC=m +CONFIG_LEDS_CLASS_MULTICOLOR=m +CONFIG_LEDS_QCOM_LPG=m +CONFIG_SM_GPUCC_8250=m +CONFIG_SM_DISPCC_8250=m +CONFIG_SM_VIDEOCC_8250=m +CONFIG_CLK_GFM_LPASS_SM8250=m +CONFIG_PHY_QCOM_USB_SNPS_FEMTO_V2=m +CONFIG_INTERCONNECT_QCOM_SM8250=m +CONFIG_CRYPTO_MICHAEL_MIC=m diff --git a/arch/arm64/configs/fips140_gki.fragment b/arch/arm64/configs/fips140_gki.fragment new file mode 100644 index 000000000000..aa8444d34e56 --- /dev/null +++ b/arch/arm64/configs/fips140_gki.fragment @@ -0,0 +1,2 @@ +CONFIG_CRYPTO_FIPS140_MOD=m +# CONFIG_MODULE_SIG_ALL is not set diff --git a/arch/arm64/configs/gki_defconfig b/arch/arm64/configs/gki_defconfig new file mode 100644 index 000000000000..c2fff4bc35b2 --- /dev/null +++ b/arch/arm64/configs/gki_defconfig @@ -0,0 +1,726 @@ +CONFIG_UAPI_HEADER_TEST=y +CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_PREEMPT=y +CONFIG_IRQ_TIME_ACCOUNTING=y +CONFIG_TASKSTATS=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_PSI=y +CONFIG_RCU_EXPERT=y +CONFIG_RCU_FAST_NO_HZ=y +CONFIG_RCU_BOOST=y +CONFIG_RCU_NOCB_CPU=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_IKHEADERS=y +CONFIG_UCLAMP_TASK=y +CONFIG_UCLAMP_BUCKETS_COUNT=20 +CONFIG_CGROUPS=y +CONFIG_MEMCG=y +CONFIG_BLK_CGROUP=y +CONFIG_CGROUP_SCHED=y +CONFIG_UCLAMP_TASK_GROUP=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CPUSETS=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_BPF=y +CONFIG_NAMESPACES=y +# CONFIG_PID_NS is not set +CONFIG_RT_SOFTINT_OPTIMIZATION=y +# CONFIG_RD_BZIP2 is not set +# CONFIG_RD_LZMA is not set +# CONFIG_RD_XZ is not set +# CONFIG_RD_LZO is not set +CONFIG_BOOT_CONFIG=y +# CONFIG_SYSFS_SYSCALL is not set +# CONFIG_FHANDLE is not set +CONFIG_KALLSYMS_ALL=y +CONFIG_USERFAULTFD=y +# CONFIG_RSEQ is not set +CONFIG_EMBEDDED=y +# CONFIG_COMPAT_BRK is not set +# CONFIG_SLAB_MERGE_DEFAULT is not set +CONFIG_SLAB_FREELIST_RANDOM=y +CONFIG_SLAB_FREELIST_HARDENED=y +CONFIG_SHUFFLE_PAGE_ALLOCATOR=y +CONFIG_PROFILING=y +CONFIG_ARCH_SUNXI=y +CONFIG_ARCH_HISI=y +CONFIG_ARCH_QCOM=y +CONFIG_SCHED_MC=y +CONFIG_NR_CPUS=32 +CONFIG_PARAVIRT_TIME_ACCOUNTING=y +CONFIG_ARM64_SW_TTBR0_PAN=y +CONFIG_COMPAT=y +CONFIG_ARMV8_DEPRECATED=y +CONFIG_SWP_EMULATION=y +CONFIG_CP15_BARRIER_EMULATION=y +CONFIG_SETEND_EMULATION=y +CONFIG_ARM64_MPAM=y +CONFIG_RANDOMIZE_BASE=y +# CONFIG_RANDOMIZE_MODULE_REGION_FULL is not set +CONFIG_CMDLINE="console=ttynull stack_depot_disable=on cgroup_disable=pressure kasan.page_alloc.sample=10 kasan.stacktrace=off bootconfig ioremap_guard kvm-arm.mode=protected" +CONFIG_CMDLINE_EXTEND=y +# CONFIG_DMI is not set +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=0 +# CONFIG_PM_WAKELOCKS_GC is not set +CONFIG_ENERGY_MODEL=y +CONFIG_CPU_IDLE=y +CONFIG_CPU_IDLE_GOV_MENU=y +CONFIG_CPU_IDLE_GOV_TEO=y +CONFIG_ARM_CPUIDLE=y +CONFIG_ARM_PSCI_CPUIDLE=y +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_STAT=y +CONFIG_CPU_FREQ_TIMES=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=y +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y +CONFIG_ARM_SCPI_CPUFREQ=y +CONFIG_ARM_SCMI_CPUFREQ=y +CONFIG_VIRTUALIZATION=y +CONFIG_KVM=y +CONFIG_CRYPTO_SHA2_ARM64_CE=y +CONFIG_CRYPTO_SHA512_ARM64_CE=y +CONFIG_CRYPTO_POLYVAL_ARM64_CE=y +CONFIG_CRYPTO_AES_ARM64_CE_BLK=y +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +CONFIG_SHADOW_CALL_STACK=y +CONFIG_LTO_CLANG_FULL=y +CONFIG_CFI_CLANG=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODVERSIONS=y +CONFIG_MODULE_SCMVERSION=y +CONFIG_MODULE_SIG=y +CONFIG_MODULE_SIG_PROTECT=y +CONFIG_BLK_DEV_ZONED=y +CONFIG_BLK_CGROUP_IOCOST=y +CONFIG_BLK_INLINE_ENCRYPTION=y +CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y +CONFIG_IOSCHED_BFQ=y +CONFIG_BFQ_GROUP_IOSCHED=y +CONFIG_GKI_HACKS_TO_FIX=y +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +CONFIG_BINFMT_MISC=y +CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_DEFAULT_MMAP_MIN_ADDR=32768 +CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y +CONFIG_CLEANCACHE=y +CONFIG_CMA=y +CONFIG_CMA_DEBUGFS=y +CONFIG_CMA_SYSFS=y +CONFIG_CMA_AREAS=16 +CONFIG_ZSMALLOC=m +# CONFIG_ZONE_DMA is not set +CONFIG_ANON_VMA_NAME=y +CONFIG_LRU_GEN=y +CONFIG_DAMON=y +CONFIG_DAMON_PADDR=y +CONFIG_DAMON_RECLAIM=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_XFRM_USER=y +CONFIG_XFRM_INTERFACE=y +CONFIG_XFRM_MIGRATE=y +CONFIG_XFRM_STATISTICS=y +CONFIG_NET_KEY=y +CONFIG_XDP_SOCKETS=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_NET_IPIP=y +CONFIG_NET_IPGRE_DEMUX=y +CONFIG_NET_IPGRE=y +CONFIG_NET_IPVTI=y +CONFIG_INET_ESP=y +CONFIG_INET_UDP_DIAG=y +CONFIG_INET_DIAG_DESTROY=y +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_ESP=y +CONFIG_INET6_IPCOMP=y +CONFIG_IPV6_MIP6=y +CONFIG_IPV6_VTI=y +CONFIG_IPV6_GRE=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_MROUTE=y +CONFIG_NETFILTER=y +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_AMANDA=y +CONFIG_NF_CONNTRACK_FTP=y +CONFIG_NF_CONNTRACK_H323=y +CONFIG_NF_CONNTRACK_IRC=y +CONFIG_NF_CONNTRACK_NETBIOS_NS=y +CONFIG_NF_CONNTRACK_PPTP=y +CONFIG_NF_CONNTRACK_SANE=y +CONFIG_NF_CONNTRACK_TFTP=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y +CONFIG_NETFILTER_XT_TARGET_CONNMARK=y +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y +CONFIG_NETFILTER_XT_TARGET_DSCP=y +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_NFLOG=y +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y +CONFIG_NETFILTER_XT_TARGET_NOTRACK=y +CONFIG_NETFILTER_XT_TARGET_TEE=y +CONFIG_NETFILTER_XT_TARGET_TPROXY=y +CONFIG_NETFILTER_XT_TARGET_TRACE=y +CONFIG_NETFILTER_XT_TARGET_SECMARK=y +CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_MATCH_BPF=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y +CONFIG_NETFILTER_XT_MATCH_CONNMARK=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_DSCP=y +CONFIG_NETFILTER_XT_MATCH_ESP=y +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y +CONFIG_NETFILTER_XT_MATCH_HELPER=y +CONFIG_NETFILTER_XT_MATCH_IPRANGE=y +CONFIG_NETFILTER_XT_MATCH_L2TP=y +CONFIG_NETFILTER_XT_MATCH_LENGTH=y +CONFIG_NETFILTER_XT_MATCH_LIMIT=y +CONFIG_NETFILTER_XT_MATCH_MAC=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=y +CONFIG_NETFILTER_XT_MATCH_OWNER=y +CONFIG_NETFILTER_XT_MATCH_POLICY=y +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y +CONFIG_NETFILTER_XT_MATCH_QUOTA=y +CONFIG_NETFILTER_XT_MATCH_QUOTA2=y +CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG=y +CONFIG_NETFILTER_XT_MATCH_SOCKET=y +CONFIG_NETFILTER_XT_MATCH_STATE=y +CONFIG_NETFILTER_XT_MATCH_STATISTIC=y +CONFIG_NETFILTER_XT_MATCH_STRING=y +CONFIG_NETFILTER_XT_MATCH_TIME=y +CONFIG_NETFILTER_XT_MATCH_U32=y +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_MATCH_ECN=y +CONFIG_IP_NF_MATCH_TTL=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_IP_NF_NAT=y +CONFIG_IP_NF_TARGET_MASQUERADE=y +CONFIG_IP_NF_TARGET_NETMAP=y +CONFIG_IP_NF_TARGET_REDIRECT=y +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_RAW=y +CONFIG_IP_NF_SECURITY=y +CONFIG_IP_NF_ARPTABLES=y +CONFIG_IP_NF_ARPFILTER=y +CONFIG_IP_NF_ARP_MANGLE=y +CONFIG_IP6_NF_IPTABLES=y +CONFIG_IP6_NF_MATCH_RPFILTER=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_TARGET_REJECT=y +CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_RAW=y +CONFIG_TIPC=m +CONFIG_L2TP=m +CONFIG_BRIDGE=y +CONFIG_VLAN_8021Q=m +CONFIG_6LOWPAN=m +CONFIG_IEEE802154=m +CONFIG_IEEE802154_6LOWPAN=m +CONFIG_MAC802154=m +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_HTB=y +CONFIG_NET_SCH_PRIO=y +CONFIG_NET_SCH_MULTIQ=y +CONFIG_NET_SCH_SFQ=y +CONFIG_NET_SCH_TBF=y +CONFIG_NET_SCH_NETEM=y +CONFIG_NET_SCH_CODEL=y +CONFIG_NET_SCH_FQ_CODEL=y +CONFIG_NET_SCH_FQ=y +CONFIG_NET_SCH_INGRESS=y +CONFIG_NET_CLS_BASIC=y +CONFIG_NET_CLS_TCINDEX=y +CONFIG_NET_CLS_FW=y +CONFIG_NET_CLS_U32=y +CONFIG_CLS_U32_MARK=y +CONFIG_NET_CLS_FLOW=y +CONFIG_NET_CLS_BPF=y +CONFIG_NET_CLS_MATCHALL=y +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_CMP=y +CONFIG_NET_EMATCH_NBYTE=y +CONFIG_NET_EMATCH_U32=y +CONFIG_NET_EMATCH_META=y +CONFIG_NET_EMATCH_TEXT=y +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_POLICE=y +CONFIG_NET_ACT_GACT=y +CONFIG_NET_ACT_MIRRED=y +CONFIG_NET_ACT_SKBEDIT=y +CONFIG_NET_ACT_BPF=y +CONFIG_VSOCKETS=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_CAN=m +CONFIG_CAN_VCAN=m +CONFIG_CAN_SLCAN=m +CONFIG_BT=m +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +CONFIG_BT_HIDP=m +CONFIG_BT_HCIBTSDIO=m +CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_LL=y +CONFIG_BT_HCIUART_BCM=y +CONFIG_BT_HCIUART_QCA=y +CONFIG_CFG80211=m +CONFIG_NL80211_TESTMODE=y +# CONFIG_CFG80211_DEFAULT_PS is not set +# CONFIG_CFG80211_CRDA_SUPPORT is not set +CONFIG_MAC80211=m +CONFIG_RFKILL=m +CONFIG_NFC=m +CONFIG_PCI=y +CONFIG_PCIEPORTBUS=y +CONFIG_PCIEAER=y +CONFIG_PCI_IOV=y +CONFIG_PCI_HOST_GENERIC=y +CONFIG_PCIE_DW_PLAT_EP=y +CONFIG_PCIE_QCOM=y +CONFIG_PCIE_KIRIN=y +CONFIG_PCI_ENDPOINT=y +CONFIG_FW_LOADER_USER_HELPER=y +# CONFIG_FW_CACHE is not set +# CONFIG_SUN50I_DE2_BUS is not set +# CONFIG_SUNXI_RSB is not set +CONFIG_ARM_SCMI_PROTOCOL=y +# CONFIG_ARM_SCMI_POWER_DOMAIN is not set +CONFIG_ARM_SCPI_PROTOCOL=y +# CONFIG_ARM_SCPI_POWER_DOMAIN is not set +# CONFIG_EFI_ARMSTUB_DTB_LOADER is not set +CONFIG_GNSS=y +CONFIG_ZRAM=m +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_LOOP_MIN_COUNT=16 +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_SRAM=y +CONFIG_UID_SYS_STATS=y +CONFIG_SCSI=y +# CONFIG_SCSI_PROC_FS is not set +CONFIG_BLK_DEV_SD=y +CONFIG_SCSI_LOGGING=y +CONFIG_MD=y +CONFIG_BLK_DEV_DM=y +CONFIG_DM_CRYPT=y +CONFIG_DM_DEFAULT_KEY=y +CONFIG_DM_SNAPSHOT=y +CONFIG_DM_INIT=y +CONFIG_DM_UEVENT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y +CONFIG_DM_BOW=y +CONFIG_NETDEVICES=y +CONFIG_DUMMY=y +CONFIG_WIREGUARD=y +CONFIG_IFB=y +CONFIG_MACSEC=y +CONFIG_TUN=y +CONFIG_VETH=y +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_MPPE=m +CONFIG_PPTP=m +CONFIG_PPPOL2TP=m +CONFIG_USB_RTL8150=y +CONFIG_USB_RTL8152=y +CONFIG_USB_USBNET=y +CONFIG_USB_NET_CDC_EEM=y +# CONFIG_USB_NET_NET1080 is not set +# CONFIG_USB_NET_CDC_SUBSET is not set +# CONFIG_USB_NET_ZAURUS is not set +CONFIG_USB_NET_AQC111=y +# CONFIG_WLAN_VENDOR_ADMTEK is not set +# CONFIG_WLAN_VENDOR_ATH is not set +# CONFIG_WLAN_VENDOR_ATMEL is not set +# CONFIG_WLAN_VENDOR_BROADCOM is not set +# CONFIG_WLAN_VENDOR_CISCO is not set +# CONFIG_WLAN_VENDOR_INTEL is not set +# CONFIG_WLAN_VENDOR_INTERSIL is not set +# CONFIG_WLAN_VENDOR_MARVELL is not set +# CONFIG_WLAN_VENDOR_MEDIATEK is not set +# CONFIG_WLAN_VENDOR_RALINK is not set +# CONFIG_WLAN_VENDOR_REALTEK is not set +# CONFIG_WLAN_VENDOR_RSI is not set +# CONFIG_WLAN_VENDOR_ST is not set +# CONFIG_WLAN_VENDOR_TI is not set +# CONFIG_WLAN_VENDOR_ZYDAS is not set +# CONFIG_WLAN_VENDOR_QUANTENNA is not set +CONFIG_INPUT_EVDEV=y +CONFIG_KEYBOARD_GPIO=y +# CONFIG_MOUSE_PS2 is not set +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_XPAD=y +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_INPUT_MISC=y +CONFIG_INPUT_UINPUT=y +# CONFIG_VT is not set +# CONFIG_LEGACY_PTYS is not set +CONFIG_SERIAL_8250=y +# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set +CONFIG_SERIAL_8250_CONSOLE=y +# CONFIG_SERIAL_8250_EXAR is not set +CONFIG_SERIAL_8250_RUNTIME_UARTS=0 +CONFIG_SERIAL_8250_DW=y +CONFIG_SERIAL_OF_PLATFORM=y +CONFIG_SERIAL_AMBA_PL011=y +CONFIG_SERIAL_AMBA_PL011_CONSOLE=y +CONFIG_SERIAL_SAMSUNG=y +CONFIG_SERIAL_SAMSUNG_CONSOLE=y +CONFIG_SERIAL_QCOM_GENI=y +# CONFIG_SERIAL_QCOM_GENI_CONSOLE_DEFAULT_ENABLED is not set +CONFIG_SERIAL_QCOM_GENI_CONSOLE=y +CONFIG_SERIAL_SPRD=y +CONFIG_SERIAL_SPRD_CONSOLE=y +CONFIG_NULL_TTY=y +CONFIG_HVC_DCC=y +CONFIG_SERIAL_DEV_BUS=y +CONFIG_HW_RANDOM=y +# CONFIG_DEVMEM is not set +# CONFIG_DEVPORT is not set +# CONFIG_I2C_COMPAT is not set +# CONFIG_I2C_HELPER_AUTO is not set +CONFIG_I3C=y +CONFIG_SPI=y +CONFIG_SPI_MEM=y +# CONFIG_SPMI_MSM_PMIC_ARB is not set +# CONFIG_PINCTRL_SUN8I_H3_R is not set +# CONFIG_PINCTRL_SUN50I_A64 is not set +# CONFIG_PINCTRL_SUN50I_A64_R is not set +# CONFIG_PINCTRL_SUN50I_A100 is not set +# CONFIG_PINCTRL_SUN50I_A100_R is not set +# CONFIG_PINCTRL_SUN50I_H5 is not set +# CONFIG_PINCTRL_SUN50I_H6 is not set +# CONFIG_PINCTRL_SUN50I_H6_R is not set +# CONFIG_PINCTRL_SUN50I_H616 is not set +# CONFIG_PINCTRL_SUN50I_H616_R is not set +CONFIG_GPIO_GENERIC_PLATFORM=y +CONFIG_POWER_RESET_HISI=y +CONFIG_POWER_RESET_SYSCON=y +# CONFIG_HWMON is not set +CONFIG_THERMAL=y +CONFIG_THERMAL_NETLINK=y +CONFIG_THERMAL_STATISTICS=y +CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 +CONFIG_THERMAL_WRITABLE_TRIPS=y +CONFIG_THERMAL_GOV_USER_SPACE=y +CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y +CONFIG_CPU_THERMAL=y +CONFIG_DEVFREQ_THERMAL=y +CONFIG_THERMAL_EMULATION=y +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_CORE=y +CONFIG_MFD_ACT8945A=y +CONFIG_REGULATOR=y +CONFIG_REGULATOR_FIXED_VOLTAGE=y +CONFIG_RC_CORE=y +# CONFIG_RC_MAP is not set +CONFIG_LIRC=y +CONFIG_BPF_LIRC_MODE2=y +CONFIG_RC_DECODERS=y +CONFIG_RC_DEVICES=y +CONFIG_MEDIA_CEC_RC=y +# CONFIG_MEDIA_ANALOG_TV_SUPPORT is not set +# CONFIG_MEDIA_DIGITAL_TV_SUPPORT is not set +# CONFIG_MEDIA_RADIO_SUPPORT is not set +# CONFIG_MEDIA_SDR_SUPPORT is not set +# CONFIG_MEDIA_TEST_SUPPORT is not set +CONFIG_VIDEO_V4L2_SUBDEV_API=y +CONFIG_MEDIA_USB_SUPPORT=y +CONFIG_USB_VIDEO_CLASS=y +CONFIG_USB_GSPCA=y +CONFIG_V4L_PLATFORM_DRIVERS=y +CONFIG_V4L_MEM2MEM_DRIVERS=y +# CONFIG_VGA_ARB is not set +CONFIG_DRM=y +CONFIG_BACKLIGHT_CLASS_DEVICE=y +CONFIG_SOUND=y +CONFIG_SND=y +CONFIG_SND_HRTIMER=y +# CONFIG_SND_SUPPORT_OLD_API is not set +# CONFIG_SND_VERBOSE_PROCFS is not set +# CONFIG_SND_DRIVERS is not set +CONFIG_SND_USB_AUDIO=y +CONFIG_SND_SOC=y +CONFIG_HID_BATTERY_STRENGTH=y +CONFIG_HIDRAW=y +CONFIG_UHID=y +CONFIG_HID_APPLE=y +CONFIG_HID_PRODIKEYS=y +CONFIG_HID_ELECOM=y +CONFIG_HID_UCLOGIC=y +CONFIG_HID_LOGITECH=y +CONFIG_HID_LOGITECH_DJ=y +CONFIG_HID_MAGICMOUSE=y +CONFIG_HID_MICROSOFT=y +CONFIG_HID_MULTITOUCH=y +CONFIG_HID_NINTENDO=y +CONFIG_HID_PICOLCD=y +CONFIG_HID_PLANTRONICS=y +CONFIG_HID_PLAYSTATION=y +CONFIG_PLAYSTATION_FF=y +CONFIG_HID_ROCCAT=y +CONFIG_HID_SONY=y +CONFIG_SONY_FF=y +CONFIG_HID_STEAM=y +CONFIG_HID_WACOM=y +CONFIG_HID_WIIMOTE=y +CONFIG_USB_HIDDEV=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y +CONFIG_USB_OTG=y +CONFIG_USB_XHCI_HCD=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_EHCI_ROOT_HUB_TT=y +CONFIG_USB_EHCI_HCD_PLATFORM=y +CONFIG_USB_ACM=m +CONFIG_USB_STORAGE=y +CONFIG_USB_UAS=y +CONFIG_USB_DWC3=y +CONFIG_USB_SERIAL=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_GADGET=y +CONFIG_USB_CONFIGFS=y +CONFIG_USB_CONFIGFS_UEVENT=y +CONFIG_USB_CONFIGFS_SERIAL=y +CONFIG_USB_CONFIGFS_ACM=y +CONFIG_USB_CONFIGFS_NCM=y +CONFIG_USB_CONFIGFS_ECM=y +CONFIG_USB_CONFIGFS_EEM=y +CONFIG_USB_CONFIGFS_MASS_STORAGE=y +CONFIG_USB_CONFIGFS_F_FS=y +CONFIG_USB_CONFIGFS_F_ACC=y +CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y +CONFIG_USB_CONFIGFS_F_UAC2=y +CONFIG_USB_CONFIGFS_F_MIDI=y +CONFIG_USB_CONFIGFS_F_HID=y +CONFIG_USB_CONFIGFS_F_UVC=y +CONFIG_TYPEC=y +CONFIG_TYPEC_TCPM=y +CONFIG_TYPEC_TCPCI=y +CONFIG_TYPEC_UCSI=y +CONFIG_MMC=y +# CONFIG_PWRSEQ_EMMC is not set +# CONFIG_PWRSEQ_SIMPLE is not set +CONFIG_MMC_CRYPTO=y +CONFIG_MMC_SDHCI=y +CONFIG_MMC_SDHCI_PLTFM=y +CONFIG_SCSI_UFSHCD=y +CONFIG_SCSI_UFS_BSG=y +CONFIG_SCSI_UFS_CRYPTO=y +CONFIG_SCSI_UFS_HPB=y +CONFIG_SCSI_UFSHCD_PCI=y +CONFIG_SCSI_UFSHCD_PLATFORM=y +CONFIG_SCSI_UFS_DWC_TC_PLATFORM=y +CONFIG_SCSI_UFS_HISI=y +CONFIG_LEDS_CLASS_FLASH=y +CONFIG_LEDS_TRIGGER_TIMER=y +CONFIG_LEDS_TRIGGER_TRANSIENT=y +CONFIG_EDAC=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_DRV_PL030=y +CONFIG_RTC_DRV_PL031=y +CONFIG_DMABUF_HEAPS=y +CONFIG_DMABUF_SYSFS_STATS=y +CONFIG_DMABUF_HEAPS_DEFERRED_FREE=y +CONFIG_DMABUF_HEAPS_PAGE_POOL=y +CONFIG_UIO=y +CONFIG_VHOST_VSOCK=y +CONFIG_STAGING=y +CONFIG_ASHMEM=y +CONFIG_DEBUG_KINFO=y +CONFIG_COMMON_CLK_SCPI=y +# CONFIG_CLK_SUNXI is not set +# CONFIG_SUNXI_CCU is not set +CONFIG_HWSPINLOCK=y +# CONFIG_SUN50I_ERRATUM_UNKNOWN1 is not set +CONFIG_MAILBOX=y +CONFIG_REMOTEPROC=y +CONFIG_REMOTEPROC_CDEV=y +CONFIG_RPMSG_CHAR=y +CONFIG_QCOM_GENI_SE=y +CONFIG_DEVFREQ_GOV_PERFORMANCE=y +CONFIG_DEVFREQ_GOV_POWERSAVE=y +CONFIG_DEVFREQ_GOV_USERSPACE=y +CONFIG_DEVFREQ_GOV_PASSIVE=y +CONFIG_PM_DEVFREQ_EVENT=y +CONFIG_IIO=y +CONFIG_IIO_BUFFER=y +CONFIG_IIO_TRIGGER=y +CONFIG_PWM=y +CONFIG_GENERIC_PHY=y +CONFIG_POWERCAP=y +CONFIG_ANDROID=y +CONFIG_ANDROID_BINDER_IPC=y +CONFIG_ANDROID_BINDERFS=y +CONFIG_ANDROID_DEBUG_SYMBOLS=y +CONFIG_ANDROID_VENDOR_HOOKS=y +CONFIG_LIBNVDIMM=y +# CONFIG_ND_BLK is not set +CONFIG_INTERCONNECT=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_F2FS_FS=y +CONFIG_F2FS_FS_SECURITY=y +CONFIG_F2FS_FS_COMPRESSION=y +CONFIG_F2FS_UNFAIR_RWSEM=y +CONFIG_FS_ENCRYPTION=y +CONFIG_FS_ENCRYPTION_INLINE_CRYPT=y +CONFIG_FS_VERITY=y +CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y +# CONFIG_DNOTIFY is not set +CONFIG_QUOTA=y +CONFIG_QFMT_V2=y +CONFIG_FUSE_FS=y +CONFIG_VIRTIO_FS=y +CONFIG_FUSE_BPF=y +CONFIG_OVERLAY_FS=y +CONFIG_INCREMENTAL_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_EXFAT_FS=y +CONFIG_TMPFS=y +# CONFIG_EFIVAR_FS is not set +CONFIG_PSTORE=y +CONFIG_PSTORE_CONSOLE=y +CONFIG_PSTORE_PMSG=y +CONFIG_PSTORE_RAM=y +CONFIG_EROFS_FS=y +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_CODEPAGE_737=y +CONFIG_NLS_CODEPAGE_775=y +CONFIG_NLS_CODEPAGE_850=y +CONFIG_NLS_CODEPAGE_852=y +CONFIG_NLS_CODEPAGE_855=y +CONFIG_NLS_CODEPAGE_857=y +CONFIG_NLS_CODEPAGE_860=y +CONFIG_NLS_CODEPAGE_861=y +CONFIG_NLS_CODEPAGE_862=y +CONFIG_NLS_CODEPAGE_863=y +CONFIG_NLS_CODEPAGE_864=y +CONFIG_NLS_CODEPAGE_865=y +CONFIG_NLS_CODEPAGE_866=y +CONFIG_NLS_CODEPAGE_869=y +CONFIG_NLS_CODEPAGE_936=y +CONFIG_NLS_CODEPAGE_950=y +CONFIG_NLS_CODEPAGE_932=y +CONFIG_NLS_CODEPAGE_949=y +CONFIG_NLS_CODEPAGE_874=y +CONFIG_NLS_ISO8859_8=y +CONFIG_NLS_CODEPAGE_1250=y +CONFIG_NLS_CODEPAGE_1251=y +CONFIG_NLS_ASCII=y +CONFIG_NLS_ISO8859_1=y +CONFIG_NLS_ISO8859_2=y +CONFIG_NLS_ISO8859_3=y +CONFIG_NLS_ISO8859_4=y +CONFIG_NLS_ISO8859_5=y +CONFIG_NLS_ISO8859_6=y +CONFIG_NLS_ISO8859_7=y +CONFIG_NLS_ISO8859_9=y +CONFIG_NLS_ISO8859_13=y +CONFIG_NLS_ISO8859_14=y +CONFIG_NLS_ISO8859_15=y +CONFIG_NLS_KOI8_R=y +CONFIG_NLS_KOI8_U=y +CONFIG_NLS_MAC_ROMAN=y +CONFIG_NLS_MAC_CELTIC=y +CONFIG_NLS_MAC_CENTEURO=y +CONFIG_NLS_MAC_CROATIAN=y +CONFIG_NLS_MAC_CYRILLIC=y +CONFIG_NLS_MAC_GAELIC=y +CONFIG_NLS_MAC_GREEK=y +CONFIG_NLS_MAC_ICELAND=y +CONFIG_NLS_MAC_INUIT=y +CONFIG_NLS_MAC_ROMANIAN=y +CONFIG_NLS_MAC_TURKISH=y +CONFIG_NLS_UTF8=y +CONFIG_UNICODE=y +CONFIG_SECURITY=y +CONFIG_SECURITYFS=y +CONFIG_SECURITY_NETWORK=y +CONFIG_HARDENED_USERCOPY=y +# CONFIG_HARDENED_USERCOPY_FALLBACK is not set +CONFIG_STATIC_USERMODEHELPER=y +CONFIG_STATIC_USERMODEHELPER_PATH="" +CONFIG_SECURITY_SELINUX=y +CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y +CONFIG_CRYPTO_ECDH=y +CONFIG_CRYPTO_CCM=y +CONFIG_CRYPTO_CHACHA20POLY1305=y +CONFIG_CRYPTO_ADIANTUM=y +CONFIG_CRYPTO_HCTR2=y +CONFIG_CRYPTO_CMAC=y +CONFIG_CRYPTO_XCBC=y +CONFIG_CRYPTO_BLAKE2B=y +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_DES=y +CONFIG_CRYPTO_LZO=y +CONFIG_CRYPTO_LZ4=y +CONFIG_CRYPTO_ZSTD=y +CONFIG_CRYPTO_ANSI_CPRNG=y +CONFIG_CRC_CCITT=y +CONFIG_XZ_DEC=y +CONFIG_DMA_RESTRICTED_POOL=y +CONFIG_DMA_CMA=y +CONFIG_STACK_HASH_ORDER=12 +CONFIG_PRINTK_TIME=y +CONFIG_PRINTK_CALLER=y +CONFIG_DYNAMIC_DEBUG_CORE=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF4=y +CONFIG_DEBUG_INFO_BTF=y +CONFIG_MODULE_ALLOW_BTF_MISMATCH=y +CONFIG_HEADERS_INSTALL=y +# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set +CONFIG_MAGIC_SYSRQ=y +CONFIG_UBSAN=y +CONFIG_UBSAN_TRAP=y +CONFIG_UBSAN_LOCAL_BOUNDS=y +# CONFIG_UBSAN_SHIFT is not set +# CONFIG_UBSAN_BOOL is not set +# CONFIG_UBSAN_ENUM is not set +CONFIG_PAGE_OWNER=y +CONFIG_PAGE_PINNER=y +CONFIG_DEBUG_STACK_USAGE=y +CONFIG_DEBUG_MEMORY_INIT=y +CONFIG_KASAN=y +CONFIG_KASAN_HW_TAGS=y +CONFIG_KFENCE=y +CONFIG_KFENCE_SAMPLE_INTERVAL=500 +CONFIG_KFENCE_NUM_OBJECTS=63 +CONFIG_KFENCE_STATIC_KEYS=y +CONFIG_PANIC_ON_OOPS=y +CONFIG_PANIC_TIMEOUT=-1 +CONFIG_SOFTLOCKUP_DETECTOR=y +CONFIG_WQ_WATCHDOG=y +CONFIG_SCHEDSTATS=y +# CONFIG_DEBUG_PREEMPT is not set +CONFIG_BUG_ON_DATA_CORRUPTION=y +CONFIG_TRACE_MMIO_ACCESS=y +CONFIG_HIST_TRIGGERS=y +CONFIG_PID_IN_CONTEXTIDR=y +# CONFIG_RUNTIME_TESTING_MENU is not set diff --git a/arch/arm64/configs/rockpi4_gki.fragment b/arch/arm64/configs/rockpi4_gki.fragment new file mode 100644 index 000000000000..2af01b890c59 --- /dev/null +++ b/arch/arm64/configs/rockpi4_gki.fragment @@ -0,0 +1,82 @@ +# Core features +CONFIG_ARCH_ROCKCHIP=y +# CONFIG_CLK_PX30 is not set +# CONFIG_CLK_RV110X is not set +# CONFIG_CLK_RK3036 is not set +# CONFIG_CLK_RK312X is not set +# CONFIG_CLK_RK3188 is not set +# CONFIG_CLK_RK322X is not set +# CONFIG_CLK_RK3288 is not set +# CONFIG_CLK_RK3308 is not set +# CONFIG_CLK_RK3328 is not set +# CONFIG_CLK_RK3368 is not set +CONFIG_COMMON_CLK_RK808=m +CONFIG_CPUFREQ_DT=m +CONFIG_MFD_RK808=m +CONFIG_PCIE_ROCKCHIP_HOST=m +CONFIG_PHY_ROCKCHIP_PCIE=m +CONFIG_PL330_DMA=m +CONFIG_PWM_ROCKCHIP=m +CONFIG_PWRSEQ_SIMPLE=m +CONFIG_REGULATOR_FAN53555=m +CONFIG_REGULATOR_PWM=m +CONFIG_REGULATOR_RK808=m +CONFIG_ROCKCHIP_EFUSE=m +CONFIG_ROCKCHIP_IOMMU=y +CONFIG_ROCKCHIP_IODOMAIN=m +CONFIG_ROCKCHIP_MBOX=y +CONFIG_ROCKCHIP_PM_DOMAINS=y +CONFIG_ROCKCHIP_THERMAL=m + +# Ethernet +CONFIG_STMMAC_ETH=m +# CONFIG_DWMAC_GENERIC is not set +# CONFIG_DWMAC_IPQ806X is not set +# CONFIG_DWMAC_QCOM_ETHQOS is not set +# CONFIG_DWMAC_SUNXI is not set +# CONFIG_DWMAC_SUN8I is not set + +# I2C +CONFIG_I2C_RK3X=m + +# Watchdog +CONFIG_DW_WATCHDOG=m + +# Display +CONFIG_DRM_ROCKCHIP=m +CONFIG_ROCKCHIP_ANALOGIX_DP=y +CONFIG_ROCKCHIP_DW_HDMI=y +CONFIG_ROCKCHIP_DW_MIPI_DSI=y + +# USB 2.x +CONFIG_PHY_ROCKCHIP_INNO_USB2=m +CONFIG_USB_OHCI_HCD=m +# CONFIG_USB_OHCI_HCD_PCI is not set +CONFIG_USB_OHCI_HCD_PLATFORM=m + +# eMMC / SD-Card +CONFIG_MMC_SDHCI_OF_ARASAN=m +CONFIG_MMC_DW=m +CONFIG_MMC_DW_ROCKCHIP=m +CONFIG_PHY_ROCKCHIP_EMMC=m + +# Real-time clock +CONFIG_RTC_DRV_RK808=m + +# Type-C +CONFIG_PHY_ROCKCHIP_TYPEC=m + +# SAR ADC +CONFIG_ROCKCHIP_SARADC=m + +# Audio +CONFIG_SND_SOC_ROCKCHIP_I2S=m + +# To boot Linux distributions like Debian +CONFIG_DEVTMPFS=y + +# To bootstrap rootfs with QEMU +CONFIG_HW_RANDOM_VIRTIO=m +CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_BLK=m +CONFIG_VIRTIO_NET=m diff --git a/arch/arm64/crypto/Kbuild.fips140 b/arch/arm64/crypto/Kbuild.fips140 new file mode 100644 index 000000000000..9aa0af602130 --- /dev/null +++ b/arch/arm64/crypto/Kbuild.fips140 @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Create a separate FIPS archive that duplicates the modules that are relevant +# for FIPS 140 certification as builtin objects +# + +sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o +sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o +sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o +ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o +aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o +aes-ce-blk-y := aes-glue-ce.o aes-ce.o +aes-neon-blk-y := aes-glue-neon.o aes-neon.o +sha256-arm64-y := sha256-glue.o sha256-core.o +sha512-arm64-y := sha512-glue.o sha512-core.o +aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o +aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o + +crypto-arm64-fips-src := $(srctree)/arch/arm64/crypto/ +crypto-arm64-fips-modules := sha1-ce.o sha2-ce.o sha512-ce.o ghash-ce.o \ + aes-ce-cipher.o aes-ce-blk.o aes-neon-blk.o \ + sha256-arm64.o sha512-arm64.o aes-arm64.o \ + aes-neon-bs.o + +crypto-fips-objs += $(foreach o,$(crypto-arm64-fips-modules),$($(o:.o=-y):.o=-fips-arch.o)) + +CFLAGS_aes-glue-ce-fips-arch.o := -DUSE_V8_CRYPTO_EXTENSIONS + +$(obj)/aes-glue-%-fips-arch.o: KBUILD_CFLAGS += $(FIPS140_CFLAGS) +$(obj)/aes-glue-%-fips-arch.o: $(crypto-arm64-fips-src)/aes-glue.c FORCE + $(call if_changed_rule,cc_o_c) + +$(obj)/%-fips-arch.o: KBUILD_CFLAGS += $(FIPS140_CFLAGS) +$(obj)/%-fips-arch.o: $(crypto-arm64-fips-src)/%.c FORCE + $(call if_changed_rule,cc_o_c) + +$(obj)/%-fips-arch.o: $(crypto-arm64-fips-src)/%.S FORCE + $(call if_changed_rule,as_o_S) + +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $(<) void $(@) + +$(obj)/%-core.S: $(crypto-arm64-fips-src)/%-armv8.pl + $(call cmd,perlasm) + +$(obj)/sha256-core.S: $(crypto-arm64-fips-src)/sha512-armv8.pl + $(call cmd,perlasm) + +clean-files += sha256-core.S sha512-core.S + +$(obj)/%-fips-arch.o: $(obj)/%.S FORCE + $(call if_changed_rule,as_o_S) diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 1a5406e599ba..0db557ee8519 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -61,6 +61,11 @@ config CRYPTO_GHASH_ARM64_CE select CRYPTO_LIB_AES select CRYPTO_AEAD +config CRYPTO_POLYVAL_ARM64_CE + tristate "POLYVAL using ARMv8 Crypto Extensions (for HCTR2)" + depends on KERNEL_MODE_NEON + select CRYPTO_POLYVAL + config CRYPTO_CRCT10DIF_ARM64_CE tristate "CRCT10DIF digest algorithm using PMULL instructions" depends on KERNEL_MODE_NEON && CRC_T10DIF @@ -85,20 +90,16 @@ config CRYPTO_AES_ARM64_CE_CCM select CRYPTO_LIB_AES config CRYPTO_AES_ARM64_CE_BLK - tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions" + tristate "AES in ECB/CBC/CTR/XTS/XCTR modes using ARMv8 Crypto Extensions" depends on KERNEL_MODE_NEON select CRYPTO_SKCIPHER select CRYPTO_AES_ARM64_CE - select CRYPTO_AES_ARM64 - select CRYPTO_SIMD config CRYPTO_AES_ARM64_NEON_BLK - tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions" + tristate "AES in ECB/CBC/CTR/XTS/XCTR modes using NEON instructions" depends on KERNEL_MODE_NEON select CRYPTO_SKCIPHER - select CRYPTO_AES_ARM64 select CRYPTO_LIB_AES - select CRYPTO_SIMD config CRYPTO_CHACHA20_NEON tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions" @@ -123,8 +124,6 @@ config CRYPTO_AES_ARM64_BS depends on KERNEL_MODE_NEON select CRYPTO_SKCIPHER select CRYPTO_AES_ARM64_NEON_BLK - select CRYPTO_AES_ARM64 select CRYPTO_LIB_AES - select CRYPTO_SIMD endif diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index 09a805cc32d7..53f9af962b86 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -26,6 +26,9 @@ sm4-ce-y := sm4-ce-glue.o sm4-ce-core.o obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o +obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o +polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o + obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 445082aff0b8..453e54570911 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -24,7 +24,6 @@ #ifdef USE_V8_CRYPTO_EXTENSIONS #define MODE "ce" #define PRIO 300 -#define STRIDE 5 #define aes_expandkey ce_aes_expandkey #define aes_ecb_encrypt ce_aes_ecb_encrypt #define aes_ecb_decrypt ce_aes_ecb_decrypt @@ -35,14 +34,14 @@ #define aes_essiv_cbc_encrypt ce_aes_essiv_cbc_encrypt #define aes_essiv_cbc_decrypt ce_aes_essiv_cbc_decrypt #define aes_ctr_encrypt ce_aes_ctr_encrypt +#define aes_xctr_encrypt ce_aes_xctr_encrypt #define aes_xts_encrypt ce_aes_xts_encrypt #define aes_xts_decrypt ce_aes_xts_decrypt #define aes_mac_update ce_aes_mac_update -MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); +MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS/XCTR using ARMv8 Crypto Extensions"); #else #define MODE "neon" #define PRIO 200 -#define STRIDE 4 #define aes_ecb_encrypt neon_aes_ecb_encrypt #define aes_ecb_decrypt neon_aes_ecb_decrypt #define aes_cbc_encrypt neon_aes_cbc_encrypt @@ -52,16 +51,18 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); #define aes_essiv_cbc_encrypt neon_aes_essiv_cbc_encrypt #define aes_essiv_cbc_decrypt neon_aes_essiv_cbc_decrypt #define aes_ctr_encrypt neon_aes_ctr_encrypt +#define aes_xctr_encrypt neon_aes_xctr_encrypt #define aes_xts_encrypt neon_aes_xts_encrypt #define aes_xts_decrypt neon_aes_xts_decrypt #define aes_mac_update neon_aes_mac_update -MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON"); +MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS/XCTR using ARMv8 NEON"); #endif #if defined(USE_V8_CRYPTO_EXTENSIONS) MODULE_ALIAS_CRYPTO("ecb(aes)"); MODULE_ALIAS_CRYPTO("cbc(aes)"); MODULE_ALIAS_CRYPTO("ctr(aes)"); MODULE_ALIAS_CRYPTO("xts(aes)"); +MODULE_ALIAS_CRYPTO("xctr(aes)"); MODULE_ALIAS_CRYPTO("cts(cbc(aes))"); MODULE_ALIAS_CRYPTO("essiv(cbc(aes),sha256)"); MODULE_ALIAS_CRYPTO("cmac(aes)"); @@ -89,7 +90,10 @@ asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, int bytes, u8 const iv[]); asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], - int rounds, int bytes, u8 ctr[], u8 finalbuf[]); + int rounds, int bytes, u8 ctr[]); + +asmlinkage void aes_xctr_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int bytes, u8 ctr[], int byte_ctr); asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, int bytes, u32 const rk2[], u8 iv[], @@ -444,7 +448,53 @@ static int __maybe_unused essiv_cbc_decrypt(struct skcipher_request *req) return err ?: cbc_decrypt_walk(req, &walk); } -static int ctr_encrypt(struct skcipher_request *req) +static int __maybe_unused xctr_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + int err, rounds = 6 + ctx->key_length / 4; + struct skcipher_walk walk; + unsigned int byte_ctr = 0; + + err = skcipher_walk_virt(&walk, req, false); + + while (walk.nbytes > 0) { + const u8 *src = walk.src.virt.addr; + unsigned int nbytes = walk.nbytes; + u8 *dst = walk.dst.virt.addr; + u8 buf[AES_BLOCK_SIZE]; + + /* + * If given less than 16 bytes, we must copy the partial block + * into a temporary buffer of 16 bytes to avoid out of bounds + * reads and writes. Furthermore, this code is somewhat unusual + * in that it expects the end of the data to be at the end of + * the temporary buffer, rather than the start of the data at + * the start of the temporary buffer. + */ + if (unlikely(nbytes < AES_BLOCK_SIZE)) + src = dst = memcpy(buf + sizeof(buf) - nbytes, + src, nbytes); + else if (nbytes < walk.total) + nbytes &= ~(AES_BLOCK_SIZE - 1); + + kernel_neon_begin(); + aes_xctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes, + walk.iv, byte_ctr); + kernel_neon_end(); + + if (unlikely(nbytes < AES_BLOCK_SIZE)) + memcpy(walk.dst.virt.addr, + buf + sizeof(buf) - nbytes, nbytes); + byte_ctr += nbytes; + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + return err; +} + +static int __maybe_unused ctr_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); @@ -458,26 +508,29 @@ static int ctr_encrypt(struct skcipher_request *req) unsigned int nbytes = walk.nbytes; u8 *dst = walk.dst.virt.addr; u8 buf[AES_BLOCK_SIZE]; - unsigned int tail; + /* + * If given less than 16 bytes, we must copy the partial block + * into a temporary buffer of 16 bytes to avoid out of bounds + * reads and writes. Furthermore, this code is somewhat unusual + * in that it expects the end of the data to be at the end of + * the temporary buffer, rather than the start of the data at + * the start of the temporary buffer. + */ if (unlikely(nbytes < AES_BLOCK_SIZE)) - src = memcpy(buf, src, nbytes); + src = dst = memcpy(buf + sizeof(buf) - nbytes, + src, nbytes); else if (nbytes < walk.total) nbytes &= ~(AES_BLOCK_SIZE - 1); kernel_neon_begin(); aes_ctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes, - walk.iv, buf); + walk.iv); kernel_neon_end(); - tail = nbytes % (STRIDE * AES_BLOCK_SIZE); - if (tail > 0 && tail < AES_BLOCK_SIZE) - /* - * The final partial block could not be returned using - * an overlapping store, so it was passed via buf[] - * instead. - */ - memcpy(dst + nbytes - tail, buf, tail); + if (unlikely(nbytes < AES_BLOCK_SIZE)) + memcpy(walk.dst.virt.addr, + buf + sizeof(buf) - nbytes, nbytes); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } @@ -485,29 +538,6 @@ static int ctr_encrypt(struct skcipher_request *req) return err; } -static void ctr_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst) -{ - const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); - unsigned long flags; - - /* - * Temporarily disable interrupts to avoid races where - * cachelines are evicted when the CPU is interrupted - * to do something else. - */ - local_irq_save(flags); - aes_encrypt(ctx, dst, src); - local_irq_restore(flags); -} - -static int __maybe_unused ctr_encrypt_sync(struct skcipher_request *req) -{ - if (!crypto_simd_usable()) - return crypto_ctr_encrypt_walk(req, ctr_encrypt_one); - - return ctr_encrypt(req); -} - static int __maybe_unused xts_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); @@ -656,10 +686,9 @@ static int __maybe_unused xts_decrypt(struct skcipher_request *req) static struct skcipher_alg aes_algs[] = { { #if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS) .base = { - .cra_name = "__ecb(aes)", - .cra_driver_name = "__ecb-aes-" MODE, + .cra_name = "ecb(aes)", + .cra_driver_name = "ecb-aes-" MODE, .cra_priority = PRIO, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_ctx), .cra_module = THIS_MODULE, @@ -671,10 +700,9 @@ static struct skcipher_alg aes_algs[] = { { .decrypt = ecb_decrypt, }, { .base = { - .cra_name = "__cbc(aes)", - .cra_driver_name = "__cbc-aes-" MODE, + .cra_name = "cbc(aes)", + .cra_driver_name = "cbc-aes-" MODE, .cra_priority = PRIO, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_ctx), .cra_module = THIS_MODULE, @@ -687,10 +715,9 @@ static struct skcipher_alg aes_algs[] = { { .decrypt = cbc_decrypt, }, { .base = { - .cra_name = "__ctr(aes)", - .cra_driver_name = "__ctr-aes-" MODE, + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-" MODE, .cra_priority = PRIO, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct crypto_aes_ctx), .cra_module = THIS_MODULE, @@ -704,9 +731,9 @@ static struct skcipher_alg aes_algs[] = { { .decrypt = ctr_encrypt, }, { .base = { - .cra_name = "ctr(aes)", - .cra_driver_name = "ctr-aes-" MODE, - .cra_priority = PRIO - 1, + .cra_name = "xctr(aes)", + .cra_driver_name = "xctr-aes-" MODE, + .cra_priority = PRIO, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct crypto_aes_ctx), .cra_module = THIS_MODULE, @@ -716,14 +743,13 @@ static struct skcipher_alg aes_algs[] = { { .ivsize = AES_BLOCK_SIZE, .chunksize = AES_BLOCK_SIZE, .setkey = skcipher_aes_setkey, - .encrypt = ctr_encrypt_sync, - .decrypt = ctr_encrypt_sync, + .encrypt = xctr_encrypt, + .decrypt = xctr_encrypt, }, { .base = { - .cra_name = "__xts(aes)", - .cra_driver_name = "__xts-aes-" MODE, + .cra_name = "xts(aes)", + .cra_driver_name = "xts-aes-" MODE, .cra_priority = PRIO, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_xts_ctx), .cra_module = THIS_MODULE, @@ -738,10 +764,9 @@ static struct skcipher_alg aes_algs[] = { { }, { #endif .base = { - .cra_name = "__cts(cbc(aes))", - .cra_driver_name = "__cts-cbc-aes-" MODE, + .cra_name = "cts(cbc(aes))", + .cra_driver_name = "cts-cbc-aes-" MODE, .cra_priority = PRIO, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_ctx), .cra_module = THIS_MODULE, @@ -755,10 +780,9 @@ static struct skcipher_alg aes_algs[] = { { .decrypt = cts_cbc_decrypt, }, { .base = { - .cra_name = "__essiv(cbc(aes),sha256)", - .cra_driver_name = "__essiv-cbc-aes-sha256-" MODE, + .cra_name = "essiv(cbc(aes),sha256)", + .cra_driver_name = "essiv-cbc-aes-sha256-" MODE, .cra_priority = PRIO + 1, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_essiv_cbc_ctx), .cra_module = THIS_MODULE, @@ -997,28 +1021,15 @@ static struct shash_alg mac_algs[] = { { .descsize = sizeof(struct mac_desc_ctx), } }; -static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)]; - static void aes_exit(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++) - if (aes_simd_algs[i]) - simd_skcipher_free(aes_simd_algs[i]); - crypto_unregister_shashes(mac_algs, ARRAY_SIZE(mac_algs)); crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); } static int __init aes_init(void) { - struct simd_skcipher_alg *simd; - const char *basename; - const char *algname; - const char *drvname; int err; - int i; err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); if (err) @@ -1028,26 +1039,8 @@ static int __init aes_init(void) if (err) goto unregister_ciphers; - for (i = 0; i < ARRAY_SIZE(aes_algs); i++) { - if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL)) - continue; - - algname = aes_algs[i].base.cra_name + 2; - drvname = aes_algs[i].base.cra_driver_name + 2; - basename = aes_algs[i].base.cra_driver_name; - simd = simd_skcipher_create_compat(algname, drvname, basename); - err = PTR_ERR(simd); - if (IS_ERR(simd)) - goto unregister_simds; - - aes_simd_algs[i] = simd; - } - return 0; -unregister_simds: - aes_exit(); - return err; unregister_ciphers: crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); return err; diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S index b495de22bb38..5abc834271f4 100644 --- a/arch/arm64/crypto/aes-modes.S +++ b/arch/arm64/crypto/aes-modes.S @@ -318,127 +318,211 @@ AES_FUNC_END(aes_cbc_cts_decrypt) .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .previous - /* - * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, - * int bytes, u8 ctr[], u8 finalbuf[]) + * This macro generates the code for CTR and XCTR mode. */ +.macro ctr_encrypt xctr + // Arguments + OUT .req x0 + IN .req x1 + KEY .req x2 + ROUNDS_W .req w3 + BYTES_W .req w4 + IV .req x5 + BYTE_CTR_W .req w6 // XCTR only + // Intermediate values + CTR_W .req w11 // XCTR only + CTR .req x11 // XCTR only + IV_PART .req x12 + BLOCKS .req x13 + BLOCKS_W .req w13 -AES_FUNC_START(aes_ctr_encrypt) stp x29, x30, [sp, #-16]! mov x29, sp - enc_prepare w3, x2, x12 - ld1 {vctr.16b}, [x5] + enc_prepare ROUNDS_W, KEY, IV_PART + ld1 {vctr.16b}, [IV] - umov x12, vctr.d[1] /* keep swabbed ctr in reg */ - rev x12, x12 + /* + * Keep 64 bits of the IV in a register. For CTR mode this lets us + * easily increment the IV. For XCTR mode this lets us efficiently XOR + * the 64-bit counter with the IV. + */ + .if \xctr + umov IV_PART, vctr.d[0] + lsr CTR_W, BYTE_CTR_W, #4 + .else + umov IV_PART, vctr.d[1] + rev IV_PART, IV_PART + .endif -.LctrloopNx: - add w7, w4, #15 - sub w4, w4, #MAX_STRIDE << 4 - lsr w7, w7, #4 +.LctrloopNx\xctr: + add BLOCKS_W, BYTES_W, #15 + sub BYTES_W, BYTES_W, #MAX_STRIDE << 4 + lsr BLOCKS_W, BLOCKS_W, #4 mov w8, #MAX_STRIDE - cmp w7, w8 - csel w7, w7, w8, lt - adds x12, x12, x7 + cmp BLOCKS_W, w8 + csel BLOCKS_W, BLOCKS_W, w8, lt + /* + * Set up the counter values in v0-v{MAX_STRIDE-1}. + * + * If we are encrypting less than MAX_STRIDE blocks, the tail block + * handling code expects the last keystream block to be in + * v{MAX_STRIDE-1}. For example: if encrypting two blocks with + * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks. + */ + .if \xctr + add CTR, CTR, BLOCKS + .else + adds IV_PART, IV_PART, BLOCKS + .endif mov v0.16b, vctr.16b mov v1.16b, vctr.16b mov v2.16b, vctr.16b mov v3.16b, vctr.16b ST5( mov v4.16b, vctr.16b ) - bcs 0f + .if \xctr + sub x6, CTR, #MAX_STRIDE - 1 + sub x7, CTR, #MAX_STRIDE - 2 + sub x8, CTR, #MAX_STRIDE - 3 + sub x9, CTR, #MAX_STRIDE - 4 +ST5( sub x10, CTR, #MAX_STRIDE - 5 ) + eor x6, x6, IV_PART + eor x7, x7, IV_PART + eor x8, x8, IV_PART + eor x9, x9, IV_PART +ST5( eor x10, x10, IV_PART ) + mov v0.d[0], x6 + mov v1.d[0], x7 + mov v2.d[0], x8 + mov v3.d[0], x9 +ST5( mov v4.d[0], x10 ) + .else + bcs 0f + .subsection 1 + /* + * This subsection handles carries. + * + * Conditional branching here is allowed with respect to time + * invariance since the branches are dependent on the IV instead + * of the plaintext or key. This code is rarely executed in + * practice anyway. + */ - .subsection 1 - /* apply carry to outgoing counter */ -0: umov x8, vctr.d[0] - rev x8, x8 - add x8, x8, #1 - rev x8, x8 - ins vctr.d[0], x8 + /* Apply carry to outgoing counter. */ +0: umov x8, vctr.d[0] + rev x8, x8 + add x8, x8, #1 + rev x8, x8 + ins vctr.d[0], x8 - /* apply carry to N counter blocks for N := x12 */ - cbz x12, 2f - adr x16, 1f - sub x16, x16, x12, lsl #3 - br x16 - hint 34 // bti c - mov v0.d[0], vctr.d[0] - hint 34 // bti c - mov v1.d[0], vctr.d[0] - hint 34 // bti c - mov v2.d[0], vctr.d[0] - hint 34 // bti c - mov v3.d[0], vctr.d[0] -ST5( hint 34 ) -ST5( mov v4.d[0], vctr.d[0] ) -1: b 2f - .previous + /* + * Apply carry to counter blocks if needed. + * + * Since the carry flag was set, we know 0 <= IV_PART < + * MAX_STRIDE. Using the value of IV_PART we can determine how + * many counter blocks need to be updated. + */ + cbz IV_PART, 2f + adr x16, 1f + sub x16, x16, IV_PART, lsl #3 + br x16 + bti c + mov v0.d[0], vctr.d[0] + bti c + mov v1.d[0], vctr.d[0] + bti c + mov v2.d[0], vctr.d[0] + bti c + mov v3.d[0], vctr.d[0] +ST5( bti c ) +ST5( mov v4.d[0], vctr.d[0] ) +1: b 2f + .previous -2: rev x7, x12 - ins vctr.d[1], x7 - sub x7, x12, #MAX_STRIDE - 1 - sub x8, x12, #MAX_STRIDE - 2 - sub x9, x12, #MAX_STRIDE - 3 - rev x7, x7 - rev x8, x8 - mov v1.d[1], x7 - rev x9, x9 -ST5( sub x10, x12, #MAX_STRIDE - 4 ) - mov v2.d[1], x8 -ST5( rev x10, x10 ) - mov v3.d[1], x9 -ST5( mov v4.d[1], x10 ) - tbnz w4, #31, .Lctrtail - ld1 {v5.16b-v7.16b}, [x1], #48 +2: rev x7, IV_PART + ins vctr.d[1], x7 + sub x7, IV_PART, #MAX_STRIDE - 1 + sub x8, IV_PART, #MAX_STRIDE - 2 + sub x9, IV_PART, #MAX_STRIDE - 3 + rev x7, x7 + rev x8, x8 + mov v1.d[1], x7 + rev x9, x9 +ST5( sub x10, IV_PART, #MAX_STRIDE - 4 ) + mov v2.d[1], x8 +ST5( rev x10, x10 ) + mov v3.d[1], x9 +ST5( mov v4.d[1], x10 ) + .endif + + /* + * If there are at least MAX_STRIDE blocks left, XOR the data with + * keystream and store. Otherwise jump to tail handling. + */ + tbnz BYTES_W, #31, .Lctrtail\xctr + ld1 {v5.16b-v7.16b}, [IN], #48 ST4( bl aes_encrypt_block4x ) ST5( bl aes_encrypt_block5x ) eor v0.16b, v5.16b, v0.16b -ST4( ld1 {v5.16b}, [x1], #16 ) +ST4( ld1 {v5.16b}, [IN], #16 ) eor v1.16b, v6.16b, v1.16b -ST5( ld1 {v5.16b-v6.16b}, [x1], #32 ) +ST5( ld1 {v5.16b-v6.16b}, [IN], #32 ) eor v2.16b, v7.16b, v2.16b eor v3.16b, v5.16b, v3.16b ST5( eor v4.16b, v6.16b, v4.16b ) - st1 {v0.16b-v3.16b}, [x0], #64 -ST5( st1 {v4.16b}, [x0], #16 ) - cbz w4, .Lctrout - b .LctrloopNx + st1 {v0.16b-v3.16b}, [OUT], #64 +ST5( st1 {v4.16b}, [OUT], #16 ) + cbz BYTES_W, .Lctrout\xctr + b .LctrloopNx\xctr -.Lctrout: - st1 {vctr.16b}, [x5] /* return next CTR value */ +.Lctrout\xctr: + .if !\xctr + st1 {vctr.16b}, [IV] /* return next CTR value */ + .endif ldp x29, x30, [sp], #16 ret -.Lctrtail: - /* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */ +.Lctrtail\xctr: + /* + * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext + * + * This code expects the last keystream block to be in v{MAX_STRIDE-1}. + * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and + * v4 should have the next two counter blocks. + * + * This allows us to store the ciphertext by writing to overlapping + * regions of memory. Any invalid ciphertext blocks get overwritten by + * correctly computed blocks. This approach greatly simplifies the + * logic for storing the ciphertext. + */ mov x16, #16 - ands x13, x4, #0xf - csel x13, x13, x16, ne + ands w7, BYTES_W, #0xf + csel x13, x7, x16, ne -ST5( cmp w4, #64 - (MAX_STRIDE << 4) ) +ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4)) ST5( csel x14, x16, xzr, gt ) - cmp w4, #48 - (MAX_STRIDE << 4) + cmp BYTES_W, #48 - (MAX_STRIDE << 4) csel x15, x16, xzr, gt - cmp w4, #32 - (MAX_STRIDE << 4) + cmp BYTES_W, #32 - (MAX_STRIDE << 4) csel x16, x16, xzr, gt - cmp w4, #16 - (MAX_STRIDE << 4) - ble .Lctrtail1x + cmp BYTES_W, #16 - (MAX_STRIDE << 4) - adr_l x12, .Lcts_permute_table - add x12, x12, x13 + adr_l x9, .Lcts_permute_table + add x9, x9, x13 + ble .Lctrtail1x\xctr -ST5( ld1 {v5.16b}, [x1], x14 ) - ld1 {v6.16b}, [x1], x15 - ld1 {v7.16b}, [x1], x16 +ST5( ld1 {v5.16b}, [IN], x14 ) + ld1 {v6.16b}, [IN], x15 + ld1 {v7.16b}, [IN], x16 ST4( bl aes_encrypt_block4x ) ST5( bl aes_encrypt_block5x ) - ld1 {v8.16b}, [x1], x13 - ld1 {v9.16b}, [x1] - ld1 {v10.16b}, [x12] + ld1 {v8.16b}, [IN], x13 + ld1 {v9.16b}, [IN] + ld1 {v10.16b}, [x9] ST4( eor v6.16b, v6.16b, v0.16b ) ST4( eor v7.16b, v7.16b, v1.16b ) @@ -453,24 +537,91 @@ ST5( eor v7.16b, v7.16b, v2.16b ) ST5( eor v8.16b, v8.16b, v3.16b ) ST5( eor v9.16b, v9.16b, v4.16b ) -ST5( st1 {v5.16b}, [x0], x14 ) - st1 {v6.16b}, [x0], x15 - st1 {v7.16b}, [x0], x16 - add x13, x13, x0 +ST5( st1 {v5.16b}, [OUT], x14 ) + st1 {v6.16b}, [OUT], x15 + st1 {v7.16b}, [OUT], x16 + add x13, x13, OUT st1 {v9.16b}, [x13] // overlapping stores - st1 {v8.16b}, [x0] - b .Lctrout + st1 {v8.16b}, [OUT] + b .Lctrout\xctr -.Lctrtail1x: - csel x0, x0, x6, eq // use finalbuf if less than a full block - ld1 {v5.16b}, [x1] +.Lctrtail1x\xctr: + /* + * Handle <= 16 bytes of plaintext + * + * This code always reads and writes 16 bytes. To avoid out of bounds + * accesses, XCTR and CTR modes must use a temporary buffer when + * encrypting/decrypting less than 16 bytes. + * + * This code is unusual in that it loads the input and stores the output + * relative to the end of the buffers rather than relative to the start. + * This causes unusual behaviour when encrypting/decrypting less than 16 + * bytes; the end of the data is expected to be at the end of the + * temporary buffer rather than the start of the data being at the start + * of the temporary buffer. + */ + sub x8, x7, #16 + csel x7, x7, x8, eq + add IN, IN, x7 + add OUT, OUT, x7 + ld1 {v5.16b}, [IN] + ld1 {v6.16b}, [OUT] ST5( mov v3.16b, v4.16b ) - encrypt_block v3, w3, x2, x8, w7 + encrypt_block v3, ROUNDS_W, KEY, x8, w7 + ld1 {v10.16b-v11.16b}, [x9] + tbl v3.16b, {v3.16b}, v10.16b + sshr v11.16b, v11.16b, #7 eor v5.16b, v5.16b, v3.16b - st1 {v5.16b}, [x0] - b .Lctrout + bif v5.16b, v6.16b, v11.16b + st1 {v5.16b}, [OUT] + b .Lctrout\xctr + + // Arguments + .unreq OUT + .unreq IN + .unreq KEY + .unreq ROUNDS_W + .unreq BYTES_W + .unreq IV + .unreq BYTE_CTR_W // XCTR only + // Intermediate values + .unreq CTR_W // XCTR only + .unreq CTR // XCTR only + .unreq IV_PART + .unreq BLOCKS + .unreq BLOCKS_W +.endm + + /* + * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int bytes, u8 ctr[]) + * + * The input and output buffers must always be at least 16 bytes even if + * encrypting/decrypting less than 16 bytes. Otherwise out of bounds + * accesses will occur. The data to be encrypted/decrypted is expected + * to be at the end of this 16-byte temporary buffer rather than the + * start. + */ + +AES_FUNC_START(aes_ctr_encrypt) + ctr_encrypt 0 AES_FUNC_END(aes_ctr_encrypt) + /* + * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int bytes, u8 const iv[], int byte_ctr) + * + * The input and output buffers must always be at least 16 bytes even if + * encrypting/decrypting less than 16 bytes. Otherwise out of bounds + * accesses will occur. The data to be encrypted/decrypted is expected + * to be at the end of this 16-byte temporary buffer rather than the + * start. + */ + +AES_FUNC_START(aes_xctr_encrypt) + ctr_encrypt 1 +AES_FUNC_END(aes_xctr_encrypt) + /* * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index cc52829d426a..23c209ec518f 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -58,11 +58,6 @@ struct aesbs_cbc_ctx { u32 enc[AES_MAX_KEYLENGTH_U32]; }; -struct aesbs_ctr_ctx { - struct aesbs_ctx key; /* must be first member */ - struct crypto_aes_ctx fallback; -}; - struct aesbs_xts_ctx { struct aesbs_ctx key; u32 twkey[AES_MAX_KEYLENGTH_U32]; @@ -202,25 +197,6 @@ static int cbc_decrypt(struct skcipher_request *req) return err; } -static int aesbs_ctr_setkey_sync(struct crypto_skcipher *tfm, const u8 *in_key, - unsigned int key_len) -{ - struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = aes_expandkey(&ctx->fallback, in_key, key_len); - if (err) - return err; - - ctx->key.rounds = 6 + key_len / 4; - - kernel_neon_begin(); - aesbs_convert_key(ctx->key.rk, ctx->fallback.key_enc, ctx->key.rounds); - kernel_neon_end(); - - return 0; -} - static int ctr_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); @@ -287,29 +263,6 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key, return aesbs_setkey(tfm, in_key, key_len); } -static void ctr_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst) -{ - struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); - unsigned long flags; - - /* - * Temporarily disable interrupts to avoid races where - * cachelines are evicted when the CPU is interrupted - * to do something else. - */ - local_irq_save(flags); - aes_encrypt(&ctx->fallback, dst, src); - local_irq_restore(flags); -} - -static int ctr_encrypt_sync(struct skcipher_request *req) -{ - if (!crypto_simd_usable()) - return crypto_ctr_encrypt_walk(req, ctr_encrypt_one); - - return ctr_encrypt(req); -} - static int __xts_crypt(struct skcipher_request *req, bool encrypt, void (*fn)(u8 out[], u8 const in[], u8 const rk[], int rounds, int blocks, u8 iv[])) @@ -426,13 +379,12 @@ static int xts_decrypt(struct skcipher_request *req) } static struct skcipher_alg aes_algs[] = { { - .base.cra_name = "__ecb(aes)", - .base.cra_driver_name = "__ecb-aes-neonbs", + .base.cra_name = "ecb(aes)", + .base.cra_driver_name = "ecb-aes-neonbs", .base.cra_priority = 250, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aesbs_ctx), .base.cra_module = THIS_MODULE, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, @@ -441,13 +393,12 @@ static struct skcipher_alg aes_algs[] = { { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(aes)", - .base.cra_driver_name = "__cbc-aes-neonbs", + .base.cra_name = "cbc(aes)", + .base.cra_driver_name = "cbc-aes-neonbs", .base.cra_priority = 250, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctx), .base.cra_module = THIS_MODULE, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, @@ -457,13 +408,12 @@ static struct skcipher_alg aes_algs[] = { { .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, }, { - .base.cra_name = "__ctr(aes)", - .base.cra_driver_name = "__ctr-aes-neonbs", + .base.cra_name = "ctr(aes)", + .base.cra_driver_name = "ctr-aes-neonbs", .base.cra_priority = 250, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aesbs_ctx), .base.cra_module = THIS_MODULE, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, @@ -474,29 +424,12 @@ static struct skcipher_alg aes_algs[] = { { .encrypt = ctr_encrypt, .decrypt = ctr_encrypt, }, { - .base.cra_name = "ctr(aes)", - .base.cra_driver_name = "ctr-aes-neonbs", - .base.cra_priority = 250 - 1, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct aesbs_ctr_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .chunksize = AES_BLOCK_SIZE, - .walksize = 8 * AES_BLOCK_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = aesbs_ctr_setkey_sync, - .encrypt = ctr_encrypt_sync, - .decrypt = ctr_encrypt_sync, -}, { - .base.cra_name = "__xts(aes)", - .base.cra_driver_name = "__xts-aes-neonbs", + .base.cra_name = "xts(aes)", + .base.cra_driver_name = "xts-aes-neonbs", .base.cra_priority = 250, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aesbs_xts_ctx), .base.cra_module = THIS_MODULE, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .min_keysize = 2 * AES_MIN_KEY_SIZE, .max_keysize = 2 * AES_MAX_KEY_SIZE, @@ -507,54 +440,17 @@ static struct skcipher_alg aes_algs[] = { { .decrypt = xts_decrypt, } }; -static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)]; - static void aes_exit(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++) - if (aes_simd_algs[i]) - simd_skcipher_free(aes_simd_algs[i]); - crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); } static int __init aes_init(void) { - struct simd_skcipher_alg *simd; - const char *basename; - const char *algname; - const char *drvname; - int err; - int i; - if (!cpu_have_named_feature(ASIMD)) return -ENODEV; - err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); - if (err) - return err; - - for (i = 0; i < ARRAY_SIZE(aes_algs); i++) { - if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL)) - continue; - - algname = aes_algs[i].base.cra_name + 2; - drvname = aes_algs[i].base.cra_driver_name + 2; - basename = aes_algs[i].base.cra_driver_name; - simd = simd_skcipher_create_compat(algname, drvname, basename); - err = PTR_ERR(simd); - if (IS_ERR(simd)) - goto unregister_simds; - - aes_simd_algs[i] = simd; - } - return 0; - -unregister_simds: - aes_exit(); - return err; + return crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); } module_init(aes_init); diff --git a/arch/arm64/crypto/polyval-ce-core.S b/arch/arm64/crypto/polyval-ce-core.S new file mode 100644 index 000000000000..b5326540d2e3 --- /dev/null +++ b/arch/arm64/crypto/polyval-ce-core.S @@ -0,0 +1,361 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Implementation of POLYVAL using ARMv8 Crypto Extensions. + * + * Copyright 2021 Google LLC + */ +/* + * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions + * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8, + * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split + * finite field multiplication into two steps. + * + * In the first step, we consider h^i, m_i as normal polynomials of degree less + * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication + * is simply polynomial multiplication. + * + * In the second step, we compute the reduction of p(x) modulo the finite field + * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1. + * + * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where + * multiplication is finite field multiplication. The advantage is that the + * two-step process only requires 1 finite field reduction for every 8 + * polynomial multiplications. Further parallelism is gained by interleaving the + * multiplications and polynomial reductions. + */ + +#include +#define STRIDE_BLOCKS 8 + +KEY_POWERS .req x0 +MSG .req x1 +BLOCKS_LEFT .req x2 +ACCUMULATOR .req x3 +KEY_START .req x10 +EXTRA_BYTES .req x11 +TMP .req x13 + +M0 .req v0 +M1 .req v1 +M2 .req v2 +M3 .req v3 +M4 .req v4 +M5 .req v5 +M6 .req v6 +M7 .req v7 +KEY8 .req v8 +KEY7 .req v9 +KEY6 .req v10 +KEY5 .req v11 +KEY4 .req v12 +KEY3 .req v13 +KEY2 .req v14 +KEY1 .req v15 +PL .req v16 +PH .req v17 +TMP_V .req v18 +LO .req v20 +MI .req v21 +HI .req v22 +SUM .req v23 +GSTAR .req v24 + + .text + + .arch armv8-a+crypto + .align 4 + +.Lgstar: + .quad 0xc200000000000000, 0xc200000000000000 + +/* + * Computes the product of two 128-bit polynomials in X and Y and XORs the + * components of the 256-bit product into LO, MI, HI. + * + * Given: + * X = [X_1 : X_0] + * Y = [Y_1 : Y_0] + * + * We compute: + * LO += X_0 * Y_0 + * MI += (X_0 + X_1) * (Y_0 + Y_1) + * HI += X_1 * Y_1 + * + * Later, the 256-bit result can be extracted as: + * [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0] + * This step is done when computing the polynomial reduction for efficiency + * reasons. + * + * Karatsuba multiplication is used instead of Schoolbook multiplication because + * it was found to be slightly faster on ARM64 CPUs. + * + */ +.macro karatsuba1 X Y + X .req \X + Y .req \Y + ext v25.16b, X.16b, X.16b, #8 + ext v26.16b, Y.16b, Y.16b, #8 + eor v25.16b, v25.16b, X.16b + eor v26.16b, v26.16b, Y.16b + pmull2 v28.1q, X.2d, Y.2d + pmull v29.1q, X.1d, Y.1d + pmull v27.1q, v25.1d, v26.1d + eor HI.16b, HI.16b, v28.16b + eor LO.16b, LO.16b, v29.16b + eor MI.16b, MI.16b, v27.16b + .unreq X + .unreq Y +.endm + +/* + * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into + * them. + */ +.macro karatsuba1_store X Y + X .req \X + Y .req \Y + ext v25.16b, X.16b, X.16b, #8 + ext v26.16b, Y.16b, Y.16b, #8 + eor v25.16b, v25.16b, X.16b + eor v26.16b, v26.16b, Y.16b + pmull2 HI.1q, X.2d, Y.2d + pmull LO.1q, X.1d, Y.1d + pmull MI.1q, v25.1d, v26.1d + .unreq X + .unreq Y +.endm + +/* + * Computes the 256-bit polynomial represented by LO, HI, MI. Stores + * the result in PL, PH. + * [PH : PL] = + * [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0] + */ +.macro karatsuba2 + // v4 = [HI_1 + MI_1 : HI_0 + MI_0] + eor v4.16b, HI.16b, MI.16b + // v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0] + eor v4.16b, v4.16b, LO.16b + // v5 = [HI_0 : LO_1] + ext v5.16b, LO.16b, HI.16b, #8 + // v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0] + eor v4.16b, v4.16b, v5.16b + // HI = [HI_0 : HI_1] + ext HI.16b, HI.16b, HI.16b, #8 + // LO = [LO_0 : LO_1] + ext LO.16b, LO.16b, LO.16b, #8 + // PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1] + ext PH.16b, v4.16b, HI.16b, #8 + // PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0] + ext PL.16b, LO.16b, v4.16b, #8 +.endm + +/* + * Computes the 128-bit reduction of PH : PL. Stores the result in dest. + * + * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) = + * x^128 + x^127 + x^126 + x^121 + 1. + * + * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the + * product of two 128-bit polynomials in Montgomery form. We need to reduce it + * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor + * of x^128, this product has two extra factors of x^128. To get it back into + * Montgomery form, we need to remove one of these factors by dividing by x^128. + * + * To accomplish both of these goals, we add multiples of g(x) that cancel out + * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low + * bits are zero, the polynomial division by x^128 can be done by right + * shifting. + * + * Since the only nonzero term in the low 64 bits of g(x) is the constant term, + * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can + * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 + + * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to + * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T + * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191. + * + * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits + * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1 + * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) * + * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 : + * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0). + * + * So our final computation is: + * T = T_1 : T_0 = g*(x) * P_0 + * V = V_1 : V_0 = g*(x) * (P_1 + T_0) + * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0 + * + * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0 + * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 : + * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V. + */ +.macro montgomery_reduction dest + DEST .req \dest + // TMP_V = T_1 : T_0 = P_0 * g*(x) + pmull TMP_V.1q, PL.1d, GSTAR.1d + // TMP_V = T_0 : T_1 + ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8 + // TMP_V = P_1 + T_0 : P_0 + T_1 + eor TMP_V.16b, PL.16b, TMP_V.16b + // PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1 + eor PH.16b, PH.16b, TMP_V.16b + // TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x) + pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d + eor DEST.16b, PH.16b, TMP_V.16b + .unreq DEST +.endm + +/* + * Compute Polyval on 8 blocks. + * + * If reduce is set, also computes the montgomery reduction of the + * previous full_stride call and XORs with the first message block. + * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. + * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0. + * + * Sets PL, PH. + */ +.macro full_stride reduce + eor LO.16b, LO.16b, LO.16b + eor MI.16b, MI.16b, MI.16b + eor HI.16b, HI.16b, HI.16b + + ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64 + ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64 + + karatsuba1 M7 KEY1 + .if \reduce + pmull TMP_V.1q, PL.1d, GSTAR.1d + .endif + + karatsuba1 M6 KEY2 + .if \reduce + ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8 + .endif + + karatsuba1 M5 KEY3 + .if \reduce + eor TMP_V.16b, PL.16b, TMP_V.16b + .endif + + karatsuba1 M4 KEY4 + .if \reduce + eor PH.16b, PH.16b, TMP_V.16b + .endif + + karatsuba1 M3 KEY5 + .if \reduce + pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d + .endif + + karatsuba1 M2 KEY6 + .if \reduce + eor SUM.16b, PH.16b, TMP_V.16b + .endif + + karatsuba1 M1 KEY7 + eor M0.16b, M0.16b, SUM.16b + + karatsuba1 M0 KEY8 + karatsuba2 +.endm + +/* + * Handle any extra blocks after full_stride loop. + */ +.macro partial_stride + add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4) + sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4 + ld1 {KEY1.16b}, [KEY_POWERS], #16 + + ld1 {TMP_V.16b}, [MSG], #16 + eor SUM.16b, SUM.16b, TMP_V.16b + karatsuba1_store KEY1 SUM + sub BLOCKS_LEFT, BLOCKS_LEFT, #1 + + tst BLOCKS_LEFT, #4 + beq .Lpartial4BlocksDone + ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64 + ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64 + karatsuba1 M0 KEY8 + karatsuba1 M1 KEY7 + karatsuba1 M2 KEY6 + karatsuba1 M3 KEY5 +.Lpartial4BlocksDone: + tst BLOCKS_LEFT, #2 + beq .Lpartial2BlocksDone + ld1 {M0.16b, M1.16b}, [MSG], #32 + ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32 + karatsuba1 M0 KEY8 + karatsuba1 M1 KEY7 +.Lpartial2BlocksDone: + tst BLOCKS_LEFT, #1 + beq .LpartialDone + ld1 {M0.16b}, [MSG], #16 + ld1 {KEY8.16b}, [KEY_POWERS], #16 + karatsuba1 M0 KEY8 +.LpartialDone: + karatsuba2 + montgomery_reduction SUM +.endm + +/* + * Perform montgomery multiplication in GF(2^128) and store result in op1. + * + * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1 + * If op1, op2 are in montgomery form, this computes the montgomery + * form of op1*op2. + * + * void pmull_polyval_mul(u8 *op1, const u8 *op2); + */ +SYM_FUNC_START(pmull_polyval_mul) + adr TMP, .Lgstar + ld1 {GSTAR.2d}, [TMP] + ld1 {v0.16b}, [x0] + ld1 {v1.16b}, [x1] + karatsuba1_store v0 v1 + karatsuba2 + montgomery_reduction SUM + st1 {SUM.16b}, [x0] + ret +SYM_FUNC_END(pmull_polyval_mul) + +/* + * Perform polynomial evaluation as specified by POLYVAL. This computes: + * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1} + * where n=nblocks, h is the hash key, and m_i are the message blocks. + * + * x0 - pointer to precomputed key powers h^8 ... h^1 + * x1 - pointer to message blocks + * x2 - number of blocks to hash + * x3 - pointer to accumulator + * + * void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in, + * size_t nblocks, u8 *accumulator); + */ +SYM_FUNC_START(pmull_polyval_update) + adr TMP, .Lgstar + mov KEY_START, KEY_POWERS + ld1 {GSTAR.2d}, [TMP] + ld1 {SUM.16b}, [ACCUMULATOR] + subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS + blt .LstrideLoopExit + ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64 + ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64 + full_stride 0 + subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS + blt .LstrideLoopExitReduce +.LstrideLoop: + full_stride 1 + subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS + bge .LstrideLoop +.LstrideLoopExitReduce: + montgomery_reduction SUM +.LstrideLoopExit: + adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS + beq .LskipPartial + partial_stride +.LskipPartial: + st1 {SUM.16b}, [ACCUMULATOR] + ret +SYM_FUNC_END(pmull_polyval_update) diff --git a/arch/arm64/crypto/polyval-ce-glue.c b/arch/arm64/crypto/polyval-ce-glue.c new file mode 100644 index 000000000000..0a3b5718df85 --- /dev/null +++ b/arch/arm64/crypto/polyval-ce-glue.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Glue code for POLYVAL using ARMv8 Crypto Extensions + * + * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying + * Copyright 2021 Google LLC + */ + +/* + * Glue code based on ghash-clmulni-intel_glue.c. + * + * This implementation of POLYVAL uses montgomery multiplication accelerated by + * ARMv8 Crypto Extensions instructions to implement the finite field operations. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NUM_KEY_POWERS 8 + +struct polyval_tfm_ctx { + /* + * These powers must be in the order h^8, ..., h^1. + */ + u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE]; +}; + +struct polyval_desc_ctx { + u8 buffer[POLYVAL_BLOCK_SIZE]; + u32 bytes; +}; + +asmlinkage void pmull_polyval_update(const struct polyval_tfm_ctx *keys, + const u8 *in, size_t nblocks, u8 *accumulator); +asmlinkage void pmull_polyval_mul(u8 *op1, const u8 *op2); + +static void internal_polyval_update(const struct polyval_tfm_ctx *keys, + const u8 *in, size_t nblocks, u8 *accumulator) +{ + if (likely(crypto_simd_usable())) { + kernel_neon_begin(); + pmull_polyval_update(keys, in, nblocks, accumulator); + kernel_neon_end(); + } else { + polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in, + nblocks, accumulator); + } +} + +static void internal_polyval_mul(u8 *op1, const u8 *op2) +{ + if (likely(crypto_simd_usable())) { + kernel_neon_begin(); + pmull_polyval_mul(op1, op2); + kernel_neon_end(); + } else { + polyval_mul_non4k(op1, op2); + } +} + +static int polyval_arm64_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm); + int i; + + if (keylen != POLYVAL_BLOCK_SIZE) + return -EINVAL; + + memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE); + + for (i = NUM_KEY_POWERS-2; i >= 0; i--) { + memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE); + internal_polyval_mul(tctx->key_powers[i], + tctx->key_powers[i+1]); + } + + return 0; +} + +static int polyval_arm64_init(struct shash_desc *desc) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + + memset(dctx, 0, sizeof(*dctx)); + + return 0; +} + +static int polyval_arm64_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + u8 *pos; + unsigned int nblocks; + unsigned int n; + + if (dctx->bytes) { + n = min(srclen, dctx->bytes); + pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes; + + dctx->bytes -= n; + srclen -= n; + + while (n--) + *pos++ ^= *src++; + + if (!dctx->bytes) + internal_polyval_mul(dctx->buffer, + tctx->key_powers[NUM_KEY_POWERS-1]); + } + + while (srclen >= POLYVAL_BLOCK_SIZE) { + /* allow rescheduling every 4K bytes */ + nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE; + internal_polyval_update(tctx, src, nblocks, dctx->buffer); + srclen -= nblocks * POLYVAL_BLOCK_SIZE; + src += nblocks * POLYVAL_BLOCK_SIZE; + } + + if (srclen) { + dctx->bytes = POLYVAL_BLOCK_SIZE - srclen; + pos = dctx->buffer; + while (srclen--) + *pos++ ^= *src++; + } + + return 0; +} + +static int polyval_arm64_final(struct shash_desc *desc, u8 *dst) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + + if (dctx->bytes) { + internal_polyval_mul(dctx->buffer, + tctx->key_powers[NUM_KEY_POWERS-1]); + } + + memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE); + + return 0; +} + +static struct shash_alg polyval_alg = { + .digestsize = POLYVAL_DIGEST_SIZE, + .init = polyval_arm64_init, + .update = polyval_arm64_update, + .final = polyval_arm64_final, + .setkey = polyval_arm64_setkey, + .descsize = sizeof(struct polyval_desc_ctx), + .base = { + .cra_name = "polyval", + .cra_driver_name = "polyval-ce", + .cra_priority = 200, + .cra_blocksize = POLYVAL_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct polyval_tfm_ctx), + .cra_module = THIS_MODULE, + }, +}; + +static int __init polyval_ce_mod_init(void) +{ + return crypto_register_shash(&polyval_alg); +} + +static void __exit polyval_ce_mod_exit(void) +{ + crypto_unregister_shash(&polyval_alg); +} + +module_cpu_feature_match(PMULL, polyval_ce_mod_init) +module_exit(polyval_ce_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("POLYVAL hash function accelerated by ARMv8 Crypto Extensions"); +MODULE_ALIAS_CRYPTO("polyval"); +MODULE_ALIAS_CRYPTO("polyval-ce"); diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h index 7e157ab6cd50..610fcdc0d771 100644 --- a/arch/arm64/include/asm/alternative-macros.h +++ b/arch/arm64/include/asm/alternative-macros.h @@ -7,6 +7,7 @@ #define ARM64_CB_PATCH ARM64_NCAPS +#ifndef BUILD_FIPS140_KO #ifndef __ASSEMBLY__ #include @@ -207,4 +208,33 @@ alternative_endif #define ALTERNATIVE(oldinstr, newinstr, ...) \ _ALTERNATIVE_CFG(oldinstr, newinstr, __VA_ARGS__, 1) +#else + +/* + * The FIPS140 module does not support alternatives patching, as this + * invalidates the HMAC digest of the .text section. However, some alternatives + * are known to be irrelevant so we can tolerate them in the FIPS140 module, as + * they will never be applied in the first place in the use cases that the + * FIPS140 module targets (Android running on a production phone). Any other + * uses of alternatives should be avoided, as it is not safe in the general + * case to simply use the default sequence in one place (the fips module) and + * the alternative sequence everywhere else. + * + * Below is an allowlist of features that we can ignore, by simply taking the + * safe default instruction sequence. Note that this implies that the FIPS140 + * module is not compatible with VHE, or with pseudo-NMI support. + */ + +#define __ALT_ARM64_HAS_LDAPR 0, +#define __ALT_ARM64_HAS_VIRT_HOST_EXTN 0, +#define __ALT_ARM64_HAS_IRQ_PRIO_MASKING 0, + +#define ALTERNATIVE(oldinstr, newinstr, feature, ...) \ + _ALTERNATIVE(oldinstr, __ALT_ ## feature, #feature) + +#define _ALTERNATIVE(oldinstr, feature, feature_str) \ + __take_second_arg(feature oldinstr, \ + ".err Feature " feature_str " not supported in fips140 module") + +#endif /* BUILD_FIPS140_KO */ #endif /* __ASM_ALTERNATIVE_MACROS_H */ diff --git a/arch/arm64/include/asm/archrandom.h b/arch/arm64/include/asm/archrandom.h index 09e43272ccb0..dd7d9b4b6685 100644 --- a/arch/arm64/include/asm/archrandom.h +++ b/arch/arm64/include/asm/archrandom.h @@ -109,7 +109,7 @@ static inline bool __init __early_cpu_has_rndr(void) { /* Open code as we run prior to the first call to cpufeature. */ unsigned long ftr = read_sysreg_s(SYS_ID_AA64ISAR0_EL1); - return (ftr >> ID_AA64ISAR0_RNDR_SHIFT) & 0xf; + return (ftr >> ID_AA64ISAR0_EL1_RNDR_SHIFT) & 0xf; } static inline bool __init __must_check diff --git a/arch/arm64/include/asm/asm_pointer_auth.h b/arch/arm64/include/asm/asm_pointer_auth.h index f1bba5fc61c4..13ecc79854ee 100644 --- a/arch/arm64/include/asm/asm_pointer_auth.h +++ b/arch/arm64/include/asm/asm_pointer_auth.h @@ -59,7 +59,10 @@ alternative_else_nop_endif .macro __ptrauth_keys_init_cpu tsk, tmp1, tmp2, tmp3 mrs \tmp1, id_aa64isar1_el1 - ubfx \tmp1, \tmp1, #ID_AA64ISAR1_APA_SHIFT, #8 + ubfx \tmp1, \tmp1, #ID_AA64ISAR1_EL1_APA_SHIFT, #8 + mrs_s \tmp2, SYS_ID_AA64ISAR2_EL1 + ubfx \tmp2, \tmp2, #ID_AA64ISAR2_EL1_APA3_SHIFT, #4 + orr \tmp1, \tmp1, \tmp2 cbz \tmp1, .Lno_addr_auth\@ mov_q \tmp1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \ SCTLR_ELx_ENDA | SCTLR_ELx_ENDB) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 448a575db8e8..60cf15176337 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -395,8 +395,8 @@ alternative_cb_end .macro tcr_compute_pa_size, tcr, pos, tmp0, tmp1 mrs \tmp0, ID_AA64MMFR0_EL1 // Narrow PARange to fit the PS field in TCR_ELx - ubfx \tmp0, \tmp0, #ID_AA64MMFR0_PARANGE_SHIFT, #3 - mov \tmp1, #ID_AA64MMFR0_PARANGE_MAX + ubfx \tmp0, \tmp0, #ID_AA64MMFR0_EL1_PARANGE_SHIFT, #3 + mov \tmp1, #ID_AA64MMFR0_EL1_PARANGE_MAX cmp \tmp0, \tmp1 csel \tmp0, \tmp1, \tmp0, hi bfi \tcr, \tmp0, \pos, #3 @@ -479,7 +479,7 @@ alternative_endif */ .macro reset_pmuserenr_el0, tmpreg mrs \tmpreg, id_aa64dfr0_el1 - sbfx \tmpreg, \tmpreg, #ID_AA64DFR0_PMUVER_SHIFT, #4 + sbfx \tmpreg, \tmpreg, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4 cmp \tmpreg, #1 // Skip if no PMU present b.lt 9000f msr pmuserenr_el0, xzr // Disable PMU access from EL0 @@ -491,7 +491,7 @@ alternative_endif */ .macro reset_amuserenr_el0, tmpreg mrs \tmpreg, id_aa64pfr0_el1 // Check ID_AA64PFR0_EL1 - ubfx \tmpreg, \tmpreg, #ID_AA64PFR0_AMU_SHIFT, #4 + ubfx \tmpreg, \tmpreg, #ID_AA64PFR0_EL1_AMU_SHIFT, #4 cbz \tmpreg, .Lskip_\@ // Skip if no AMU present msr_s SYS_AMUSERENR_EL0, xzr // Disable AMU access from EL0 .Lskip_\@: @@ -584,7 +584,7 @@ alternative_endif .macro offset_ttbr1, ttbr, tmp #ifdef CONFIG_ARM64_VA_BITS_52 mrs_s \tmp, SYS_ID_AA64MMFR2_EL1 - and \tmp, \tmp, #(0xf << ID_AA64MMFR2_LVA_SHIFT) + and \tmp, \tmp, #(0xf << ID_AA64MMFR2_EL1_VARange_SHIFT) cbnz \tmp, .Lskipoffs_\@ orr \ttbr, \ttbr, #TTBR1_BADDR_4852_OFFSET .Lskipoffs_\@ : @@ -787,6 +787,16 @@ alternative_endif .Lnoyield_\@: .endm +/* + * Branch Target Identifier (BTI) + */ + .macro bti, targets + .equ .L__bti_targets_c, 34 + .equ .L__bti_targets_j, 36 + .equ .L__bti_targets_jc,38 + hint #.L__bti_targets_\targets + .endm + /* * This macro emits a program property note section identifying * architecture features which require special handling, mainly for diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index 451e11e5fd23..7eb4ed873a9d 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -16,14 +16,18 @@ #define sev() asm volatile("sev" : : : "memory") #define wfe() asm volatile("wfe" : : : "memory") +#define wfet(val) asm volatile("msr s0_3_c1_c0_0, %0" \ + : : "r" (val) : "memory") #define wfi() asm volatile("wfi" : : : "memory") +#define wfit(val) asm volatile("msr s0_3_c1_c0_1, %0" \ + : : "r" (val) : "memory") #define isb() asm volatile("isb" : : : "memory") #define dmb(opt) asm volatile("dmb " #opt : : : "memory") #define dsb(opt) asm volatile("dsb " #opt : : : "memory") #define psb_csync() asm volatile("hint #17" : : : "memory") -#define tsb_csync() asm volatile("hint #18" : : : "memory") +#define __tsb_csync() asm volatile("hint #18" : : : "memory") #define csdb() asm volatile("hint #20" : : : "memory") #ifdef CONFIG_ARM64_PSEUDO_NMI @@ -46,6 +50,20 @@ #define dma_rmb() dmb(oshld) #define dma_wmb() dmb(oshst) + +#define tsb_csync() \ + do { \ + /* \ + * CPUs affected by Arm Erratum 2054223 or 2067961 needs \ + * another TSB to ensure the trace is flushed. The barriers \ + * don't have to be strictly back to back, as long as the \ + * CPU is in trace prohibited state. \ + */ \ + if (cpus_have_final_cap(ARM64_WORKAROUND_TSB_FLUSH_FAILURE)) \ + __tsb_csync(); \ + __tsb_csync(); \ + } while (0) + /* * Generate a mask for array_index__nospec() that is ~0UL when 0 <= idx < sz * and 0 otherwise. diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index a074459f8f2f..933facc11ee6 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -5,33 +5,9 @@ #ifndef __ASM_CACHE_H #define __ASM_CACHE_H -#include - -#define CTR_L1IP_SHIFT 14 -#define CTR_L1IP_MASK 3 -#define CTR_DMINLINE_SHIFT 16 -#define CTR_IMINLINE_SHIFT 0 -#define CTR_IMINLINE_MASK 0xf -#define CTR_ERG_SHIFT 20 -#define CTR_CWG_SHIFT 24 -#define CTR_CWG_MASK 15 -#define CTR_IDC_SHIFT 28 -#define CTR_DIC_SHIFT 29 - -#define CTR_CACHE_MINLINE_MASK \ - (0xf << CTR_DMINLINE_SHIFT | CTR_IMINLINE_MASK << CTR_IMINLINE_SHIFT) - -#define CTR_L1IP(ctr) (((ctr) >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK) - -#define ICACHE_POLICY_VPIPT 0 -#define ICACHE_POLICY_RESERVED 1 -#define ICACHE_POLICY_VIPT 2 -#define ICACHE_POLICY_PIPT 3 - #define L1_CACHE_SHIFT (6) #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) - #define CLIDR_LOUU_SHIFT 27 #define CLIDR_LOC_SHIFT 24 #define CLIDR_LOUIS_SHIFT 21 @@ -59,6 +35,11 @@ #include +#include +#include + +#define CTR_L1IP(ctr) (((ctr) >> CTR_EL0_L1Ip_SHIFT) & CTR_EL0_L1Ip_MASK) + #define ICACHEF_ALIASING 0 #define ICACHEF_VPIPT 1 extern unsigned long __icache_flags; @@ -79,7 +60,7 @@ static __always_inline int icache_is_vpipt(void) static inline u32 cache_type_cwg(void) { - return (read_cpuid_cachetype() >> CTR_CWG_SHIFT) & CTR_CWG_MASK; + return (read_cpuid_cachetype() >> CTR_EL0_CWG_SHIFT) & CTR_EL0_CWG_MASK; } #define __read_mostly __section(".data..read_mostly") @@ -113,12 +94,12 @@ static inline u32 __attribute_const__ read_cpuid_effective_cachetype(void) { u32 ctr = read_cpuid_cachetype(); - if (!(ctr & BIT(CTR_IDC_SHIFT))) { + if (!(ctr & BIT(CTR_EL0_IDC_SHIFT))) { u64 clidr = read_sysreg(clidr_el1); if (CLIDR_LOC(clidr) == 0 || (CLIDR_LOUIS(clidr) == 0 && CLIDR_LOUU(clidr) == 0)) - ctr |= BIT(CTR_IDC_SHIFT); + ctr |= BIT(CTR_EL0_IDC_SHIFT); } return ctr; diff --git a/arch/arm64/include/asm/compiler.h b/arch/arm64/include/asm/compiler.h index dc3ea4080e2e..6fb2e6bcc392 100644 --- a/arch/arm64/include/asm/compiler.h +++ b/arch/arm64/include/asm/compiler.h @@ -23,20 +23,4 @@ #define __builtin_return_address(val) \ (void *)(ptrauth_clear_pac((unsigned long)__builtin_return_address(val))) -#ifdef CONFIG_CFI_CLANG -/* - * With CONFIG_CFI_CLANG, the compiler replaces function address - * references with the address of the function's CFI jump table - * entry. The function_nocfi macro always returns the address of the - * actual function instead. - */ -#define function_nocfi(x) ({ \ - void *addr; \ - asm("adrp %0, " __stringify(x) "\n\t" \ - "add %0, %0, :lo12:" __stringify(x) \ - : "=r" (addr)); \ - addr; \ -}) -#endif - #endif /* __ASM_COMPILER_H */ diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index a58e366f0b07..115cdec1ae87 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -58,11 +58,15 @@ struct cpuinfo_arm64 { u64 reg_id_aa64pfr0; u64 reg_id_aa64pfr1; u64 reg_id_aa64zfr0; + u64 reg_id_aa64smfr0; struct cpuinfo_32bit aarch32; /* pseudo-ZCR for recording maximum ZCR_EL1 LEN value: */ u64 reg_zcr; + + /* pseudo-SMCR for recording maximum SMCR_EL1 LEN value: */ + u64 reg_smcr; }; DECLARE_PER_CPU(struct cpuinfo_arm64, cpu_data); diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index a77b5f49b3a6..2185304a20bb 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -356,6 +356,7 @@ struct arm64_cpu_capabilities { struct { /* Feature register checking */ u32 sys_reg; u8 field_pos; + u8 field_width; u8 min_field_value; u8 hwcap_type; bool sign; @@ -552,7 +553,7 @@ cpuid_feature_cap_perfmon_field(u64 features, int field, u64 cap) u64 mask = GENMASK_ULL(field + 3, field); /* Treat IMPLEMENTATION DEFINED functionality as unimplemented */ - if (val == ID_AA64DFR0_PMUVER_IMP_DEF) + if (val == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) val = 0; if (val > cap) { @@ -594,36 +595,43 @@ static inline s64 arm64_ftr_value(const struct arm64_ftr_bits *ftrp, u64 val) static inline bool id_aa64mmfr0_mixed_endian_el0(u64 mmfr0) { - return cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_BIGENDEL_SHIFT) == 0x1 || - cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_BIGENDEL0_SHIFT) == 0x1; + return cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_BIGEND_SHIFT) == 0x1 || + cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_BIGENDEL0_SHIFT) == 0x1; } static inline bool id_aa64pfr0_32bit_el1(u64 pfr0) { - u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SHIFT); + u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_EL1_SHIFT); - return val == ID_AA64PFR0_ELx_32BIT_64BIT; + return val == ID_AA64PFR0_EL1_ELx_32BIT_64BIT; } static inline bool id_aa64pfr0_32bit_el0(u64 pfr0) { - u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL0_SHIFT); + u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_EL0_SHIFT); - return val == ID_AA64PFR0_ELx_32BIT_64BIT; + return val == ID_AA64PFR0_EL1_ELx_32BIT_64BIT; } static inline bool id_aa64pfr0_sve(u64 pfr0) { - u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_SVE_SHIFT); + u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SVE_SHIFT); + + return val > 0; +} + +static inline bool id_aa64pfr1_sme(u64 pfr1) +{ + u32 val = cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_EL1_SME_SHIFT); return val > 0; } static inline bool id_aa64pfr1_mte(u64 pfr1) { - u32 val = cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_MTE_SHIFT); + u32 val = cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_EL1_MTE_SHIFT); - return val >= ID_AA64PFR1_MTE; + return val >= ID_AA64PFR1_EL1_MTE_MTE2; } void __init setup_cpu_features(void); @@ -649,7 +657,7 @@ static inline bool supports_csv2p3(int scope) pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); csv2_val = cpuid_feature_extract_unsigned_field(pfr0, - ID_AA64PFR0_CSV2_SHIFT); + ID_AA64PFR0_EL1_CSV2_SHIFT); return csv2_val == 3; } @@ -663,7 +671,7 @@ static inline bool supports_clearbhb(int scope) isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); return cpuid_feature_extract_unsigned_field(isar2, - ID_AA64ISAR2_CLEARBHB_SHIFT); + ID_AA64ISAR2_EL1_BC_SHIFT); } const struct cpumask *system_32bit_el0_cpumask(void); @@ -684,10 +692,10 @@ static inline bool system_supports_4kb_granule(void) mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); val = cpuid_feature_extract_unsigned_field(mmfr0, - ID_AA64MMFR0_TGRAN4_SHIFT); + ID_AA64MMFR0_EL1_TGRAN4_SHIFT); - return (val >= ID_AA64MMFR0_TGRAN4_SUPPORTED_MIN) && - (val <= ID_AA64MMFR0_TGRAN4_SUPPORTED_MAX); + return (val >= ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MIN) && + (val <= ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MAX); } static inline bool system_supports_64kb_granule(void) @@ -697,10 +705,10 @@ static inline bool system_supports_64kb_granule(void) mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); val = cpuid_feature_extract_unsigned_field(mmfr0, - ID_AA64MMFR0_TGRAN64_SHIFT); + ID_AA64MMFR0_EL1_TGRAN64_SHIFT); - return (val >= ID_AA64MMFR0_TGRAN64_SUPPORTED_MIN) && - (val <= ID_AA64MMFR0_TGRAN64_SUPPORTED_MAX); + return (val >= ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MIN) && + (val <= ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MAX); } static inline bool system_supports_16kb_granule(void) @@ -710,10 +718,10 @@ static inline bool system_supports_16kb_granule(void) mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); val = cpuid_feature_extract_unsigned_field(mmfr0, - ID_AA64MMFR0_TGRAN16_SHIFT); + ID_AA64MMFR0_EL1_TGRAN16_SHIFT); - return (val >= ID_AA64MMFR0_TGRAN16_SUPPORTED_MIN) && - (val <= ID_AA64MMFR0_TGRAN16_SUPPORTED_MAX); + return (val >= ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MIN) && + (val <= ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MAX); } static inline bool system_supports_mixed_endian_el0(void) @@ -728,7 +736,7 @@ static inline bool system_supports_mixed_endian(void) mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); val = cpuid_feature_extract_unsigned_field(mmfr0, - ID_AA64MMFR0_BIGENDEL_SHIFT); + ID_AA64MMFR0_EL1_BIGEND_SHIFT); return val == 0x1; } @@ -756,6 +764,23 @@ static __always_inline bool system_supports_sve(void) cpus_have_const_cap(ARM64_SVE); } +static __always_inline bool system_supports_sme(void) +{ + return IS_ENABLED(CONFIG_ARM64_SME) && + cpus_have_const_cap(ARM64_SME); +} + +static __always_inline bool system_supports_fa64(void) +{ + return IS_ENABLED(CONFIG_ARM64_SME) && + cpus_have_const_cap(ARM64_SME_FA64); +} + +static __always_inline bool system_supports_tpidr2(void) +{ + return system_supports_sme(); +} + static __always_inline bool system_supports_cnp(void) { return IS_ENABLED(CONFIG_ARM64_CNP) && @@ -813,13 +838,13 @@ extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) { switch (parange) { - case ID_AA64MMFR0_PARANGE_32: return 32; - case ID_AA64MMFR0_PARANGE_36: return 36; - case ID_AA64MMFR0_PARANGE_40: return 40; - case ID_AA64MMFR0_PARANGE_42: return 42; - case ID_AA64MMFR0_PARANGE_44: return 44; - case ID_AA64MMFR0_PARANGE_48: return 48; - case ID_AA64MMFR0_PARANGE_52: return 52; + case ID_AA64MMFR0_EL1_PARANGE_32: return 32; + case ID_AA64MMFR0_EL1_PARANGE_36: return 36; + case ID_AA64MMFR0_EL1_PARANGE_40: return 40; + case ID_AA64MMFR0_EL1_PARANGE_42: return 42; + case ID_AA64MMFR0_EL1_PARANGE_44: return 44; + case ID_AA64MMFR0_EL1_PARANGE_48: return 48; + case ID_AA64MMFR0_EL1_PARANGE_52: return 52; /* * A future PE could use a value unknown to the kernel. * However, by the "D10.1.4 Principles of the ID scheme @@ -841,14 +866,14 @@ static inline bool cpu_has_hw_af(void) mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); return cpuid_feature_extract_unsigned_field(mmfr1, - ID_AA64MMFR1_HADBS_SHIFT); + ID_AA64MMFR1_EL1_HAFDBS_SHIFT); } static inline bool cpu_has_pan(void) { u64 mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); return cpuid_feature_extract_unsigned_field(mmfr1, - ID_AA64MMFR1_PAN_SHIFT); + ID_AA64MMFR1_EL1_PAN_SHIFT); } #ifdef CONFIG_ARM64_AMU_EXTN @@ -869,8 +894,8 @@ static inline unsigned int get_vmid_bits(u64 mmfr1) int vmid_bits; vmid_bits = cpuid_feature_extract_unsigned_field(mmfr1, - ID_AA64MMFR1_VMIDBITS_SHIFT); - if (vmid_bits == ID_AA64MMFR1_VMIDBITS_16) + ID_AA64MMFR1_EL1_VMIDBits_SHIFT); + if (vmid_bits == ID_AA64MMFR1_EL1_VMIDBits_16) return 16; /* @@ -880,9 +905,12 @@ static inline unsigned int get_vmid_bits(u64 mmfr1) return 8; } +struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id); + extern struct arm64_ftr_override id_aa64mmfr1_override; extern struct arm64_ftr_override id_aa64pfr1_override; extern struct arm64_ftr_override id_aa64isar1_override; +extern struct arm64_ftr_override id_aa64isar2_override; u32 get_kvm_ipa_limit(void); void dump_cpu_features(void); diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 9cf5d9551e99..190ed1c259ec 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -112,6 +112,10 @@ #define APPLE_CPU_PART_M1_ICESTORM 0x022 #define APPLE_CPU_PART_M1_FIRESTORM 0x023 +#define APPLE_CPU_PART_M1_ICESTORM_PRO 0x024 +#define APPLE_CPU_PART_M1_FIRESTORM_PRO 0x025 +#define APPLE_CPU_PART_M1_ICESTORM_MAX 0x028 +#define APPLE_CPU_PART_M1_FIRESTORM_MAX 0x029 #define AMPERE_CPU_PART_AMPERE1 0xAC3 @@ -154,6 +158,10 @@ #define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110) #define MIDR_APPLE_M1_ICESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM) #define MIDR_APPLE_M1_FIRESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM) +#define MIDR_APPLE_M1_ICESTORM_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM_PRO) +#define MIDR_APPLE_M1_FIRESTORM_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_PRO) +#define MIDR_APPLE_M1_ICESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM_MAX) +#define MIDR_APPLE_M1_FIRESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_MAX) #define MIDR_AMPERE1 MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1) /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 7f3c87f7a0ce..2256f22e336d 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -40,7 +40,7 @@ .macro __init_el2_debug mrs x1, id_aa64dfr0_el1 - sbfx x0, x1, #ID_AA64DFR0_PMUVER_SHIFT, #4 + sbfx x0, x1, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4 cmp x0, #1 b.lt .Lskip_pmu_\@ // Skip if no PMU present mrs x0, pmcr_el0 // Disable debug access traps @@ -49,7 +49,7 @@ csel x2, xzr, x0, lt // all PMU counters from EL1 /* Statistical profiling */ - ubfx x0, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4 + ubfx x0, x1, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4 cbz x0, .Lskip_spe_\@ // Skip if SPE not present mrs_s x0, SYS_PMBIDR_EL1 // If SPE available at EL2, @@ -65,7 +65,7 @@ .Lskip_spe_\@: /* Trace buffer */ - ubfx x0, x1, #ID_AA64DFR0_TRBE_SHIFT, #4 + ubfx x0, x1, #ID_AA64DFR0_EL1_TraceBuffer_SHIFT, #4 cbz x0, .Lskip_trace_\@ // Skip if TraceBuffer is not present mrs_s x0, SYS_TRBIDR_EL1 @@ -83,7 +83,7 @@ /* LORegions */ .macro __init_el2_lor mrs x1, id_aa64mmfr1_el1 - ubfx x0, x1, #ID_AA64MMFR1_LOR_SHIFT, 4 + ubfx x0, x1, #ID_AA64MMFR1_EL1_LO_SHIFT, 4 cbz x0, .Lskip_lor_\@ msr_s SYS_LORC_EL1, xzr .Lskip_lor_\@: @@ -97,7 +97,7 @@ /* GICv3 system register access */ .macro __init_el2_gicv3 mrs x0, id_aa64pfr0_el1 - ubfx x0, x0, #ID_AA64PFR0_GIC_SHIFT, #4 + ubfx x0, x0, #ID_AA64PFR0_EL1_GIC_SHIFT, #4 cbz x0, .Lskip_gicv3_\@ mrs_s x0, SYS_ICC_SRE_EL2 @@ -132,7 +132,7 @@ /* SVE register access */ .macro __init_el2_nvhe_sve mrs x1, id_aa64pfr0_el1 - ubfx x1, x1, #ID_AA64PFR0_SVE_SHIFT, #4 + ubfx x1, x1, #ID_AA64PFR0_EL1_SVE_SHIFT, #4 cbz x1, .Lskip_sve_\@ bic x0, x0, #CPTR_EL2_TZ // Also disable SVE traps @@ -143,29 +143,84 @@ .Lskip_sve_\@: .endm +/* SME register access and priority mapping */ +.macro __init_el2_nvhe_sme + mrs x1, id_aa64pfr1_el1 + ubfx x1, x1, #ID_AA64PFR1_EL1_SME_SHIFT, #4 + cbz x1, .Lskip_sme_\@ + + bic x0, x0, #CPTR_EL2_TSM // Also disable SME traps + msr cptr_el2, x0 // Disable copro. traps to EL2 + isb + + mrs x1, sctlr_el2 + orr x1, x1, #SCTLR_ELx_ENTP2 // Disable TPIDR2 traps + msr sctlr_el2, x1 + isb + + mov x1, #0 // SMCR controls + + mrs_s x2, SYS_ID_AA64SMFR0_EL1 + ubfx x2, x2, #ID_AA64SMFR0_EL1_FA64_SHIFT, #1 // Full FP in SM? + cbz x2, .Lskip_sme_fa64_\@ + + orr x1, x1, SMCR_ELx_FA64_MASK +.Lskip_sme_fa64_\@: + + orr x1, x1, #SMCR_ELx_LEN_MASK // Enable full SME vector + msr_s SYS_SMCR_EL2, x1 // length for EL1. + + mrs_s x1, SYS_SMIDR_EL1 // Priority mapping supported? + ubfx x1, x1, #SMIDR_EL1_SMPS_SHIFT, #1 + cbz x1, .Lskip_sme_\@ + + msr_s SYS_SMPRIMAP_EL2, xzr // Make all priorities equal + + mrs x1, id_aa64mmfr1_el1 // HCRX_EL2 present? + ubfx x1, x1, #ID_AA64MMFR1_EL1_HCX_SHIFT, #4 + cbz x1, .Lskip_sme_\@ + + mrs_s x1, SYS_HCRX_EL2 + orr x1, x1, #HCRX_EL2_SMPME_MASK // Enable priority mapping + msr_s SYS_HCRX_EL2, x1 + +.Lskip_sme_\@: +.endm + /* Disable any fine grained traps */ .macro __init_el2_fgt mrs x1, id_aa64mmfr0_el1 - ubfx x1, x1, #ID_AA64MMFR0_FGT_SHIFT, #4 + ubfx x1, x1, #ID_AA64MMFR0_EL1_FGT_SHIFT, #4 cbz x1, .Lskip_fgt_\@ mov x0, xzr mrs x1, id_aa64dfr0_el1 - ubfx x1, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4 + ubfx x1, x1, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4 cmp x1, #3 - b.lt .Lset_fgt_\@ + b.lt .Lset_debug_fgt_\@ /* Disable PMSNEVFR_EL1 read and write traps */ orr x0, x0, #(1 << 62) -.Lset_fgt_\@: +.Lset_debug_fgt_\@: msr_s SYS_HDFGRTR_EL2, x0 msr_s SYS_HDFGWTR_EL2, x0 - msr_s SYS_HFGRTR_EL2, xzr - msr_s SYS_HFGWTR_EL2, xzr + + mov x0, xzr + mrs x1, id_aa64pfr1_el1 + ubfx x1, x1, #ID_AA64PFR1_EL1_SME_SHIFT, #4 + cbz x1, .Lset_fgt_\@ + + /* Disable nVHE traps of TPIDR2 and SMPRI */ + orr x0, x0, #HFGxTR_EL2_nSMPRI_EL1_MASK + orr x0, x0, #HFGxTR_EL2_nTPIDR2_EL0_MASK + +.Lset_fgt_\@: + msr_s SYS_HFGRTR_EL2, x0 + msr_s SYS_HFGWTR_EL2, x0 msr_s SYS_HFGITR_EL2, xzr mrs x1, id_aa64pfr0_el1 // AMU traps UNDEF without AMU - ubfx x1, x1, #ID_AA64PFR0_AMU_SHIFT, #4 + ubfx x1, x1, #ID_AA64PFR0_EL1_AMU_SHIFT, #4 cbz x1, .Lskip_fgt_\@ msr_s SYS_HAFGRTR_EL2, xzr @@ -177,6 +232,22 @@ msr spsr_el2, x0 .endm +.macro __init_el2_mpam +#ifdef CONFIG_ARM64_MPAM + /* Memory Partioning And Monitoring: disable EL2 traps */ + mrs x1, id_aa64pfr0_el1 + ubfx x0, x1, #ID_AA64PFR0_EL1_MPAM_SHIFT, #4 + cbz x0, .Lskip_mpam_\@ // skip if no MPAM + msr_s SYS_MPAM0_EL1, xzr // use the default partition.. + msr_s SYS_MPAM2_EL2, xzr // ..and disable lower traps + msr_s SYS_MPAM1_EL1, xzr + mrs_s x0, SYS_MPAMIDR_EL1 + tbz x0, #17, .Lskip_mpam_\@ // skip if no MPAMHCR reg + msr_s SYS_MPAMHCR_EL2, xzr // clear TRAP_MPAMIDR_EL1 -> EL2 +.Lskip_mpam_\@: +#endif /* CONFIG_ARM64_MPAM */ +.endm + /** * Initialize EL2 registers to sane values. This should be called early on all * cores that were booted in EL2. Note that everything gets initialised as @@ -193,9 +264,11 @@ __init_el2_stage2 __init_el2_gicv3 __init_el2_hstr + __init_el2_mpam __init_el2_nvhe_idregs __init_el2_nvhe_cptr __init_el2_nvhe_sve + __init_el2_nvhe_sme __init_el2_fgt __init_el2_nvhe_prepare_eret .endm diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 9f91c8906edd..80ad5accc187 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -37,7 +37,8 @@ #define ESR_ELx_EC_ERET (0x1a) /* EL2 only */ /* Unallocated EC: 0x1B */ #define ESR_ELx_EC_FPAC (0x1C) /* EL1 and above */ -/* Unallocated EC: 0x1D - 0x1E */ +#define ESR_ELx_EC_SME (0x1D) +/* Unallocated EC: 0x1E */ #define ESR_ELx_EC_IMP_DEF (0x1f) /* EL3 only */ #define ESR_ELx_EC_IABT_LOW (0x20) #define ESR_ELx_EC_IABT_CUR (0x21) @@ -75,6 +76,7 @@ #define ESR_ELx_IL_SHIFT (25) #define ESR_ELx_IL (UL(1) << ESR_ELx_IL_SHIFT) #define ESR_ELx_ISS_MASK (ESR_ELx_IL - 1) +#define ESR_ELx_ISS(esr) ((esr) & ESR_ELx_ISS_MASK) /* ISS field definitions shared by different classes */ #define ESR_ELx_WNR_SHIFT (6) @@ -112,6 +114,7 @@ #define ESR_ELx_FSC_ACCESS (0x08) #define ESR_ELx_FSC_FAULT (0x04) #define ESR_ELx_FSC_PERM (0x0C) +#define ESR_ELx_FSC_TLBCONF (0x30) /* ISS field definitions for Data Aborts */ #define ESR_ELx_ISV_SHIFT (24) @@ -133,7 +136,10 @@ #define ESR_ELx_CV (UL(1) << 24) #define ESR_ELx_COND_SHIFT (20) #define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT) -#define ESR_ELx_WFx_ISS_TI (UL(1) << 0) +#define ESR_ELx_WFx_ISS_RN (UL(0x1F) << 5) +#define ESR_ELx_WFx_ISS_RV (UL(1) << 2) +#define ESR_ELx_WFx_ISS_TI (UL(3) << 0) +#define ESR_ELx_WFx_ISS_WFxT (UL(2) << 0) #define ESR_ELx_WFx_ISS_WFI (UL(0) << 0) #define ESR_ELx_WFx_ISS_WFE (UL(1) << 0) #define ESR_ELx_xVC_IMM_MASK ((1UL << 16) - 1) @@ -146,7 +152,8 @@ #define DISR_EL1_ESR_MASK (ESR_ELx_AET | ESR_ELx_EA | ESR_ELx_FSC) /* ESR value templates for specific events */ -#define ESR_ELx_WFx_MASK (ESR_ELx_EC_MASK | ESR_ELx_WFx_ISS_TI) +#define ESR_ELx_WFx_MASK (ESR_ELx_EC_MASK | \ + (ESR_ELx_WFx_ISS_TI & ~ESR_ELx_WFx_ISS_WFxT)) #define ESR_ELx_WFx_WFI_VAL ((ESR_ELx_EC_WFx << ESR_ELx_EC_SHIFT) | \ ESR_ELx_WFx_ISS_WFI) @@ -321,6 +328,15 @@ #define ESR_ELx_CP15_32_ISS_SYS_CNTFRQ (ESR_ELx_CP15_32_ISS_SYS_VAL(0, 0, 14, 0) |\ ESR_ELx_CP15_32_ISS_DIR_READ) +/* + * ISS values for SME traps + */ + +#define ESR_ELx_SME_ISS_SME_DISABLED 0 +#define ESR_ELx_SME_ISS_ILL 1 +#define ESR_ELx_SME_ISS_SM_DISABLED 2 +#define ESR_ELx_SME_ISS_ZA_DISABLED 3 + #ifndef __ASSEMBLY__ #include diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 0e6535aa78c2..d94aecff9690 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -64,6 +64,7 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr, struct pt_regs *regs); void do_fpsimd_acc(unsigned long esr, struct pt_regs *regs); void do_sve_acc(unsigned long esr, struct pt_regs *regs); +void do_sme_acc(unsigned long esr, struct pt_regs *regs); void do_fpsimd_exc(unsigned long esr, struct pt_regs *regs); void do_sysinstr(unsigned long esr, struct pt_regs *regs); void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs); diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index daff882883f9..f4b7a1c1e522 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -107,6 +107,8 @@ void __init early_fixmap_init(void); extern void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot); +extern pte_t *__get_fixmap_pte(enum fixed_addresses idx); + #include #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 9a62884183e5..1926ca23aa4b 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -46,13 +46,25 @@ extern void fpsimd_restore_current_state(void); extern void fpsimd_update_current_state(struct user_fpsimd_state const *state); extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state, - void *sve_state, unsigned int sve_vl); + void *sve_state, unsigned int sve_vl, + void *za_state, unsigned int sme_vl, + u64 *svcr); extern void fpsimd_flush_task_state(struct task_struct *target); extern void fpsimd_save_and_flush_cpu_state(void); -/* Maximum VL that SVE VL-agnostic software can transparently support */ -#define SVE_VL_ARCH_MAX 0x100 +static inline bool thread_sm_enabled(struct thread_struct *thread) +{ + return system_supports_sme() && (thread->svcr & SVCR_SM_MASK); +} + +static inline bool thread_za_enabled(struct thread_struct *thread) +{ + return system_supports_sme() && (thread->svcr & SVCR_ZA_MASK); +} + +/* Maximum VL that SVE/SME VL-agnostic software can transparently support */ +#define VL_ARCH_MAX 0x100 /* Offset of FFR in the SVE register dump */ static inline size_t sve_ffr_offset(int vl) @@ -62,26 +74,33 @@ static inline size_t sve_ffr_offset(int vl) static inline void *sve_pffr(struct thread_struct *thread) { - return (char *)thread->sve_state + sve_ffr_offset(thread->sve_vl); + unsigned int vl; + + if (system_supports_sme() && thread_sm_enabled(thread)) + vl = thread_get_sme_vl(thread); + else + vl = thread_get_sve_vl(thread); + + return (char *)thread->sve_state + sve_ffr_offset(vl); } -extern void sve_save_state(void *state, u32 *pfpsr); +extern void sve_save_state(void *state, u32 *pfpsr, int save_ffr); extern void sve_load_state(void const *state, u32 const *pfpsr, - unsigned long vq_minus_1); -extern void sve_flush_live(unsigned long vq_minus_1); -extern void sve_load_from_fpsimd_state(struct user_fpsimd_state const *state, - unsigned long vq_minus_1); + int restore_ffr); +extern void sve_flush_live(bool flush_ffr, unsigned long vq_minus_1); extern unsigned int sve_get_vl(void); extern void sve_set_vq(unsigned long vq_minus_1); +extern void sme_set_vq(unsigned long vq_minus_1); +extern void za_save_state(void *state); +extern void za_load_state(void const *state); struct arm64_cpu_capabilities; extern void sve_kernel_enable(const struct arm64_cpu_capabilities *__unused); +extern void sme_kernel_enable(const struct arm64_cpu_capabilities *__unused); +extern void fa64_kernel_enable(const struct arm64_cpu_capabilities *__unused); extern u64 read_zcr_features(void); - -extern int __ro_after_init sve_max_vl; -extern int __ro_after_init sve_max_virtualisable_vl; -extern __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); +extern u64 read_smcr_features(void); /* * Helpers to translate bit indices in sve_vq_map to VQ values (and @@ -98,23 +117,38 @@ static inline unsigned int __bit_to_vq(unsigned int bit) return SVE_VQ_MAX - bit; } -/* Ensure vq >= SVE_VQ_MIN && vq <= SVE_VQ_MAX before calling this function */ -static inline bool sve_vq_available(unsigned int vq) -{ - return test_bit(__vq_to_bit(vq), sve_vq_map); -} + +struct vl_info { + enum vec_type type; + const char *name; /* For display purposes */ + + /* Minimum supported vector length across all CPUs */ + int min_vl; + + /* Maximum supported vector length across all CPUs */ + int max_vl; + int max_virtualisable_vl; + + /* + * Set of available vector lengths, + * where length vq encoded as bit __vq_to_bit(vq): + */ + DECLARE_BITMAP(vq_map, SVE_VQ_MAX); + + /* Set of vector lengths present on at least one cpu: */ + DECLARE_BITMAP(vq_partial_map, SVE_VQ_MAX); +}; #ifdef CONFIG_ARM64_SVE -extern size_t sve_state_size(struct task_struct const *task); - -extern void sve_alloc(struct task_struct *task); +extern void sve_alloc(struct task_struct *task, bool flush); extern void fpsimd_release_task(struct task_struct *task); extern void fpsimd_sync_to_sve(struct task_struct *task); +extern void fpsimd_force_sync_to_sve(struct task_struct *task); extern void sve_sync_to_fpsimd(struct task_struct *task); extern void sve_sync_from_fpsimd_zeropad(struct task_struct *task); -extern int sve_set_vector_length(struct task_struct *task, +extern int vec_set_vector_length(struct task_struct *task, enum vec_type type, unsigned long vl, unsigned long flags); extern int sve_set_current_vl(unsigned long arg); @@ -143,18 +177,83 @@ static inline void sve_user_enable(void) * Probing and setup functions. * Calls to these functions must be serialised with one another. */ -extern void __init sve_init_vq_map(void); -extern void sve_update_vq_map(void); -extern int sve_verify_vq_map(void); +enum vec_type; + +extern void __init vec_init_vq_map(enum vec_type type); +extern void vec_update_vq_map(enum vec_type type); +extern int vec_verify_vq_map(enum vec_type type); extern void __init sve_setup(void); +extern __ro_after_init struct vl_info vl_info[ARM64_VEC_MAX]; + +static inline void write_vl(enum vec_type type, u64 val) +{ + u64 tmp; + + switch (type) { +#ifdef CONFIG_ARM64_SVE + case ARM64_VEC_SVE: + tmp = read_sysreg_s(SYS_ZCR_EL1) & ~ZCR_ELx_LEN_MASK; + write_sysreg_s(tmp | val, SYS_ZCR_EL1); + break; +#endif +#ifdef CONFIG_ARM64_SME + case ARM64_VEC_SME: + tmp = read_sysreg_s(SYS_SMCR_EL1) & ~SMCR_ELx_LEN_MASK; + write_sysreg_s(tmp | val, SYS_SMCR_EL1); + break; +#endif + default: + WARN_ON_ONCE(1); + break; + } +} + +static inline int vec_max_vl(enum vec_type type) +{ + return vl_info[type].max_vl; +} + +static inline int vec_max_virtualisable_vl(enum vec_type type) +{ + return vl_info[type].max_virtualisable_vl; +} + +static inline int sve_max_vl(void) +{ + return vec_max_vl(ARM64_VEC_SVE); +} + +static inline int sve_max_virtualisable_vl(void) +{ + return vec_max_virtualisable_vl(ARM64_VEC_SVE); +} + +/* Ensure vq >= SVE_VQ_MIN && vq <= SVE_VQ_MAX before calling this function */ +static inline bool vq_available(enum vec_type type, unsigned int vq) +{ + return test_bit(__vq_to_bit(vq), vl_info[type].vq_map); +} + +static inline bool sve_vq_available(unsigned int vq) +{ + return vq_available(ARM64_VEC_SVE, vq); +} + +size_t sve_state_size(struct task_struct const *task); + #else /* ! CONFIG_ARM64_SVE */ -static inline void sve_alloc(struct task_struct *task) { } +static inline void sve_alloc(struct task_struct *task, bool flush) { } static inline void fpsimd_release_task(struct task_struct *task) { } static inline void sve_sync_to_fpsimd(struct task_struct *task) { } static inline void sve_sync_from_fpsimd_zeropad(struct task_struct *task) { } +static inline int sve_max_virtualisable_vl(void) +{ + return 0; +} + static inline int sve_set_current_vl(unsigned long arg) { return -EINVAL; @@ -165,18 +264,110 @@ static inline int sve_get_current_vl(void) return -EINVAL; } +static inline int sve_max_vl(void) +{ + return -EINVAL; +} + +static inline bool sve_vq_available(unsigned int vq) { return false; } + static inline void sve_user_disable(void) { BUILD_BUG(); } static inline void sve_user_enable(void) { BUILD_BUG(); } #define sve_cond_update_zcr_vq(val, reg) do { } while (0) -static inline void sve_init_vq_map(void) { } -static inline void sve_update_vq_map(void) { } -static inline int sve_verify_vq_map(void) { return 0; } +static inline void vec_init_vq_map(enum vec_type t) { } +static inline void vec_update_vq_map(enum vec_type t) { } +static inline int vec_verify_vq_map(enum vec_type t) { return 0; } static inline void sve_setup(void) { } +static inline size_t sve_state_size(struct task_struct const *task) +{ + return 0; +} + #endif /* ! CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_SME + +static inline void sme_user_disable(void) +{ + sysreg_clear_set(cpacr_el1, CPACR_EL1_SMEN_EL0EN, 0); +} + +static inline void sme_user_enable(void) +{ + sysreg_clear_set(cpacr_el1, 0, CPACR_EL1_SMEN_EL0EN); +} + +static inline void sme_smstart_sm(void) +{ + asm volatile(__msr_s(SYS_SVCR_SMSTART_SM_EL0, "xzr")); +} + +static inline void sme_smstop_sm(void) +{ + asm volatile(__msr_s(SYS_SVCR_SMSTOP_SM_EL0, "xzr")); +} + +static inline void sme_smstop(void) +{ + asm volatile(__msr_s(SYS_SVCR_SMSTOP_SMZA_EL0, "xzr")); +} + +extern void __init sme_setup(void); + +static inline int sme_max_vl(void) +{ + return vec_max_vl(ARM64_VEC_SME); +} + +static inline int sme_max_virtualisable_vl(void) +{ + return vec_max_virtualisable_vl(ARM64_VEC_SME); +} + +extern void sme_alloc(struct task_struct *task); +extern unsigned int sme_get_vl(void); +extern int sme_set_current_vl(unsigned long arg); +extern int sme_get_current_vl(void); + +/* + * Return how many bytes of memory are required to store the full SME + * specific state (currently just ZA) for task, given task's currently + * configured vector length. + */ +static inline size_t za_state_size(struct task_struct const *task) +{ + unsigned int vl = task_get_sme_vl(task); + + return ZA_SIG_REGS_SIZE(sve_vq_from_vl(vl)); +} + +#else + +static inline void sme_user_disable(void) { BUILD_BUG(); } +static inline void sme_user_enable(void) { BUILD_BUG(); } + +static inline void sme_smstart_sm(void) { } +static inline void sme_smstop_sm(void) { } +static inline void sme_smstop(void) { } + +static inline void sme_alloc(struct task_struct *task) { } +static inline void sme_setup(void) { } +static inline unsigned int sme_get_vl(void) { return 0; } +static inline int sme_max_vl(void) { return 0; } +static inline int sme_max_virtualisable_vl(void) { return 0; } +static inline int sme_set_current_vl(unsigned long arg) { return -EINVAL; } +static inline int sme_get_current_vl(void) { return -EINVAL; } + +static inline size_t za_state_size(struct task_struct const *task) +{ + return 0; +} + +#endif /* ! CONFIG_ARM64_SME */ + /* For use by EFI runtime services calls only */ extern void __efi_fpsimd_begin(void); extern void __efi_fpsimd_end(void); diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index 00a2c0b69c2b..5e0910cf4832 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h @@ -93,6 +93,12 @@ .endif .endm +.macro _sme_check_wv v + .if (\v) < 12 || (\v) > 15 + .error "Bad vector select register \v." + .endif +.endm + /* SVE instruction encodings for non-SVE-capable assemblers */ /* (pre binutils 2.28, all kernel capable clang versions support SVE) */ @@ -174,6 +180,54 @@ | (\np) .endm +/* SME instruction encodings for non-SME-capable assemblers */ +/* (pre binutils 2.38/LLVM 13) */ + +/* RDSVL X\nx, #\imm */ +.macro _sme_rdsvl nx, imm + _check_general_reg \nx + _check_num (\imm), -0x20, 0x1f + .inst 0x04bf5800 \ + | (\nx) \ + | (((\imm) & 0x3f) << 5) +.endm + +/* + * STR (vector from ZA array): + * STR ZA[\nw, #\offset], [X\nxbase, #\offset, MUL VL] + */ +.macro _sme_str_zav nw, nxbase, offset=0 + _sme_check_wv \nw + _check_general_reg \nxbase + _check_num (\offset), -0x100, 0xff + .inst 0xe1200000 \ + | (((\nw) & 3) << 13) \ + | ((\nxbase) << 5) \ + | ((\offset) & 7) +.endm + +/* + * LDR (vector to ZA array): + * LDR ZA[\nw, #\offset], [X\nxbase, #\offset, MUL VL] + */ +.macro _sme_ldr_zav nw, nxbase, offset=0 + _sme_check_wv \nw + _check_general_reg \nxbase + _check_num (\offset), -0x100, 0xff + .inst 0xe1000000 \ + | (((\nw) & 3) << 13) \ + | ((\nxbase) << 5) \ + | ((\offset) & 7) +.endm + +/* + * Zero the entire ZA array + * ZERO ZA + */ +.macro zero_za + .inst 0xc00800ff +.endm + .macro __for from:req, to:req .if (\from) == (\to) _for__body %\from @@ -208,6 +262,17 @@ 921: .endm +/* Update SMCR_EL1.LEN with the new VQ */ +.macro sme_load_vq xvqminus1, xtmp, xtmp2 + mrs_s \xtmp, SYS_SMCR_EL1 + bic \xtmp2, \xtmp, SMCR_ELx_LEN_MASK + orr \xtmp2, \xtmp2, \xvqminus1 + cmp \xtmp2, \xtmp + b.eq 921f + msr_s SYS_SMCR_EL1, \xtmp2 //self-synchronising +921: +.endm + /* Preserve the first 128-bits of Znz and zero the rest. */ .macro _sve_flush_z nz _sve_check_zreg \nz @@ -217,28 +282,36 @@ .macro sve_flush_z _for n, 0, 31, _sve_flush_z \n .endm -.macro sve_flush_p_ffr +.macro sve_flush_p _for n, 0, 15, _sve_pfalse \n +.endm +.macro sve_flush_ffr _sve_wrffr 0 .endm -.macro sve_save nxbase, xpfpsr, nxtmp +.macro sve_save nxbase, xpfpsr, save_ffr, nxtmp _for n, 0, 31, _sve_str_v \n, \nxbase, \n - 34 _for n, 0, 15, _sve_str_p \n, \nxbase, \n - 16 + cbz \save_ffr, 921f _sve_rdffr 0 _sve_str_p 0, \nxbase _sve_ldr_p 0, \nxbase, -16 - + b 922f +921: + str xzr, [x\nxbase] // Zero out FFR +922: mrs x\nxtmp, fpsr str w\nxtmp, [\xpfpsr] mrs x\nxtmp, fpcr str w\nxtmp, [\xpfpsr, #4] .endm -.macro __sve_load nxbase, xpfpsr, nxtmp +.macro sve_load nxbase, xpfpsr, restore_ffr, nxtmp _for n, 0, 31, _sve_ldr_v \n, \nxbase, \n - 34 + cbz \restore_ffr, 921f _sve_ldr_p 0, \nxbase _sve_wrffr 0 +921: _for n, 0, 15, _sve_ldr_p \n, \nxbase, \n - 16 ldr w\nxtmp, [\xpfpsr] @@ -247,7 +320,24 @@ msr fpcr, x\nxtmp .endm -.macro sve_load nxbase, xpfpsr, xvqminus1, nxtmp, xtmp2 - sve_load_vq \xvqminus1, x\nxtmp, \xtmp2 - __sve_load \nxbase, \xpfpsr, \nxtmp +.macro sme_save_za nxbase, xvl, nw + mov w\nw, #0 + +423: + _sme_str_zav \nw, \nxbase + add x\nxbase, x\nxbase, \xvl + add x\nw, x\nw, #1 + cmp \xvl, x\nw + bne 423b +.endm + +.macro sme_load_za nxbase, xvl, nw + mov w\nw, #0 + +423: + _sme_ldr_zav \nw, \nxbase + add x\nxbase, x\nxbase, \xvl + add x\nw, x\nw, #1 + cmp \xvl, x\nw + bne 423b .endm diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h index 91fa4baa1a93..c96d47cb8f46 100644 --- a/arch/arm64/include/asm/ftrace.h +++ b/arch/arm64/include/asm/ftrace.h @@ -12,6 +12,17 @@ #define HAVE_FUNCTION_GRAPH_FP_TEST +/* + * HAVE_FUNCTION_GRAPH_RET_ADDR_PTR means that the architecture can provide a + * "return address pointer" which can be used to uniquely identify a return + * address which has been overwritten. + * + * On arm64 we use the address of the caller's frame record, which remains the + * same for the lifetime of the instrumented function, unlike the return + * address in the LR. + */ +#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS #define ARCH_SUPPORTS_FTRACE_OPS 1 #else diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h index bc7aaed4b34e..fa4c6ff3aa9b 100644 --- a/arch/arm64/include/asm/hw_breakpoint.h +++ b/arch/arm64/include/asm/hw_breakpoint.h @@ -142,7 +142,7 @@ static inline int get_num_brps(void) u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); return 1 + cpuid_feature_extract_unsigned_field(dfr0, - ID_AA64DFR0_BRPS_SHIFT); + ID_AA64DFR0_EL1_BRPs_SHIFT); } /* Determine number of WRP registers available. */ @@ -151,7 +151,7 @@ static inline int get_num_wrps(void) u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); return 1 + cpuid_feature_extract_unsigned_field(dfr0, - ID_AA64DFR0_WRPS_SHIFT); + ID_AA64DFR0_EL1_WRPs_SHIFT); } #endif /* __ASM_BREAKPOINT_H */ diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index f68fbb207473..aa443d8f8cfb 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -108,6 +108,16 @@ #define KERNEL_HWCAP_ECV __khwcap2_feature(ECV) #define KERNEL_HWCAP_AFP __khwcap2_feature(AFP) #define KERNEL_HWCAP_RPRES __khwcap2_feature(RPRES) +#define KERNEL_HWCAP_MTE3 __khwcap2_feature(MTE3) +#define KERNEL_HWCAP_SME __khwcap2_feature(SME) +#define KERNEL_HWCAP_SME_I16I64 __khwcap2_feature(SME_I16I64) +#define KERNEL_HWCAP_SME_F64F64 __khwcap2_feature(SME_F64F64) +#define KERNEL_HWCAP_SME_I8I32 __khwcap2_feature(SME_I8I32) +#define KERNEL_HWCAP_SME_F16F32 __khwcap2_feature(SME_F16F32) +#define KERNEL_HWCAP_SME_B16F32 __khwcap2_feature(SME_B16F32) +#define KERNEL_HWCAP_SME_F32F32 __khwcap2_feature(SME_F32F32) +#define KERNEL_HWCAP_SME_FA64 __khwcap2_feature(SME_FA64) +#define KERNEL_HWCAP_WFXT __khwcap2_feature(WFXT) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h index 0ae427f352c8..9b4e4ed79623 100644 --- a/arch/arm64/include/asm/hypervisor.h +++ b/arch/arm64/include/asm/hypervisor.h @@ -6,5 +6,14 @@ void kvm_init_hyp_services(void); bool kvm_arm_hyp_service_available(u32 func_id); +void kvm_arm_init_hyp_services(void); +void kvm_init_memshare_services(void); +void kvm_init_ioremap_services(void); + +#ifdef CONFIG_MEMORY_RELINQUISH +void kvm_init_memrelinquish_services(void); +#else +static inline void kvm_init_memrelinquish_services(void) {} +#endif #endif diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index f67a561e0935..7313fac7086d 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -80,11 +80,12 @@ * FMO: Override CPSR.F and enable signaling with VF * SWIO: Turn set/way invalidates into set/way clean+invalidate * PTW: Take a stage2 fault if a stage1 walk steps in device memory + * TID3: Trap EL1 reads of group 3 ID registers */ #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \ HCR_BSU_IS | HCR_FB | HCR_TACR | \ HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \ - HCR_FMO | HCR_IMO | HCR_PTW ) + HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 ) #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF) #define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA) #define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC) @@ -134,7 +135,7 @@ * 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are * not known to exist and will break with this configuration. * - * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2(). + * The VTCR_EL2 is configured per VM and is initialised in kvm_init_stage2_mmu. * * Note that when using 4K pages, we concatenate two first level page tables * together. With 16K pages, we concatenate 16 first level page tables. @@ -279,6 +280,7 @@ #define CPTR_EL2_TCPAC (1U << 31) #define CPTR_EL2_TAM (1 << 30) #define CPTR_EL2_TTA (1 << 20) +#define CPTR_EL2_TSM (1 << 12) #define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT) #define CPTR_EL2_TZ (1 << 8) #define CPTR_NVHE_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 (nVHE) */ @@ -295,6 +297,7 @@ #define MDCR_EL2_HPMFZO (UL(1) << 29) #define MDCR_EL2_MTPME (UL(1) << 28) #define MDCR_EL2_TDCC (UL(1) << 27) +#define MDCR_EL2_HLP (UL(1) << 26) #define MDCR_EL2_HCCD (UL(1) << 23) #define MDCR_EL2_TTRF (UL(1) << 19) #define MDCR_EL2_HPMD (UL(1) << 17) @@ -341,6 +344,8 @@ #define PAR_TO_HPFAR(par) \ (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8) +#define FAR_MASK GENMASK_ULL(11, 0) + #define ECN(x) { ESR_ELx_EC_##x, #x } #define kvm_arm_exception_class \ @@ -354,8 +359,17 @@ ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \ ECN(BKPT32), ECN(VECTOR32), ECN(BRK64) -#define CPACR_EL1_FPEN (3 << 20) #define CPACR_EL1_TTA (1 << 28) -#define CPACR_EL1_DEFAULT (CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN) +#define CPACR_EL1_DEFAULT (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |\ + CPACR_EL1_ZEN_EL1EN) + +/* + * ARMv8 Reset Values + */ +#define VCPU_RESET_PSTATE_EL1 (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | \ + PSR_F_BIT | PSR_D_BIT) + +#define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \ + PSR_AA32_I_BIT | PSR_AA32_F_BIT) #endif /* __ARM64_KVM_ARM_H__ */ diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index e86045ac43ba..1cb42d5b5484 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -44,31 +44,72 @@ #define KVM_HOST_SMCCC_FUNC(name) KVM_HOST_SMCCC_ID(__KVM_HOST_SMCCC_FUNC_##name) #define __KVM_HOST_SMCCC_FUNC___kvm_hyp_init 0 -#define __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run 1 -#define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2 -#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3 -#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4 -#define __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context 5 -#define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6 -#define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config 8 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11 -#define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 12 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14 -#define __KVM_HOST_SMCCC_FUNC___pkvm_init 15 -#define __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp 16 -#define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping 17 -#define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18 -#define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19 -#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 20 #ifndef __ASSEMBLY__ #include +enum __kvm_host_smccc_func { + /* Hypercalls available only prior to pKVM finalisation */ + /* __KVM_HOST_SMCCC_FUNC___kvm_hyp_init */ + __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 = __KVM_HOST_SMCCC_FUNC___kvm_hyp_init + 1, + __KVM_HOST_SMCCC_FUNC___pkvm_init, + __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping, + __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector, + __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs, + __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs, + __KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config, + __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context, + __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa, + __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid, + __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context, + + /* + * __pkvm_alloc_module_va may temporarily serve as the privileged hcall + * limit when module loading is enabled, see early_pkvm_enable_modules(). + */ + __KVM_HOST_SMCCC_FUNC___pkvm_alloc_module_va, + __KVM_HOST_SMCCC_FUNC___pkvm_map_module_page, + __KVM_HOST_SMCCC_FUNC___pkvm_unmap_module_page, + __KVM_HOST_SMCCC_FUNC___pkvm_init_module, + __KVM_HOST_SMCCC_FUNC___pkvm_register_hcall, + __KVM_HOST_SMCCC_FUNC___pkvm_close_module_registration, + __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize, + + /* Hypercalls available after pKVM finalisation */ + __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp, + __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp, + __KVM_HOST_SMCCC_FUNC___pkvm_host_map_guest, + __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc, + __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run, + __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff, + __KVM_HOST_SMCCC_FUNC___vgic_v3_save_vmcr_aprs, + __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs, + __KVM_HOST_SMCCC_FUNC___pkvm_init_vm, + __KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu, + __KVM_HOST_SMCCC_FUNC___pkvm_start_teardown_vm, + __KVM_HOST_SMCCC_FUNC___pkvm_finalize_teardown_vm, + __KVM_HOST_SMCCC_FUNC___pkvm_reclaim_dying_guest_page, + __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_load, + __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_put, + __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_sync_state, + __KVM_HOST_SMCCC_FUNC___pkvm_iommu_driver_init, + __KVM_HOST_SMCCC_FUNC___pkvm_iommu_register, + __KVM_HOST_SMCCC_FUNC___pkvm_iommu_pm_notify, + __KVM_HOST_SMCCC_FUNC___pkvm_iommu_finalize, + __KVM_HOST_SMCCC_FUNC___pkvm_start_tracing, + __KVM_HOST_SMCCC_FUNC___pkvm_stop_tracing, + __KVM_HOST_SMCCC_FUNC___pkvm_rb_swap_reader_page, + __KVM_HOST_SMCCC_FUNC___pkvm_rb_update_footers, + __KVM_HOST_SMCCC_FUNC___pkvm_enable_event, + + /* + * Start of the dynamically registered hypercalls. Start a bit + * further, just in case some modules... + */ + __KVM_HOST_SMCCC_FUNC___dynamic_hcalls = 128, +}; + #define DECLARE_KVM_VHE_SYM(sym) extern char sym[] #define DECLARE_KVM_NVHE_SYM(sym) extern char kvm_nvhe_sym(sym)[] @@ -97,7 +138,7 @@ #define per_cpu_ptr_nvhe_sym(sym, cpu) \ ({ \ unsigned long base, off; \ - base = kvm_arm_hyp_percpu_base[cpu]; \ + base = kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu]; \ off = (unsigned long)&CHOOSE_NVHE_SYM(sym) - \ (unsigned long)&CHOOSE_NVHE_SYM(__per_cpu_start); \ base ? (typeof(CHOOSE_NVHE_SYM(sym))*)(base + off) : NULL; \ @@ -160,12 +201,29 @@ struct kvm_nvhe_init_params { unsigned long tcr_el2; unsigned long tpidr_el2; unsigned long stack_hyp_va; + unsigned long stack_pa; phys_addr_t pgd_pa; unsigned long hcr_el2; unsigned long vttbr; unsigned long vtcr; }; +/* + * Used by the host in EL1 to dump the nVHE hypervisor backtrace on + * hyp_panic() in non-protected mode. + * + * @stack_base: hyp VA of the hyp_stack base. + * @overflow_stack_base: hyp VA of the hyp_overflow_stack base. + * @fp: hyp FP where the backtrace begins. + * @pc: hyp PC where the backtrace begins. + */ +struct kvm_nvhe_stacktrace_info { + unsigned long stack_base; + unsigned long overflow_stack_base; + unsigned long fp; + unsigned long pc; +}; + /* Translate a kernel address @ptr into its equivalent linear mapping */ #define kvm_ksym_ref(ptr) \ ({ \ @@ -185,7 +243,7 @@ DECLARE_KVM_HYP_SYM(__kvm_hyp_vector); #define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init) #define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector) -extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; +extern unsigned long kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[]; DECLARE_KVM_NVHE_SYM(__per_cpu_start); DECLARE_KVM_NVHE_SYM(__per_cpu_end); @@ -205,8 +263,6 @@ extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); extern void __kvm_adjust_pc(struct kvm_vcpu *vcpu); extern u64 __vgic_v3_get_gic_config(void); -extern u64 __vgic_v3_read_vmcr(void); -extern void __vgic_v3_write_vmcr(u32 vmcr); extern void __vgic_v3_init_lrs(void); extern u64 __kvm_get_mdcr_el2(void); diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 64f8a90d3327..8d375229a770 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -40,11 +40,31 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu); void kvm_inject_vabt(struct kvm_vcpu *vcpu); void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr); +void kvm_inject_size_fault(struct kvm_vcpu *vcpu); +unsigned long get_except64_offset(unsigned long psr, unsigned long target_mode, + enum exception_type type); +unsigned long get_except64_cpsr(unsigned long old, bool has_mte, + unsigned long sctlr, unsigned long mode); + +void kvm_vcpu_wfi(struct kvm_vcpu *vcpu); + +#if defined(__KVM_VHE_HYPERVISOR__) || defined(__KVM_NVHE_HYPERVISOR__) static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) { return !(vcpu->arch.hcr_el2 & HCR_RW); } +#else +static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + WARN_ON_ONCE(!test_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, + &kvm->arch.flags)); + + return test_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags); +} +#endif static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) { @@ -70,17 +90,9 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) vcpu->arch.hcr_el2 |= HCR_TVM; } - if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) + if (vcpu_el1_is_32bit(vcpu)) vcpu->arch.hcr_el2 &= ~HCR_RW; - /* - * TID3: trap feature register accesses that we virtualise. - * For now this is conditional, since no AArch32 feature regs - * are currently virtualised. - */ - if (!vcpu_el1_is_32bit(vcpu)) - vcpu->arch.hcr_el2 |= HCR_TID3; - if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) || vcpu_el1_is_32bit(vcpu)) vcpu->arch.hcr_el2 |= HCR_TID2; @@ -404,7 +416,7 @@ static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu) *vcpu_cpsr(vcpu) |= PSR_AA32_E_BIT; } else { u64 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); - sctlr |= (1 << 25); + sctlr |= SCTLR_ELx_EE; vcpu_write_sys_reg(vcpu, sctlr, SCTLR_EL1); } } @@ -414,7 +426,10 @@ static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu) if (vcpu_mode_is_32bit(vcpu)) return !!(*vcpu_cpsr(vcpu) & PSR_AA32_E_BIT); - return !!(vcpu_read_sys_reg(vcpu, SCTLR_EL1) & (1 << 25)); + if (vcpu_mode_priv(vcpu)) + return !!(vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_EE); + else + return !!(vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_EL1_E0E); } static inline unsigned long vcpu_data_guest_to_host(struct kvm_vcpu *vcpu, @@ -481,12 +496,78 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) { - vcpu->arch.flags |= KVM_ARM64_INCREMENT_PC; + WARN_ON(vcpu_get_flag(vcpu, PENDING_EXCEPTION)); + vcpu_set_flag(vcpu, INCREMENT_PC); } +#define kvm_pend_exception(v, e) \ + do { \ + WARN_ON(vcpu_get_flag((v), INCREMENT_PC)); \ + vcpu_set_flag((v), PENDING_EXCEPTION); \ + vcpu_set_flag((v), e); \ + } while (0) + + static inline bool vcpu_has_feature(struct kvm_vcpu *vcpu, int feature) { return test_bit(feature, vcpu->arch.features); } +static inline int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) +{ + /* + * For now make sure that both address/generic pointer authentication + * features are requested by the userspace together and the system + * supports these capabilities. + */ + if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) || + !vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC) || + !system_has_full_ptr_auth()) + return -EINVAL; + + vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH); + return 0; +} + +/* Reset a vcpu's core registers. */ +static inline void kvm_reset_vcpu_core(struct kvm_vcpu *vcpu) +{ + u32 pstate; + + if (vcpu_el1_is_32bit(vcpu)) { + pstate = VCPU_RESET_PSTATE_SVC; + } else { + pstate = VCPU_RESET_PSTATE_EL1; + } + + /* Reset core registers */ + memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu))); + memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs)); + vcpu->arch.ctxt.spsr_abt = 0; + vcpu->arch.ctxt.spsr_und = 0; + vcpu->arch.ctxt.spsr_irq = 0; + vcpu->arch.ctxt.spsr_fiq = 0; + vcpu_gp_regs(vcpu)->pstate = pstate; +} + +/* PSCI reset handling for a vcpu. */ +static inline void kvm_reset_vcpu_psci(struct kvm_vcpu *vcpu, + struct vcpu_reset_state *reset_state) +{ + unsigned long target_pc = reset_state->pc; + + /* Gracefully handle Thumb2 entry point */ + if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) { + target_pc &= ~1UL; + vcpu_set_thumb(vcpu); + } + + /* Propagate caller endianness */ + if (reset_state->be) + kvm_vcpu_set_be(vcpu); + + *vcpu_pc(vcpu) = target_pc; + vcpu_set_reg(vcpu, 0, reset_state->r0); +} + #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 1713630bf8f5..cf6ce6fe2bb3 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -26,7 +26,6 @@ #include #include #include -#include #define __KVM_HAVE_ARCH_INTC_INITIALIZED @@ -47,10 +46,13 @@ #define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3) #define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4) #define KVM_REQ_RELOAD_PMU KVM_ARCH_REQ(5) +#define KVM_REQ_SUSPEND KVM_ARCH_REQ(6) #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) +#define KVM_HAVE_MMU_RWLOCK + /* * Mode of operation configurable with kvm-arm.mode early param. * See Documentation/admin-guide/kernel-parameters.txt for more information. @@ -58,6 +60,7 @@ enum kvm_mode { KVM_MODE_DEFAULT, KVM_MODE_PROTECTED, + KVM_MODE_NONE, }; enum kvm_mode kvm_get_mode(void); @@ -70,10 +73,66 @@ u32 __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); +struct kvm_hyp_memcache { + phys_addr_t head; + unsigned long nr_pages; +}; + +static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc, + phys_addr_t *p, + phys_addr_t (*to_pa)(void *virt)) +{ + *p = mc->head; + mc->head = to_pa(p); + mc->nr_pages++; +} + +static inline void *pop_hyp_memcache(struct kvm_hyp_memcache *mc, + void *(*to_va)(phys_addr_t phys)) +{ + phys_addr_t *p = to_va(mc->head); + + if (!mc->nr_pages) + return NULL; + + mc->head = *p; + mc->nr_pages--; + + return p; +} + +static inline int __topup_hyp_memcache(struct kvm_hyp_memcache *mc, + unsigned long min_pages, + void *(*alloc_fn)(void *arg), + phys_addr_t (*to_pa)(void *virt), + void *arg) +{ + while (mc->nr_pages < min_pages) { + phys_addr_t *p = alloc_fn(arg); + + if (!p) + return -ENOMEM; + push_hyp_memcache(mc, p, to_pa); + } + + return 0; +} + +static inline void __free_hyp_memcache(struct kvm_hyp_memcache *mc, + void (*free_fn)(void *virt, void *arg), + void *(*to_va)(phys_addr_t phys), + void *arg) +{ + while (mc->nr_pages) + free_fn(pop_hyp_memcache(mc, to_va), arg); +} + +void free_hyp_memcache(struct kvm_hyp_memcache *mc, struct kvm *kvm); +void free_hyp_stage2_memcache(struct kvm_hyp_memcache *mc, struct kvm *kvm); +int topup_hyp_memcache(struct kvm_vcpu *vcpu); + struct kvm_vmid { - /* The VMID generation used for the virt. memory system */ - u64 vmid_gen; - u32 vmid; + atomic64_t id; }; struct kvm_s2_mmu { @@ -101,6 +160,36 @@ struct kvm_s2_mmu { struct kvm_arch_memory_slot { }; +/** + * struct kvm_smccc_features: Descriptor of the hypercall services exposed to the guests + * + * @std_bmap: Bitmap of standard secure service calls + * @std_hyp_bmap: Bitmap of standard hypervisor service calls + * @vendor_hyp_bmap: Bitmap of vendor specific hypervisor service calls + */ +struct kvm_smccc_features { + unsigned long std_bmap; + unsigned long std_hyp_bmap; + unsigned long vendor_hyp_bmap; +}; + +struct kvm_pinned_page { + struct rb_node node; + struct page *page; + u64 ipa; +}; + +typedef unsigned int pkvm_handle_t; + +struct kvm_protected_vm { + pkvm_handle_t handle; + struct kvm_hyp_memcache teardown_mc; + struct kvm_hyp_memcache teardown_stage2_mc; + struct rb_root pinned_pages; + gpa_t pvmfw_load_addr; + bool enabled; +}; + struct kvm_arch { struct kvm_s2_mmu mmu; @@ -122,20 +211,46 @@ struct kvm_arch { * should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is * supported. */ - bool return_nisv_io_abort_to_user; +#define KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER 0 + /* Memory Tagging Extension enabled for the guest */ +#define KVM_ARCH_FLAG_MTE_ENABLED 1 + /* At least one vCPU has ran in the VM */ +#define KVM_ARCH_FLAG_HAS_RAN_ONCE 2 + /* + * The following two bits are used to indicate the guest's EL1 + * register width configuration. A value of KVM_ARCH_FLAG_EL1_32BIT + * bit is valid only when KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED is set. + * Otherwise, the guest's EL1 register width has not yet been + * determined yet. + */ +#define KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED 3 +#define KVM_ARCH_FLAG_EL1_32BIT 4 + /* PSCI SYSTEM_SUSPEND enabled for the guest */ +#define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED 5 + /* Guest has bought into the MMIO guard extension */ +#define KVM_ARCH_FLAG_MMIO_GUARD 6 + unsigned long flags; /* * VM-wide PMU filter, implemented as a bitmap and big enough for * up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+). */ unsigned long *pmu_filter; - unsigned int pmuver; + struct arm_pmu *arm_pmu; + + cpumask_var_t supported_cpus; u8 pfr0_csv2; u8 pfr0_csv3; - /* Memory Tagging Extension enabled for the guest */ - bool mte_enabled; + /* Hypercall features firmware registers' descriptor */ + struct kvm_smccc_features smccc_feat; + + /* + * For an untrusted host VM, 'pkvm.handle' is used to lookup + * the associated pKVM instance in the hypervisor. + */ + struct kvm_protected_vm pkvm; }; struct kvm_vcpu_fault_info { @@ -171,6 +286,7 @@ enum vcpu_sysreg { PAR_EL1, /* Physical Address Register */ MDSCR_EL1, /* Monitor Debug System Control Register */ MDCCINT_EL1, /* Monitor Debug Comms Channel Interrupt Enable Reg */ + OSLSR_EL1, /* OS Lock Status Register */ DISR_EL1, /* Deferred Interrupt Status Register */ /* Performance Monitors Registers */ @@ -239,19 +355,14 @@ struct kvm_cpu_context { struct kvm_vcpu *__hyp_running_vcpu; }; -struct kvm_pmu_events { - u32 events_host; - u32 events_guest; -}; - struct kvm_host_data { struct kvm_cpu_context host_ctxt; - struct kvm_pmu_events pmu_events; }; struct kvm_host_psci_config { /* PSCI version used by host. */ u32 version; + u32 smccc_version; /* Function IDs used by host if version is v0.1. */ struct psci_0_1_function_ids function_ids_0_1; @@ -271,6 +382,28 @@ extern s64 kvm_nvhe_sym(hyp_physvirt_offset); extern u64 kvm_nvhe_sym(hyp_cpu_logical_map)[NR_CPUS]; #define hyp_cpu_logical_map CHOOSE_NVHE_SYM(hyp_cpu_logical_map) +enum pkvm_iommu_pm_event { + PKVM_IOMMU_PM_SUSPEND, + PKVM_IOMMU_PM_RESUME, +}; + +struct pkvm_iommu_ops; + +struct pkvm_iommu_driver { + const struct pkvm_iommu_ops *ops; + struct list_head list; + atomic_t state; +}; + +int pkvm_iommu_driver_init(u64 drv, void *data, size_t size); +int pkvm_iommu_register(struct device *dev, u64 drv, + phys_addr_t pa, size_t size, struct device *parent); +int pkvm_iommu_suspend(struct device *dev); +int pkvm_iommu_resume(struct device *dev); + +/* Reject future calls to pkvm_iommu_driver_init() and pkvm_iommu_register(). */ +int pkvm_iommu_finalize(void); + struct vcpu_reset_state { unsigned long pc; unsigned long r0; @@ -280,8 +413,11 @@ struct vcpu_reset_state { struct kvm_vcpu_arch { struct kvm_cpu_context ctxt; + + /* Guest floating point state */ void *sve_state; unsigned int sve_max_vl; + u64 svcr; /* Stage 2 paging state used by the hardware on next switch */ struct kvm_s2_mmu *hw_mmu; @@ -297,11 +433,30 @@ struct kvm_vcpu_arch { /* Exception Information */ struct kvm_vcpu_fault_info fault; - /* State of various workarounds, see kvm_asm.h for bit assignment */ - u64 workaround_flags; + /* Ownership of the FP regs */ + enum { + FP_STATE_FREE, + FP_STATE_HOST_OWNED, + FP_STATE_GUEST_OWNED, + } fp_state; - /* Miscellaneous vcpu state flags */ - u64 flags; + /* Configuration flags, set once and for all before the vcpu can run */ + u8 cflags; + + /* Input flags to the hypervisor code, potentially cleared after use */ + u8 iflags; + + /* State flags for kernel bookkeeping, unused by the hypervisor code */ + u8 sflags; + + /* + * Don't run the guest (internal implementation need). + * + * Contrary to the flags above, this is set/cleared outside of + * a vcpu context, and thus cannot be mixed with the flags + * themselves (or the flag accesses need to be made atomic). + */ + bool pause; /* * We maintain more than a single set of debug registers to support @@ -320,8 +475,8 @@ struct kvm_vcpu_arch { struct kvm_guest_debug_arch vcpu_debug_state; struct kvm_guest_debug_arch external_debug_state; - struct thread_info *host_thread_info; /* hyp VA */ struct user_fpsimd_state *host_fpsimd_state; /* hyp VA */ + struct task_struct *parent_task; struct { /* {Break,watch}point registers */ @@ -337,11 +492,6 @@ struct kvm_vcpu_arch { struct arch_timer_cpu timer_cpu; struct kvm_pmu pmu; - /* - * Anything that is not used directly from assembly code goes - * here. - */ - /* * Guest registers we preserve during guest debugging. * @@ -351,34 +501,29 @@ struct kvm_vcpu_arch { */ struct { u32 mdscr_el1; + bool pstate_ss; } guest_debug_preserved; - /* vcpu power-off state */ - bool power_off; + /* vcpu power state */ + struct kvm_mp_state mp_state; - /* Don't run the guest (internal implementation need) */ - bool pause; - - /* Cache some mmu pages needed inside spinlock regions */ - struct kvm_mmu_memory_cache mmu_page_cache; + union { + /* Cache some mmu pages needed inside spinlock regions */ + struct kvm_mmu_memory_cache mmu_page_cache; + /* Pages to be donated to pkvm/EL2 if it runs out */ + struct kvm_hyp_memcache pkvm_memcache; + }; /* Target CPU and feature flags */ int target; DECLARE_BITMAP(features, KVM_VCPU_MAX_FEATURES); - /* Detect first run of a vcpu */ - bool has_run_once; - /* Virtual SError ESR to restore when HCR_EL2.VSE is set */ u64 vsesr_el2; /* Additional reset state */ struct vcpu_reset_state reset_state; - /* True when deferrable sysregs are loaded on the physical CPU, - * see kvm_vcpu_load_sysregs_vhe and kvm_vcpu_put_sysregs_vhe. */ - bool sysregs_loaded_on_cpu; - /* Guest PV state */ struct { u64 last_steal; @@ -386,6 +531,147 @@ struct kvm_vcpu_arch { } steal; }; +/* + * Each 'flag' is composed of a comma-separated triplet: + * + * - the flag-set it belongs to in the vcpu->arch structure + * - the value for that flag + * - the mask for that flag + * + * __vcpu_single_flag() builds such a triplet for a single-bit flag. + * unpack_vcpu_flag() extract the flag value from the triplet for + * direct use outside of the flag accessors. + */ +#define __vcpu_single_flag(_set, _f) _set, (_f), (_f) + +#define __unpack_flag(_set, _f, _m) _f +#define unpack_vcpu_flag(...) __unpack_flag(__VA_ARGS__) + +#define __build_check_flag(v, flagset, f, m) \ + do { \ + typeof(v->arch.flagset) *_fset; \ + \ + /* Check that the flags fit in the mask */ \ + BUILD_BUG_ON(HWEIGHT(m) != HWEIGHT((f) | (m))); \ + /* Check that the flags fit in the type */ \ + BUILD_BUG_ON((sizeof(*_fset) * 8) <= __fls(m)); \ + } while (0) + +#define __vcpu_get_flag(v, flagset, f, m) \ + ({ \ + __build_check_flag(v, flagset, f, m); \ + \ + v->arch.flagset & (m); \ + }) + +#define __vcpu_set_flag(v, flagset, f, m) \ + do { \ + typeof(v->arch.flagset) *fset; \ + \ + __build_check_flag(v, flagset, f, m); \ + \ + fset = &v->arch.flagset; \ + if (HWEIGHT(m) > 1) \ + *fset &= ~(m); \ + *fset |= (f); \ + } while (0) + +#define __vcpu_clear_flag(v, flagset, f, m) \ + do { \ + typeof(v->arch.flagset) *fset; \ + \ + __build_check_flag(v, flagset, f, m); \ + \ + fset = &v->arch.flagset; \ + *fset &= ~(m); \ + } while (0) + +#define __vcpu_copy_flag(vt, vs, flagset, f, m) \ + do { \ + typeof(vs->arch.flagset) tmp, val; \ + \ + __build_check_flag(vs, flagset, f, m); \ + \ + val = READ_ONCE(vs->arch.flagset); \ + val &= (m); \ + tmp = READ_ONCE(vt->arch.flagset); \ + tmp &= ~(m); \ + tmp |= val; \ + WRITE_ONCE(vt->arch.flagset, tmp); \ + } while (0) + + +#define vcpu_get_flag(v, ...) __vcpu_get_flag((v), __VA_ARGS__) +#define vcpu_set_flag(v, ...) __vcpu_set_flag((v), __VA_ARGS__) +#define vcpu_clear_flag(v, ...) __vcpu_clear_flag((v), __VA_ARGS__) +#define vcpu_copy_flag(vt, vs,...) __vcpu_copy_flag((vt), (vs), __VA_ARGS__) + +/* SVE exposed to guest */ +#define GUEST_HAS_SVE __vcpu_single_flag(cflags, BIT(0)) +/* SVE config completed */ +#define VCPU_SVE_FINALIZED __vcpu_single_flag(cflags, BIT(1)) +/* PTRAUTH exposed to guest */ +#define GUEST_HAS_PTRAUTH __vcpu_single_flag(cflags, BIT(2)) + +/* Exception pending */ +#define PENDING_EXCEPTION __vcpu_single_flag(iflags, BIT(0)) +/* + * PC increment. Overlaps with EXCEPT_MASK on purpose so that it can't + * be set together with an exception... + */ +#define INCREMENT_PC __vcpu_single_flag(iflags, BIT(1)) +/* Target EL/MODE (not a single flag, but let's abuse the macro) */ +#define EXCEPT_MASK __vcpu_single_flag(iflags, GENMASK(3, 1)) +/* Cover both PENDING_EXCEPTION and EXCEPT_MASK for global operations */ +#define PC_UPDATE_REQ __vcpu_single_flag(iflags, GENMASK(3, 0)) + +/* Helpers to encode exceptions with minimum fuss */ +#define __EXCEPT_MASK_VAL unpack_vcpu_flag(EXCEPT_MASK) +#define __EXCEPT_SHIFT __builtin_ctzl(__EXCEPT_MASK_VAL) +#define __vcpu_except_flags(_f) iflags, (_f << __EXCEPT_SHIFT), __EXCEPT_MASK_VAL + +/* + * When PENDING_EXCEPTION is set, EXCEPT_MASK can take the following + * values: + * + * For AArch32 EL1: + */ +#define EXCEPT_AA32_UND __vcpu_except_flags(0) +#define EXCEPT_AA32_IABT __vcpu_except_flags(1) +#define EXCEPT_AA32_DABT __vcpu_except_flags(2) +/* For AArch64: */ +#define EXCEPT_AA64_EL1_SYNC __vcpu_except_flags(0) +#define EXCEPT_AA64_EL1_IRQ __vcpu_except_flags(1) +#define EXCEPT_AA64_EL1_FIQ __vcpu_except_flags(2) +#define EXCEPT_AA64_EL1_SERR __vcpu_except_flags(3) +/* For AArch64 with NV (one day): */ +#define EXCEPT_AA64_EL2_SYNC __vcpu_except_flags(4) +#define EXCEPT_AA64_EL2_IRQ __vcpu_except_flags(5) +#define EXCEPT_AA64_EL2_FIQ __vcpu_except_flags(6) +#define EXCEPT_AA64_EL2_SERR __vcpu_except_flags(7) +/* Guest debug is live */ +#define DEBUG_DIRTY __vcpu_single_flag(iflags, BIT(4)) +/* Save SPE context if active */ +#define DEBUG_STATE_SAVE_SPE __vcpu_single_flag(iflags, BIT(5)) +/* Save TRBE context if active */ +#define DEBUG_STATE_SAVE_TRBE __vcpu_single_flag(iflags, BIT(6)) +/* pKVM host vcpu state is dirty, needs resync */ +#define PKVM_HOST_STATE_DIRTY __vcpu_single_flag(iflags, BIT(7)) + +/* SVE enabled for host EL0 */ +#define HOST_SVE_ENABLED __vcpu_single_flag(sflags, BIT(0)) +/* SME enabled for EL0 */ +#define HOST_SME_ENABLED __vcpu_single_flag(sflags, BIT(1)) +/* Physical CPU not in supported_cpus */ +#define ON_UNSUPPORTED_CPU __vcpu_single_flag(sflags, BIT(2)) +/* WFIT instruction trapped */ +#define IN_WFIT __vcpu_single_flag(sflags, BIT(3)) +/* vcpu system registers loaded on physical CPU */ +#define SYSREGS_ON_CPU __vcpu_single_flag(sflags, BIT(4)) +/* Software step state is Active-pending */ +#define DBG_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(5)) + + /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ #define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \ sve_ffr_offset((vcpu)->arch.sve_max_vl)) @@ -406,59 +692,32 @@ struct kvm_vcpu_arch { __size_ret; \ }) -/* vcpu_arch flags field values: */ -#define KVM_ARM64_DEBUG_DIRTY (1 << 0) -#define KVM_ARM64_FP_ENABLED (1 << 1) /* guest FP regs loaded */ -#define KVM_ARM64_FP_HOST (1 << 2) /* host FP regs loaded */ -#define KVM_ARM64_HOST_SVE_IN_USE (1 << 3) /* backup for host TIF_SVE */ -#define KVM_ARM64_HOST_SVE_ENABLED (1 << 4) /* SVE enabled for EL0 */ -#define KVM_ARM64_GUEST_HAS_SVE (1 << 5) /* SVE exposed to guest */ -#define KVM_ARM64_VCPU_SVE_FINALIZED (1 << 6) /* SVE config completed */ -#define KVM_ARM64_GUEST_HAS_PTRAUTH (1 << 7) /* PTRAUTH exposed to guest */ -#define KVM_ARM64_PENDING_EXCEPTION (1 << 8) /* Exception pending */ -#define KVM_ARM64_EXCEPT_MASK (7 << 9) /* Target EL/MODE */ -#define KVM_ARM64_DEBUG_STATE_SAVE_SPE (1 << 12) /* Save SPE context if active */ -#define KVM_ARM64_DEBUG_STATE_SAVE_TRBE (1 << 13) /* Save TRBE context if active */ - #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \ KVM_GUESTDBG_USE_SW_BP | \ KVM_GUESTDBG_USE_HW | \ KVM_GUESTDBG_SINGLESTEP) -/* - * When KVM_ARM64_PENDING_EXCEPTION is set, KVM_ARM64_EXCEPT_MASK can - * take the following values: - * - * For AArch32 EL1: - */ -#define KVM_ARM64_EXCEPT_AA32_UND (0 << 9) -#define KVM_ARM64_EXCEPT_AA32_IABT (1 << 9) -#define KVM_ARM64_EXCEPT_AA32_DABT (2 << 9) -/* For AArch64: */ -#define KVM_ARM64_EXCEPT_AA64_ELx_SYNC (0 << 9) -#define KVM_ARM64_EXCEPT_AA64_ELx_IRQ (1 << 9) -#define KVM_ARM64_EXCEPT_AA64_ELx_FIQ (2 << 9) -#define KVM_ARM64_EXCEPT_AA64_ELx_SERR (3 << 9) -#define KVM_ARM64_EXCEPT_AA64_EL1 (0 << 11) -#define KVM_ARM64_EXCEPT_AA64_EL2 (1 << 11) - -/* - * Overlaps with KVM_ARM64_EXCEPT_MASK on purpose so that it can't be - * set together with an exception... - */ -#define KVM_ARM64_INCREMENT_PC (1 << 9) /* Increment PC */ #define vcpu_has_sve(vcpu) (system_supports_sve() && \ - ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_SVE)) + vcpu_get_flag(vcpu, GUEST_HAS_SVE)) #ifdef CONFIG_ARM64_PTR_AUTH #define vcpu_has_ptrauth(vcpu) \ ((cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) || \ cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) && \ - (vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_PTRAUTH) + vcpu_get_flag(vcpu, GUEST_HAS_PTRAUTH)) #else #define vcpu_has_ptrauth(vcpu) false #endif +#define vcpu_on_unsupported_cpu(vcpu) \ + vcpu_get_flag(vcpu, ON_UNSUPPORTED_CPU) + +#define vcpu_set_on_unsupported_cpu(vcpu) \ + vcpu_set_flag(vcpu, ON_UNSUPPORTED_CPU) + +#define vcpu_clear_on_unsupported_cpu(vcpu) \ + vcpu_clear_flag(vcpu, ON_UNSUPPORTED_CPU) + #define vcpu_gp_regs(v) (&(v)->arch.ctxt.regs) /* @@ -474,9 +733,6 @@ struct kvm_vcpu_arch { #define __vcpu_sys_reg(v,r) (ctxt_sys_reg(&(v)->arch.ctxt, (r))) -u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg); -void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg); - static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val) { /* @@ -568,8 +824,32 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg) return true; } +#define vcpu_read_sys_reg(__vcpu, reg) \ + ({ \ + u64 __val = 0x8badf00d8badf00d; \ + \ + /* SYSREGS_ON_CPU is only used in VHE */ \ + ((!is_nvhe_hyp_code() && \ + vcpu_get_flag(__vcpu, SYSREGS_ON_CPU) && \ + __vcpu_read_sys_reg_from_cpu(reg, &__val))) ? \ + __val \ + : \ + ctxt_sys_reg(&__vcpu->arch.ctxt, reg); \ + }) + +#define vcpu_write_sys_reg(__vcpu, __val, reg) \ + do { \ + /* SYSREGS_ON_CPU is only used in VHE */ \ + if (is_nvhe_hyp_code() || \ + !vcpu_get_flag(__vcpu, SYSREGS_ON_CPU) || \ + !__vcpu_write_sys_reg_to_cpu(__val, reg)) \ + ctxt_sys_reg(&__vcpu->arch.ctxt, reg) = __val; \ + } while (0) + struct kvm_vm_stat { struct kvm_vm_stat_generic generic; + atomic64_t protected_hyp_mem; + atomic64_t protected_shared_mem; }; struct kvm_vcpu_stat { @@ -583,7 +863,7 @@ struct kvm_vcpu_stat { u64 exits; }; -int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init); +void kvm_vcpu_preferred_target(struct kvm_vcpu_init *init); unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); @@ -591,8 +871,6 @@ int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu); int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); -int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); -int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); @@ -605,6 +883,8 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); +#define vcpu_has_run_once(vcpu) !!rcu_access_pointer((vcpu)->pid) + #ifndef __KVM_NVHE_HYPERVISOR__ #define kvm_call_hyp_nvhe(f, ...) \ ({ \ @@ -662,10 +942,11 @@ int kvm_handle_cp14_64(struct kvm_vcpu *vcpu); int kvm_handle_cp15_32(struct kvm_vcpu *vcpu); int kvm_handle_cp15_64(struct kvm_vcpu *vcpu); int kvm_handle_sys_reg(struct kvm_vcpu *vcpu); +int kvm_handle_cp10_id(struct kvm_vcpu *vcpu); void kvm_reset_sys_regs(struct kvm_vcpu *vcpu); -void kvm_sys_reg_table_init(void); +int kvm_sys_reg_table_init(void); /* MMIO helpers */ void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); @@ -674,8 +955,15 @@ unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); int kvm_handle_mmio_return(struct kvm_vcpu *vcpu); int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa); -int kvm_perf_init(void); -int kvm_perf_teardown(void); +/* + * Returns true if a Performance Monitoring Interrupt (PMI), a.k.a. perf event, + * arrived in guest context. For arm64, any event that arrives while a vCPU is + * loaded is considered to be "in guest". + */ +static inline bool kvm_arch_pmi_in_guest(struct kvm_vcpu *vcpu) +{ + return IS_ENABLED(CONFIG_GUEST_PERF_EVENTS) && !!vcpu; +} long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu); gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu); @@ -689,6 +977,12 @@ int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu, int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); +extern unsigned int kvm_arm_vmid_bits; +int kvm_arm_vmid_alloc_init(void); +void kvm_arm_vmid_alloc_free(void); +void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid); +void kvm_arm_vmid_clear_active(void); + static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch) { vcpu_arch->steal.base = GPA_INVALID; @@ -728,6 +1022,27 @@ void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu); void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); + +#define __vcpu_save_guest_debug_regs(vcpu) \ + do { \ + u64 val = vcpu_read_sys_reg(vcpu, MDSCR_EL1); \ + \ + (vcpu)->arch.guest_debug_preserved.mdscr_el1 = val; \ + } while(0) + +#define __vcpu_restore_guest_debug_regs(vcpu) \ + do { \ + u64 val = (vcpu)->arch.guest_debug_preserved.mdscr_el1; \ + \ + vcpu_write_sys_reg(vcpu, val, MDSCR_EL1); \ + } while (0) + +#define kvm_vcpu_os_lock_enabled(vcpu) \ + (!!(__vcpu_sys_reg(vcpu, OSLSR_EL1) & SYS_OSLSR_OSLK)) + +#define kvm_vcpu_needs_debug_regs(vcpu) \ + ((vcpu)->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) + int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, @@ -741,8 +1056,10 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, /* Guest/host FPSIMD coordination helpers */ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu); +void kvm_vcpu_unshare_task_fp(struct kvm_vcpu *vcpu); static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr) { @@ -753,17 +1070,9 @@ static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr) void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu); -#ifdef CONFIG_KVM /* Avoid conflicts with core headers if CONFIG_KVM=n */ -static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) -{ - return kvm_arch_vcpu_run_map_fp(vcpu); -} - +#ifdef CONFIG_KVM void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr); void kvm_clr_pmu_events(u32 clr); - -void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu); -void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu); #else static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {} static inline void kvm_clr_pmu_events(u32 clr) {} @@ -778,22 +1087,18 @@ int kvm_set_ipa_limit(void); struct kvm *kvm_arch_alloc_vm(void); void kvm_arch_free_vm(struct kvm *kvm); -int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type); +#define kvm_vm_is_protected(kvm) ((kvm)->arch.pkvm.enabled) -static inline bool kvm_vm_is_protected(struct kvm *kvm) -{ - return false; -} +void kvm_init_protected_traps(struct kvm_vcpu *vcpu); int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature); bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); -#define kvm_arm_vcpu_sve_finalized(vcpu) \ - ((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED) +#define kvm_arm_vcpu_sve_finalized(vcpu) vcpu_get_flag(vcpu, VCPU_SVE_FINALIZED) -#define kvm_has_mte(kvm) (system_supports_mte() && (kvm)->arch.mte_enabled) -#define kvm_vcpu_has_pmu(vcpu) \ - (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features)) +#define kvm_has_mte(kvm) \ + (system_supports_mte() && \ + test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &(kvm)->arch.flags)) #define kvm_supports_32bit_el0() \ (system_supports_32bit_el0() && \ @@ -808,4 +1113,7 @@ void __init kvm_hyp_reserve(void); static inline void kvm_hyp_reserve(void) { } #endif +void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu); +bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu); + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 657d0c94cf82..4b236df6c1f6 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -15,6 +15,9 @@ DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DECLARE_PER_CPU(unsigned long, kvm_hyp_vector); DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); +DECLARE_PER_CPU(int, hyp_cpu_number); + +#define hyp_smp_processor_id() (__this_cpu_read(hyp_cpu_number)) #define read_sysreg_elx(r,nvh,vh) \ ({ \ @@ -61,8 +64,8 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if); -void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if); -void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if); int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu); #ifdef __KVM_NVHE_HYPERVISOR__ @@ -115,7 +118,25 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt); #endif +extern u64 kvm_nvhe_sym(id_aa64pfr0_el1_sys_val); +extern u64 kvm_nvhe_sym(id_aa64pfr1_el1_sys_val); +extern u64 kvm_nvhe_sym(id_aa64isar0_el1_sys_val); +extern u64 kvm_nvhe_sym(id_aa64isar1_el1_sys_val); +extern u64 kvm_nvhe_sym(id_aa64isar2_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val); +extern u64 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val); +extern unsigned long kvm_nvhe_sym(__icache_flags); +extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits); +extern bool kvm_nvhe_sym(smccc_trng_available); + +extern bool kvm_nvhe_sym(__pkvm_modules_enabled); + +struct kvm_nvhe_clock_data { + u32 mult; + u32 shift; + u64 epoch_ns; + u64 epoch_cyc; +}; #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/include/asm/kvm_hypevents.h b/arch/arm64/include/asm/kvm_hypevents.h new file mode 100644 index 000000000000..8a2dd41b8569 --- /dev/null +++ b/arch/arm64/include/asm/kvm_hypevents.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ARM64_KVM_HYPEVENTS_H_ +#define __ARM64_KVM_HYPEVENTS_H_ + +#ifdef __KVM_NVHE_HYPERVISOR__ +#include +#endif + +/* + * Hypervisor events definitions. + */ + +HYP_EVENT(hyp_enter, + HE_PROTO(void), + HE_STRUCT( + ), + HE_ASSIGN( + ), + HE_PRINTK(" ") +); + +HYP_EVENT(hyp_exit, + HE_PROTO(void), + HE_STRUCT( + ), + HE_ASSIGN( + ), + HE_PRINTK(" ") +); + +HYP_EVENT(host_hcall, + HE_PROTO(unsigned int id, u8 invalid), + HE_STRUCT( + he_field(unsigned int, id) + he_field(u8, invalid) + ), + HE_ASSIGN( + __entry->id = id; + __entry->invalid = invalid; + ), + HE_PRINTK("id=%u invalid=%u", + __entry->id, __entry->invalid) +); + +HYP_EVENT(host_smc, + HE_PROTO(u64 id, u8 forwarded), + HE_STRUCT( + he_field(u64, id) + he_field(u8, forwarded) + ), + HE_ASSIGN( + __entry->id = id; + __entry->forwarded = forwarded; + ), + HE_PRINTK("id=%llu invalid=%u", + __entry->id, __entry->forwarded) +); + + +HYP_EVENT(host_mem_abort, + HE_PROTO(u64 esr, u64 addr), + HE_STRUCT( + he_field(u64, esr) + he_field(u64, addr) + ), + HE_ASSIGN( + __entry->esr = esr; + __entry->addr = addr; + ), + HE_PRINTK("esr=0x%llx addr=0x%llx", + __entry->esr, __entry->addr) +); +#endif diff --git a/arch/arm64/include/asm/kvm_hypevents_defs.h b/arch/arm64/include/asm/kvm_hypevents_defs.h new file mode 100644 index 000000000000..e228d894a898 --- /dev/null +++ b/arch/arm64/include/asm/kvm_hypevents_defs.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ARM64_KVM_HYPEVENTS_DEFS_H +#define __ARM64_KVM_HYPEVENTS_DEFS_H + +struct hyp_event_id { + unsigned short id; + void *data; +}; + +struct hyp_entry_hdr { + unsigned short id; +}; + +/* + * Hyp events definitions common to the hyp and the host + */ +#define HYP_EVENT_FORMAT(__name, __struct) \ + struct trace_hyp_format_##__name { \ + struct hyp_entry_hdr hdr; \ + __struct \ + } + +#define HE_PROTO(args...) args +#define HE_STRUCT(args...) args +#define HE_ASSIGN(args...) args +#define HE_PRINTK(args...) args + +#define he_field(type, item) type item; +#endif diff --git a/arch/arm64/include/asm/kvm_hyptrace.h b/arch/arm64/include/asm/kvm_hyptrace.h new file mode 100644 index 000000000000..d32b445b4cf6 --- /dev/null +++ b/arch/arm64/include/asm/kvm_hyptrace.h @@ -0,0 +1,21 @@ +#ifndef __ARM64_KVM_HYPTRACE_H_ +#define __ARM64_KVM_HYPTRACE_H_ +#include + +#include + +/* + * Host donations to the hypervisor to store the struct hyp_buffer_page. + */ +struct hyp_buffer_pages_backing { + unsigned long start; + size_t size; +}; + +struct hyp_trace_pack { + struct hyp_buffer_pages_backing backing; + struct kvm_nvhe_clock_data trace_clock_data; + struct trace_buffer_pack trace_buffer_pack; + +}; +#endif diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 02d378887743..dd98fe849dff 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -115,6 +115,8 @@ alternative_cb_end #include #include #include +#include +#include void kvm_update_va_mask(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst); @@ -150,7 +152,12 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v) #include #include +int kvm_share_hyp(void *from, void *to); +void kvm_unshare_hyp(void *from, void *to); int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot); +int __create_hyp_mappings(unsigned long start, unsigned long size, + unsigned long phys, enum kvm_pgtable_prot prot); +int hyp_alloc_private_va_range(size_t size, unsigned long *haddr); int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, void __iomem **kaddr, void __iomem **haddr); @@ -159,7 +166,7 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, void free_hyp_pgds(void); void stage2_unmap_vm(struct kvm *kvm); -int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu); +int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type); void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu); int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t pa, unsigned long size, bool writable); @@ -180,8 +187,13 @@ static inline void *__kvm_vector_slot2addr(void *base, struct kvm; -#define kvm_flush_dcache_to_poc(a,l) \ - dcache_clean_inval_poc((unsigned long)(a), (unsigned long)(a)+(l)) +#define kvm_flush_dcache_to_poc(a, l) do { \ + unsigned long __a = (unsigned long)(a); \ + unsigned long __l = (unsigned long)(l); \ + \ + if (__l) \ + dcache_clean_inval_poc(__a, __a + __l); \ +} while (0) static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) { @@ -264,7 +276,8 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0; baddr = mmu->pgd_phys; - vmid_field = (u64)READ_ONCE(vmid->vmid) << VTTBR_VMID_SHIFT; + vmid_field = atomic64_read(&vmid->id) << VTTBR_VMID_SHIFT; + vmid_field &= VTTBR_VMID_MASK(kvm_arm_vmid_bits); return kvm_phys_to_vttbr(baddr) | vmid_field | cnp; } diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 027783829584..04f9050c482d 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -13,12 +13,24 @@ #define KVM_PGTABLE_MAX_LEVELS 4U +/* + * The largest supported block sizes for KVM (no 52-bit PA support): + * - 4K (level 1): 1GB + * - 16K (level 2): 32MB + * - 64K (level 2): 512MB + */ +#ifdef CONFIG_ARM64_4K_PAGES +#define KVM_PGTABLE_MIN_BLOCK_LEVEL 1U +#else +#define KVM_PGTABLE_MIN_BLOCK_LEVEL 2U +#endif + static inline u64 kvm_get_parange(u64 mmfr0) { u64 parange = cpuid_feature_extract_unsigned_field(mmfr0, - ID_AA64MMFR0_PARANGE_SHIFT); - if (parange > ID_AA64MMFR0_PARANGE_MAX) - parange = ID_AA64MMFR0_PARANGE_MAX; + ID_AA64MMFR0_EL1_PARANGE_SHIFT); + if (parange > ID_AA64MMFR0_EL1_PARANGE_MAX) + parange = ID_AA64MMFR0_EL1_PARANGE_MAX; return parange; } @@ -30,6 +42,38 @@ typedef u64 kvm_pte_t; #define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) #define KVM_PTE_ADDR_51_48 GENMASK(15, 12) +#define KVM_PHYS_INVALID (-1ULL) + +#define KVM_PTE_TYPE BIT(1) +#define KVM_PTE_TYPE_BLOCK 0 +#define KVM_PTE_TYPE_PAGE 1 +#define KVM_PTE_TYPE_TABLE 1 + +#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) + +#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3 +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1 +#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) +#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 +#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) + +#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) +#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) +#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 +#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) + +#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) + +#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) + +#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) + +#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) + static inline bool kvm_pte_valid(kvm_pte_t pte) { return pte & KVM_PTE_VALID; @@ -45,6 +89,18 @@ static inline u64 kvm_pte_to_phys(kvm_pte_t pte) return pa; } +static inline kvm_pte_t kvm_phys_to_pte(u64 pa) +{ + kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK; + + if (PAGE_SHIFT == 16) { + pa &= GENMASK(51, 48); + pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48); + } + + return pte; +} + static inline u64 kvm_granule_shift(u32 level) { /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ @@ -58,11 +114,18 @@ static inline u64 kvm_granule_size(u32 level) static inline bool kvm_level_supports_block_mapping(u32 level) { - /* - * Reject invalid block mappings and don't bother with 4TB mappings for - * 52-bit PAs. - */ - return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1)); + return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL; +} + +static inline bool kvm_pte_table(kvm_pte_t pte, u32 level) +{ + if (level == KVM_PGTABLE_MAX_LEVELS - 1) + return false; + + if (!kvm_pte_valid(pte)) + return false; + + return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; } /** @@ -121,6 +184,7 @@ enum kvm_pgtable_stage2_flags { * @KVM_PGTABLE_PROT_W: Write permission. * @KVM_PGTABLE_PROT_R: Read permission. * @KVM_PGTABLE_PROT_DEVICE: Device attributes. + * @KVM_PGTABLE_PROT_NC: Normal non-cacheable attributes. * @KVM_PGTABLE_PROT_SW0: Software bit 0. * @KVM_PGTABLE_PROT_SW1: Software bit 1. * @KVM_PGTABLE_PROT_SW2: Software bit 2. @@ -132,6 +196,7 @@ enum kvm_pgtable_prot { KVM_PGTABLE_PROT_R = BIT(2), KVM_PGTABLE_PROT_DEVICE = BIT(3), + KVM_PGTABLE_PROT_NC = BIT(4), KVM_PGTABLE_PROT_SW0 = BIT(55), KVM_PGTABLE_PROT_SW1 = BIT(56), @@ -145,6 +210,20 @@ enum kvm_pgtable_prot { #define PKVM_HOST_MEM_PROT KVM_PGTABLE_PROT_RWX #define PKVM_HOST_MMIO_PROT KVM_PGTABLE_PROT_RW +#define KVM_HOST_S2_DEFAULT_MASK (KVM_PTE_LEAF_ATTR_HI | \ + KVM_PTE_LEAF_ATTR_LO) + +#define KVM_HOST_S2_DEFAULT_MEM_PTE \ + (PTE_S2_MEMATTR(MT_S2_NORMAL) | \ + KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ + KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ + KVM_PTE_LEAF_ATTR_LO_S2_AF | \ + FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, KVM_PTE_LEAF_ATTR_LO_S2_SH_IS)) + +#define KVM_HOST_S2_DEFAULT_MMIO_PTE \ + (KVM_HOST_S2_DEFAULT_MEM_PTE | \ + KVM_PTE_LEAF_ATTR_HI_S2_XN) + #define PAGE_HYP KVM_PGTABLE_PROT_RW #define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X) #define PAGE_HYP_RO (KVM_PGTABLE_PROT_R) @@ -153,6 +232,22 @@ enum kvm_pgtable_prot { typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, enum kvm_pgtable_prot prot); +typedef bool (*kvm_pgtable_pte_is_counted_cb_t)(kvm_pte_t pte, u32 level); + +/** + * struct kvm_pgtable_pte_ops - PTE callbacks. + * @force_pte_cb: Force the mapping granularity to pages and + * return true if we support this instead of + * block mappings. + * @pte_is_counted_cb Verify the attributes of the @pte argument + * and return true if the descriptor needs to be + * refcounted, otherwise return false. + */ +struct kvm_pgtable_pte_ops { + kvm_pgtable_force_pte_cb_t force_pte_cb; + kvm_pgtable_pte_is_counted_cb_t pte_is_counted_cb; +}; + /** * struct kvm_pgtable - KVM page-table. * @ia_bits: Maximum input address size, in bits. @@ -161,8 +256,7 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, * @mm_ops: Memory management callbacks. * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. * @flags: Stage-2 page-table flags. - * @force_pte_cb: Function that returns true if page level mappings must - * be used instead of block mappings. + * @pte_ops: PTE callbacks. */ struct kvm_pgtable { u32 ia_bits; @@ -173,7 +267,7 @@ struct kvm_pgtable { /* Stage-2 only */ struct kvm_s2_mmu *mmu; enum kvm_pgtable_stage2_flags flags; - kvm_pgtable_force_pte_cb_t force_pte_cb; + struct kvm_pgtable_pte_ops *pte_ops; }; /** @@ -251,6 +345,27 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt); int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot); +/** + * kvm_pgtable_hyp_unmap() - Remove a mapping from a hypervisor stage-1 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init(). + * @addr: Virtual address from which to remove the mapping. + * @size: Size of the mapping. + * + * The offset of @addr within a page is ignored, @size is rounded-up to + * the next page boundary and @phys is rounded-down to the previous page + * boundary. + * + * TLB invalidation is performed for each page-table entry cleared during the + * unmapping operation and the reference count for the page-table page + * containing the cleared entry is decremented, with unreferenced pages being + * freed. The unmapping operation will stop early if it encounters either an + * invalid page-table entry or a valid block mapping which maps beyond the range + * being unmapped. + * + * Return: Number of bytes unmapped, which may be 0. + */ +u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); + /** * kvm_get_vtcr() - Helper to construct VTCR_EL2 * @mmfr0: Sanitized value of SYS_ID_AA64MMFR0_EL1 register. @@ -267,25 +382,31 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, */ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift); +/** + * kvm_pgtable_stage2_pgd_size() - Helper to compute size of a stage-2 PGD + * @vtcr: Content of the VTCR register. + * + * Return: the size (in bytes) of the stage-2 PGD + */ +size_t kvm_pgtable_stage2_pgd_size(u64 vtcr); + /** * __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table. * @pgt: Uninitialised page-table structure to initialise. - * @arch: Arch-specific KVM structure representing the guest virtual - * machine. + * @mmu: S2 MMU context for this S2 translation * @mm_ops: Memory management callbacks. * @flags: Stage-2 configuration flags. - * @force_pte_cb: Function that returns true if page level mappings must - * be used instead of block mappings. + * @pte_ops: PTE callbacks. * * Return: 0 on success, negative error code on failure. */ -int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch, +int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops, enum kvm_pgtable_stage2_flags flags, - kvm_pgtable_force_pte_cb_t force_pte_cb); + struct kvm_pgtable_pte_ops *pte_ops); -#define kvm_pgtable_stage2_init(pgt, arch, mm_ops) \ - __kvm_pgtable_stage2_init(pgt, arch, mm_ops, 0, NULL) +#define kvm_pgtable_stage2_init(pgt, mmu, mm_ops, pte_ops) \ + __kvm_pgtable_stage2_init(pgt, mmu, mm_ops, 0, pte_ops) /** * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table. @@ -329,14 +450,16 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, void *mc); /** - * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to - * track ownership. + * kvm_pgtable_stage2_annotate() - Unmap and annotate pages in the IPA space + * to track ownership (and more). * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Base intermediate physical address to annotate. * @size: Size of the annotated range. * @mc: Cache of pre-allocated and zeroed memory from which to allocate * page-table pages. - * @owner_id: Unique identifier for the owner of the page. + * @annotation: A 63 bit value that will be stored in the page tables. + * @annotation[0] must be 0, and @annotation[63:1] is stored + * in the page tables. * * By default, all page-tables are owned by identifier 0. This function can be * used to mark portions of the IPA space as owned by other entities. When a @@ -345,8 +468,8 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, * * Return: 0 on success, negative error code on failure. */ -int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, - void *mc, u8 owner_id); +int kvm_pgtable_stage2_annotate(struct kvm_pgtable *pgt, u64 addr, u64 size, + void *mc, kvm_pte_t annotation); /** * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table. diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h new file mode 100644 index 000000000000..2c7652bfe362 --- /dev/null +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 - Google LLC + * Author: Quentin Perret + * Author: Fuad Tabba + */ +#ifndef __ARM64_KVM_PKVM_H__ +#define __ARM64_KVM_PKVM_H__ + +#include +#include +#include +#include +#include + +/* Maximum number of VMs that can co-exist under pKVM. */ +#define KVM_MAX_PVMS 255 + +#define HYP_MEMBLOCK_REGIONS 128 +#define PVMFW_INVALID_LOAD_ADDR (-1) + +int pkvm_vm_ioctl_enable_cap(struct kvm *kvm,struct kvm_enable_cap *cap); +int pkvm_init_host_vm(struct kvm *kvm, unsigned long type); +int pkvm_create_hyp_vm(struct kvm *kvm); +void pkvm_destroy_hyp_vm(struct kvm *kvm); +void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa); + +/* + * Definitions for features to be allowed or restricted for guest virtual + * machines, depending on the mode KVM is running in and on the type of guest + * that is running. + * + * The ALLOW masks represent a bitmask of feature fields that are allowed + * without any restrictions as long as they are supported by the system. + * + * The RESTRICT_UNSIGNED masks, if present, represent unsigned fields for + * features that are restricted to support at most the specified feature. + * + * If a feature field is not present in either, than it is not supported. + * + * The approach taken for protected VMs is to allow features that are: + * - Needed by common Linux distributions (e.g., floating point) + * - Trivial to support, e.g., supporting the feature does not introduce or + * require tracking of additional state in KVM + * - Cannot be trapped or prevent the guest from using anyway + */ + +/* + * Allow for protected VMs: + * - Floating-point and Advanced SIMD + * - GICv3(+) system register interface + * - Data Independent Timing + */ +#define PVM_ID_AA64PFR0_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) \ + ) + +/* + * Restrict to the following *unsigned* features for protected VMs: + * - AArch64 guests only (no support for AArch32 guests): + * AArch32 adds complexity in trap handling, emulation, condition codes, + * etc... + * - RAS (v1) + * Supported by KVM + */ +#define PVM_ID_AA64PFR0_RESTRICT_UNSIGNED (\ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL2), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL3), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), ID_AA64PFR0_EL1_RAS_IMP) \ + ) + +/* + * Allow for protected VMs: + * - Branch Target Identification + * - Speculative Store Bypassing + */ +#define PVM_ID_AA64PFR1_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_BT) | \ + ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SSBS) \ + ) + +/* + * Allow for protected VMs: + * - Mixed-endian + * - Distinction between Secure and Non-secure Memory + * - Mixed-endian at EL0 only + * - Non-context synchronizing exception entry and exit + */ +#define PVM_ID_AA64MMFR0_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGEND) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_SNSMEM) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGENDEL0) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_EXS) \ + ) + +/* + * Restrict to the following *unsigned* features for protected VMs: + * - 40-bit IPA + * - 16-bit ASID + */ +#define PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED (\ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_PARANGE), ID_AA64MMFR0_EL1_PARANGE_40) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_ASIDBITS), ID_AA64MMFR0_EL1_ASIDBITS_16) \ + ) + +/* + * Allow for protected VMs: + * - Hardware translation table updates to Access flag and Dirty state + * - Number of VMID bits from CPU + * - Hierarchical Permission Disables + * - Privileged Access Never + * - SError interrupt exceptions from speculative reads + * - Enhanced Translation Synchronization + */ +#define PVM_ID_AA64MMFR1_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_VMIDBits) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HPDS) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_PAN) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_SpecSEI) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_ETS) \ + ) + +/* + * Allow for protected VMs: + * - Common not Private translations + * - User Access Override + * - IESB bit in the SCTLR_ELx registers + * - Unaligned single-copy atomicity and atomic functions + * - ESR_ELx.EC value on an exception by read access to feature ID space + * - TTL field in address operations. + * - Break-before-make sequences when changing translation block size + * - E0PDx mechanism + */ +#define PVM_ID_AA64MMFR2_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_CnP) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_UAO) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IESB) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_AT) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IDS) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_TTL) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_BBM) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_E0PD) \ + ) + +/* + * No support for Scalable Vectors for protected VMs: + * Requires additional support from KVM, e.g., context-switching and + * trapping at EL2 + */ +#define PVM_ID_AA64ZFR0_ALLOW (0ULL) + +/* + * No support for debug, including breakpoints, and watchpoints for protected + * VMs: + * The Arm architecture mandates support for at least the Armv8 debug + * architecture, which would include at least 2 hardware breakpoints and + * watchpoints. Providing that support to protected guests adds + * considerable state and complexity. Therefore, the reserved value of 0 is + * used for debug-related fields. + */ +#define PVM_ID_AA64DFR0_ALLOW (0ULL) +#define PVM_ID_AA64DFR1_ALLOW (0ULL) + +/* + * No support for implementation defined features. + */ +#define PVM_ID_AA64AFR0_ALLOW (0ULL) +#define PVM_ID_AA64AFR1_ALLOW (0ULL) + +/* + * No restrictions on instructions implemented in AArch64. + */ +#define PVM_ID_AA64ISAR0_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_AES) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA1) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA2) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_CRC32) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RDM) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA3) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM3) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM4) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_DP) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_FHM) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TS) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TLB) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RNDR) \ + ) + +#define PVM_ID_AA64ISAR1_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DPB) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_JSCVT) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FCMA) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_LRCPC) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FRINTTS) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SB) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SPECRES) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_BF16) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DGH) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM) \ + ) + +#define PVM_ID_AA64ISAR2_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) \ + ) + +/* + * Returns the maximum number of breakpoints supported for protected VMs. + */ +static inline int pkvm_get_max_brps(void) +{ + int num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_BRPs), + PVM_ID_AA64DFR0_ALLOW); + + /* + * If breakpoints are supported, the maximum number is 1 + the field. + * Otherwise, return 0, which is not compliant with the architecture, + * but is reserved and is used here to indicate no debug support. + */ + return num ? num + 1 : 0; +} + +/* + * Returns the maximum number of watchpoints supported for protected VMs. + */ +static inline int pkvm_get_max_wrps(void) +{ + int num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_WRPs), + PVM_ID_AA64DFR0_ALLOW); + + return num ? num + 1 : 0; +} + +enum pkvm_moveable_reg_type { + PKVM_MREG_MEMORY, + PKVM_MREG_PROTECTED_RANGE, +}; + +struct pkvm_moveable_reg { + phys_addr_t start; + u64 size; + enum pkvm_moveable_reg_type type; +}; + +#define PKVM_NR_MOVEABLE_REGS 512 +extern struct pkvm_moveable_reg kvm_nvhe_sym(pkvm_moveable_regs)[]; +extern unsigned int kvm_nvhe_sym(pkvm_moveable_regs_nr); + +extern struct memblock_region kvm_nvhe_sym(hyp_memory)[]; +extern unsigned int kvm_nvhe_sym(hyp_memblock_nr); + +extern phys_addr_t kvm_nvhe_sym(pvmfw_base); +extern phys_addr_t kvm_nvhe_sym(pvmfw_size); + +static inline unsigned long +hyp_vmemmap_memblock_size(struct memblock_region *reg, size_t vmemmap_entry_size) +{ + unsigned long nr_pages = reg->size >> PAGE_SHIFT; + unsigned long start, end; + + start = (reg->base >> PAGE_SHIFT) * vmemmap_entry_size; + end = start + nr_pages * vmemmap_entry_size; + start = ALIGN_DOWN(start, PAGE_SIZE); + end = ALIGN(end, PAGE_SIZE); + + return end - start; +} + +static inline unsigned long hyp_vmemmap_pages(size_t vmemmap_entry_size) +{ + unsigned long res = 0, i; + + for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) { + res += hyp_vmemmap_memblock_size(&kvm_nvhe_sym(hyp_memory)[i], + vmemmap_entry_size); + } + + return res >> PAGE_SHIFT; +} + +static inline unsigned long hyp_vm_table_pages(void) +{ + return PAGE_ALIGN(KVM_MAX_PVMS * sizeof(void *)) >> PAGE_SHIFT; +} + +static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages) +{ + unsigned long total = 0, i; + + /* Provision the worst case scenario */ + for (i = 0; i < KVM_PGTABLE_MAX_LEVELS; i++) { + nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE); + total += nr_pages; + } + + return total; +} + +static inline unsigned long __hyp_pgtable_moveable_regs_pages(void) +{ + unsigned long res = 0, i; + + /* Cover all of moveable regions with page-granularity */ + for (i = 0; i < kvm_nvhe_sym(pkvm_moveable_regs_nr); i++) { + struct pkvm_moveable_reg *reg = &kvm_nvhe_sym(pkvm_moveable_regs)[i]; + res += __hyp_pgtable_max_pages(reg->size >> PAGE_SHIFT); + } + + return res; +} + +#define __PKVM_PRIVATE_SZ SZ_1G + +static inline unsigned long hyp_s1_pgtable_pages(void) +{ + unsigned long res; + + res = __hyp_pgtable_moveable_regs_pages(); + + res += __hyp_pgtable_max_pages(__PKVM_PRIVATE_SZ >> PAGE_SHIFT); + + return res; +} + +static inline unsigned long host_s2_pgtable_pages(void) +{ + unsigned long res; + + /* + * Include an extra 16 pages to safely upper-bound the worst case of + * concatenated pgds. + */ + res = __hyp_pgtable_moveable_regs_pages() + 16; + + /* Allow 1 GiB for non-moveable regions */ + res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT); + + return res; +} + +#define KVM_FFA_MBOX_NR_PAGES 1 + +/* + * Maximum number of consitutents allowed in a descriptor. This number is + * arbitrary, see comment below on SG_MAX_SEGMENTS in hyp_ffa_proxy_pages(). + */ +#define KVM_FFA_MAX_NR_CONSTITUENTS 4096 + +static inline unsigned long hyp_ffa_proxy_pages(void) +{ + size_t desc_max; + + /* + * SG_MAX_SEGMENTS is supposed to bound the number of elements in an + * sglist, which should match the number of consituents in the + * corresponding FFA descriptor. As such, the EL2 buffer needs to be + * large enough to hold a descriptor with SG_MAX_SEGMENTS consituents + * at least. But the kernel's DMA code doesn't enforce the limit, and + * it is sometimes abused, so let's allow larger descriptors and hope + * for the best. + */ + BUILD_BUG_ON(KVM_FFA_MAX_NR_CONSTITUENTS < SG_MAX_SEGMENTS); + + /* + * The hypervisor FFA proxy needs enough memory to buffer a fragmented + * descriptor returned from EL3 in response to a RETRIEVE_REQ call. + */ + desc_max = sizeof(struct ffa_mem_region) + + sizeof(struct ffa_mem_region_attributes) + + sizeof(struct ffa_composite_mem_region) + + KVM_FFA_MAX_NR_CONSTITUENTS * sizeof(struct ffa_mem_region_addr_range); + + /* Plus a page each for the hypervisor's RX and TX mailboxes. */ + return (2 * KVM_FFA_MBOX_NR_PAGES) + DIV_ROUND_UP(desc_max, PAGE_SIZE); +} + +#endif /* __ARM64_KVM_PKVM_H__ */ diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h new file mode 100644 index 000000000000..99c7de4e2e81 --- /dev/null +++ b/arch/arm64/include/asm/kvm_pkvm_module.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ARM64_KVM_PKVM_MODULE_H__ +#define __ARM64_KVM_PKVM_MODULE_H__ + +#include +#include + +typedef void (*dyn_hcall_t)(struct kvm_cpu_context *); + +enum pkvm_psci_notification { + PKVM_PSCI_CPU_SUSPEND, + PKVM_PSCI_SYSTEM_SUSPEND, + PKVM_PSCI_CPU_ENTRY, +}; + +#ifdef CONFIG_MODULES +struct pkvm_module_ops { + int (*create_private_mapping)(phys_addr_t phys, size_t size, + enum kvm_pgtable_prot prot, + unsigned long *haddr); + void *(*alloc_module_va)(u64 nr_pages); + int (*map_module_page)(u64 pfn, void *va, enum kvm_pgtable_prot prot, bool is_protected); + int (*register_serial_driver)(void (*hyp_putc_cb)(char)); + void (*puts)(const char *str); + void (*putx64)(u64 num); + void *(*fixmap_map)(phys_addr_t phys); + void (*fixmap_unmap)(void); + void *(*linear_map_early)(phys_addr_t phys, size_t size, enum kvm_pgtable_prot prot); + void (*linear_unmap_early)(void *addr, size_t size); + void (*flush_dcache_to_poc)(void *addr, size_t size); + int (*register_host_perm_fault_handler)(int (*cb)(struct kvm_cpu_context *ctxt, u64 esr, u64 addr)); + int (*host_stage2_mod_prot)(u64 pfn, enum kvm_pgtable_prot prot); + int (*host_stage2_get_leaf)(phys_addr_t phys, kvm_pte_t *ptep, u32 *level); + int (*register_host_smc_handler)(bool (*cb)(struct kvm_cpu_context *)); + int (*register_default_trap_handler)(bool (*cb)(struct kvm_cpu_context *)); + int (*register_illegal_abt_notifier)(void (*cb)(struct kvm_cpu_context *)); + int (*register_psci_notifier)(void (*cb)(enum pkvm_psci_notification, struct kvm_cpu_context *)); + int (*register_hyp_panic_notifier)(void (*cb)(struct kvm_cpu_context *host_ctxt)); + int (*host_donate_hyp)(u64 pfn, u64 nr_pages); + int (*hyp_donate_host)(u64 pfn, u64 nr_pages); + void* (*memcpy)(void *to, const void *from, size_t count); + void* (*memset)(void *dst, int c, size_t count); + phys_addr_t (*hyp_pa)(void *x); + void* (*hyp_va)(phys_addr_t phys); + unsigned long (*kern_hyp_va)(unsigned long x); +}; + +int __pkvm_load_el2_module(struct module *this, unsigned long *token); + +int __pkvm_register_el2_call(unsigned long hfn_hyp_va); +#else +static inline int __pkvm_load_el2_module(struct module *this, + unsigned long *token) +{ + return -ENOSYS; +} + +static inline int __pkvm_register_el2_call(unsigned long hfn_hyp_va) +{ + return -ENOSYS; +} +#endif /* CONFIG_MODULES */ + +#ifdef MODULE +/* + * Convert an EL2 module addr from the kernel VA to the hyp VA + */ +#define pkvm_el2_mod_va(kern_va, token) \ +({ \ + unsigned long hyp_text_kern_va = \ + (unsigned long)THIS_MODULE->arch.hyp.text.start; \ + unsigned long offset; \ + \ + offset = (unsigned long)kern_va - hyp_text_kern_va; \ + token + offset; \ +}) + +/* + * function_nocfi() does not work with function pointers, hence the macro in + * lieu of a function. + */ +#define pkvm_load_el2_module(init_fn, token) \ +({ \ + THIS_MODULE->arch.hyp.init = function_nocfi(init_fn); \ + __pkvm_load_el2_module(THIS_MODULE, token); \ +}) + +#define pkvm_register_el2_mod_call(hfn, token) \ +({ \ + __pkvm_register_el2_call(pkvm_el2_mod_va(function_nocfi(hfn), \ + token)); \ +}) + +#define pkvm_el2_mod_call(id, ...) \ + ({ \ + struct arm_smccc_res res; \ + \ + arm_smccc_1_1_hvc(KVM_HOST_SMCCC_ID(id), \ + ##__VA_ARGS__, &res); \ + WARN_ON(res.a0 != SMCCC_RET_SUCCESS); \ + \ + res.a1; \ + }) +#endif +#endif diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h index 9906541a6861..57b31b5707e1 100644 --- a/arch/arm64/include/asm/linkage.h +++ b/arch/arm64/include/asm/linkage.h @@ -1,17 +1,16 @@ #ifndef __ASM_LINKAGE_H #define __ASM_LINKAGE_H +#ifdef __ASSEMBLY__ +#include +#endif + #define __ALIGN .align 2 #define __ALIGN_STR ".align 2" #if defined(CONFIG_ARM64_BTI_KERNEL) && defined(__aarch64__) -/* - * Since current versions of gas reject the BTI instruction unless we - * set the architecture version to v8.5 we use the hint instruction - * instead. - */ -#define BTI_C hint 34 ; +#define BTI_C bti c ; /* * When using in-kernel BTI we need to ensure that PCS-conformant assembly diff --git a/arch/arm64/include/asm/lse.h b/arch/arm64/include/asm/lse.h index 5d10051c3e62..29d0719bd2ed 100644 --- a/arch/arm64/include/asm/lse.h +++ b/arch/arm64/include/asm/lse.h @@ -4,7 +4,7 @@ #include -#ifdef CONFIG_ARM64_LSE_ATOMICS +#if defined(CONFIG_ARM64_LSE_ATOMICS) && !defined(BUILD_FIPS140_KO) #define __LSE_PREAMBLE ".arch_extension lse\n" diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h new file mode 100644 index 000000000000..300c8b8cbebe --- /dev/null +++ b/arch/arm64/include/asm/mem_encrypt.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_MEM_ENCRYPT_H +#define __ASM_MEM_ENCRYPT_H + +bool mem_encrypt_active(void); +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); + +#endif /* __ASM_MEM_ENCRYPT_H */ diff --git a/arch/arm64/include/asm/mem_relinquish.h b/arch/arm64/include/asm/mem_relinquish.h new file mode 100644 index 000000000000..ac51786a4a11 --- /dev/null +++ b/arch/arm64/include/asm/mem_relinquish.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 Google LLC + * Author: Keir Fraser + */ + +#ifndef __ASM_MEM_RELINQUISH_H +#define __ASM_MEM_RELINQUISH_H + +struct page; + +bool kvm_has_memrelinquish_services(void); +void page_relinquish(struct page *page); + +#endif /* __ASM_MEM_RELINQUISH_H */ diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 05886322c300..0fd5a5854948 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -113,6 +113,14 @@ #define OVERFLOW_STACK_SIZE SZ_4K +/* + * With the minimum frame size of [x29, x30], exactly half the combined + * sizes of the hyp and overflow stacks is the maximum size needed to + * save the unwinded stacktrace; plus an additional entry to delimit the + * end. + */ +#define NVHE_STACKTRACE_SIZE ((OVERFLOW_STACK_SIZE + PAGE_SIZE) / 2 + sizeof(long)) + /* * Alignment of kernel segments (e.g. .text, .data). * @@ -134,11 +142,13 @@ #define MT_NORMAL_NC 2 #define MT_DEVICE_nGnRnE 3 #define MT_DEVICE_nGnRE 4 +#define MT_NORMAL_iNC_oWB 5 /* * Memory types for Stage-2 translation */ #define MT_S2_NORMAL 0xf +#define MT_S2_NORMAL_NC 0x5 #define MT_S2_DEVICE_nGnRE 0x1 /* @@ -146,6 +156,7 @@ * Stage-2 enforces Normal-WB and Device-nGnRE */ #define MT_S2_FWB_NORMAL 6 +#define MT_S2_FWB_NORMAL_NC 5 #define MT_S2_FWB_DEVICE_nGnRE 1 #ifdef CONFIG_ARM64_4K_PAGES @@ -240,6 +251,7 @@ static inline const void *__tag_set(const void *addr, u8 tag) #ifdef CONFIG_KASAN_HW_TAGS #define arch_enable_tagging_sync() mte_enable_kernel_sync() #define arch_enable_tagging_async() mte_enable_kernel_async() +#define arch_enable_tagging_asymm() mte_enable_kernel_asymm() #define arch_force_async_tag_fault() mte_check_tfsr_exit() #define arch_get_random_tag() mte_get_random_tag() #define arch_get_mem_tag(addr) mte_get_mem_tag(addr) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index e9c30859f80c..48f8466a4be9 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -15,6 +15,7 @@ #ifndef __ASSEMBLY__ #include +#include typedef struct { atomic64_t id; diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h index 4e7fa2623896..8eca53e96d5a 100644 --- a/arch/arm64/include/asm/module.h +++ b/arch/arm64/include/asm/module.h @@ -14,12 +14,50 @@ struct mod_plt_sec { int plt_max_entries; }; -struct mod_arch_specific { - struct mod_plt_sec core; - struct mod_plt_sec init; - - /* for CONFIG_DYNAMIC_FTRACE */ +#define ARM64_MODULE_PLTS_ARCHDATA \ + struct mod_plt_sec core; \ + struct mod_plt_sec init; \ + \ + /* for CONFIG_DYNAMIC_FTRACE */ \ struct plt_entry *ftrace_trampolines; +#else +#define ARM64_MODULE_PLTS_ARCHDATA +#endif + +#ifdef CONFIG_KVM +struct pkvm_module_section { + void *start; + void *end; +}; + +typedef s32 kvm_nvhe_reloc_t; +struct pkvm_module_ops; + +struct pkvm_el2_module { + struct pkvm_module_section text; + struct pkvm_module_section bss; + struct pkvm_module_section rodata; + struct pkvm_module_section data; + kvm_nvhe_reloc_t *relocs; + unsigned int nr_relocs; + int (*init)(const struct pkvm_module_ops *ops); +}; + +void kvm_apply_hyp_module_relocations(void *mod_start, void *hyp_va, + kvm_nvhe_reloc_t *begin, + kvm_nvhe_reloc_t *end); + +#define ARM64_MODULE_KVM_ARCHDATA \ + /* For pKVM hypervisor modules */ \ + struct pkvm_el2_module hyp; +#else +#define ARM64_MODULE_KVM_ARCHDATA +#endif + +#ifdef CONFIG_HAVE_MOD_ARCH_SPECIFIC +struct mod_arch_specific { + ARM64_MODULE_PLTS_ARCHDATA + ARM64_MODULE_KVM_ARCHDATA }; #endif diff --git a/arch/arm64/include/asm/module.lds.h b/arch/arm64/include/asm/module.lds.h index 094701ec5500..f11d92211862 100644 --- a/arch/arm64/include/asm/module.lds.h +++ b/arch/arm64/include/asm/module.lds.h @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#include + SECTIONS { #ifdef CONFIG_ARM64_MODULE_PLTS .plt 0 : { BYTE(0) } @@ -17,4 +20,24 @@ SECTIONS { */ .text.hot : { *(.text.hot) } #endif + +#ifdef CONFIG_KVM + .hyp.text : ALIGN(PAGE_SIZE) { + *(.hyp.text) + . = ALIGN(PAGE_SIZE); + } + .hyp.bss : ALIGN(PAGE_SIZE) { + *(.hyp.bss) + . = ALIGN(PAGE_SIZE); + } + .hyp.rodata : ALIGN(PAGE_SIZE) { + *(.hyp.rodata) + . = ALIGN(PAGE_SIZE); + } + .hyp.data : ALIGN(PAGE_SIZE) { + *(.hyp.data) + . = ALIGN(PAGE_SIZE); + } + .hyp.reloc : ALIGN(4) { *(.hyp.reloc) } +#endif } diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h index 592aabb25b0e..a857bcacf0fe 100644 --- a/arch/arm64/include/asm/mte-kasan.h +++ b/arch/arm64/include/asm/mte-kasan.h @@ -133,6 +133,7 @@ static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag, void mte_enable_kernel_sync(void); void mte_enable_kernel_async(void); +void mte_enable_kernel_asymm(void); #else /* CONFIG_ARM64_MTE */ @@ -164,6 +165,10 @@ static inline void mte_enable_kernel_async(void) { } +static inline void mte_enable_kernel_asymm(void) +{ +} + #endif /* CONFIG_ARM64_MTE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 3e368ca66623..760c62f8e22f 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -11,7 +11,9 @@ #ifndef __ASSEMBLY__ #include +#include #include +#include #include #include @@ -47,6 +49,7 @@ long set_mte_ctrl(struct task_struct *task, unsigned long arg); long get_mte_ctrl(struct task_struct *task); int mte_ptrace_copy_tags(struct task_struct *child, long request, unsigned long addr, unsigned long data); +size_t mte_probe_user_range(const char __user *uaddr, size_t size); #else /* CONFIG_ARM64_MTE */ @@ -91,13 +94,33 @@ static inline int mte_ptrace_copy_tags(struct task_struct *child, #endif /* CONFIG_ARM64_MTE */ +static inline void mte_disable_tco_entry(struct task_struct *task) +{ + if (!system_supports_mte()) + return; + + /* + * Re-enable tag checking (TCO set on exception entry). This is only + * necessary if MTE is enabled in either the kernel or the userspace + * task in synchronous or asymmetric mode (SCTLR_EL1.TCF0 bit 0 is set + * for both). With MTE disabled in the kernel and disabled or + * asynchronous in userspace, tag check faults (including in uaccesses) + * are not reported, therefore there is no need to re-enable checking. + * This is beneficial on microarchitectures where re-enabling TCO is + * expensive. + */ + if (kasan_hw_tags_enabled() || + (task->thread.sctlr_user & (1UL << SCTLR_EL1_TCF0_SHIFT))) + asm volatile(SET_PSTATE_TCO(0)); +} + #ifdef CONFIG_KASAN_HW_TAGS /* Whether the MTE asynchronous mode is enabled. */ -DECLARE_STATIC_KEY_FALSE(mte_async_mode); +DECLARE_STATIC_KEY_FALSE(mte_async_or_asymm_mode); -static inline bool system_uses_mte_async_mode(void) +static inline bool system_uses_mte_async_or_asymm_mode(void) { - return static_branch_unlikely(&mte_async_mode); + return static_branch_unlikely(&mte_async_or_asymm_mode); } void mte_check_tfsr_el1(void); @@ -126,7 +149,7 @@ static inline void mte_check_tfsr_exit(void) mte_check_tfsr_el1(); } #else -static inline bool system_uses_mte_async_mode(void) +static inline bool system_uses_mte_async_or_asymm_mode(void) { return false; } diff --git a/arch/arm64/include/asm/patching.h b/arch/arm64/include/asm/patching.h index 6bf5adc56295..82b1e0c66809 100644 --- a/arch/arm64/include/asm/patching.h +++ b/arch/arm64/include/asm/patching.h @@ -6,6 +6,7 @@ int aarch64_insn_read(void *addr, u32 *insnp); int aarch64_insn_write(void *addr, u32 insn); +int aarch64_addr_write(void *addr, u64 dst); int aarch64_insn_patch_text_nosync(void *addr, u32 insn); int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt); diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index ed57717cd004..d1b19eaf9479 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -524,6 +524,15 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) __pgprot_modify(prot, PTE_ATTRINDX_MASK, \ PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN) +/* + * Mark the prot value as outer cacheable and inner non-cacheable. Non-coherent + * devices on a system with support for a system or last level cache use these + * attributes to cache allocations in the system cache. + */ +#define pgprot_syscached(prot) \ + __pgprot_modify(prot, PTE_ATTRINDX_MASK, \ + PTE_ATTRINDX(MT_NORMAL_iNC_oWB) | PTE_PXN | PTE_UXN) + #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, @@ -999,23 +1008,13 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, * page after fork() + CoW for pfn mappings. We don't always have a * hardware-managed access flag on arm64. */ -static inline bool arch_faults_on_old_pte(void) -{ - WARN_ON(preemptible()); - - return !cpu_has_hw_af(); -} -#define arch_faults_on_old_pte arch_faults_on_old_pte +#define arch_has_hw_pte_young cpu_has_hw_af /* * Experimentally, it's cheap to set the access flag in hardware and we * benefit from prefaulting mappings as 'old' to start with. */ -static inline bool arch_wants_old_prefaulted_pte(void) -{ - return !arch_faults_on_old_pte(); -} -#define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte +#define arch_wants_old_prefaulted_pte cpu_has_hw_af #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 7364530de0a7..16b03a90ee26 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -21,6 +21,7 @@ #define MTE_CTRL_TCF_SYNC (1UL << 16) #define MTE_CTRL_TCF_ASYNC (1UL << 17) +#define MTE_CTRL_TCF_ASYMM (1UL << 18) #ifndef __ASSEMBLY__ @@ -30,6 +31,7 @@ #include #include #include +#include #include @@ -115,6 +117,12 @@ struct debug_info { #endif }; +enum vec_type { + ARM64_VEC_SVE = 0, + ARM64_VEC_SME, + ARM64_VEC_MAX, +}; + struct cpu_context { unsigned long x19; unsigned long x20; @@ -145,10 +153,13 @@ struct thread_struct { struct user_fpsimd_state fpsimd_state; } uw; + ANDROID_VENDOR_DATA(1); + unsigned int fpsimd_cpu; void *sve_state; /* SVE registers, if any */ - unsigned int sve_vl; /* SVE vector length */ - unsigned int sve_vl_onexec; /* SVE vl after next exec */ + void *za_state; /* ZA register, if any */ + unsigned int vl[ARM64_VEC_MAX]; /* vector length */ + unsigned int vl_onexec[ARM64_VEC_MAX]; /* vl after next exec */ unsigned long fault_address; /* fault info */ unsigned long fault_code; /* ESR_EL1 value */ struct debug_info debug; /* debugging */ @@ -162,8 +173,68 @@ struct thread_struct { u64 mte_ctrl; #endif u64 sctlr_user; + u64 svcr; + u64 tpidr2_el0; }; +static inline unsigned int thread_get_vl(struct thread_struct *thread, + enum vec_type type) +{ + return thread->vl[type]; +} + +static inline unsigned int thread_get_sve_vl(struct thread_struct *thread) +{ + return thread_get_vl(thread, ARM64_VEC_SVE); +} + +static inline unsigned int thread_get_sme_vl(struct thread_struct *thread) +{ + return thread_get_vl(thread, ARM64_VEC_SME); +} + +static inline unsigned int thread_get_cur_vl(struct thread_struct *thread) +{ + if (system_supports_sme() && (thread->svcr & SVCR_SM_MASK)) + return thread_get_sme_vl(thread); + else + return thread_get_sve_vl(thread); +} + +unsigned int task_get_vl(const struct task_struct *task, enum vec_type type); +void task_set_vl(struct task_struct *task, enum vec_type type, + unsigned long vl); +void task_set_vl_onexec(struct task_struct *task, enum vec_type type, + unsigned long vl); +unsigned int task_get_vl_onexec(const struct task_struct *task, + enum vec_type type); + +static inline unsigned int task_get_sve_vl(const struct task_struct *task) +{ + return task_get_vl(task, ARM64_VEC_SVE); +} + +static inline unsigned int task_get_sme_vl(const struct task_struct *task) +{ + return task_get_vl(task, ARM64_VEC_SME); +} + +static inline void task_set_sve_vl(struct task_struct *task, unsigned long vl) +{ + task_set_vl(task, ARM64_VEC_SVE, vl); +} + +static inline unsigned int task_get_sve_vl_onexec(const struct task_struct *task) +{ + return task_get_vl_onexec(task, ARM64_VEC_SVE); +} + +static inline void task_set_sve_vl_onexec(struct task_struct *task, + unsigned long vl) +{ + task_set_vl_onexec(task, ARM64_VEC_SVE, vl); +} + #define SCTLR_USER_MASK \ (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | SCTLR_ELx_ENDA | SCTLR_ELx_ENDB | \ SCTLR_EL1_TCF0_MASK) @@ -309,9 +380,11 @@ extern void __init minsigstksz_setup(void); */ #include -/* Userspace interface for PR_SVE_{SET,GET}_VL prctl()s: */ +/* Userspace interface for PR_S[MV]E_{SET,GET}_VL prctl()s: */ #define SVE_SET_VL(arg) sve_set_current_vl(arg) #define SVE_GET_VL() sve_get_current_vl() +#define SME_SET_VL(arg) sme_set_current_vl(arg) +#define SME_GET_VL() sme_get_current_vl() /* PR_PAC_RESET_KEYS prctl */ #define PAC_RESET_KEYS(tsk, arg) ptrauth_prctl_reset_keys(tsk, arg) diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h index 552891e626e5..6b2ce1ed1d4c 100644 --- a/arch/arm64/include/asm/sections.h +++ b/arch/arm64/include/asm/sections.h @@ -11,6 +11,7 @@ extern char __alt_instructions[], __alt_instructions_end[]; extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[]; extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; extern char __hyp_text_start[], __hyp_text_end[]; +extern char __hyp_data_start[], __hyp_data_end[]; extern char __hyp_rodata_start[], __hyp_rodata_end[]; extern char __hyp_reloc_begin[], __hyp_reloc_end[]; extern char __hyp_bss_start[], __hyp_bss_end[]; diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h index 6a75d7ecdcaa..543fa59e72e0 100644 --- a/arch/arm64/include/asm/simd.h +++ b/arch/arm64/include/asm/simd.h @@ -35,9 +35,7 @@ static __must_check inline bool may_use_simd(void) * migrated, and if it's clear we cannot be migrated to a CPU * where it is set. */ - return !WARN_ON(!system_capabilities_finalized()) && - system_supports_fpsimd() && - !in_hardirq() && !irqs_disabled() && !in_nmi() && + return !in_hardirq() && !irqs_disabled() && !in_nmi() && !this_cpu_read(fpsimd_context_busy); } diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index fc55f5a57a06..b93637756f91 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -87,6 +87,8 @@ extern void secondary_entry(void); extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); +extern int nr_ipi_get(void); +extern struct irq_desc **ipi_desc_get(void); #ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask); diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 8aebc00c1718..3f125c1c8b4a 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -8,85 +8,19 @@ #include #include #include -#include #include +#include #include #include -enum stack_type { - STACK_TYPE_UNKNOWN, - STACK_TYPE_TASK, - STACK_TYPE_IRQ, - STACK_TYPE_OVERFLOW, - STACK_TYPE_SDEI_NORMAL, - STACK_TYPE_SDEI_CRITICAL, - __NR_STACK_TYPES -}; +#include -struct stack_info { - unsigned long low; - unsigned long high; - enum stack_type type; -}; - -/* - * A snapshot of a frame record or fp/lr register values, along with some - * accounting information necessary for robust unwinding. - * - * @fp: The fp value in the frame record (or the real fp) - * @pc: The lr value in the frame record (or the real lr) - * - * @stacks_done: Stacks which have been entirely unwound, for which it is no - * longer valid to unwind to. - * - * @prev_fp: The fp that pointed to this frame record, or a synthetic value - * of 0. This is used to ensure that within a stack, each - * subsequent frame record is at an increasing address. - * @prev_type: The type of stack this frame record was on, or a synthetic - * value of STACK_TYPE_UNKNOWN. This is used to detect a - * transition from one stack to another. - * - * @graph: When FUNCTION_GRAPH_TRACER is selected, holds the index of a - * replacement lr value in the ftrace graph stack. - */ -struct stackframe { - unsigned long fp; - unsigned long pc; - DECLARE_BITMAP(stacks_done, __NR_STACK_TYPES); - unsigned long prev_fp; - enum stack_type prev_type; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - int graph; -#endif -}; - -extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame); -extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, - bool (*fn)(void *, unsigned long), void *data); extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, const char *loglvl); DECLARE_PER_CPU(unsigned long *, irq_stack_ptr); -static inline bool on_stack(unsigned long sp, unsigned long size, - unsigned long low, unsigned long high, - enum stack_type type, struct stack_info *info) -{ - if (!low) - return false; - - if (sp < low || sp + size < sp || sp + size > high) - return false; - - if (info) { - info->low = low; - info->high = high; - info->type = type; - } - return true; -} - static inline bool on_irq_stack(unsigned long sp, unsigned long size, struct stack_info *info) { @@ -122,33 +56,4 @@ static inline bool on_overflow_stack(unsigned long sp, unsigned long size, struct stack_info *info) { return false; } #endif - -/* - * We can only safely access per-cpu stacks from current in a non-preemptible - * context. - */ -static inline bool on_accessible_stack(const struct task_struct *tsk, - unsigned long sp, unsigned long size, - struct stack_info *info) -{ - if (info) - info->type = STACK_TYPE_UNKNOWN; - - if (on_task_stack(tsk, sp, size, info)) - return true; - if (tsk != current || preemptible()) - return false; - if (on_irq_stack(sp, size, info)) - return true; - if (on_overflow_stack(sp, size, info)) - return true; - if (on_sdei_stack(sp, size, info)) - return true; - - return false; -} - -void start_backtrace(struct stackframe *frame, unsigned long fp, - unsigned long pc); - #endif /* __ASM_STACKTRACE_H */ diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h new file mode 100644 index 000000000000..dfbf2a7399ae --- /dev/null +++ b/arch/arm64/include/asm/stacktrace/common.h @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common arm64 stack unwinder code. + * + * To implement a new arm64 stack unwinder: + * 1) Include this header + * + * 2) Call into unwind_next_common() from your top level unwind + * function, passing it the validation and translation callbacks + * (though the later can be NULL if no translation is required). + * + * See: arch/arm64/kernel/stacktrace.c for the reference implementation. + * + * Copyright (C) 2012 ARM Ltd. + */ +#ifndef __ASM_STACKTRACE_COMMON_H +#define __ASM_STACKTRACE_COMMON_H + +#include +#include +#include +#include + +enum stack_type { + STACK_TYPE_UNKNOWN, + STACK_TYPE_TASK, + STACK_TYPE_IRQ, + STACK_TYPE_OVERFLOW, + STACK_TYPE_SDEI_NORMAL, + STACK_TYPE_SDEI_CRITICAL, + STACK_TYPE_HYP, + __NR_STACK_TYPES +}; + +struct stack_info { + unsigned long low; + unsigned long high; + enum stack_type type; +}; + +/* + * A snapshot of a frame record or fp/lr register values, along with some + * accounting information necessary for robust unwinding. + * + * @fp: The fp value in the frame record (or the real fp) + * @pc: The lr value in the frame record (or the real lr) + * + * @stacks_done: Stacks which have been entirely unwound, for which it is no + * longer valid to unwind to. + * + * @prev_fp: The fp that pointed to this frame record, or a synthetic value + * of 0. This is used to ensure that within a stack, each + * subsequent frame record is at an increasing address. + * @prev_type: The type of stack this frame record was on, or a synthetic + * value of STACK_TYPE_UNKNOWN. This is used to detect a + * transition from one stack to another. + * + * @task: The task being unwound. + */ +struct unwind_state { + unsigned long fp; + unsigned long pc; + DECLARE_BITMAP(stacks_done, __NR_STACK_TYPES); + unsigned long prev_fp; + enum stack_type prev_type; + struct task_struct *task; +}; + +static inline bool on_stack(unsigned long sp, unsigned long size, + unsigned long low, unsigned long high, + enum stack_type type, struct stack_info *info) +{ + if (!low) + return false; + + if (sp < low || sp + size < sp || sp + size > high) + return false; + + if (info) { + info->low = low; + info->high = high; + info->type = type; + } + return true; +} + +static inline void unwind_init_common(struct unwind_state *state, + struct task_struct *task) +{ + state->task = task; + + /* + * Prime the first unwind. + * + * In unwind_next() we'll check that the FP points to a valid stack, + * which can't be STACK_TYPE_UNKNOWN, and the first unwind will be + * treated as a transition to whichever stack that happens to be. The + * prev_fp value won't be used, but we set it to 0 such that it is + * definitely not an accessible stack address. + */ + bitmap_zero(state->stacks_done, __NR_STACK_TYPES); + state->prev_fp = 0; + state->prev_type = STACK_TYPE_UNKNOWN; +} + +/* + * stack_trace_translate_fp_fn() - Translates a non-kernel frame pointer to + * a kernel address. + * + * @fp: the frame pointer to be updated to its kernel address. + * @type: the stack type associated with frame pointer @fp + * + * Returns true and success and @fp is updated to the corresponding + * kernel virtual address; otherwise returns false. + */ +typedef bool (*stack_trace_translate_fp_fn)(unsigned long *fp, + enum stack_type type); + +/* + * on_accessible_stack_fn() - Check whether a stack range is on any + * of the possible stacks. + * + * @tsk: task whose stack is being unwound + * @sp: stack address being checked + * @size: size of the stack range being checked + * @info: stack unwinding context + */ +typedef bool (*on_accessible_stack_fn)(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info); + +static inline int unwind_next_common(struct unwind_state *state, + struct stack_info *info, + on_accessible_stack_fn accessible, + stack_trace_translate_fp_fn translate_fp) +{ + unsigned long fp = state->fp, kern_fp = fp; + struct task_struct *tsk = state->task; + + if (fp & 0x7) + return -EINVAL; + + if (!accessible(tsk, fp, 16, info)) + return -EINVAL; + + if (test_bit(info->type, state->stacks_done)) + return -EINVAL; + + /* + * If fp is not from the current address space perform the necessary + * translation before dereferencing it to get the next fp. + */ + if (translate_fp && !translate_fp(&kern_fp, info->type)) + return -EINVAL; + + /* + * As stacks grow downward, any valid record on the same stack must be + * at a strictly higher address than the prior record. + * + * Stacks can nest in several valid orders, e.g. + * + * TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL + * TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW + * HYP -> OVERFLOW + * + * ... but the nesting itself is strict. Once we transition from one + * stack to another, it's never valid to unwind back to that first + * stack. + */ + if (info->type == state->prev_type) { + if (fp <= state->prev_fp) + return -EINVAL; + } else { + __set_bit(state->prev_type, state->stacks_done); + } + + /* + * Record this frame record's values and location. The prev_fp and + * prev_type are only meaningful to the next unwind_next() invocation. + */ + state->fp = READ_ONCE(*(unsigned long *)(kern_fp)); + state->pc = READ_ONCE(*(unsigned long *)(kern_fp + 8)); + state->prev_fp = fp; + state->prev_type = info->type; + + return 0; +} + +#endif /* __ASM_STACKTRACE_COMMON_H */ diff --git a/arch/arm64/include/asm/stacktrace/nvhe.h b/arch/arm64/include/asm/stacktrace/nvhe.h new file mode 100644 index 000000000000..d5527b600390 --- /dev/null +++ b/arch/arm64/include/asm/stacktrace/nvhe.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * KVM nVHE hypervisor stack tracing support. + * + * The unwinder implementation depends on the nVHE mode: + * + * 1) Non-protected nVHE mode - the host can directly access the + * HYP stack pages and unwind the HYP stack in EL1. This saves having + * to allocate shared buffers for the host to read the unwinded + * stacktrace. + * + * 2) pKVM (protected nVHE) mode - the host cannot directly access + * the HYP memory. The stack is unwinded in EL2 and dumped to a shared + * buffer where the host can read and print the stacktrace. + * + * Copyright (C) 2022 Google LLC + */ +#ifndef __ASM_STACKTRACE_NVHE_H +#define __ASM_STACKTRACE_NVHE_H + +#include + +/* + * kvm_nvhe_unwind_init - Start an unwind from the given nVHE HYP fp and pc + * + * @state : unwind_state to initialize + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + */ +static inline void kvm_nvhe_unwind_init(struct unwind_state *state, + unsigned long fp, + unsigned long pc) +{ + unwind_init_common(state, NULL); + + state->fp = fp; + state->pc = pc; +} + +#ifndef __KVM_NVHE_HYPERVISOR__ +/* + * Conventional (non-protected) nVHE HYP stack unwinder + * + * In non-protected mode, the unwinding is done from kernel proper context + * (by the host in EL1). + */ + +DECLARE_KVM_NVHE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack); +DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_stacktrace_info, kvm_stacktrace_info); +DECLARE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); + +void kvm_nvhe_dump_backtrace(unsigned long hyp_offset); + +#endif /* __KVM_NVHE_HYPERVISOR__ */ +#endif /* __ASM_STACKTRACE_NVHE_H */ diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index fe341a6578c3..c8dca8ae359c 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -10,13 +10,6 @@ #include -/* - * PGDIR_SHIFT determines the size a top-level page table entry can map - * and depends on the number of levels in the page table. Compute the - * PGDIR_SHIFT for a given number of levels. - */ -#define pt_levels_pgdir_shift(lvls) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls)) - /* * The hardware supports concatenation of up to 16 tables at stage2 entry * level and we use the feature whenever possible, which means we resolve 4 @@ -30,11 +23,6 @@ #define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4) #define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr) -/* stage2_pgdir_shift() is the size mapped by top-level stage2 entry for the VM */ -#define stage2_pgdir_shift(kvm) pt_levels_pgdir_shift(kvm_stage2_levels(kvm)) -#define stage2_pgdir_size(kvm) (1ULL << stage2_pgdir_shift(kvm)) -#define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1) - /* * kvm_mmmu_cache_min_pages() is the number of pages required to install * a stage-2 translation. We pre-allocate the entry level page table at @@ -42,12 +30,4 @@ */ #define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1) -static inline phys_addr_t -stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t boundary = (addr + stage2_pgdir_size(kvm)) & stage2_pgdir_mask(kvm); - - return (boundary - 1 < end - 1) ? boundary : end; -} - #endif /* __ARM64_S2_PGTABLE_H_ */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index f79f3720e4cb..1479f5c89cea 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -116,6 +116,10 @@ * System registers, organised loosely by encoding but grouped together * where the architected name contains an index. e.g. ID_MMFR_EL1. */ +#define SYS_SVCR_SMSTOP_SM_EL0 sys_reg(0, 3, 4, 2, 3) +#define SYS_SVCR_SMSTART_SM_EL0 sys_reg(0, 3, 4, 3, 3) +#define SYS_SVCR_SMSTOP_SMZA_EL0 sys_reg(0, 3, 4, 6, 3) + #define SYS_OSDTRRX_EL1 sys_reg(2, 0, 0, 0, 2) #define SYS_MDCCINT_EL1 sys_reg(2, 0, 0, 2, 0) #define SYS_MDSCR_EL1 sys_reg(2, 0, 0, 2, 2) @@ -126,8 +130,16 @@ #define SYS_DBGWVRn_EL1(n) sys_reg(2, 0, 0, n, 6) #define SYS_DBGWCRn_EL1(n) sys_reg(2, 0, 0, n, 7) #define SYS_MDRAR_EL1 sys_reg(2, 0, 1, 0, 0) + #define SYS_OSLAR_EL1 sys_reg(2, 0, 1, 0, 4) +#define SYS_OSLAR_OSLK BIT(0) + #define SYS_OSLSR_EL1 sys_reg(2, 0, 1, 1, 4) +#define SYS_OSLSR_OSLM_MASK (BIT(3) | BIT(0)) +#define SYS_OSLSR_OSLM_NI 0 +#define SYS_OSLSR_OSLM_IMPLEMENTED BIT(3) +#define SYS_OSLSR_OSLK BIT(1) + #define SYS_OSDLR_EL1 sys_reg(2, 0, 1, 3, 4) #define SYS_DBGPRCR_EL1 sys_reg(2, 0, 1, 4, 4) #define SYS_DBGCLAIMSET_EL1 sys_reg(2, 0, 7, 8, 6) @@ -171,6 +183,7 @@ #define SYS_ID_AA64PFR0_EL1 sys_reg(3, 0, 0, 4, 0) #define SYS_ID_AA64PFR1_EL1 sys_reg(3, 0, 0, 4, 1) #define SYS_ID_AA64ZFR0_EL1 sys_reg(3, 0, 0, 4, 4) +#define SYS_ID_AA64SMFR0_EL1 sys_reg(3, 0, 0, 4, 5) #define SYS_ID_AA64DFR0_EL1 sys_reg(3, 0, 0, 5, 0) #define SYS_ID_AA64DFR1_EL1 sys_reg(3, 0, 0, 5, 1) @@ -194,6 +207,8 @@ #define SYS_ZCR_EL1 sys_reg(3, 0, 1, 2, 0) #define SYS_TRFCR_EL1 sys_reg(3, 0, 1, 2, 1) +#define SYS_SMPRI_EL1 sys_reg(3, 0, 1, 2, 4) +#define SYS_SMCR_EL1 sys_reg(3, 0, 1, 2, 6) #define SYS_TTBR0_EL1 sys_reg(3, 0, 2, 0, 0) #define SYS_TTBR1_EL1 sys_reg(3, 0, 2, 0, 1) @@ -386,6 +401,8 @@ #define TRBIDR_ALIGN_MASK GENMASK(3, 0) #define TRBIDR_ALIGN_SHIFT 0 +#define SMPRI_EL1_PRIORITY_MASK 0xf + #define SYS_PMINTENSET_EL1 sys_reg(3, 0, 9, 14, 1) #define SYS_PMINTENCLR_EL1 sys_reg(3, 0, 9, 14, 2) @@ -398,7 +415,10 @@ #define SYS_LOREA_EL1 sys_reg(3, 0, 10, 4, 1) #define SYS_LORN_EL1 sys_reg(3, 0, 10, 4, 2) #define SYS_LORC_EL1 sys_reg(3, 0, 10, 4, 3) +#define SYS_MPAMIDR_EL1 sys_reg(3, 0, 10, 4, 4) #define SYS_LORID_EL1 sys_reg(3, 0, 10, 4, 7) +#define SYS_MPAM1_EL1 sys_reg(3, 0, 10, 5, 0) +#define SYS_MPAM0_EL1 sys_reg(3, 0, 10, 5, 1) #define SYS_VBAR_EL1 sys_reg(3, 0, 12, 0, 0) #define SYS_DISR_EL1 sys_reg(3, 0, 12, 1, 1) @@ -441,8 +461,13 @@ #define SYS_CCSIDR_EL1 sys_reg(3, 1, 0, 0, 0) #define SYS_CLIDR_EL1 sys_reg(3, 1, 0, 0, 1) #define SYS_GMID_EL1 sys_reg(3, 1, 0, 0, 4) +#define SYS_SMIDR_EL1 sys_reg(3, 1, 0, 0, 6) #define SYS_AIDR_EL1 sys_reg(3, 1, 0, 0, 7) +#define SMIDR_EL1_IMPLEMENTER_SHIFT 24 +#define SMIDR_EL1_SMPS_SHIFT 15 +#define SMIDR_EL1_AFFINITY_SHIFT 0 + #define SYS_CSSELR_EL1 sys_reg(3, 2, 0, 0, 0) #define SYS_CTR_EL0 sys_reg(3, 3, 0, 0, 1) @@ -451,6 +476,10 @@ #define SYS_RNDR_EL0 sys_reg(3, 3, 2, 4, 0) #define SYS_RNDRRS_EL0 sys_reg(3, 3, 2, 4, 1) +#define SYS_SVCR sys_reg(3, 3, 4, 2, 2) +#define SVCR_ZA_MASK 2 +#define SVCR_SM_MASK 1 + #define SYS_PMCR_EL0 sys_reg(3, 3, 9, 12, 0) #define SYS_PMCNTENSET_EL0 sys_reg(3, 3, 9, 12, 1) #define SYS_PMCNTENCLR_EL0 sys_reg(3, 3, 9, 12, 2) @@ -467,6 +496,7 @@ #define SYS_TPIDR_EL0 sys_reg(3, 3, 13, 0, 2) #define SYS_TPIDRRO_EL0 sys_reg(3, 3, 13, 0, 3) +#define SYS_TPIDR2_EL0 sys_reg(3, 3, 13, 0, 5) #define SYS_SCXTNUM_EL0 sys_reg(3, 3, 13, 0, 7) @@ -533,6 +563,9 @@ #define SYS_HFGITR_EL2 sys_reg(3, 4, 1, 1, 6) #define SYS_ZCR_EL2 sys_reg(3, 4, 1, 2, 0) #define SYS_TRFCR_EL2 sys_reg(3, 4, 1, 2, 1) +#define SYS_HCRX_EL2 sys_reg(3, 4, 1, 2, 2) +#define SYS_SMPRIMAP_EL2 sys_reg(3, 4, 1, 2, 5) +#define SYS_SMCR_EL2 sys_reg(3, 4, 1, 2, 6) #define SYS_DACR32_EL2 sys_reg(3, 4, 3, 0, 0) #define SYS_HDFGRTR_EL2 sys_reg(3, 4, 3, 1, 4) #define SYS_HDFGWTR_EL2 sys_reg(3, 4, 3, 1, 5) @@ -546,6 +579,10 @@ #define SYS_TFSR_EL2 sys_reg(3, 4, 5, 6, 0) #define SYS_FAR_EL2 sys_reg(3, 4, 6, 0, 0) +#define SYS_MPAMHCR_EL2 sys_reg(3, 4, 10, 4, 0) +#define SYS_MPAMVPMV_EL2 sys_reg(3, 4, 10, 4, 1) +#define SYS_MPAM2_EL2 sys_reg(3, 4, 10, 5, 0) + #define SYS_VDISR_EL2 sys_reg(3, 4, 12, 1, 1) #define __SYS__AP0Rx_EL2(x) sys_reg(3, 4, 12, 8, x) #define SYS_ICH_AP0R0_EL2 __SYS__AP0Rx_EL2(0) @@ -592,6 +629,7 @@ #define SYS_SCTLR_EL12 sys_reg(3, 5, 1, 0, 0) #define SYS_CPACR_EL12 sys_reg(3, 5, 1, 0, 2) #define SYS_ZCR_EL12 sys_reg(3, 5, 1, 2, 0) +#define SYS_SMCR_EL12 sys_reg(3, 5, 1, 2, 6) #define SYS_TTBR0_EL12 sys_reg(3, 5, 2, 0, 0) #define SYS_TTBR1_EL12 sys_reg(3, 5, 2, 0, 1) #define SYS_TCR_EL12 sys_reg(3, 5, 2, 0, 2) @@ -615,6 +653,7 @@ #define SYS_CNTV_CVAL_EL02 sys_reg(3, 5, 14, 3, 2) /* Common SCTLR_ELx flags. */ +#define SCTLR_ELx_ENTP2 (BIT(60)) #define SCTLR_ELx_DSSBS (BIT(44)) #define SCTLR_ELx_ATA (BIT(43)) @@ -622,6 +661,7 @@ #define SCTLR_ELx_TCF_NONE (UL(0x0) << SCTLR_ELx_TCF_SHIFT) #define SCTLR_ELx_TCF_SYNC (UL(0x1) << SCTLR_ELx_TCF_SHIFT) #define SCTLR_ELx_TCF_ASYNC (UL(0x2) << SCTLR_ELx_TCF_SHIFT) +#define SCTLR_ELx_TCF_ASYMM (UL(0x3) << SCTLR_ELx_TCF_SHIFT) #define SCTLR_ELx_TCF_MASK (UL(0x3) << SCTLR_ELx_TCF_SHIFT) #define SCTLR_ELx_ENIA_SHIFT 31 @@ -667,6 +707,7 @@ #define SCTLR_EL1_TCF0_NONE (UL(0x0) << SCTLR_EL1_TCF0_SHIFT) #define SCTLR_EL1_TCF0_SYNC (UL(0x1) << SCTLR_EL1_TCF0_SHIFT) #define SCTLR_EL1_TCF0_ASYNC (UL(0x2) << SCTLR_EL1_TCF0_SHIFT) +#define SCTLR_EL1_TCF0_ASYMM (UL(0x3) << SCTLR_EL1_TCF0_SHIFT) #define SCTLR_EL1_TCF0_MASK (UL(0x3) << SCTLR_EL1_TCF0_SHIFT) #define SCTLR_EL1_BT1 (BIT(36)) @@ -709,66 +750,70 @@ #define MAIR_ATTR_NORMAL_TAGGED UL(0xf0) #define MAIR_ATTR_NORMAL UL(0xff) #define MAIR_ATTR_MASK UL(0xff) +#define MAIR_ATTR_NORMAL_iNC_oWB UL(0xf4) /* Position the attr at the correct index */ #define MAIR_ATTRIDX(attr, idx) ((attr) << ((idx) * 8)) /* id_aa64isar0 */ -#define ID_AA64ISAR0_RNDR_SHIFT 60 -#define ID_AA64ISAR0_TLB_SHIFT 56 -#define ID_AA64ISAR0_TS_SHIFT 52 -#define ID_AA64ISAR0_FHM_SHIFT 48 -#define ID_AA64ISAR0_DP_SHIFT 44 -#define ID_AA64ISAR0_SM4_SHIFT 40 -#define ID_AA64ISAR0_SM3_SHIFT 36 -#define ID_AA64ISAR0_SHA3_SHIFT 32 -#define ID_AA64ISAR0_RDM_SHIFT 28 -#define ID_AA64ISAR0_ATOMICS_SHIFT 20 -#define ID_AA64ISAR0_CRC32_SHIFT 16 -#define ID_AA64ISAR0_SHA2_SHIFT 12 -#define ID_AA64ISAR0_SHA1_SHIFT 8 -#define ID_AA64ISAR0_AES_SHIFT 4 +#define ID_AA64ISAR0_EL1_RNDR_SHIFT 60 +#define ID_AA64ISAR0_EL1_TLB_SHIFT 56 +#define ID_AA64ISAR0_EL1_TS_SHIFT 52 +#define ID_AA64ISAR0_EL1_FHM_SHIFT 48 +#define ID_AA64ISAR0_EL1_DP_SHIFT 44 +#define ID_AA64ISAR0_EL1_SM4_SHIFT 40 +#define ID_AA64ISAR0_EL1_SM3_SHIFT 36 +#define ID_AA64ISAR0_EL1_SHA3_SHIFT 32 +#define ID_AA64ISAR0_EL1_RDM_SHIFT 28 +#define ID_AA64ISAR0_EL1_ATOMIC_SHIFT 20 +#define ID_AA64ISAR0_EL1_CRC32_SHIFT 16 +#define ID_AA64ISAR0_EL1_SHA2_SHIFT 12 +#define ID_AA64ISAR0_EL1_SHA1_SHIFT 8 +#define ID_AA64ISAR0_EL1_AES_SHIFT 4 -#define ID_AA64ISAR0_TLB_RANGE_NI 0x0 -#define ID_AA64ISAR0_TLB_RANGE 0x2 +#define ID_AA64ISAR0_EL1_TLB_RANGE_NI 0x0 +#define ID_AA64ISAR0_EL1_TLB_RANGE 0x2 /* id_aa64isar1 */ -#define ID_AA64ISAR1_I8MM_SHIFT 52 -#define ID_AA64ISAR1_DGH_SHIFT 48 -#define ID_AA64ISAR1_BF16_SHIFT 44 -#define ID_AA64ISAR1_SPECRES_SHIFT 40 -#define ID_AA64ISAR1_SB_SHIFT 36 -#define ID_AA64ISAR1_FRINTTS_SHIFT 32 -#define ID_AA64ISAR1_GPI_SHIFT 28 -#define ID_AA64ISAR1_GPA_SHIFT 24 -#define ID_AA64ISAR1_LRCPC_SHIFT 20 -#define ID_AA64ISAR1_FCMA_SHIFT 16 -#define ID_AA64ISAR1_JSCVT_SHIFT 12 -#define ID_AA64ISAR1_API_SHIFT 8 -#define ID_AA64ISAR1_APA_SHIFT 4 -#define ID_AA64ISAR1_DPB_SHIFT 0 +#define ID_AA64ISAR1_EL1_I8MM_SHIFT 52 +#define ID_AA64ISAR1_EL1_DGH_SHIFT 48 +#define ID_AA64ISAR1_EL1_BF16_SHIFT 44 +#define ID_AA64ISAR1_EL1_BF16_MASK (0xfUL << ID_AA64ISAR1_EL1_BF16_SHIFT) +#define ID_AA64ISAR1_EL1_SPECRES_SHIFT 40 +#define ID_AA64ISAR1_EL1_SB_SHIFT 36 +#define ID_AA64ISAR1_EL1_FRINTTS_SHIFT 32 +#define ID_AA64ISAR1_EL1_GPI_SHIFT 28 +#define ID_AA64ISAR1_EL1_GPA_SHIFT 24 +#define ID_AA64ISAR1_EL1_LRCPC_SHIFT 20 +#define ID_AA64ISAR1_EL1_FCMA_SHIFT 16 +#define ID_AA64ISAR1_EL1_JSCVT_SHIFT 12 +#define ID_AA64ISAR1_EL1_API_SHIFT 8 +#define ID_AA64ISAR1_EL1_APA_SHIFT 4 +#define ID_AA64ISAR1_EL1_DPB_SHIFT 0 -#define ID_AA64ISAR1_APA_NI 0x0 -#define ID_AA64ISAR1_APA_ARCHITECTED 0x1 -#define ID_AA64ISAR1_APA_ARCH_EPAC 0x2 -#define ID_AA64ISAR1_APA_ARCH_EPAC2 0x3 -#define ID_AA64ISAR1_APA_ARCH_EPAC2_FPAC 0x4 -#define ID_AA64ISAR1_APA_ARCH_EPAC2_FPAC_CMB 0x5 -#define ID_AA64ISAR1_API_NI 0x0 -#define ID_AA64ISAR1_API_IMP_DEF 0x1 -#define ID_AA64ISAR1_API_IMP_DEF_EPAC 0x2 -#define ID_AA64ISAR1_API_IMP_DEF_EPAC2 0x3 -#define ID_AA64ISAR1_API_IMP_DEF_EPAC2_FPAC 0x4 -#define ID_AA64ISAR1_API_IMP_DEF_EPAC2_FPAC_CMB 0x5 -#define ID_AA64ISAR1_GPA_NI 0x0 -#define ID_AA64ISAR1_GPA_ARCHITECTED 0x1 -#define ID_AA64ISAR1_GPI_NI 0x0 -#define ID_AA64ISAR1_GPI_IMP_DEF 0x1 +#define ID_AA64ISAR1_EL1_APA_NI 0x0 +#define ID_AA64ISAR1_EL1_APA_PAuth 0x1 +#define ID_AA64ISAR1_EL1_APA_ARCH_EPAC 0x2 +#define ID_AA64ISAR1_EL1_APA_Pauth2 0x3 +#define ID_AA64ISAR1_EL1_APA_FPAC 0x4 +#define ID_AA64ISAR1_EL1_APA_FPACCOMBINE 0x5 +#define ID_AA64ISAR1_EL1_API_NI 0x0 +#define ID_AA64ISAR1_EL1_API_PAuth 0x1 +#define ID_AA64ISAR1_EL1_API_EPAC 0x2 +#define ID_AA64ISAR1_EL1_API_PAuth2 0x3 +#define ID_AA64ISAR1_EL1_API_FPAC 0x4 +#define ID_AA64ISAR1_EL1_API_FPACCOMBINE 0x5 +#define ID_AA64ISAR1_EL1_GPA_NI 0x0 +#define ID_AA64ISAR1_EL1_GPA_IMP 0x1 +#define ID_AA64ISAR1_EL1_GPI_NI 0x0 +#define ID_AA64ISAR1_EL1_GPI_IMP 0x1 /* id_aa64isar2 */ -#define ID_AA64ISAR2_CLEARBHB_SHIFT 28 -#define ID_AA64ISAR2_RPRES_SHIFT 4 -#define ID_AA64ISAR2_WFXT_SHIFT 0 +#define ID_AA64ISAR2_EL1_BC_SHIFT 28 +#define ID_AA64ISAR2_EL1_APA3_SHIFT 12 +#define ID_AA64ISAR2_EL1_GPA3_SHIFT 8 +#define ID_AA64ISAR2_EL1_RPRES_SHIFT 4 +#define ID_AA64ISAR2_EL1_WFxT_SHIFT 0 #define ID_AA64ISAR2_RPRES_8BIT 0x0 #define ID_AA64ISAR2_RPRES_12BIT 0x1 @@ -777,180 +822,211 @@ * reserved, but has not yet been removed from the ARM ARM * as of ARM DDI 0487G.b. */ -#define ID_AA64ISAR2_WFXT_NI 0x0 -#define ID_AA64ISAR2_WFXT_SUPPORTED 0x2 +#define ID_AA64ISAR2_EL1_WFxT_NI 0x0 +#define ID_AA64ISAR2_EL1_WFxT_IMP 0x2 + +#define ID_AA64ISAR2_EL1_APA3_NI 0x0 +#define ID_AA64ISAR2_EL1_APA3_PAuth 0x1 +#define ID_AA64ISAR2_EL1_APA3_EPAC 0x2 +#define ID_AA64ISAR2_EL1_APA3_PAuth2 0x3 +#define ID_AA64ISAR2_EL1_APA3_FPAC 0x4 +#define ID_AA64ISAR2_EL1_APA3_FPACCOMBINE 0x5 + +#define ID_AA64ISAR2_EL1_GPA3_NI 0x0 +#define ID_AA64ISAR2_EL1_GPA3_IMP 0x1 /* id_aa64pfr0 */ -#define ID_AA64PFR0_CSV3_SHIFT 60 -#define ID_AA64PFR0_CSV2_SHIFT 56 -#define ID_AA64PFR0_DIT_SHIFT 48 -#define ID_AA64PFR0_AMU_SHIFT 44 -#define ID_AA64PFR0_MPAM_SHIFT 40 -#define ID_AA64PFR0_SEL2_SHIFT 36 -#define ID_AA64PFR0_SVE_SHIFT 32 -#define ID_AA64PFR0_RAS_SHIFT 28 -#define ID_AA64PFR0_GIC_SHIFT 24 -#define ID_AA64PFR0_ASIMD_SHIFT 20 -#define ID_AA64PFR0_FP_SHIFT 16 -#define ID_AA64PFR0_EL3_SHIFT 12 -#define ID_AA64PFR0_EL2_SHIFT 8 -#define ID_AA64PFR0_EL1_SHIFT 4 -#define ID_AA64PFR0_EL0_SHIFT 0 +#define ID_AA64PFR0_EL1_CSV3_SHIFT 60 +#define ID_AA64PFR0_EL1_CSV2_SHIFT 56 +#define ID_AA64PFR0_EL1_DIT_SHIFT 48 +#define ID_AA64PFR0_EL1_AMU_SHIFT 44 +#define ID_AA64PFR0_EL1_MPAM_SHIFT 40 +#define ID_AA64PFR0_EL1_SEL2_SHIFT 36 +#define ID_AA64PFR0_EL1_SVE_SHIFT 32 +#define ID_AA64PFR0_EL1_RAS_SHIFT 28 +#define ID_AA64PFR0_EL1_GIC_SHIFT 24 +#define ID_AA64PFR0_EL1_AdvSIMD_SHIFT 20 +#define ID_AA64PFR0_EL1_FP_SHIFT 16 +#define ID_AA64PFR0_EL1_EL3_SHIFT 12 +#define ID_AA64PFR0_EL1_EL2_SHIFT 8 +#define ID_AA64PFR0_EL1_EL1_SHIFT 4 +#define ID_AA64PFR0_EL1_EL0_SHIFT 0 -#define ID_AA64PFR0_AMU 0x1 -#define ID_AA64PFR0_SVE 0x1 -#define ID_AA64PFR0_RAS_V1 0x1 -#define ID_AA64PFR0_RAS_V1P1 0x2 -#define ID_AA64PFR0_FP_NI 0xf -#define ID_AA64PFR0_FP_SUPPORTED 0x0 -#define ID_AA64PFR0_ASIMD_NI 0xf -#define ID_AA64PFR0_ASIMD_SUPPORTED 0x0 -#define ID_AA64PFR0_ELx_64BIT_ONLY 0x1 -#define ID_AA64PFR0_ELx_32BIT_64BIT 0x2 +#define ID_AA64PFR0_EL1_AMU_IMP 0x1 +#define ID_AA64PFR0_EL1_SVE_IMP 0x1 +#define ID_AA64PFR0_EL1_RAS_IMP 0x1 +#define ID_AA64PFR0_EL1_RAS_V1P1 0x2 +#define ID_AA64PFR0_EL1_FP_NI 0xf +#define ID_AA64PFR0_EL1_FP_IMP 0x0 +#define ID_AA64PFR0_EL1_AdvSIMD_NI 0xf +#define ID_AA64PFR0_EL1_ELx_64BIT_ONLY 0x1 +#define ID_AA64PFR0_EL1_ELx_32BIT_64BIT 0x2 /* id_aa64pfr1 */ -#define ID_AA64PFR1_MPAMFRAC_SHIFT 16 -#define ID_AA64PFR1_RASFRAC_SHIFT 12 -#define ID_AA64PFR1_MTE_SHIFT 8 -#define ID_AA64PFR1_SSBS_SHIFT 4 -#define ID_AA64PFR1_BT_SHIFT 0 +#define ID_AA64PFR1_EL1_SME_SHIFT 24 +#define ID_AA64PFR1_EL1_MPAM_frac_SHIFT 16 +#define ID_AA64PFR1_EL1_RAS_frac_SHIFT 12 +#define ID_AA64PFR1_EL1_MTE_SHIFT 8 +#define ID_AA64PFR1_EL1_SSBS_SHIFT 4 +#define ID_AA64PFR1_EL1_BT_SHIFT 0 -#define ID_AA64PFR1_SSBS_PSTATE_NI 0 -#define ID_AA64PFR1_SSBS_PSTATE_ONLY 1 -#define ID_AA64PFR1_SSBS_PSTATE_INSNS 2 -#define ID_AA64PFR1_BT_BTI 0x1 +#define ID_AA64PFR1_EL1_SSBS_NI 0 +#define ID_AA64PFR1_EL1_SSBS_IMP 1 +#define ID_AA64PFR1_EL1_SSBS_SSBS2 2 +#define ID_AA64PFR1_EL1_BT_IMP 0x1 +#define ID_AA64PFR1_EL1_SME_IMP 1 -#define ID_AA64PFR1_MTE_NI 0x0 -#define ID_AA64PFR1_MTE_EL0 0x1 -#define ID_AA64PFR1_MTE 0x2 +#define ID_AA64PFR1_EL1_MTE_NI 0x0 +#define ID_AA64PFR1_EL1_MTE_IMP 0x1 +#define ID_AA64PFR1_EL1_MTE_MTE2 0x2 +#define ID_AA64PFR1_EL1_MTE_MTE3 0x3 /* id_aa64zfr0 */ -#define ID_AA64ZFR0_F64MM_SHIFT 56 -#define ID_AA64ZFR0_F32MM_SHIFT 52 -#define ID_AA64ZFR0_I8MM_SHIFT 44 -#define ID_AA64ZFR0_SM4_SHIFT 40 -#define ID_AA64ZFR0_SHA3_SHIFT 32 -#define ID_AA64ZFR0_BF16_SHIFT 20 -#define ID_AA64ZFR0_BITPERM_SHIFT 16 -#define ID_AA64ZFR0_AES_SHIFT 4 -#define ID_AA64ZFR0_SVEVER_SHIFT 0 +#define ID_AA64ZFR0_EL1_F64MM_SHIFT 56 +#define ID_AA64ZFR0_EL1_F32MM_SHIFT 52 +#define ID_AA64ZFR0_EL1_I8MM_SHIFT 44 +#define ID_AA64ZFR0_EL1_SM4_SHIFT 40 +#define ID_AA64ZFR0_EL1_SHA3_SHIFT 32 +#define ID_AA64ZFR0_EL1_BF16_SHIFT 20 +#define ID_AA64ZFR0_EL1_BitPerm_SHIFT 16 +#define ID_AA64ZFR0_EL1_AES_SHIFT 4 +#define ID_AA64ZFR0_EL1_SVEver_SHIFT 0 -#define ID_AA64ZFR0_F64MM 0x1 -#define ID_AA64ZFR0_F32MM 0x1 -#define ID_AA64ZFR0_I8MM 0x1 -#define ID_AA64ZFR0_BF16 0x1 -#define ID_AA64ZFR0_SM4 0x1 -#define ID_AA64ZFR0_SHA3 0x1 -#define ID_AA64ZFR0_BITPERM 0x1 -#define ID_AA64ZFR0_AES 0x1 -#define ID_AA64ZFR0_AES_PMULL 0x2 -#define ID_AA64ZFR0_SVEVER_SVE2 0x1 +#define ID_AA64ZFR0_EL1_F64MM_IMP 0x1 +#define ID_AA64ZFR0_EL1_F32MM_IMP 0x1 +#define ID_AA64ZFR0_EL1_I8MM_IMP 0x1 +#define ID_AA64ZFR0_EL1_BF16_IMP 0x1 +#define ID_AA64ZFR0_EL1_SM4_IMP 0x1 +#define ID_AA64ZFR0_EL1_SHA3_IMP 0x1 +#define ID_AA64ZFR0_EL1_BitPerm_IMP 0x1 +#define ID_AA64ZFR0_EL1_AES_IMP 0x1 +#define ID_AA64ZFR0_EL1_AES_PMULL128 0x2 +#define ID_AA64ZFR0_EL1_SVEver_SVE2 0x1 + +/* id_aa64smfr0 */ +#define ID_AA64SMFR0_EL1_FA64_SHIFT 63 +#define ID_AA64SMFR0_EL1_I16I64_SHIFT 52 +#define ID_AA64SMFR0_EL1_F64F64_SHIFT 48 +#define ID_AA64SMFR0_EL1_I8I32_SHIFT 36 +#define ID_AA64SMFR0_EL1_F16F32_SHIFT 35 +#define ID_AA64SMFR0_EL1_B16F32_SHIFT 34 +#define ID_AA64SMFR0_EL1_F32F32_SHIFT 32 + +#define ID_AA64SMFR0_EL1_FA64_IMP 0x1 +#define ID_AA64SMFR0_EL1_I16I64_IMP 0xf +#define ID_AA64SMFR0_EL1_F64F64_IMP 0x1 +#define ID_AA64SMFR0_EL1_I8I32_IMP 0xf +#define ID_AA64SMFR0_EL1_F16F32_IMP 0x1 +#define ID_AA64SMFR0_EL1_B16F32_IMP 0x1 +#define ID_AA64SMFR0_EL1_F32F32_IMP 0x1 /* id_aa64mmfr0 */ -#define ID_AA64MMFR0_ECV_SHIFT 60 -#define ID_AA64MMFR0_FGT_SHIFT 56 -#define ID_AA64MMFR0_EXS_SHIFT 44 -#define ID_AA64MMFR0_TGRAN4_2_SHIFT 40 -#define ID_AA64MMFR0_TGRAN64_2_SHIFT 36 -#define ID_AA64MMFR0_TGRAN16_2_SHIFT 32 -#define ID_AA64MMFR0_TGRAN4_SHIFT 28 -#define ID_AA64MMFR0_TGRAN64_SHIFT 24 -#define ID_AA64MMFR0_TGRAN16_SHIFT 20 -#define ID_AA64MMFR0_BIGENDEL0_SHIFT 16 -#define ID_AA64MMFR0_SNSMEM_SHIFT 12 -#define ID_AA64MMFR0_BIGENDEL_SHIFT 8 -#define ID_AA64MMFR0_ASID_SHIFT 4 -#define ID_AA64MMFR0_PARANGE_SHIFT 0 +#define ID_AA64MMFR0_EL1_ECV_SHIFT 60 +#define ID_AA64MMFR0_EL1_FGT_SHIFT 56 +#define ID_AA64MMFR0_EL1_EXS_SHIFT 44 +#define ID_AA64MMFR0_EL1_TGRAN4_2_SHIFT 40 +#define ID_AA64MMFR0_EL1_TGRAN64_2_SHIFT 36 +#define ID_AA64MMFR0_EL1_TGRAN16_2_SHIFT 32 +#define ID_AA64MMFR0_EL1_TGRAN4_SHIFT 28 +#define ID_AA64MMFR0_EL1_TGRAN64_SHIFT 24 +#define ID_AA64MMFR0_EL1_TGRAN16_SHIFT 20 +#define ID_AA64MMFR0_EL1_BIGENDEL0_SHIFT 16 +#define ID_AA64MMFR0_EL1_SNSMEM_SHIFT 12 +#define ID_AA64MMFR0_EL1_BIGEND_SHIFT 8 +#define ID_AA64MMFR0_EL1_ASIDBITS_SHIFT 4 +#define ID_AA64MMFR0_EL1_PARANGE_SHIFT 0 -#define ID_AA64MMFR0_ASID_8 0x0 -#define ID_AA64MMFR0_ASID_16 0x2 +#define ID_AA64MMFR0_EL1_ASIDBITS_8 0x0 +#define ID_AA64MMFR0_EL1_ASIDBITS_16 0x2 -#define ID_AA64MMFR0_TGRAN4_NI 0xf -#define ID_AA64MMFR0_TGRAN4_SUPPORTED_MIN 0x0 -#define ID_AA64MMFR0_TGRAN4_SUPPORTED_MAX 0x7 -#define ID_AA64MMFR0_TGRAN64_NI 0xf -#define ID_AA64MMFR0_TGRAN64_SUPPORTED_MIN 0x0 -#define ID_AA64MMFR0_TGRAN64_SUPPORTED_MAX 0x7 -#define ID_AA64MMFR0_TGRAN16_NI 0x0 -#define ID_AA64MMFR0_TGRAN16_SUPPORTED_MIN 0x1 -#define ID_AA64MMFR0_TGRAN16_SUPPORTED_MAX 0xf +#define ID_AA64MMFR0_EL1_TGRAN4_NI 0xf +#define ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MIN 0x0 +#define ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MAX 0x7 +#define ID_AA64MMFR0_EL1_TGRAN64_NI 0xf +#define ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MIN 0x0 +#define ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MAX 0x7 +#define ID_AA64MMFR0_EL1_TGRAN16_NI 0x0 +#define ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MIN 0x1 +#define ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MAX 0xf -#define ID_AA64MMFR0_PARANGE_32 0x0 -#define ID_AA64MMFR0_PARANGE_36 0x1 -#define ID_AA64MMFR0_PARANGE_40 0x2 -#define ID_AA64MMFR0_PARANGE_42 0x3 -#define ID_AA64MMFR0_PARANGE_44 0x4 -#define ID_AA64MMFR0_PARANGE_48 0x5 -#define ID_AA64MMFR0_PARANGE_52 0x6 +#define ID_AA64MMFR0_EL1_PARANGE_32 0x0 +#define ID_AA64MMFR0_EL1_PARANGE_36 0x1 +#define ID_AA64MMFR0_EL1_PARANGE_40 0x2 +#define ID_AA64MMFR0_EL1_PARANGE_42 0x3 +#define ID_AA64MMFR0_EL1_PARANGE_44 0x4 +#define ID_AA64MMFR0_EL1_PARANGE_48 0x5 +#define ID_AA64MMFR0_EL1_PARANGE_52 0x6 #define ARM64_MIN_PARANGE_BITS 32 -#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_DEFAULT 0x0 -#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE 0x1 -#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_MIN 0x2 -#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_MAX 0x7 +#define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_DEFAULT 0x0 +#define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_NONE 0x1 +#define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MIN 0x2 +#define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MAX 0x7 #ifdef CONFIG_ARM64_PA_BITS_52 -#define ID_AA64MMFR0_PARANGE_MAX ID_AA64MMFR0_PARANGE_52 +#define ID_AA64MMFR0_EL1_PARANGE_MAX ID_AA64MMFR0_EL1_PARANGE_52 #else -#define ID_AA64MMFR0_PARANGE_MAX ID_AA64MMFR0_PARANGE_48 +#define ID_AA64MMFR0_EL1_PARANGE_MAX ID_AA64MMFR0_EL1_PARANGE_48 #endif /* id_aa64mmfr1 */ -#define ID_AA64MMFR1_ECBHB_SHIFT 60 -#define ID_AA64MMFR1_AFP_SHIFT 44 -#define ID_AA64MMFR1_ETS_SHIFT 36 -#define ID_AA64MMFR1_TWED_SHIFT 32 -#define ID_AA64MMFR1_XNX_SHIFT 28 -#define ID_AA64MMFR1_SPECSEI_SHIFT 24 -#define ID_AA64MMFR1_PAN_SHIFT 20 -#define ID_AA64MMFR1_LOR_SHIFT 16 -#define ID_AA64MMFR1_HPD_SHIFT 12 -#define ID_AA64MMFR1_VHE_SHIFT 8 -#define ID_AA64MMFR1_VMIDBITS_SHIFT 4 -#define ID_AA64MMFR1_HADBS_SHIFT 0 +#define ID_AA64MMFR1_EL1_ECBHB_SHIFT 60 +#define ID_AA64MMFR1_EL1_HCX_SHIFT 40 +#define ID_AA64MMFR1_EL1_AFP_SHIFT 44 +#define ID_AA64MMFR1_EL1_ETS_SHIFT 36 +#define ID_AA64MMFR1_EL1_TWED_SHIFT 32 +#define ID_AA64MMFR1_EL1_XNX_SHIFT 28 +#define ID_AA64MMFR1_EL1_SpecSEI_SHIFT 24 +#define ID_AA64MMFR1_EL1_PAN_SHIFT 20 +#define ID_AA64MMFR1_EL1_LO_SHIFT 16 +#define ID_AA64MMFR1_EL1_HPDS_SHIFT 12 +#define ID_AA64MMFR1_EL1_VH_SHIFT 8 +#define ID_AA64MMFR1_EL1_VMIDBits_SHIFT 4 +#define ID_AA64MMFR1_EL1_HAFDBS_SHIFT 0 -#define ID_AA64MMFR1_VMIDBITS_8 0 -#define ID_AA64MMFR1_VMIDBITS_16 2 +#define ID_AA64MMFR1_EL1_VMIDBits_8 0 +#define ID_AA64MMFR1_EL1_VMIDBits_16 2 /* id_aa64mmfr2 */ -#define ID_AA64MMFR2_E0PD_SHIFT 60 -#define ID_AA64MMFR2_EVT_SHIFT 56 -#define ID_AA64MMFR2_BBM_SHIFT 52 -#define ID_AA64MMFR2_TTL_SHIFT 48 -#define ID_AA64MMFR2_FWB_SHIFT 40 -#define ID_AA64MMFR2_IDS_SHIFT 36 -#define ID_AA64MMFR2_AT_SHIFT 32 -#define ID_AA64MMFR2_ST_SHIFT 28 -#define ID_AA64MMFR2_NV_SHIFT 24 -#define ID_AA64MMFR2_CCIDX_SHIFT 20 -#define ID_AA64MMFR2_LVA_SHIFT 16 -#define ID_AA64MMFR2_IESB_SHIFT 12 -#define ID_AA64MMFR2_LSM_SHIFT 8 -#define ID_AA64MMFR2_UAO_SHIFT 4 -#define ID_AA64MMFR2_CNP_SHIFT 0 +#define ID_AA64MMFR2_EL1_E0PD_SHIFT 60 +#define ID_AA64MMFR2_EL1_EVT_SHIFT 56 +#define ID_AA64MMFR2_EL1_BBM_SHIFT 52 +#define ID_AA64MMFR2_EL1_TTL_SHIFT 48 +#define ID_AA64MMFR2_EL1_FWB_SHIFT 40 +#define ID_AA64MMFR2_EL1_IDS_SHIFT 36 +#define ID_AA64MMFR2_EL1_AT_SHIFT 32 +#define ID_AA64MMFR2_EL1_ST_SHIFT 28 +#define ID_AA64MMFR2_EL1_NV_SHIFT 24 +#define ID_AA64MMFR2_EL1_CCIDX_SHIFT 20 +#define ID_AA64MMFR2_EL1_VARange_SHIFT 16 +#define ID_AA64MMFR2_EL1_IESB_SHIFT 12 +#define ID_AA64MMFR2_EL1_LSM_SHIFT 8 +#define ID_AA64MMFR2_EL1_UAO_SHIFT 4 +#define ID_AA64MMFR2_EL1_CnP_SHIFT 0 /* id_aa64dfr0 */ -#define ID_AA64DFR0_MTPMU_SHIFT 48 -#define ID_AA64DFR0_TRBE_SHIFT 44 -#define ID_AA64DFR0_TRACE_FILT_SHIFT 40 -#define ID_AA64DFR0_DOUBLELOCK_SHIFT 36 -#define ID_AA64DFR0_PMSVER_SHIFT 32 -#define ID_AA64DFR0_CTX_CMPS_SHIFT 28 -#define ID_AA64DFR0_WRPS_SHIFT 20 -#define ID_AA64DFR0_BRPS_SHIFT 12 -#define ID_AA64DFR0_PMUVER_SHIFT 8 -#define ID_AA64DFR0_TRACEVER_SHIFT 4 -#define ID_AA64DFR0_DEBUGVER_SHIFT 0 +#define ID_AA64DFR0_EL1_MTPMU_SHIFT 48 +#define ID_AA64DFR0_EL1_TraceBuffer_SHIFT 44 +#define ID_AA64DFR0_EL1_TraceFilt_SHIFT 40 +#define ID_AA64DFR0_EL1_DoubleLock_SHIFT 36 +#define ID_AA64DFR0_EL1_PMSVer_SHIFT 32 +#define ID_AA64DFR0_EL1_CTX_CMPs_SHIFT 28 +#define ID_AA64DFR0_EL1_WRPs_SHIFT 20 +#define ID_AA64DFR0_EL1_BRPs_SHIFT 12 +#define ID_AA64DFR0_EL1_PMUVer_SHIFT 8 +#define ID_AA64DFR0_EL1_TraceVer_SHIFT 4 +#define ID_AA64DFR0_EL1_DebugVer_SHIFT 0 -#define ID_AA64DFR0_PMUVER_8_0 0x1 -#define ID_AA64DFR0_PMUVER_8_1 0x4 -#define ID_AA64DFR0_PMUVER_8_4 0x5 -#define ID_AA64DFR0_PMUVER_8_5 0x6 -#define ID_AA64DFR0_PMUVER_IMP_DEF 0xf +#define ID_AA64DFR0_EL1_PMUVer_IMP 0x1 +#define ID_AA64DFR0_EL1_PMUVer_V3P1 0x4 +#define ID_AA64DFR0_EL1_PMUVer_V3P4 0x5 +#define ID_AA64DFR0_EL1_PMUVer_V3P5 0x6 +#define ID_AA64DFR0_EL1_PMUVer_V3P7 0x7 +#define ID_AA64DFR0_EL1_PMUVer_IMP_DEF 0xf -#define ID_AA64DFR0_PMSVER_8_2 0x1 -#define ID_AA64DFR0_PMSVER_8_3 0x2 +#define ID_AA64DFR0_EL1_PMSVer_IMP 0x1 +#define ID_AA64DFR0_EL1_PMSVer_V1P1 0x2 #define ID_DFR0_PERFMON_SHIFT 24 @@ -1059,40 +1135,62 @@ #define ID_PFR1_PROGMOD_SHIFT 0 #if defined(CONFIG_ARM64_4K_PAGES) -#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN4_SHIFT -#define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN4_SUPPORTED_MIN -#define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_TGRAN4_SUPPORTED_MAX -#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN4_2_SHIFT +#define ID_AA64MMFR0_EL1_TGRAN_SHIFT ID_AA64MMFR0_EL1_TGRAN4_SHIFT +#define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MIN +#define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MAX +#define ID_AA64MMFR0_EL1_TGRAN_2_SHIFT ID_AA64MMFR0_EL1_TGRAN4_2_SHIFT #elif defined(CONFIG_ARM64_16K_PAGES) -#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN16_SHIFT -#define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN16_SUPPORTED_MIN -#define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_TGRAN16_SUPPORTED_MAX -#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN16_2_SHIFT +#define ID_AA64MMFR0_EL1_TGRAN_SHIFT ID_AA64MMFR0_EL1_TGRAN16_SHIFT +#define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MIN +#define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MAX +#define ID_AA64MMFR0_EL1_TGRAN_2_SHIFT ID_AA64MMFR0_EL1_TGRAN16_2_SHIFT #elif defined(CONFIG_ARM64_64K_PAGES) -#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN64_SHIFT -#define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN64_SUPPORTED_MIN -#define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_TGRAN64_SUPPORTED_MAX -#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN64_2_SHIFT +#define ID_AA64MMFR0_EL1_TGRAN_SHIFT ID_AA64MMFR0_EL1_TGRAN64_SHIFT +#define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MIN +#define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MAX +#define ID_AA64MMFR0_EL1_TGRAN_2_SHIFT ID_AA64MMFR0_EL1_TGRAN64_2_SHIFT #endif #define MVFR2_FPMISC_SHIFT 4 #define MVFR2_SIMDMISC_SHIFT 0 -#define DCZID_DZP_SHIFT 4 -#define DCZID_BS_SHIFT 0 +#define CTR_EL0_L1Ip_VPIPT 0 +#define CTR_EL0_L1Ip_VIPT 2 +#define CTR_EL0_L1Ip_PIPT 3 + +#define CTR_EL0_L1Ip_SHIFT 14 +#define CTR_EL0_L1Ip_MASK 3 +#define CTR_EL0_DminLine_SHIFT 16 +#define CTR_EL0_IminLine_SHIFT 0 +#define CTR_EL0_IminLine_MASK 0xf +#define CTR_EL0_ERG_SHIFT 20 +#define CTR_EL0_CWG_SHIFT 24 +#define CTR_EL0_CWG_MASK 15 +#define CTR_EL0_IDC_SHIFT 28 +#define CTR_EL0_DIC_SHIFT 29 + +#define DCZID_EL0_DZP_SHIFT 4 +#define DCZID_EL0_BS_SHIFT 0 -/* - * The ZCR_ELx_LEN_* definitions intentionally include bits [8:4] which - * are reserved by the SVE architecture for future expansion of the LEN - * field, with compatible semantics. - */ #define ZCR_ELx_LEN_SHIFT 0 -#define ZCR_ELx_LEN_SIZE 9 -#define ZCR_ELx_LEN_MASK 0x1ff +#define ZCR_ELx_LEN_WIDTH 4 +#define ZCR_ELx_LEN_MASK 0xf + +#define SMCR_ELx_FA64_SHIFT 31 +#define SMCR_ELx_FA64_MASK (1 << SMCR_ELx_FA64_SHIFT) + +#define SMCR_ELx_LEN_SHIFT 0 +#define SMCR_ELx_LEN_WIDTH 4 +#define SMCR_ELx_LEN_MASK 0xf + +#define CPACR_EL1_FPEN_EL1EN (BIT(20)) /* enable EL1 access */ +#define CPACR_EL1_FPEN_EL0EN (BIT(21)) /* enable EL0 access, if EL1EN set */ + +#define CPACR_EL1_SMEN_EL1EN (BIT(24)) /* enable EL1 access */ +#define CPACR_EL1_SMEN_EL0EN (BIT(25)) /* enable EL0 access, if EL1EN set */ #define CPACR_EL1_ZEN_EL1EN (BIT(16)) /* enable EL1 access */ #define CPACR_EL1_ZEN_EL0EN (BIT(17)) /* enable EL0 access, if EL1EN set */ -#define CPACR_EL1_ZEN (CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN) /* GCR_EL1 Definitions */ #define SYS_GCR_EL1_RRND (BIT(16)) @@ -1119,8 +1217,8 @@ #define SYS_RGSR_EL1_SEED_MASK 0xffffUL /* GMID_EL1 field definitions */ -#define SYS_GMID_EL1_BS_SHIFT 0 -#define SYS_GMID_EL1_BS_SIZE 4 +#define GMID_EL1_BS_SHIFT 0 +#define GMID_EL1_BS_SIZE 4 /* TFSR{,E0}_EL1 bit definitions */ #define SYS_TFSR_EL1_TF0_SHIFT 0 @@ -1139,6 +1237,8 @@ #define TRFCR_ELx_ExTRE BIT(1) #define TRFCR_ELx_E0TRE BIT(0) +/* HCRX_EL2 definitions */ +#define HCRX_EL2_SMPME_MASK (1 << 5) /* GIC Hypervisor interface registers */ /* ICH_MISR_EL2 bit definitions */ @@ -1166,6 +1266,7 @@ #define ICH_HCR_TC (1 << 10) #define ICH_HCR_TALL0 (1 << 11) #define ICH_HCR_TALL1 (1 << 12) +#define ICH_HCR_TDIR (1 << 14) #define ICH_HCR_EOIcount_SHIFT 27 #define ICH_HCR_EOIcount_MASK (0x1f << ICH_HCR_EOIcount_SHIFT) @@ -1198,6 +1299,14 @@ #define ICH_VTR_SEIS_MASK (1 << ICH_VTR_SEIS_SHIFT) #define ICH_VTR_A3V_SHIFT 21 #define ICH_VTR_A3V_MASK (1 << ICH_VTR_A3V_SHIFT) +#define ICH_VTR_TDS_SHIFT 19 +#define ICH_VTR_TDS_MASK (1 << ICH_VTR_TDS_SHIFT) + +/* HFG[WR]TR_EL2 bit definitions */ +#define HFGxTR_EL2_nTPIDR2_EL0_SHIFT 55 +#define HFGxTR_EL2_nTPIDR2_EL0_MASK BIT_MASK(HFGxTR_EL2_nTPIDR2_EL0_SHIFT) +#define HFGxTR_EL2_nSMPRI_EL1_SHIFT 54 +#define HFGxTR_EL2_nSMPRI_EL1_MASK BIT_MASK(HFGxTR_EL2_nSMPRI_EL1_SHIFT) #define ARM64_FEATURE_FIELD_BITS 4 @@ -1221,6 +1330,7 @@ #else +#include #include #include #include @@ -1320,6 +1430,21 @@ par; \ }) +/* + * Disallow any use of the following backported SYS_FIELD macros. They rely on + * use of pre-shifted masked (for example, as generated by GENMASK). Most of + * the masks defined in this tree are not shifted, making these macros subtly + * dangerous! + */ +#define SYS_FIELD_GET(reg, field, val) \ + BUILD_BUG() + +#define SYS_FIELD_PREP(reg, field, val) \ + BUILD_BUG() + +#define SYS_FIELD_PREP_ENUM(reg, field, val) \ + BUILD_BUG() + #endif #endif /* __ASM_SYSREG_H */ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 6623c99f0984..03cb88f63317 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -78,9 +78,11 @@ int arch_dup_task_struct(struct task_struct *dst, #define TIF_SINGLESTEP 21 #define TIF_32BIT 22 /* 32bit process */ #define TIF_SVE 23 /* Scalable Vector Extension in use */ -#define TIF_SVE_VL_INHERIT 24 /* Inherit sve_vl_onexec across exec */ +#define TIF_SVE_VL_INHERIT 24 /* Inherit SVE vl_onexec across exec */ #define TIF_SSBD 25 /* Wants SSB mitigation */ #define TIF_TAGGED_ADDR 26 /* Allow tagged user addresses */ +#define TIF_SME 27 /* SME in use */ +#define TIF_SME_VL_INHERIT 28 /* Inherit SME vl_onexec across exec */ #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 0fd6056ba412..7e25950510d1 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -196,13 +196,13 @@ static inline void __uaccess_enable_tco(void) */ static inline void __uaccess_disable_tco_async(void) { - if (system_uses_mte_async_mode()) + if (system_uses_mte_async_or_asymm_mode()) __uaccess_disable_tco(); } static inline void __uaccess_enable_tco_async(void) { - if (system_uses_mte_async_mode()) + if (system_uses_mte_async_or_asymm_mode()) __uaccess_enable_tco(); } @@ -495,4 +495,19 @@ static inline int __copy_from_user_flushcache(void *dst, const void __user *src, } #endif +#ifdef CONFIG_ARCH_HAS_SUBPAGE_FAULTS + +/* + * Return 0 on success, the number of bytes not probed otherwise. + */ +static inline size_t probe_subpage_writeable(const char __user *uaddr, + size_t size) +{ + if (!system_supports_mte()) + return 0; + return mte_probe_user_range(uaddr, size); +} + +#endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */ + #endif /* __ASM_UACCESS_H */ diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index 7379f35ae2c6..8bc48dec6952 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -72,6 +72,12 @@ void __hyp_reset_vectors(void); DECLARE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); +static inline bool is_pkvm_initialized(void) +{ + return IS_ENABLED(CONFIG_KVM) && + static_branch_likely(&kvm_protected_mode_initialized); +} + /* Reports the availability of HYP mode */ static inline bool is_hyp_mode_available(void) { @@ -79,8 +85,7 @@ static inline bool is_hyp_mode_available(void) * If KVM protected mode is initialized, all CPUs must have been booted * in EL2. Avoid checking __boot_cpu_mode as CPUs now come up in EL1. */ - if (IS_ENABLED(CONFIG_KVM) && - static_branch_likely(&kvm_protected_mode_initialized)) + if (is_pkvm_initialized()) return true; return (__boot_cpu_mode[0] == BOOT_CPU_MODE_EL2 && @@ -94,8 +99,7 @@ static inline bool is_hyp_mode_mismatched(void) * If KVM protected mode is initialized, all CPUs must have been booted * in EL2. Avoid checking __boot_cpu_mode as CPUs now come up in EL1. */ - if (IS_ENABLED(CONFIG_KVM) && - static_branch_likely(&kvm_protected_mode_initialized)) + if (is_pkvm_initialized()) return false; return __boot_cpu_mode[0] != __boot_cpu_mode[1]; @@ -111,6 +115,9 @@ static __always_inline bool has_vhe(void) /* * Code only run in VHE/NVHE hyp context can assume VHE is present or * absent. Otherwise fall back to caps. + * This allows the compiler to discard VHE-specific code from the + * nVHE object, reducing the number of external symbol references + * needed to link. */ if (is_vhe_hyp_code()) return true; diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h index 7a22aeea9bb5..1ed0aace67b4 100644 --- a/arch/arm64/include/asm/vmalloc.h +++ b/arch/arm64/include/asm/vmalloc.h @@ -2,6 +2,7 @@ #define _ASM_ARM64_VMALLOC_H #include +#include #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP @@ -25,4 +26,10 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot) #endif +#define arch_vmap_pgprot_tagged arch_vmap_pgprot_tagged +static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot) +{ + return pgprot_tagged(prot); +} + #endif /* _ASM_ARM64_VMALLOC_H */ diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h index 894e031b28d2..20873099c035 100644 --- a/arch/arm64/include/asm/vmap_stack.h +++ b/arch/arm64/include/asm/vmap_stack.h @@ -17,10 +17,13 @@ */ static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node) { + void *p; + BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK)); - return __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node, + p = __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node, __builtin_return_address(0)); + return kasan_reset_tag(p); } #endif /* __ASM_VMAP_STACK_H */ diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index f03731847d9d..4bb2cc8ac446 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -78,5 +78,15 @@ #define HWCAP2_ECV (1 << 19) #define HWCAP2_AFP (1 << 20) #define HWCAP2_RPRES (1 << 21) +#define HWCAP2_MTE3 (1 << 22) +#define HWCAP2_SME (1 << 23) +#define HWCAP2_SME_I16I64 (1 << 24) +#define HWCAP2_SME_F64F64 (1 << 25) +#define HWCAP2_SME_I8I32 (1 << 26) +#define HWCAP2_SME_F16F32 (1 << 27) +#define HWCAP2_SME_B16F32 (1 << 28) +#define HWCAP2_SME_F32F32 (1 << 29) +#define HWCAP2_SME_FA64 (1 << 30) +#define HWCAP2_WFXT (1UL << 31) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 323e251ed37b..c317504f3ae1 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -75,9 +75,11 @@ struct kvm_regs { /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */ #define KVM_ARM_DEVICE_TYPE_SHIFT 0 -#define KVM_ARM_DEVICE_TYPE_MASK (0xffff << KVM_ARM_DEVICE_TYPE_SHIFT) +#define KVM_ARM_DEVICE_TYPE_MASK GENMASK(KVM_ARM_DEVICE_TYPE_SHIFT + 15, \ + KVM_ARM_DEVICE_TYPE_SHIFT) #define KVM_ARM_DEVICE_ID_SHIFT 16 -#define KVM_ARM_DEVICE_ID_MASK (0xffff << KVM_ARM_DEVICE_ID_SHIFT) +#define KVM_ARM_DEVICE_ID_MASK GENMASK(KVM_ARM_DEVICE_ID_SHIFT + 15, \ + KVM_ARM_DEVICE_ID_SHIFT) /* Supported device IDs */ #define KVM_ARM_DEVICE_VGIC_V2 0 @@ -332,6 +334,40 @@ struct kvm_arm_copy_mte_tags { #define KVM_ARM64_SVE_VLS_WORDS \ ((KVM_ARM64_SVE_VQ_MAX - KVM_ARM64_SVE_VQ_MIN) / 64 + 1) +/* Bitmap feature firmware registers */ +#define KVM_REG_ARM_FW_FEAT_BMAP (0x0016 << KVM_REG_ARM_COPROC_SHIFT) +#define KVM_REG_ARM_FW_FEAT_BMAP_REG(r) (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \ + KVM_REG_ARM_FW_FEAT_BMAP | \ + ((r) & 0xffff)) + +#define KVM_REG_ARM_STD_BMAP KVM_REG_ARM_FW_FEAT_BMAP_REG(0) + +enum { + KVM_REG_ARM_STD_BIT_TRNG_V1_0 = 0, +#ifdef __KERNEL__ + KVM_REG_ARM_STD_BMAP_BIT_COUNT, +#endif +}; + +#define KVM_REG_ARM_STD_HYP_BMAP KVM_REG_ARM_FW_FEAT_BMAP_REG(1) + +enum { + KVM_REG_ARM_STD_HYP_BIT_PV_TIME = 0, +#ifdef __KERNEL__ + KVM_REG_ARM_STD_HYP_BMAP_BIT_COUNT, +#endif +}; + +#define KVM_REG_ARM_VENDOR_HYP_BMAP KVM_REG_ARM_FW_FEAT_BMAP_REG(2) + +enum { + KVM_REG_ARM_VENDOR_HYP_BIT_FUNC_FEAT = 0, + KVM_REG_ARM_VENDOR_HYP_BIT_PTP = 1, +#ifdef __KERNEL__ + KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_COUNT, +#endif +}; + /* Device Control API: ARM VGIC */ #define KVM_DEV_ARM_VGIC_GRP_ADDR 0 #define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1 @@ -367,6 +403,7 @@ struct kvm_arm_copy_mte_tags { #define KVM_ARM_VCPU_PMU_V3_IRQ 0 #define KVM_ARM_VCPU_PMU_V3_INIT 1 #define KVM_ARM_VCPU_PMU_V3_FILTER 2 +#define KVM_ARM_VCPU_PMU_V3_SET_PMU 3 #define KVM_ARM_VCPU_TIMER_CTRL 1 #define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 #define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 @@ -418,6 +455,25 @@ struct kvm_arm_copy_mte_tags { #define KVM_PSCI_RET_INVAL PSCI_RET_INVALID_PARAMS #define KVM_PSCI_RET_DENIED PSCI_RET_DENIED +/* Protected KVM */ +#define KVM_CAP_ARM_PROTECTED_VM_FLAGS_SET_FW_IPA 0 +#define KVM_CAP_ARM_PROTECTED_VM_FLAGS_INFO 1 + +struct kvm_protected_vm_info { + __u64 firmware_size; + __u64 __reserved[7]; +}; + +/* arm64-specific kvm_run::system_event flags */ +/* + * Reset caused by a PSCI v1.1 SYSTEM_RESET2 call. + * Valid only when the system event has a type of KVM_SYSTEM_EVENT_RESET. + */ +#define KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 (1ULL << 0) + +/* run->fail_entry.hardware_entry_failure_reason codes. */ +#define KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED (1ULL << 0) + #endif #endif /* __ARM_KVM_H__ */ diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index 758ae984ff97..7fa2f7036aa7 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -109,7 +109,7 @@ struct user_hwdebug_state { } dbg_regs[16]; }; -/* SVE/FP/SIMD state (NT_ARM_SVE) */ +/* SVE/FP/SIMD state (NT_ARM_SVE & NT_ARM_SSVE) */ struct user_sve_header { __u32 size; /* total meaningful regset content in bytes */ @@ -220,6 +220,7 @@ struct user_sve_header { (SVE_PT_SVE_PREG_OFFSET(vq, __SVE_NUM_PREGS) - \ SVE_PT_SVE_PREGS_OFFSET(vq)) +/* For streaming mode SVE (SSVE) FFR must be read and written as zero */ #define SVE_PT_SVE_FFR_OFFSET(vq) \ (SVE_PT_REGS_OFFSET + __SVE_FFR_OFFSET(vq)) @@ -240,10 +241,12 @@ struct user_sve_header { - SVE_PT_SVE_OFFSET + (__SVE_VQ_BYTES - 1)) \ / __SVE_VQ_BYTES * __SVE_VQ_BYTES) -#define SVE_PT_SIZE(vq, flags) \ - (((flags) & SVE_PT_REGS_MASK) == SVE_PT_REGS_SVE ? \ - SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, flags) \ - : SVE_PT_FPSIMD_OFFSET + SVE_PT_FPSIMD_SIZE(vq, flags)) +#define SVE_PT_SIZE(vq, flags) \ + (((flags) & SVE_PT_REGS_MASK) == SVE_PT_REGS_SVE ? \ + SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, flags) \ + : ((((flags) & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD ? \ + SVE_PT_FPSIMD_OFFSET + SVE_PT_FPSIMD_SIZE(vq, flags) \ + : SVE_PT_REGS_OFFSET))) /* pointer authentication masks (NT_ARM_PAC_MASK) */ @@ -265,6 +268,62 @@ struct user_pac_generic_keys { __uint128_t apgakey; }; +/* ZA state (NT_ARM_ZA) */ + +struct user_za_header { + __u32 size; /* total meaningful regset content in bytes */ + __u32 max_size; /* maxmium possible size for this thread */ + __u16 vl; /* current vector length */ + __u16 max_vl; /* maximum possible vector length */ + __u16 flags; + __u16 __reserved; +}; + +/* + * Common ZA_PT_* flags: + * These must be kept in sync with prctl interface in + */ +#define ZA_PT_VL_INHERIT ((1 << 17) /* PR_SME_VL_INHERIT */ >> 16) +#define ZA_PT_VL_ONEXEC ((1 << 18) /* PR_SME_SET_VL_ONEXEC */ >> 16) + + +/* + * The remainder of the ZA state follows struct user_za_header. The + * total size of the ZA state (including header) depends on the + * metadata in the header: ZA_PT_SIZE(vq, flags) gives the total size + * of the state in bytes, including the header. + * + * Refer to for details of how to pass the correct + * "vq" argument to these macros. + */ + +/* Offset from the start of struct user_za_header to the register data */ +#define ZA_PT_ZA_OFFSET \ + ((sizeof(struct user_za_header) + (__SVE_VQ_BYTES - 1)) \ + / __SVE_VQ_BYTES * __SVE_VQ_BYTES) + +/* + * The payload starts at offset ZA_PT_ZA_OFFSET, and is of size + * ZA_PT_ZA_SIZE(vq, flags). + * + * The ZA array is stored as a sequence of horizontal vectors ZAV of SVL/8 + * bytes each, starting from vector 0. + * + * Additional data might be appended in the future. + * + * The ZA matrix is represented in memory in an endianness-invariant layout + * which differs from the layout used for the FPSIMD V-registers on big-endian + * systems: see sigcontext.h for more explanation. + */ + +#define ZA_PT_ZAV_OFFSET(vq, n) \ + (ZA_PT_ZA_OFFSET + ((vq * __SVE_VQ_BYTES) * n)) + +#define ZA_PT_ZA_SIZE(vq) ((vq * __SVE_VQ_BYTES) * (vq * __SVE_VQ_BYTES)) + +#define ZA_PT_SIZE(vq) \ + (ZA_PT_ZA_OFFSET + ZA_PT_ZA_SIZE(vq)) + #endif /* __ASSEMBLY__ */ #endif /* _UAPI__ASM_PTRACE_H */ diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h index 0c796c795dbe..4aaf31e3bf16 100644 --- a/arch/arm64/include/uapi/asm/sigcontext.h +++ b/arch/arm64/include/uapi/asm/sigcontext.h @@ -132,6 +132,17 @@ struct extra_context { #define SVE_MAGIC 0x53564501 struct sve_context { + struct _aarch64_ctx head; + __u16 vl; + __u16 flags; + __u16 __reserved[2]; +}; + +#define SVE_SIG_FLAG_SM 0x1 /* Context describes streaming mode */ + +#define ZA_MAGIC 0x54366345 + +struct za_context { struct _aarch64_ctx head; __u16 vl; __u16 __reserved[3]; @@ -186,9 +197,16 @@ struct sve_context { * sve_context.vl must equal the thread's current vector length when * doing a sigreturn. * + * On systems with support for SME the SVE register state may reflect either + * streaming or non-streaming mode. In streaming mode the streaming mode + * vector length will be used and the flag SVE_SIG_FLAG_SM will be set in + * the flags field. It is permitted to enter or leave streaming mode in + * a signal return, applications should take care to ensure that any difference + * in vector length between the two modes is handled, including any resizing + * and movement of context blocks. * - * Note: for all these macros, the "vq" argument denotes the SVE - * vector length in quadwords (i.e., units of 128 bits). + * Note: for all these macros, the "vq" argument denotes the vector length + * in quadwords (i.e., units of 128 bits). * * The correct way to obtain vq is to use sve_vq_from_vl(vl). The * result is valid if and only if sve_vl_valid(vl) is true. This is @@ -249,4 +267,37 @@ struct sve_context { #define SVE_SIG_CONTEXT_SIZE(vq) \ (SVE_SIG_REGS_OFFSET + SVE_SIG_REGS_SIZE(vq)) +/* + * If the ZA register is enabled for the thread at signal delivery then, + * za_context.head.size >= ZA_SIG_CONTEXT_SIZE(sve_vq_from_vl(za_context.vl)) + * and the register data may be accessed using the ZA_SIG_*() macros. + * + * If za_context.head.size < ZA_SIG_CONTEXT_SIZE(sve_vq_from_vl(za_context.vl)) + * then ZA was not enabled and no register data was included in which case + * ZA register was not enabled for the thread and no register data + * the ZA_SIG_*() macros should not be used except for this check. + * + * The same convention applies when returning from a signal: a caller + * will need to remove or resize the za_context block if it wants to + * enable the ZA register when it was previously non-live or vice-versa. + * This may require the caller to allocate fresh memory and/or move other + * context blocks in the signal frame. + * + * Changing the vector length during signal return is not permitted: + * za_context.vl must equal the thread's current SME vector length when + * doing a sigreturn. + */ + +#define ZA_SIG_REGS_OFFSET \ + ((sizeof(struct za_context) + (__SVE_VQ_BYTES - 1)) \ + / __SVE_VQ_BYTES * __SVE_VQ_BYTES) + +#define ZA_SIG_REGS_SIZE(vq) ((vq * __SVE_VQ_BYTES) * (vq * __SVE_VQ_BYTES)) + +#define ZA_SIG_ZAV_OFFSET(vq, n) (ZA_SIG_REGS_OFFSET + \ + (SVE_SIG_ZREG_SIZE(vq) * n)) + +#define ZA_SIG_CONTEXT_SIZE(vq) \ + (ZA_SIG_REGS_OFFSET + ZA_SIG_REGS_SIZE(vq)) + #endif /* _UAPI__ASM_SIGCONTEXT_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 749e31475e41..f737e67a78dd 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -8,12 +8,18 @@ CFLAGS_armv8_deprecated.o := -I$(src) CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_insn.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) +CFLAGS_io.o := -D__DISABLE_TRACE_MMIO__ # Remove stack protector to avoid triggering unneeded stack canary # checks due to randomize_kstack_offset. CFLAGS_REMOVE_syscall.o = -fstack-protector -fstack-protector-strong CFLAGS_syscall.o += -fno-stack-protector +# When KASAN is enabled, a stack trace is recorded for every alloc/free, which +# can significantly impact performance. Avoid instrumenting the stack trace +# collection code to minimize this impact. +KASAN_SANITIZE_stacktrace.o := n + # It's not safe to invoke KCOV when portions of the kernel environment aren't # available or are out-of-sync with HW state. Since `noinstr` doesn't always # inhibit KCOV instrumentation, disable it for the entire compilation unit. diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 7bbf5104b7b7..9bcaa5eacf16 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -121,7 +121,7 @@ static void clean_dcache_range_nopatch(u64 start, u64 end) ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0); d_size = 4 << cpuid_feature_extract_unsigned_field(ctr_el0, - CTR_DMINLINE_SHIFT); + CTR_EL0_DminLine_SHIFT); cur = start & ~(d_size - 1); do { /* diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 551427ae8cc5..494590009ddf 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -110,7 +110,6 @@ int main(void) #ifdef CONFIG_KVM DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt)); DEFINE(VCPU_FAULT_DISR, offsetof(struct kvm_vcpu, arch.fault.disr_el1)); - DEFINE(VCPU_WORKAROUND_FLAGS, offsetof(struct kvm_vcpu, arch.workaround_flags)); DEFINE(VCPU_HCR_EL2, offsetof(struct kvm_vcpu, arch.hcr_el2)); DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_cpu_context, regs)); DEFINE(CPU_RGSR_EL1, offsetof(struct kvm_cpu_context, sys_regs[RGSR_EL1])); diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index ce59811616d8..b371a18cb6e4 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -121,6 +121,22 @@ cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused) sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCI, 0); } +static DEFINE_RAW_SPINLOCK(reg_user_mask_modification); +static void __maybe_unused +cpu_clear_bf16_from_user_emulation(const struct arm64_cpu_capabilities *__unused) +{ + struct arm64_ftr_reg *regp; + + regp = get_arm64_ftr_reg(SYS_ID_AA64ISAR1_EL1); + if (!regp) + return; + + raw_spin_lock(®_user_mask_modification); + if (regp->user_mask & ID_AA64ISAR1_EL1_BF16_MASK) + regp->user_mask &= ~ID_AA64ISAR1_EL1_BF16_MASK; + raw_spin_unlock(®_user_mask_modification); +} + #define CAP_MIDR_RANGE(model, v_min, r_min, v_max, r_max) \ .matches = is_affected_midr_range, \ .midr_range = MIDR_RANGE(model, v_min, r_min, v_max, r_max) @@ -187,7 +203,7 @@ has_neoverse_n1_erratum_1542419(const struct arm64_cpu_capabilities *entry, int scope) { u32 midr = read_cpuid_id(); - bool has_dic = read_cpuid_cachetype() & BIT(CTR_DIC_SHIFT); + bool has_dic = read_cpuid_cachetype() & BIT(CTR_EL0_DIC_SHIFT); const struct midr_range range = MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1); WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); @@ -355,6 +371,42 @@ static const struct midr_range erratum_1463225[] = { }; #endif +#ifdef CONFIG_ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE +static const struct midr_range trbe_overwrite_fill_mode_cpus[] = { +#ifdef CONFIG_ARM64_ERRATUM_2139208 + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2119858 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), +#endif + {}, +}; +#endif /* CONFIG_ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE */ + +#ifdef CONFIG_ARM64_WORKAROUND_TSB_FLUSH_FAILURE +static const struct midr_range tsb_flush_fail_cpus[] = { +#ifdef CONFIG_ARM64_ERRATUM_2067961 + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2054223 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), +#endif + {}, +}; +#endif /* CONFIG_ARM64_WORKAROUND_TSB_FLUSH_FAILURE */ + +#ifdef CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE +static struct midr_range trbe_write_out_of_range_cpus[] = { +#ifdef CONFIG_ARM64_ERRATUM_2253138 + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2224489 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), +#endif + {}, +}; +#endif /* CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE */ + #ifdef CONFIG_ARM64_ERRATUM_1742098 static struct midr_range broken_aarch32_aes[] = { MIDR_RANGE(MIDR_CORTEX_A57, 0, 1, 0xf, 0xf), @@ -564,6 +616,34 @@ const struct arm64_cpu_capabilities arm64_errata[] = { ERRATA_MIDR_ALL_VERSIONS(MIDR_NVIDIA_CARMEL), }, #endif +#ifdef CONFIG_ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE + { + /* + * The erratum work around is handled within the TRBE + * driver and can be applied per-cpu. So, we can allow + * a late CPU to come online with this erratum. + */ + .desc = "ARM erratum 2119858 or 2139208", + .capability = ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE, + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + CAP_MIDR_RANGE_LIST(trbe_overwrite_fill_mode_cpus), + }, +#endif +#ifdef CONFIG_ARM64_WORKAROUND_TSB_FLUSH_FAILURE + { + .desc = "ARM erratum 2067961 or 2054223", + .capability = ARM64_WORKAROUND_TSB_FLUSH_FAILURE, + ERRATA_MIDR_RANGE_LIST(tsb_flush_fail_cpus), + }, +#endif +#ifdef CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE + { + .desc = "ARM erratum 2253138 or 2224489", + .capability = ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE, + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + CAP_MIDR_RANGE_LIST(trbe_write_out_of_range_cpus), + }, +#endif #ifdef CONFIG_ARM64_ERRATUM_2457168 { .desc = "ARM erratum 2457168", @@ -580,6 +660,16 @@ const struct arm64_cpu_capabilities arm64_errata[] = { CAP_MIDR_RANGE_LIST(broken_aarch32_aes), .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2658417 + { + .desc = "ARM erratum 2658417", + .capability = ARM64_WORKAROUND_2658417, + /* Cortex-A510 r0p0 - r1p1 */ + ERRATA_MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1), + MIDR_FIXED(MIDR_CPU_VAR_REV(1,1), BIT(25)), + .cpu_enable = cpu_clear_bf16_from_user_emulation, + }, #endif { } diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index d4ee345ff429..e7539ecd0379 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -192,108 +192,133 @@ static bool __system_matches_cap(unsigned int n); * sync with the documentation of the CPU feature register ABI. */ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_RNDR_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TLB_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_FHM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_DP_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SM4_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SM3_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SHA3_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_RDM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_ATOMICS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_CRC32_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SHA2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SHA1_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_AES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_RNDR_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_TLB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_TS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_FHM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_DP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SM4_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SM3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_RDM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_ATOMIC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_CRC32_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_AES_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_I8MM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_DGH_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_BF16_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_SPECRES_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_SB_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_FRINTTS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_SPECRES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_SB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_FRINTTS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_GPI_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_GPI_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_GPA_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_LRCPC_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_FCMA_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_JSCVT_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_GPA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_LRCPC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_FCMA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_JSCVT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), - FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_API_SHIFT, 4, 0), + FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_EL1_API_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), - FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_APA_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_DPB_SHIFT, 4, 0), + FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_EL1_APA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DPB_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64ISAR2_CLEARBHB_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_RPRES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64ISAR2_EL1_BC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), + FTR_STRICT, FTR_EXACT, ID_AA64ISAR2_EL1_APA3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_GPA3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_WFxT_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_DIT_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_AMU_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_MPAM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SEL2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_DIT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_AMU_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_MPAM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SEL2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_RAS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0), - S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI), - S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL3_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SVE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_RAS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_GIC_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_AdvSIMD_SHIFT, 4, ID_AA64PFR0_EL1_AdvSIMD_NI), + S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_FP_SHIFT, 4, ID_AA64PFR0_EL1_FP_NI), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL1_SHIFT, 4, ID_AA64PFR0_EL1_ELx_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL0_SHIFT, 4, ID_AA64PFR0_EL1_ELx_64BIT_ONLY), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MPAMFRAC_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_RASFRAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_RAS_frac_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_MTE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MTE_SHIFT, 4, ID_AA64PFR1_MTE_NI), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MTE_SHIFT, 4, ID_AA64PFR1_EL1_MTE_NI), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SSBS_SHIFT, 4, ID_AA64PFR1_EL1_SSBS_NI), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_BTI), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_BT_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_BT_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_F64MM_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F64MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_F32MM_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F32MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_I8MM_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_I8MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_SM4_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SM4_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_SHA3_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SHA3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_BF16_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BF16_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_BITPERM_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BitPerm_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_AES_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_AES_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_SVEVER_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SVEver_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_FA64_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I16I64_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F64F64_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I8I32_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F16F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_B16F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F32F32_SHIFT, 1, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_ECV_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_FGT_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EXS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_ECV_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_FGT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_EXS_SHIFT, 4, 0), /* * Page size not being supported at Stage-2 is not fatal. You * just give up KVM if PAGE_SIZE isn't supported there. Go fix @@ -309,9 +334,9 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { * fields are inconsistent across vCPUs, then it isn't worth * trying to bring KVM up. */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN4_2_SHIFT, 4, 1), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN64_2_SHIFT, 4, 1), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN16_2_SHIFT, 4, 1), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN4_2_SHIFT, 4, 1), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN64_2_SHIFT, 4, 1), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN16_2_SHIFT, 4, 1), /* * We already refuse to boot CPUs that don't support our configured * page size, so we can only detect mismatches for a page size other @@ -319,71 +344,71 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { * exist in the wild so, even though we don't like it, we'll have to go * along with it and treat them as non-strict. */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_TGRAN4_SHIFT, 4, ID_AA64MMFR0_TGRAN4_NI), - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_TGRAN64_SHIFT, 4, ID_AA64MMFR0_TGRAN64_NI), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_TGRAN16_SHIFT, 4, ID_AA64MMFR0_TGRAN16_NI), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN4_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN4_NI), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN64_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN64_NI), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN16_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN16_NI), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_BIGENDEL0_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_BIGENDEL0_SHIFT, 4, 0), /* Linux shouldn't care about secure memory */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_SNSMEM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_BIGENDEL_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_ASID_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_SNSMEM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_BIGEND_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_ASIDBITS_SHIFT, 4, 0), /* * Differing PARange is fine as long as all peripherals and memory are mapped * within the minimum PARange of all CPUs */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_PARANGE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_PARANGE_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_AFP_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_ETS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_TWED_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_XNX_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64MMFR1_SPECSEI_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_PAN_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_LOR_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_HPD_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_VHE_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_VMIDBITS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_HADBS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_AFP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_ETS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_TWED_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_XNX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64MMFR1_EL1_SpecSEI_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_PAN_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_LO_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HPDS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_VH_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_VMIDBits_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HAFDBS_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_E0PD_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EVT_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_BBM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_TTL_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_FWB_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IDS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_AT_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_ST_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_NV_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_CCIDX_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LVA_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IESB_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LSM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_UAO_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_CNP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_E0PD_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_EVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_BBM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_TTL_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_FWB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_IDS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_AT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_ST_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_NV_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_CCIDX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_VARange_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_IESB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_LSM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_UAO_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_CnP_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_ctr[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RES1 */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_DIC_SHIFT, 1, 1), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_IDC_SHIFT, 1, 1), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_CWG_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_ERG_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_DMINLINE_SHIFT, 4, 1), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DIC_SHIFT, 1, 1), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_IDC_SHIFT, 1, 1), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_EL0_CWG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_EL0_ERG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DminLine_SHIFT, 4, 1), /* * Linux can handle differing I-cache policies. Userspace JITs will * make use of *minLine. * If we have differing I-cache policies, report it as the weakest - VIPT. */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, CTR_L1IP_SHIFT, 2, ICACHE_POLICY_VIPT), /* L1Ip */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_IMINLINE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, CTR_EL0_L1Ip_SHIFT, 2, CTR_EL0_L1Ip_VIPT), /* L1Ip */ + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_IminLine_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -408,17 +433,41 @@ static const struct arm64_ftr_bits ftr_id_mmfr0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_DOUBLELOCK_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64DFR0_PMSVER_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_CTX_CMPS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_WRPS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_BRPS_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_DoubleLock_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_PMSVer_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_CTX_CMPs_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_WRPs_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_BRPs_SHIFT, 4, 0), /* * We can instantiate multiple PMU instances with different levels * of support. */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_PMUVER_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_DEBUGVER_SHIFT, 4, 0x6), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_EL1_PMUVer_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_EL1_DebugVer_SHIFT, 4, 0x6), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_mvfr0[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPROUND_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPSHVEC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPSQRT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPDIVIDE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPTRAP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPDP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPSP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_SIMD_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_mvfr1[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_SIMDFMAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_FPHP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_SIMDHP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_SIMDSP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_SIMDINT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_SIMDLS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_FPDNAN_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_FPFTZ_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -429,13 +478,13 @@ static const struct arm64_ftr_bits ftr_mvfr2[] = { }; static const struct arm64_ftr_bits ftr_dczid[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, DCZID_DZP_SHIFT, 1, 1), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, DCZID_BS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, DCZID_EL0_DZP_SHIFT, 1, 1), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, DCZID_EL0_BS_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_gmid[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, SYS_GMID_EL1_BS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, GMID_EL1_BS_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -452,10 +501,10 @@ static const struct arm64_ftr_bits ftr_id_isar0[] = { static const struct arm64_ftr_bits ftr_id_isar5[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_RDM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_CRC32_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SHA2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SHA1_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_AES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_CRC32_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SHA2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SHA1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_AES_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SEVL_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -554,7 +603,13 @@ static const struct arm64_ftr_bits ftr_id_dfr1[] = { static const struct arm64_ftr_bits ftr_zcr[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, - ZCR_ELx_LEN_SHIFT, ZCR_ELx_LEN_SIZE, 0), /* LEN */ + ZCR_ELx_LEN_SHIFT, ZCR_ELx_LEN_WIDTH, 0), /* LEN */ + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_smcr[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, + SMCR_ELx_LEN_SHIFT, SMCR_ELx_LEN_WIDTH, 0), /* LEN */ ARM64_FTR_END, }; @@ -562,7 +617,7 @@ static const struct arm64_ftr_bits ftr_zcr[] = { * Common ftr bits for a 32bit register with all hidden, strict * attributes, with 4bit feature fields and a default safe value of * 0. Covers the following 32bit registers: - * id_isar[1-4], id_mmfr[1-3], id_pfr1, mvfr[0-1] + * id_isar[1-3], id_mmfr[1-3] */ static const struct arm64_ftr_bits ftr_generic_32bits[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0), @@ -603,6 +658,7 @@ static const struct arm64_ftr_bits ftr_raz[] = { struct arm64_ftr_override __ro_after_init id_aa64mmfr1_override; struct arm64_ftr_override __ro_after_init id_aa64pfr1_override; struct arm64_ftr_override __ro_after_init id_aa64isar1_override; +struct arm64_ftr_override __ro_after_init id_aa64isar2_override; static const struct __ftr_reg_entry { u32 sys_id; @@ -629,8 +685,8 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG(SYS_ID_ISAR6_EL1, ftr_id_isar6), /* Op1 = 0, CRn = 0, CRm = 3 */ - ARM64_FTR_REG(SYS_MVFR0_EL1, ftr_generic_32bits), - ARM64_FTR_REG(SYS_MVFR1_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_MVFR0_EL1, ftr_mvfr0), + ARM64_FTR_REG(SYS_MVFR1_EL1, ftr_mvfr1), ARM64_FTR_REG(SYS_MVFR2_EL1, ftr_mvfr2), ARM64_FTR_REG(SYS_ID_PFR2_EL1, ftr_id_pfr2), ARM64_FTR_REG(SYS_ID_DFR1_EL1, ftr_id_dfr1), @@ -641,6 +697,7 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1, &id_aa64pfr1_override), ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0), + ARM64_FTR_REG(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0), /* Op1 = 0, CRn = 0, CRm = 5 */ ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0), @@ -650,7 +707,8 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG(SYS_ID_AA64ISAR0_EL1, ftr_id_aa64isar0), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1, &id_aa64isar1_override), - ARM64_FTR_REG(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2, + &id_aa64isar2_override), /* Op1 = 0, CRn = 0, CRm = 7 */ ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0), @@ -660,6 +718,7 @@ static const struct __ftr_reg_entry { /* Op1 = 0, CRn = 1, CRm = 2 */ ARM64_FTR_REG(SYS_ZCR_EL1, ftr_zcr), + ARM64_FTR_REG(SYS_SMCR_EL1, ftr_smcr), /* Op1 = 1, CRn = 0, CRm = 0 */ ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid), @@ -708,7 +767,7 @@ static struct arm64_ftr_reg *get_arm64_ftr_reg_nowarn(u32 sys_id) * returns - Upon success, matching ftr_reg entry for id. * - NULL on failure but with an WARN_ON(). */ -static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) +struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) { struct arm64_ftr_reg *reg; @@ -804,7 +863,7 @@ static void __init sort_ftr_regs(void) * to sys_id for subsequent binary search in get_arm64_ftr_reg() * to work correctly. */ - BUG_ON(arm64_ftr_regs[i].sys_id < arm64_ftr_regs[i - 1].sys_id); + BUG_ON(arm64_ftr_regs[i].sys_id <= arm64_ftr_regs[i - 1].sys_id); } } @@ -954,13 +1013,20 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0); init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1); init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0); + init_cpu_ftr_reg(SYS_ID_AA64SMFR0_EL1, info->reg_id_aa64smfr0); if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) init_32bit_cpu_features(&info->aarch32); if (id_aa64pfr0_sve(info->reg_id_aa64pfr0)) { init_cpu_ftr_reg(SYS_ZCR_EL1, info->reg_zcr); - sve_init_vq_map(); + vec_init_vq_map(ARM64_VEC_SVE); + } + + if (id_aa64pfr1_sme(info->reg_id_aa64pfr1)) { + init_cpu_ftr_reg(SYS_SMCR_EL1, info->reg_smcr); + if (IS_ENABLED(CONFIG_ARM64_SME)) + vec_init_vq_map(ARM64_VEC_SME); } if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) @@ -1189,6 +1255,9 @@ void update_cpu_features(int cpu, taint |= check_update_ftr_reg(SYS_ID_AA64ZFR0_EL1, cpu, info->reg_id_aa64zfr0, boot->reg_id_aa64zfr0); + taint |= check_update_ftr_reg(SYS_ID_AA64SMFR0_EL1, cpu, + info->reg_id_aa64smfr0, boot->reg_id_aa64smfr0); + if (id_aa64pfr0_sve(info->reg_id_aa64pfr0)) { taint |= check_update_ftr_reg(SYS_ZCR_EL1, cpu, info->reg_zcr, boot->reg_zcr); @@ -1196,7 +1265,17 @@ void update_cpu_features(int cpu, /* Probe vector lengths, unless we already gave up on SVE */ if (id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1)) && !system_capabilities_finalized()) - sve_update_vq_map(); + vec_update_vq_map(ARM64_VEC_SVE); + } + + if (id_aa64pfr1_sme(info->reg_id_aa64pfr1)) { + taint |= check_update_ftr_reg(SYS_SMCR_EL1, cpu, + info->reg_smcr, boot->reg_smcr); + + /* Probe vector lengths, unless we already gave up on SME */ + if (id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1)) && + !system_capabilities_finalized()) + vec_update_vq_map(ARM64_VEC_SME); } /* @@ -1282,6 +1361,7 @@ u64 __read_sysreg_by_encoding(u32 sys_id) read_sysreg_case(SYS_ID_AA64PFR0_EL1); read_sysreg_case(SYS_ID_AA64PFR1_EL1); read_sysreg_case(SYS_ID_AA64ZFR0_EL1); + read_sysreg_case(SYS_ID_AA64SMFR0_EL1); read_sysreg_case(SYS_ID_AA64DFR0_EL1); read_sysreg_case(SYS_ID_AA64DFR1_EL1); read_sysreg_case(SYS_ID_AA64MMFR0_EL1); @@ -1314,22 +1394,47 @@ u64 __read_sysreg_by_encoding(u32 sys_id) static bool feature_matches(u64 reg, const struct arm64_cpu_capabilities *entry) { - int val = cpuid_feature_extract_field(reg, entry->field_pos, entry->sign); + int val = cpuid_feature_extract_field_width(reg, entry->field_pos, + entry->field_width, + entry->sign); return val >= entry->min_field_value; } +static u64 +read_scoped_sysreg(const struct arm64_cpu_capabilities *entry, int scope) +{ + WARN_ON(scope == SCOPE_LOCAL_CPU && preemptible()); + if (scope == SCOPE_SYSTEM) + return read_sanitised_ftr_reg(entry->sys_reg); + else + return __read_sysreg_by_encoding(entry->sys_reg); +} + +static bool +has_user_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope) +{ + int mask; + struct arm64_ftr_reg *regp; + u64 val = read_scoped_sysreg(entry, scope); + + regp = get_arm64_ftr_reg(entry->sys_reg); + if (!regp) + return false; + + mask = cpuid_feature_extract_unsigned_field_width(regp->user_mask, + entry->field_pos, + entry->field_width); + if (!mask) + return false; + + return feature_matches(val, entry); +} + static bool has_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope) { - u64 val; - - WARN_ON(scope == SCOPE_LOCAL_CPU && preemptible()); - if (scope == SCOPE_SYSTEM) - val = read_sanitised_ftr_reg(entry->sys_reg); - else - val = __read_sysreg_by_encoding(entry->sys_reg); - + u64 val = read_scoped_sysreg(entry, scope); return feature_matches(val, entry); } @@ -1343,6 +1448,7 @@ const struct cpumask *system_32bit_el0_cpumask(void) return cpu_possible_mask; } +EXPORT_SYMBOL_GPL(system_32bit_el0_cpumask); static int __init parse_32bit_el0_param(char *str) { @@ -1410,7 +1516,7 @@ static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unus u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); return cpuid_feature_extract_signed_field(pfr0, - ID_AA64PFR0_FP_SHIFT) < 0; + ID_AA64PFR0_EL1_FP_SHIFT) < 0; } static bool has_cache_idc(const struct arm64_cpu_capabilities *entry, @@ -1423,7 +1529,7 @@ static bool has_cache_idc(const struct arm64_cpu_capabilities *entry, else ctr = read_cpuid_effective_cachetype(); - return ctr & BIT(CTR_IDC_SHIFT); + return ctr & BIT(CTR_EL0_IDC_SHIFT); } static void cpu_emulate_effective_ctr(const struct arm64_cpu_capabilities *__unused) @@ -1434,7 +1540,7 @@ static void cpu_emulate_effective_ctr(const struct arm64_cpu_capabilities *__unu * to the CTR_EL0 on this CPU and emulate it with the real/safe * value. */ - if (!(read_cpuid_cachetype() & BIT(CTR_IDC_SHIFT))) + if (!(read_cpuid_cachetype() & BIT(CTR_EL0_IDC_SHIFT))) sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0); } @@ -1448,7 +1554,7 @@ static bool has_cache_dic(const struct arm64_cpu_capabilities *entry, else ctr = read_cpuid_cachetype(); - return ctr & BIT(CTR_DIC_SHIFT); + return ctr & BIT(CTR_EL0_DIC_SHIFT); } static bool __maybe_unused @@ -1489,7 +1595,7 @@ bool kaslr_requires_kpti(void) if (IS_ENABLED(CONFIG_ARM64_E0PD)) { u64 mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1); if (cpuid_feature_extract_unsigned_field(mmfr2, - ID_AA64MMFR2_E0PD_SHIFT)) + ID_AA64MMFR2_EL1_E0PD_SHIFT)) return false; } @@ -1842,21 +1948,27 @@ static bool has_address_auth_cpucap(const struct arm64_cpu_capabilities *entry, /* Now check for the secondary CPUs with SCOPE_LOCAL_CPU scope */ sec_val = cpuid_feature_extract_field(__read_sysreg_by_encoding(entry->sys_reg), entry->field_pos, entry->sign); - return sec_val == boot_val; + return (sec_val >= entry->min_field_value) && (sec_val == boot_val); } static bool has_address_auth_metacap(const struct arm64_cpu_capabilities *entry, int scope) { - return has_address_auth_cpucap(cpu_hwcaps_ptrs[ARM64_HAS_ADDRESS_AUTH_ARCH], scope) || - has_address_auth_cpucap(cpu_hwcaps_ptrs[ARM64_HAS_ADDRESS_AUTH_IMP_DEF], scope); + bool api = has_address_auth_cpucap(cpu_hwcaps_ptrs[ARM64_HAS_ADDRESS_AUTH_IMP_DEF], scope); + bool apa = has_address_auth_cpucap(cpu_hwcaps_ptrs[ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA5], scope); + bool apa3 = has_address_auth_cpucap(cpu_hwcaps_ptrs[ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA3], scope); + + return apa || apa3 || api; } static bool has_generic_auth(const struct arm64_cpu_capabilities *entry, int __unused) { - return __system_matches_cap(ARM64_HAS_GENERIC_AUTH_ARCH) || - __system_matches_cap(ARM64_HAS_GENERIC_AUTH_IMP_DEF); + bool gpi = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_IMP_DEF); + bool gpa = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_ARCH_QARMA5); + bool gpa3 = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_ARCH_QARMA3); + + return gpa || gpa3 || gpi; } #endif /* CONFIG_ARM64_PTR_AUTH */ @@ -1928,15 +2040,7 @@ static void elf_hwcap_fixup(void) #ifdef CONFIG_KVM static bool is_kvm_protected_mode(const struct arm64_cpu_capabilities *entry, int __unused) { - if (kvm_get_mode() != KVM_MODE_PROTECTED) - return false; - - if (is_kernel_in_hyp_mode()) { - pr_warn("Protected KVM not available with VHE\n"); - return false; - } - - return true; + return kvm_get_mode() == KVM_MODE_PROTECTED; } #endif /* CONFIG_KVM */ @@ -1966,7 +2070,19 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = has_useable_gicv3_cpuif, .sys_reg = SYS_ID_AA64PFR0_EL1, - .field_pos = ID_AA64PFR0_GIC_SHIFT, + .field_pos = ID_AA64PFR0_EL1_GIC_SHIFT, + .field_width = 4, + .sign = FTR_UNSIGNED, + .min_field_value = 1, + }, + { + .desc = "Enhanced Counter Virtualization", + .capability = ARM64_HAS_ECV, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64MMFR0_EL1, + .field_pos = ID_AA64MMFR0_EL1_ECV_SHIFT, + .field_width = 4, .sign = FTR_UNSIGNED, .min_field_value = 1, }, @@ -1977,7 +2093,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64MMFR1_EL1, - .field_pos = ID_AA64MMFR1_PAN_SHIFT, + .field_pos = ID_AA64MMFR1_EL1_PAN_SHIFT, + .field_width = 4, .sign = FTR_UNSIGNED, .min_field_value = 1, .cpu_enable = cpu_enable_pan, @@ -1990,7 +2107,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64MMFR1_EL1, - .field_pos = ID_AA64MMFR1_PAN_SHIFT, + .field_pos = ID_AA64MMFR1_EL1_PAN_SHIFT, + .field_width = 4, .sign = FTR_UNSIGNED, .min_field_value = 3, }, @@ -2002,7 +2120,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64ISAR0_EL1, - .field_pos = ID_AA64ISAR0_ATOMICS_SHIFT, + .field_pos = ID_AA64ISAR0_EL1_ATOMIC_SHIFT, + .field_width = 4, .sign = FTR_UNSIGNED, .min_field_value = 2, }, @@ -2026,8 +2145,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_32bit_el0, .sys_reg = SYS_ID_AA64PFR0_EL1, .sign = FTR_UNSIGNED, - .field_pos = ID_AA64PFR0_EL0_SHIFT, - .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT, + .field_pos = ID_AA64PFR0_EL1_EL0_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR0_EL1_ELx_32BIT_64BIT, }, #ifdef CONFIG_KVM { @@ -2037,8 +2157,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64PFR0_EL1, .sign = FTR_UNSIGNED, - .field_pos = ID_AA64PFR0_EL1_SHIFT, - .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT, + .field_pos = ID_AA64PFR0_EL1_EL1_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR0_EL1_ELx_32BIT_64BIT, }, { .desc = "Protected KVM", @@ -2057,7 +2178,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = { * more details. */ .sys_reg = SYS_ID_AA64PFR0_EL1, - .field_pos = ID_AA64PFR0_CSV3_SHIFT, + .field_pos = ID_AA64PFR0_EL1_CSV3_SHIFT, + .field_width = 4, .min_field_value = 1, .matches = unmap_kernel_at_el0, .cpu_enable = kpti_install_ng_mappings, @@ -2076,7 +2198,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64ISAR1_EL1, - .field_pos = ID_AA64ISAR1_DPB_SHIFT, + .field_pos = ID_AA64ISAR1_EL1_DPB_SHIFT, + .field_width = 4, .min_field_value = 1, }, { @@ -2086,7 +2209,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64ISAR1_EL1, .sign = FTR_UNSIGNED, - .field_pos = ID_AA64ISAR1_DPB_SHIFT, + .field_pos = ID_AA64ISAR1_EL1_DPB_SHIFT, + .field_width = 4, .min_field_value = 2, }, #endif @@ -2097,8 +2221,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_SVE, .sys_reg = SYS_ID_AA64PFR0_EL1, .sign = FTR_UNSIGNED, - .field_pos = ID_AA64PFR0_SVE_SHIFT, - .min_field_value = ID_AA64PFR0_SVE, + .field_pos = ID_AA64PFR0_EL1_SVE_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR0_EL1_SVE_IMP, .matches = has_cpuid_feature, .cpu_enable = sve_kernel_enable, }, @@ -2111,8 +2236,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64PFR0_EL1, .sign = FTR_UNSIGNED, - .field_pos = ID_AA64PFR0_RAS_SHIFT, - .min_field_value = ID_AA64PFR0_RAS_V1, + .field_pos = ID_AA64PFR0_EL1_RAS_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR0_EL1_RAS_IMP, .cpu_enable = cpu_clear_disr, }, #endif /* CONFIG_ARM64_RAS_EXTN */ @@ -2129,8 +2255,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_amu, .sys_reg = SYS_ID_AA64PFR0_EL1, .sign = FTR_UNSIGNED, - .field_pos = ID_AA64PFR0_AMU_SHIFT, - .min_field_value = ID_AA64PFR0_AMU, + .field_pos = ID_AA64PFR0_EL1_AMU_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR0_EL1_AMU_IMP, .cpu_enable = cpu_amu_enable, }, #endif /* CONFIG_ARM64_AMU_EXTN */ @@ -2153,7 +2280,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_STAGE2_FWB, .sys_reg = SYS_ID_AA64MMFR2_EL1, .sign = FTR_UNSIGNED, - .field_pos = ID_AA64MMFR2_FWB_SHIFT, + .field_pos = ID_AA64MMFR2_EL1_FWB_SHIFT, + .field_width = 4, .min_field_value = 1, .matches = has_cpuid_feature, .cpu_enable = cpu_has_fwb, @@ -2164,7 +2292,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_ARMv8_4_TTL, .sys_reg = SYS_ID_AA64MMFR2_EL1, .sign = FTR_UNSIGNED, - .field_pos = ID_AA64MMFR2_TTL_SHIFT, + .field_pos = ID_AA64MMFR2_EL1_TTL_SHIFT, + .field_width = 4, .min_field_value = 1, .matches = has_cpuid_feature, }, @@ -2174,9 +2303,10 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64ISAR0_EL1, - .field_pos = ID_AA64ISAR0_TLB_SHIFT, + .field_pos = ID_AA64ISAR0_EL1_TLB_SHIFT, + .field_width = 4, .sign = FTR_UNSIGNED, - .min_field_value = ID_AA64ISAR0_TLB_RANGE, + .min_field_value = ID_AA64ISAR0_EL1_TLB_RANGE, }, #ifdef CONFIG_ARM64_HW_AFDBM { @@ -2192,7 +2322,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HW_DBM, .sys_reg = SYS_ID_AA64MMFR1_EL1, .sign = FTR_UNSIGNED, - .field_pos = ID_AA64MMFR1_HADBS_SHIFT, + .field_pos = ID_AA64MMFR1_EL1_HAFDBS_SHIFT, + .field_width = 4, .min_field_value = 2, .matches = has_hw_dbm, .cpu_enable = cpu_enable_hw_dbm, @@ -2204,7 +2335,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64ISAR0_EL1, - .field_pos = ID_AA64ISAR0_CRC32_SHIFT, + .field_pos = ID_AA64ISAR0_EL1_CRC32_SHIFT, + .field_width = 4, .min_field_value = 1, }, { @@ -2213,9 +2345,10 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64PFR1_EL1, - .field_pos = ID_AA64PFR1_SSBS_SHIFT, + .field_pos = ID_AA64PFR1_EL1_SSBS_SHIFT, + .field_width = 4, .sign = FTR_UNSIGNED, - .min_field_value = ID_AA64PFR1_SSBS_PSTATE_ONLY, + .min_field_value = ID_AA64PFR1_EL1_SSBS_IMP, }, #ifdef CONFIG_ARM64_CNP { @@ -2225,7 +2358,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_useable_cnp, .sys_reg = SYS_ID_AA64MMFR2_EL1, .sign = FTR_UNSIGNED, - .field_pos = ID_AA64MMFR2_CNP_SHIFT, + .field_pos = ID_AA64MMFR2_EL1_CnP_SHIFT, + .field_width = 4, .min_field_value = 1, .cpu_enable = cpu_enable_cnp, }, @@ -2236,19 +2370,32 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64ISAR1_EL1, - .field_pos = ID_AA64ISAR1_SB_SHIFT, + .field_pos = ID_AA64ISAR1_EL1_SB_SHIFT, + .field_width = 4, .sign = FTR_UNSIGNED, .min_field_value = 1, }, #ifdef CONFIG_ARM64_PTR_AUTH { - .desc = "Address authentication (architected algorithm)", - .capability = ARM64_HAS_ADDRESS_AUTH_ARCH, + .desc = "Address authentication (architected QARMA5 algorithm)", + .capability = ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA5, .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, .sys_reg = SYS_ID_AA64ISAR1_EL1, .sign = FTR_UNSIGNED, - .field_pos = ID_AA64ISAR1_APA_SHIFT, - .min_field_value = ID_AA64ISAR1_APA_ARCHITECTED, + .field_pos = ID_AA64ISAR1_EL1_APA_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64ISAR1_EL1_APA_PAuth, + .matches = has_address_auth_cpucap, + }, + { + .desc = "Address authentication (architected QARMA3 algorithm)", + .capability = ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA3, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .sys_reg = SYS_ID_AA64ISAR2_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64ISAR2_EL1_APA3_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64ISAR2_EL1_APA3_PAuth, .matches = has_address_auth_cpucap, }, { @@ -2257,8 +2404,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, .sys_reg = SYS_ID_AA64ISAR1_EL1, .sign = FTR_UNSIGNED, - .field_pos = ID_AA64ISAR1_API_SHIFT, - .min_field_value = ID_AA64ISAR1_API_IMP_DEF, + .field_pos = ID_AA64ISAR1_EL1_API_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64ISAR1_EL1_API_PAuth, .matches = has_address_auth_cpucap, }, { @@ -2267,13 +2415,25 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_address_auth_metacap, }, { - .desc = "Generic authentication (architected algorithm)", - .capability = ARM64_HAS_GENERIC_AUTH_ARCH, + .desc = "Generic authentication (architected QARMA5 algorithm)", + .capability = ARM64_HAS_GENERIC_AUTH_ARCH_QARMA5, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .sys_reg = SYS_ID_AA64ISAR1_EL1, .sign = FTR_UNSIGNED, - .field_pos = ID_AA64ISAR1_GPA_SHIFT, - .min_field_value = ID_AA64ISAR1_GPA_ARCHITECTED, + .field_pos = ID_AA64ISAR1_EL1_GPA_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64ISAR1_EL1_GPA_IMP, + .matches = has_cpuid_feature, + }, + { + .desc = "Generic authentication (architected QARMA3 algorithm)", + .capability = ARM64_HAS_GENERIC_AUTH_ARCH_QARMA3, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .sys_reg = SYS_ID_AA64ISAR2_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64ISAR2_EL1_GPA3_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64ISAR2_EL1_GPA3_IMP, .matches = has_cpuid_feature, }, { @@ -2282,8 +2442,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .sys_reg = SYS_ID_AA64ISAR1_EL1, .sign = FTR_UNSIGNED, - .field_pos = ID_AA64ISAR1_GPI_SHIFT, - .min_field_value = ID_AA64ISAR1_GPI_IMP_DEF, + .field_pos = ID_AA64ISAR1_EL1_GPI_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64ISAR1_EL1_GPI_IMP, .matches = has_cpuid_feature, }, { @@ -2302,7 +2463,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = can_use_gic_priorities, .sys_reg = SYS_ID_AA64PFR0_EL1, - .field_pos = ID_AA64PFR0_GIC_SHIFT, + .field_pos = ID_AA64PFR0_EL1_GIC_SHIFT, + .field_width = 4, .sign = FTR_UNSIGNED, .min_field_value = 1, }, @@ -2314,7 +2476,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .sys_reg = SYS_ID_AA64MMFR2_EL1, .sign = FTR_UNSIGNED, - .field_pos = ID_AA64MMFR2_E0PD_SHIFT, + .field_width = 4, + .field_pos = ID_AA64MMFR2_EL1_E0PD_SHIFT, .matches = has_cpuid_feature, .min_field_value = 1, .cpu_enable = cpu_enable_e0pd, @@ -2327,7 +2490,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64ISAR0_EL1, - .field_pos = ID_AA64ISAR0_RNDR_SHIFT, + .field_pos = ID_AA64ISAR0_EL1_RNDR_SHIFT, + .field_width = 4, .sign = FTR_UNSIGNED, .min_field_value = 1, }, @@ -2344,8 +2508,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, .cpu_enable = bti_enable, .sys_reg = SYS_ID_AA64PFR1_EL1, - .field_pos = ID_AA64PFR1_BT_SHIFT, - .min_field_value = ID_AA64PFR1_BT_BTI, + .field_pos = ID_AA64PFR1_EL1_BT_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR1_EL1_BT_IMP, .sign = FTR_UNSIGNED, }, #endif @@ -2356,11 +2521,23 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64PFR1_EL1, - .field_pos = ID_AA64PFR1_MTE_SHIFT, - .min_field_value = ID_AA64PFR1_MTE, + .field_pos = ID_AA64PFR1_EL1_MTE_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR1_EL1_MTE_MTE2, .sign = FTR_UNSIGNED, .cpu_enable = cpu_enable_mte, }, + { + .desc = "Asymmetric MTE Tag Check Fault", + .capability = ARM64_MTE_ASYMM, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64PFR1_EL1, + .field_pos = ID_AA64PFR1_EL1_MTE_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR1_EL1_MTE_MTE3, + .sign = FTR_UNSIGNED, + }, #endif /* CONFIG_ARM64_MTE */ { .desc = "RCpc load-acquire (LDAPR)", @@ -2368,17 +2545,57 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .sys_reg = SYS_ID_AA64ISAR1_EL1, .sign = FTR_UNSIGNED, - .field_pos = ID_AA64ISAR1_LRCPC_SHIFT, + .field_pos = ID_AA64ISAR1_EL1_LRCPC_SHIFT, + .field_width = 4, .matches = has_cpuid_feature, .min_field_value = 1, }, +#ifdef CONFIG_ARM64_SME + { + .desc = "Scalable Matrix Extension", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_SME, + .sys_reg = SYS_ID_AA64PFR1_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64PFR1_EL1_SME_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR1_EL1_SME_IMP, + .matches = has_cpuid_feature, + .cpu_enable = sme_kernel_enable, + }, + /* FA64 should be sorted after the base SME capability */ + { + .desc = "FA64", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_SME_FA64, + .sys_reg = SYS_ID_AA64SMFR0_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64SMFR0_EL1_FA64_SHIFT, + .field_width = 1, + .min_field_value = ID_AA64SMFR0_EL1_FA64_IMP, + .matches = has_cpuid_feature, + .cpu_enable = fa64_kernel_enable, + }, +#endif /* CONFIG_ARM64_SME */ + { + .desc = "WFx with timeout", + .capability = ARM64_HAS_WFXT, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .sys_reg = SYS_ID_AA64ISAR2_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64ISAR2_EL1_WFxT_SHIFT, + .field_width = 4, + .matches = has_cpuid_feature, + .min_field_value = ID_AA64ISAR2_EL1_WFxT_IMP, + }, {}, }; -#define HWCAP_CPUID_MATCH(reg, field, s, min_value) \ - .matches = has_cpuid_feature, \ +#define HWCAP_CPUID_MATCH(reg, field, width, s, min_value) \ + .matches = has_user_cpuid_feature, \ .sys_reg = reg, \ .field_pos = field, \ + .field_width = width, \ .sign = s, \ .min_field_value = min_value, @@ -2388,10 +2605,10 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .hwcap_type = cap_type, \ .hwcap = cap, \ -#define HWCAP_CAP(reg, field, s, min_value, cap_type, cap) \ +#define HWCAP_CAP(reg, field, width, s, min_value, cap_type, cap) \ { \ __HWCAP_CAP(#cap, cap_type, cap) \ - HWCAP_CPUID_MATCH(reg, field, s, min_value) \ + HWCAP_CPUID_MATCH(reg, field, width, s, min_value) \ } #define HWCAP_MULTI_CAP(list, cap_type, cap) \ @@ -2410,90 +2627,111 @@ static const struct arm64_cpu_capabilities arm64_features[] = { #ifdef CONFIG_ARM64_PTR_AUTH static const struct arm64_cpu_capabilities ptr_auth_hwcap_addr_matches[] = { { - HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_APA_SHIFT, - FTR_UNSIGNED, ID_AA64ISAR1_APA_ARCHITECTED) + HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_APA_SHIFT, + 4, FTR_UNSIGNED, + ID_AA64ISAR1_EL1_APA_PAuth) }, { - HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_API_SHIFT, - FTR_UNSIGNED, ID_AA64ISAR1_API_IMP_DEF) + HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_EL1_APA3_SHIFT, + 4, FTR_UNSIGNED, ID_AA64ISAR2_EL1_APA3_PAuth) + }, + { + HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_API_SHIFT, + 4, FTR_UNSIGNED, ID_AA64ISAR1_EL1_API_PAuth) }, {}, }; static const struct arm64_cpu_capabilities ptr_auth_hwcap_gen_matches[] = { { - HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_GPA_SHIFT, - FTR_UNSIGNED, ID_AA64ISAR1_GPA_ARCHITECTED) + HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_GPA_SHIFT, + 4, FTR_UNSIGNED, ID_AA64ISAR1_EL1_GPA_IMP) }, { - HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_GPI_SHIFT, - FTR_UNSIGNED, ID_AA64ISAR1_GPI_IMP_DEF) + HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_EL1_GPA3_SHIFT, + 4, FTR_UNSIGNED, ID_AA64ISAR2_EL1_GPA3_IMP) + }, + { + HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_GPI_SHIFT, + 4, FTR_UNSIGNED, ID_AA64ISAR1_EL1_GPI_IMP) }, {}, }; #endif static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_PMULL), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AES), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA1), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA2), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_SHA512), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_CRC32_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_CRC32), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_ATOMICS_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ATOMICS), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_RDM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDRDM), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA3), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM3), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM4_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM4), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDDP), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_FHM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDFHM), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FLAGM), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_RNDR_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_RNG), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_FP), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FPHP), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_ASIMD), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDHP), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_DIT_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DIT), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DCPOP), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_DCPODP), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_JSCVT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_JSCVT), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FCMA), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_LRCPC), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FRINTTS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FRINT), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_SB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SB), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_BF16_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_BF16), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DGH_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DGH), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_I8MM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_I8MM), - HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_AT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_USCAT), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_AES_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_PMULL), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_AES_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AES), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SHA1_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA1), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SHA2_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA2), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SHA2_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_SHA512), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_CRC32_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_CRC32), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_ATOMIC_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ATOMICS), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_RDM_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDRDM), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SHA3_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA3), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SM3_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM3), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SM4_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM4), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_DP_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDDP), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_FHM_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDFHM), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_TS_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FLAGM), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_TS_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_RNDR_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_RNG), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_EL1_FP_SHIFT, 4, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_FP), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_EL1_FP_SHIFT, 4, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FPHP), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_EL1_AdvSIMD_SHIFT, 4, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_ASIMD), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_EL1_AdvSIMD_SHIFT, 4, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDHP), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_EL1_DIT_SHIFT, 4, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DIT), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_DPB_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DCPOP), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_DPB_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_DCPODP), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_JSCVT_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_JSCVT), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_FCMA_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FCMA), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_LRCPC_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_LRCPC), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_LRCPC_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_FRINTTS_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FRINT), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_SB_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SB), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_BF16), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DGH), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_I8MM), + HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_EL1_AT_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_USCAT), #ifdef CONFIG_ARM64_SVE - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_SVE_SHIFT, FTR_UNSIGNED, ID_AA64PFR0_SVE, CAP_HWCAP, KERNEL_HWCAP_SVE), - HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_SVEVER_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_SVEVER_SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2), - HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_AES_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_AES, CAP_HWCAP, KERNEL_HWCAP_SVEAES), - HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_AES_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_AES_PMULL, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL), - HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_BITPERM_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_BITPERM, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM), - HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_BF16_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_BF16, CAP_HWCAP, KERNEL_HWCAP_SVEBF16), - HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_SHA3_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_SHA3, CAP_HWCAP, KERNEL_HWCAP_SVESHA3), - HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_SM4_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_SM4, CAP_HWCAP, KERNEL_HWCAP_SVESM4), - HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_I8MM_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_I8MM, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM), - HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_F32MM_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_F32MM, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM), - HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_F64MM_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_F64MM, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_EL1_SVE_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR0_EL1_SVE_IMP, CAP_HWCAP, KERNEL_HWCAP_SVE), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_SVEver_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_SVEver_SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_AES_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_AES_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEAES), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_AES_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_AES_PMULL128, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_BitPerm_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_BitPerm_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_BF16_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_BF16_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBF16), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_SHA3_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_SHA3_IMP, CAP_HWCAP, KERNEL_HWCAP_SVESHA3), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_SM4_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_SM4_IMP, CAP_HWCAP, KERNEL_HWCAP_SVESM4), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_I8MM_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_I8MM_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_F32MM_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_F32MM_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_F64MM_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_F64MM_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), #endif - HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SSBS_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SSBS_PSTATE_INSNS, CAP_HWCAP, KERNEL_HWCAP_SSBS), + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_EL1_SSBS_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR1_EL1_SSBS_SSBS2, CAP_HWCAP, KERNEL_HWCAP_SSBS), #ifdef CONFIG_ARM64_BTI - HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_BT_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_BT_BTI, CAP_HWCAP, KERNEL_HWCAP_BTI), + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_EL1_BT_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR1_EL1_BT_IMP, CAP_HWCAP, KERNEL_HWCAP_BTI), #endif #ifdef CONFIG_ARM64_PTR_AUTH HWCAP_MULTI_CAP(ptr_auth_hwcap_addr_matches, CAP_HWCAP, KERNEL_HWCAP_PACA), HWCAP_MULTI_CAP(ptr_auth_hwcap_gen_matches, CAP_HWCAP, KERNEL_HWCAP_PACG), #endif #ifdef CONFIG_ARM64_MTE - HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_MTE_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_MTE, CAP_HWCAP, KERNEL_HWCAP_MTE), + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_EL1_MTE_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR1_EL1_MTE_MTE2, CAP_HWCAP, KERNEL_HWCAP_MTE), + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_EL1_MTE_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR1_EL1_MTE_MTE3, CAP_HWCAP, KERNEL_HWCAP_MTE3), #endif /* CONFIG_ARM64_MTE */ - HWCAP_CAP(SYS_ID_AA64MMFR0_EL1, ID_AA64MMFR0_ECV_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ECV), - HWCAP_CAP(SYS_ID_AA64MMFR1_EL1, ID_AA64MMFR1_AFP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AFP), - HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_RPRES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_RPRES), + HWCAP_CAP(SYS_ID_AA64MMFR0_EL1, ID_AA64MMFR0_EL1_ECV_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ECV), + HWCAP_CAP(SYS_ID_AA64MMFR1_EL1, ID_AA64MMFR1_EL1_AFP_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AFP), + HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_EL1_RPRES_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_RPRES), + HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_EL1_WFxT_SHIFT, 4, FTR_UNSIGNED, ID_AA64ISAR2_EL1_WFxT_IMP, CAP_HWCAP, KERNEL_HWCAP_WFXT), +#ifdef CONFIG_ARM64_SME + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_EL1_SME_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR1_EL1_SME_IMP, CAP_HWCAP, KERNEL_HWCAP_SME), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_FA64_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_EL1_FA64_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_I16I64_SHIFT, 4, FTR_UNSIGNED, ID_AA64SMFR0_EL1_I16I64_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_F64F64_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_EL1_F64F64_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_I8I32_SHIFT, 4, FTR_UNSIGNED, ID_AA64SMFR0_EL1_I8I32_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_F16F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_EL1_F16F32_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_B16F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_EL1_B16F32_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_F32F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_EL1_F32F32_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32), +#endif /* CONFIG_ARM64_SME */ {}, }; @@ -2522,15 +2760,15 @@ static bool compat_has_neon(const struct arm64_cpu_capabilities *cap, int scope) static const struct arm64_cpu_capabilities compat_elf_hwcaps[] = { #ifdef CONFIG_COMPAT HWCAP_CAP_MATCH(compat_has_neon, CAP_COMPAT_HWCAP, COMPAT_HWCAP_NEON), - HWCAP_CAP(SYS_MVFR1_EL1, MVFR1_SIMDFMAC_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFPv4), + HWCAP_CAP(SYS_MVFR1_EL1, MVFR1_SIMDFMAC_SHIFT, 4, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFPv4), /* Arm v8 mandates MVFR0.FPDP == {0, 2}. So, piggy back on this for the presence of VFP support */ - HWCAP_CAP(SYS_MVFR0_EL1, MVFR0_FPDP_SHIFT, FTR_UNSIGNED, 2, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFP), - HWCAP_CAP(SYS_MVFR0_EL1, MVFR0_FPDP_SHIFT, FTR_UNSIGNED, 2, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFPv3), - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, FTR_UNSIGNED, 2, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_PMULL), - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_AES), - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA1), - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_SHA2_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA2), - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_CRC32_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_CRC32), + HWCAP_CAP(SYS_MVFR0_EL1, MVFR0_FPDP_SHIFT, 4, FTR_UNSIGNED, 2, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFP), + HWCAP_CAP(SYS_MVFR0_EL1, MVFR0_FPDP_SHIFT, 4, FTR_UNSIGNED, 2, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFPv3), + HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, 4, FTR_UNSIGNED, 2, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_PMULL), + HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, 4, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_AES), + HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_SHA1_SHIFT, 4, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA1), + HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_SHA2_SHIFT, 4, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA2), + HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_CRC32_SHIFT, 4, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_CRC32), #endif {}, }; @@ -2782,7 +3020,7 @@ static void verify_sve_features(void) unsigned int safe_len = safe_zcr & ZCR_ELx_LEN_MASK; unsigned int len = zcr & ZCR_ELx_LEN_MASK; - if (len < safe_len || sve_verify_vq_map()) { + if (len < safe_len || vec_verify_vq_map(ARM64_VEC_SVE)) { pr_crit("CPU%d: SVE: vector length support mismatch\n", smp_processor_id()); cpu_die_early(); @@ -2791,6 +3029,23 @@ static void verify_sve_features(void) /* Add checks on other ZCR bits here if necessary */ } +static void verify_sme_features(void) +{ + u64 safe_smcr = read_sanitised_ftr_reg(SYS_SMCR_EL1); + u64 smcr = read_smcr_features(); + + unsigned int safe_len = safe_smcr & SMCR_ELx_LEN_MASK; + unsigned int len = smcr & SMCR_ELx_LEN_MASK; + + if (len < safe_len || vec_verify_vq_map(ARM64_VEC_SME)) { + pr_crit("CPU%d: SME: vector length support mismatch\n", + smp_processor_id()); + cpu_die_early(); + } + + /* Add checks on other SMCR bits here if necessary */ +} + static void verify_hyp_capabilities(void) { u64 safe_mmfr1, mmfr0, mmfr1; @@ -2814,7 +3069,7 @@ static void verify_hyp_capabilities(void) /* Verify IPA range */ parange = cpuid_feature_extract_unsigned_field(mmfr0, - ID_AA64MMFR0_PARANGE_SHIFT); + ID_AA64MMFR0_EL1_PARANGE_SHIFT); ipa_max = id_aa64mmfr0_parange_to_phys_shift(parange); if (ipa_max < get_kvm_ipa_limit()) { pr_crit("CPU%d: IPA range mismatch\n", smp_processor_id()); @@ -2843,6 +3098,9 @@ static void verify_local_cpu_capabilities(void) if (system_supports_sve()) verify_sve_features(); + if (system_supports_sme()) + verify_sme_features(); + if (is_hyp_mode_available()) verify_hyp_capabilities(); } @@ -2886,6 +3144,7 @@ bool this_cpu_has_cap(unsigned int n) return false; } +EXPORT_SYMBOL_GPL(this_cpu_has_cap); /* * This helper function is used in a narrow window when, @@ -2961,6 +3220,7 @@ void __init setup_cpu_features(void) pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n"); sve_setup(); + sme_setup(); minsigstksz_setup(); /* Advertise that we have computed the system capabilities */ @@ -3034,7 +3294,7 @@ static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *c /* * We emulate only the following system register space. - * Op0 = 0x3, CRn = 0x0, Op1 = 0x0, CRm = [0, 4 - 7] + * Op0 = 0x3, CRn = 0x0, Op1 = 0x0, CRm = [0, 2 - 7] * See Table C5-6 System instruction encodings for System register accesses, * ARMv8 ARM(ARM DDI 0487A.f) for more details. */ @@ -3044,7 +3304,7 @@ static inline bool __attribute_const__ is_emulated(u32 id) sys_reg_CRn(id) == 0x0 && sys_reg_Op1(id) == 0x0 && (sys_reg_CRm(id) == 0 || - ((sys_reg_CRm(id) >= 4) && (sys_reg_CRm(id) <= 7)))); + ((sys_reg_CRm(id) >= 2) && (sys_reg_CRm(id) <= 7)))); } /* diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index ae90c11f9623..23ddc1a8ef5a 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -34,12 +34,19 @@ DEFINE_PER_CPU(struct cpuinfo_arm64, cpu_data); static struct cpuinfo_arm64 boot_cpu_data; -static const char *icache_policy_str[] = { - [ICACHE_POLICY_VPIPT] = "VPIPT", - [ICACHE_POLICY_RESERVED] = "RESERVED/UNKNOWN", - [ICACHE_POLICY_VIPT] = "VIPT", - [ICACHE_POLICY_PIPT] = "PIPT", -}; +static inline const char *icache_policy_str(int l1ip) +{ + switch (l1ip) { + case CTR_EL0_L1Ip_VPIPT: + return "VPIPT"; + case CTR_EL0_L1Ip_VIPT: + return "VIPT"; + case CTR_EL0_L1Ip_PIPT: + return "PIPT"; + default: + return "RESERVED/UNKNOWN"; + } +} unsigned long __icache_flags; @@ -98,6 +105,16 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_ECV] = "ecv", [KERNEL_HWCAP_AFP] = "afp", [KERNEL_HWCAP_RPRES] = "rpres", + [KERNEL_HWCAP_MTE3] = "mte3", + [KERNEL_HWCAP_SME] = "sme", + [KERNEL_HWCAP_SME_I16I64] = "smei16i64", + [KERNEL_HWCAP_SME_F64F64] = "smef64f64", + [KERNEL_HWCAP_SME_I8I32] = "smei8i32", + [KERNEL_HWCAP_SME_F16F32] = "smef16f32", + [KERNEL_HWCAP_SME_B16F32] = "smeb16f32", + [KERNEL_HWCAP_SME_F32F32] = "smef32f32", + [KERNEL_HWCAP_SME_FA64] = "smefa64", + [KERNEL_HWCAP_WFXT] = "wfxt", }; #ifdef CONFIG_COMPAT @@ -357,19 +374,19 @@ static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info) u32 l1ip = CTR_L1IP(info->reg_ctr); switch (l1ip) { - case ICACHE_POLICY_PIPT: + case CTR_EL0_L1Ip_PIPT: break; - case ICACHE_POLICY_VPIPT: + case CTR_EL0_L1Ip_VPIPT: set_bit(ICACHEF_VPIPT, &__icache_flags); break; - case ICACHE_POLICY_RESERVED: - case ICACHE_POLICY_VIPT: + case CTR_EL0_L1Ip_VIPT: + default: /* Assume aliasing */ set_bit(ICACHEF_ALIASING, &__icache_flags); break; } - pr_info("Detected %s I-cache on CPU%d\n", icache_policy_str[l1ip], cpu); + pr_info("Detected %s I-cache on CPU%d\n", icache_policy_str(l1ip), cpu); } static void __cpuinfo_store_cpu_32bit(struct cpuinfo_32bit *info) @@ -425,6 +442,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1); info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1); info->reg_id_aa64zfr0 = read_cpuid(ID_AA64ZFR0_EL1); + info->reg_id_aa64smfr0 = read_cpuid(ID_AA64SMFR0_EL1); if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) info->reg_gmid = read_cpuid(GMID_EL1); @@ -436,6 +454,10 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) id_aa64pfr0_sve(info->reg_id_aa64pfr0)) info->reg_zcr = read_zcr_features(); + if (IS_ENABLED(CONFIG_ARM64_SME) && + id_aa64pfr1_sme(info->reg_id_aa64pfr1)) + info->reg_smcr = read_smcr_features(); + cpuinfo_detect_icache_policy(info); } diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index bf9fe71589bc..872218e80fcb 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -28,7 +28,7 @@ u8 debug_monitors_arch(void) { return cpuid_feature_extract_unsigned_field(read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1), - ID_AA64DFR0_DEBUGVER_SHIFT); + ID_AA64DFR0_EL1_DebugVer_SHIFT); } /* @@ -283,21 +283,25 @@ void register_user_break_hook(struct break_hook *hook) { register_debug_hook(&hook->node, &user_break_hook); } +EXPORT_SYMBOL_GPL(register_user_break_hook); void unregister_user_break_hook(struct break_hook *hook) { unregister_debug_hook(&hook->node); } +EXPORT_SYMBOL_GPL(unregister_user_break_hook); void register_kernel_break_hook(struct break_hook *hook) { register_debug_hook(&hook->node, &kernel_break_hook); } +EXPORT_SYMBOL_GPL(register_kernel_break_hook); void unregister_kernel_break_hook(struct break_hook *hook) { unregister_debug_hook(&hook->node); } +EXPORT_SYMBOL_GPL(unregister_kernel_break_hook); static int call_break_hook(struct pt_regs *regs, unsigned long esr) { diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index fc91dad1579a..03f6857e4dd5 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -25,6 +26,8 @@ #include #include +#include + /* * Handle IRQ/context state management when entering from kernel mode. * Before this function is called it is not safe to call regular kernel code, @@ -55,6 +58,7 @@ static void noinstr enter_from_kernel_mode(struct pt_regs *regs) { __enter_from_kernel_mode(regs); mte_check_tfsr_entry(); + mte_disable_tco_entry(current); } /* @@ -102,6 +106,7 @@ static __always_inline void __enter_from_user_mode(void) CT_WARN_ON(ct_state() != CONTEXT_USER); user_exit_irqoff(); trace_hardirqs_off_finish(); + mte_disable_tco_entry(current); } static __always_inline void enter_from_user_mode(struct pt_regs *regs) @@ -283,6 +288,7 @@ static void noinstr __panic_unhandled(struct pt_regs *regs, const char *vector, vector, smp_processor_id(), esr, esr_get_class_string(esr)); + trace_android_rvh_panic_unhandled(regs, vector, esr); __show_regs(regs); panic("Unhandled exception"); } @@ -517,6 +523,14 @@ static void noinstr el0_sve_acc(struct pt_regs *regs, unsigned long esr) exit_to_user_mode(regs); } +static void noinstr el0_sme_acc(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_sme_acc(esr, regs); + exit_to_user_mode(regs); +} + static void noinstr el0_fpsimd_exc(struct pt_regs *regs, unsigned long esr) { enter_from_user_mode(regs); @@ -625,6 +639,9 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_SVE: el0_sve_acc(regs, esr); break; + case ESR_ELx_EC_SME: + el0_sme_acc(regs, esr); + break; case ESR_ELx_EC_FP_EXC64: el0_fpsimd_exc(regs, esr); break; diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index 196e921f61de..229436f33df5 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -38,9 +38,10 @@ SYM_FUNC_END(fpsimd_load_state) * * x0 - pointer to buffer for state * x1 - pointer to storage for FPSR + * x2 - Save FFR if non-zero */ SYM_FUNC_START(sve_save_state) - sve_save 0, x1, 2 + sve_save 0, x1, x2, 3 ret SYM_FUNC_END(sve_save_state) @@ -49,10 +50,10 @@ SYM_FUNC_END(sve_save_state) * * x0 - pointer to buffer for state * x1 - pointer to storage for FPSR - * x2 - VQ-1 + * x2 - Restore FFR if non-zero */ SYM_FUNC_START(sve_load_state) - sve_load 0, x1, x2, 3, x4 + sve_load 0, x1, x2, 4 ret SYM_FUNC_END(sve_load_state) @@ -66,35 +67,58 @@ SYM_FUNC_START(sve_set_vq) ret SYM_FUNC_END(sve_set_vq) -/* - * Load SVE state from FPSIMD state. - * - * x0 = pointer to struct fpsimd_state - * x1 = VQ - 1 - * - * Each SVE vector will be loaded with the first 128-bits taken from FPSIMD - * and the rest zeroed. All the other SVE registers will be zeroed. - */ -SYM_FUNC_START(sve_load_from_fpsimd_state) - sve_load_vq x1, x2, x3 - fpsimd_restore x0, 8 - sve_flush_p_ffr - ret -SYM_FUNC_END(sve_load_from_fpsimd_state) - /* * Zero all SVE registers but the first 128-bits of each vector * * VQ must already be configured by caller, any further updates of VQ * will need to ensure that the register state remains valid. * - * x0 = VQ - 1 + * x0 = include FFR? + * x1 = VQ - 1 */ SYM_FUNC_START(sve_flush_live) - cbz x0, 1f // A VQ-1 of 0 is 128 bits so no extra Z state + cbz x1, 1f // A VQ-1 of 0 is 128 bits so no extra Z state sve_flush_z -1: sve_flush_p_ffr - ret +1: sve_flush_p + tbz x0, #0, 2f + sve_flush_ffr +2: ret SYM_FUNC_END(sve_flush_live) #endif /* CONFIG_ARM64_SVE */ + +#ifdef CONFIG_ARM64_SME + +SYM_FUNC_START(sme_get_vl) + _sme_rdsvl 0, 1 + ret +SYM_FUNC_END(sme_get_vl) + +SYM_FUNC_START(sme_set_vq) + sme_load_vq x0, x1, x2 + ret +SYM_FUNC_END(sme_set_vq) + +/* + * Save the SME state + * + * x0 - pointer to buffer for state + */ +SYM_FUNC_START(za_save_state) + _sme_rdsvl 1, 1 // x1 = VL/8 + sme_save_za 0, x1, 12 + ret +SYM_FUNC_END(za_save_state) + +/* + * Load the SME state + * + * x0 - pointer to buffer for state + */ +SYM_FUNC_START(za_load_state) + _sme_rdsvl 1, 1 // x1 = VL/8 + sme_load_za 0, x1, 12 + ret +SYM_FUNC_END(za_load_state) + +#endif /* CONFIG_ARM64_SME */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 34e38eb00f05..9eb6121d90a6 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -175,9 +175,9 @@ alternative_else_nop_endif .macro mte_set_kernel_gcr, tmp, tmp2 #ifdef CONFIG_KASAN_HW_TAGS -alternative_if_not ARM64_MTE +alternative_cb kasan_hw_tags_enable b 1f -alternative_else_nop_endif +alternative_cb_end mov \tmp, KERNEL_GCR_EL1 msr_s SYS_GCR_EL1, \tmp 1: @@ -185,10 +185,10 @@ alternative_else_nop_endif .endm .macro mte_set_user_gcr, tsk, tmp, tmp2 -#ifdef CONFIG_ARM64_MTE -alternative_if_not ARM64_MTE +#ifdef CONFIG_KASAN_HW_TAGS +alternative_cb kasan_hw_tags_enable b 1f -alternative_else_nop_endif +alternative_cb_end ldr \tmp, [\tsk, #THREAD_MTE_CTRL] mte_set_gcr \tmp, \tmp2 @@ -315,13 +315,6 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING msr_s SYS_ICC_PMR_EL1, x20 alternative_else_nop_endif - /* Re-enable tag checking (TCO set on exception entry) */ -#ifdef CONFIG_ARM64_MTE -alternative_if ARM64_MTE - SET_PSTATE_TCO(0) -alternative_else_nop_endif -#endif - /* * Registers that may be useful after this macro is invoked: * diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 7a3fcf21b18a..62ea394d4fbd 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -78,7 +79,11 @@ * indicate whether or not the userland FPSIMD state of the current task is * present in the registers. The flag is set unless the FPSIMD registers of this * CPU currently contain the most recent userland FPSIMD state of the current - * task. + * task. If the task is behaving as a VMM, then this is will be managed by + * KVM which will clear it to indicate that the vcpu FPSIMD state is currently + * loaded on the CPU, allowing the state to be saved if a FPSIMD-aware + * softirq kicks in. Upon vcpu_put(), KVM will save the vcpu FP state and + * flag the register state as invalid. * * In order to allow softirq handlers to use FPSIMD, kernel_neon_begin() may * save the task's FPSIMD context back to task_struct from softirq context. @@ -116,49 +121,102 @@ struct fpsimd_last_state_struct { struct user_fpsimd_state *st; void *sve_state; + void *za_state; + u64 *svcr; unsigned int sve_vl; + unsigned int sme_vl; }; static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state); -/* Default VL for tasks that don't set it explicitly: */ -static int __sve_default_vl = -1; +__ro_after_init struct vl_info vl_info[ARM64_VEC_MAX] = { +#ifdef CONFIG_ARM64_SVE + [ARM64_VEC_SVE] = { + .type = ARM64_VEC_SVE, + .name = "SVE", + .min_vl = SVE_VL_MIN, + .max_vl = SVE_VL_MIN, + .max_virtualisable_vl = SVE_VL_MIN, + }, +#endif +#ifdef CONFIG_ARM64_SME + [ARM64_VEC_SME] = { + .type = ARM64_VEC_SME, + .name = "SME", + }, +#endif +}; -static int get_sve_default_vl(void) +static unsigned int vec_vl_inherit_flag(enum vec_type type) { - return READ_ONCE(__sve_default_vl); + switch (type) { + case ARM64_VEC_SVE: + return TIF_SVE_VL_INHERIT; + case ARM64_VEC_SME: + return TIF_SME_VL_INHERIT; + default: + WARN_ON_ONCE(1); + return 0; + } +} + +struct vl_config { + int __default_vl; /* Default VL for tasks */ +}; + +static struct vl_config vl_config[ARM64_VEC_MAX]; + +static inline int get_default_vl(enum vec_type type) +{ + return READ_ONCE(vl_config[type].__default_vl); } #ifdef CONFIG_ARM64_SVE -static void set_sve_default_vl(int val) +static inline int get_sve_default_vl(void) { - WRITE_ONCE(__sve_default_vl, val); + return get_default_vl(ARM64_VEC_SVE); } -/* Maximum supported vector length across all CPUs (initially poisoned) */ -int __ro_after_init sve_max_vl = SVE_VL_MIN; -int __ro_after_init sve_max_virtualisable_vl = SVE_VL_MIN; +static inline void set_default_vl(enum vec_type type, int val) +{ + WRITE_ONCE(vl_config[type].__default_vl, val); +} -/* - * Set of available vector lengths, - * where length vq encoded as bit __vq_to_bit(vq): - */ -__ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); -/* Set of vector lengths present on at least one cpu: */ -static __ro_after_init DECLARE_BITMAP(sve_vq_partial_map, SVE_VQ_MAX); +static inline void set_sve_default_vl(int val) +{ + set_default_vl(ARM64_VEC_SVE, val); +} static void __percpu *efi_sve_state; #else /* ! CONFIG_ARM64_SVE */ /* Dummy declaration for code that will be optimised out: */ -extern __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); -extern __ro_after_init DECLARE_BITMAP(sve_vq_partial_map, SVE_VQ_MAX); extern void __percpu *efi_sve_state; #endif /* ! CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_SME + +static int get_sme_default_vl(void) +{ + return get_default_vl(ARM64_VEC_SME); +} + +static void set_sme_default_vl(int val) +{ + set_default_vl(ARM64_VEC_SME, val); +} + +static void sme_free(struct task_struct *); + +#else + +static inline void sme_free(struct task_struct *t) { } + +#endif + DEFINE_PER_CPU(bool, fpsimd_context_busy); EXPORT_PER_CPU_SYMBOL(fpsimd_context_busy); @@ -228,17 +286,51 @@ static void sve_free(struct task_struct *task) __sve_free(task); } +unsigned int task_get_vl(const struct task_struct *task, enum vec_type type) +{ + return task->thread.vl[type]; +} + +void task_set_vl(struct task_struct *task, enum vec_type type, + unsigned long vl) +{ + task->thread.vl[type] = vl; +} + +unsigned int task_get_vl_onexec(const struct task_struct *task, + enum vec_type type) +{ + return task->thread.vl_onexec[type]; +} + +void task_set_vl_onexec(struct task_struct *task, enum vec_type type, + unsigned long vl) +{ + task->thread.vl_onexec[type] = vl; +} + +/* + * TIF_SME controls whether a task can use SME without trapping while + * in userspace, when TIF_SME is set then we must have storage + * alocated in sve_state and za_state to store the contents of both ZA + * and the SVE registers for both streaming and non-streaming modes. + * + * If both SVCR.ZA and SVCR.SM are disabled then at any point we + * may disable TIF_SME and reenable traps. + */ + + /* * TIF_SVE controls whether a task can use SVE without trapping while - * in userspace, and also the way a task's FPSIMD/SVE state is stored - * in thread_struct. + * in userspace, and also (together with TIF_SME) the way a task's + * FPSIMD/SVE state is stored in thread_struct. * * The kernel uses this flag to track whether a user task is actively * using SVE, and therefore whether full SVE register state needs to * be tracked. If not, the cheaper FPSIMD context handling code can * be used instead of the more costly SVE equivalents. * - * * TIF_SVE set: + * * TIF_SVE or SVCR.SM set: * * The task can execute SVE instructions while in userspace without * trapping to the kernel. @@ -246,7 +338,8 @@ static void sve_free(struct task_struct *task) * When stored, Z0-Z31 (incorporating Vn in bits[127:0] or the * corresponding Zn), P0-P15 and FFR are encoded in in * task->thread.sve_state, formatted appropriately for vector - * length task->thread.sve_vl. + * length task->thread.sve_vl or, if SVCR.SM is set, + * task->thread.sme_vl. * * task->thread.sve_state must point to a valid buffer at least * sve_state_size(task) bytes in size. @@ -284,48 +377,111 @@ static void sve_free(struct task_struct *task) */ static void task_fpsimd_load(void) { + bool restore_sve_regs = false; + bool restore_ffr; + WARN_ON(!system_supports_fpsimd()); WARN_ON(!have_cpu_fpsimd_context()); - if (IS_ENABLED(CONFIG_ARM64_SVE) && test_thread_flag(TIF_SVE)) + /* Check if we should restore SVE first */ + if (IS_ENABLED(CONFIG_ARM64_SVE) && test_thread_flag(TIF_SVE)) { + sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1); + restore_sve_regs = true; + restore_ffr = true; + } + + /* Restore SME, override SVE register configuration if needed */ + if (system_supports_sme()) { + unsigned long sme_vl = task_get_sme_vl(current); + + /* Ensure VL is set up for restoring data */ + if (test_thread_flag(TIF_SME)) + sme_set_vq(sve_vq_from_vl(sme_vl) - 1); + + write_sysreg_s(current->thread.svcr, SYS_SVCR); + + if (thread_za_enabled(¤t->thread)) + za_load_state(current->thread.za_state); + + if (thread_sm_enabled(¤t->thread)) { + restore_sve_regs = true; + restore_ffr = system_supports_fa64(); + } + } + + if (restore_sve_regs) sve_load_state(sve_pffr(¤t->thread), ¤t->thread.uw.fpsimd_state.fpsr, - sve_vq_from_vl(current->thread.sve_vl) - 1); + restore_ffr); else fpsimd_load_state(¤t->thread.uw.fpsimd_state); } /* * Ensure FPSIMD/SVE storage in memory for the loaded context is up to - * date with respect to the CPU registers. + * date with respect to the CPU registers. Note carefully that the + * current context is the context last bound to the CPU stored in + * last, if KVM is involved this may be the guest VM context rather + * than the host thread for the VM pointed to by current. This means + * that we must always reference the state storage via last rather + * than via current, other than the TIF_ flags which KVM will + * carefully maintain for us. */ static void fpsimd_save(void) { struct fpsimd_last_state_struct const *last = this_cpu_ptr(&fpsimd_last_state); /* set by fpsimd_bind_task_to_cpu() or fpsimd_bind_state_to_cpu() */ + bool save_sve_regs = false; + bool save_ffr; + unsigned int vl; WARN_ON(!system_supports_fpsimd()); WARN_ON(!have_cpu_fpsimd_context()); - if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { - if (IS_ENABLED(CONFIG_ARM64_SVE) && - test_thread_flag(TIF_SVE)) { - if (WARN_ON(sve_get_vl() != last->sve_vl)) { - /* - * Can't save the user regs, so current would - * re-enter user with corrupt state. - * There's no way to recover, so kill it: - */ - force_signal_inject(SIGKILL, SI_KERNEL, 0, 0); - return; - } + if (test_thread_flag(TIF_FOREIGN_FPSTATE)) + return; - sve_save_state((char *)last->sve_state + - sve_ffr_offset(last->sve_vl), - &last->st->fpsr); - } else - fpsimd_save_state(last->st); + if (test_thread_flag(TIF_SVE)) { + save_sve_regs = true; + save_ffr = true; + vl = last->sve_vl; + } + + if (system_supports_sme()) { + u64 *svcr = last->svcr; + *svcr = read_sysreg_s(SYS_SVCR); + + *svcr = read_sysreg_s(SYS_SVCR); + + if (*svcr & SVCR_ZA_MASK) + za_save_state(last->za_state); + + /* If we are in streaming mode override regular SVE. */ + if (*svcr & SVCR_SM_MASK) { + save_sve_regs = true; + save_ffr = system_supports_fa64(); + vl = last->sme_vl; + } + } + + if (IS_ENABLED(CONFIG_ARM64_SVE) && save_sve_regs) { + /* Get the configured VL from RDVL, will account for SM */ + if (WARN_ON(sve_get_vl() != vl)) { + /* + * Can't save the user regs, so current would + * re-enter user with corrupt state. + * There's no way to recover, so kill it: + */ + force_signal_inject(SIGKILL, SI_KERNEL, 0, 0); + return; + } + + sve_save_state((char *)last->sve_state + + sve_ffr_offset(vl), + &last->st->fpsr, save_ffr); + } else { + fpsimd_save_state(last->st); } } @@ -335,32 +491,38 @@ static void fpsimd_save(void) * If things go wrong there's a bug somewhere, but try to fall back to a * safe choice. */ -static unsigned int find_supported_vector_length(unsigned int vl) +static unsigned int find_supported_vector_length(enum vec_type type, + unsigned int vl) { + struct vl_info *info = &vl_info[type]; int bit; - int max_vl = sve_max_vl; + int max_vl = info->max_vl; if (WARN_ON(!sve_vl_valid(vl))) - vl = SVE_VL_MIN; + vl = info->min_vl; if (WARN_ON(!sve_vl_valid(max_vl))) - max_vl = SVE_VL_MIN; + max_vl = info->min_vl; if (vl > max_vl) vl = max_vl; + if (vl < info->min_vl) + vl = info->min_vl; - bit = find_next_bit(sve_vq_map, SVE_VQ_MAX, + bit = find_next_bit(info->vq_map, SVE_VQ_MAX, __vq_to_bit(sve_vq_from_vl(vl))); return sve_vl_from_vq(__bit_to_vq(bit)); } #if defined(CONFIG_ARM64_SVE) && defined(CONFIG_SYSCTL) -static int sve_proc_do_default_vl(struct ctl_table *table, int write, +static int vec_proc_do_default_vl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { + struct vl_info *info = table->extra1; + enum vec_type type = info->type; int ret; - int vl = get_sve_default_vl(); + int vl = get_default_vl(type); struct ctl_table tmp_table = { .data = &vl, .maxlen = sizeof(vl), @@ -372,12 +534,12 @@ static int sve_proc_do_default_vl(struct ctl_table *table, int write, /* Writing -1 has the special meaning "set to max": */ if (vl == -1) - vl = sve_max_vl; + vl = info->max_vl; if (!sve_vl_valid(vl)) return -EINVAL; - set_sve_default_vl(find_supported_vector_length(vl)); + set_default_vl(type, find_supported_vector_length(type, vl)); return 0; } @@ -385,7 +547,8 @@ static struct ctl_table sve_default_vl_table[] = { { .procname = "sve_default_vector_length", .mode = 0644, - .proc_handler = sve_proc_do_default_vl, + .proc_handler = vec_proc_do_default_vl, + .extra1 = &vl_info[ARM64_VEC_SVE], }, { } }; @@ -403,6 +566,30 @@ static int __init sve_sysctl_init(void) static int __init sve_sysctl_init(void) { return 0; } #endif /* ! (CONFIG_ARM64_SVE && CONFIG_SYSCTL) */ +#if defined(CONFIG_ARM64_SME) && defined(CONFIG_SYSCTL) +static struct ctl_table sme_default_vl_table[] = { + { + .procname = "sme_default_vector_length", + .mode = 0644, + .proc_handler = vec_proc_do_default_vl, + .extra1 = &vl_info[ARM64_VEC_SME], + }, + { } +}; + +static int __init sme_sysctl_init(void) +{ + if (system_supports_sme()) + if (!register_sysctl("abi", sme_default_vl_table)) + return -EINVAL; + + return 0; +} + +#else /* ! (CONFIG_ARM64_SME && CONFIG_SYSCTL) */ +static int __init sme_sysctl_init(void) { return 0; } +#endif /* ! (CONFIG_ARM64_SME && CONFIG_SYSCTL) */ + #define ZREG(sve_state, vq, n) ((char *)(sve_state) + \ (SVE_SIG_ZREG_OFFSET(vq, n) - SVE_SIG_REGS_OFFSET)) @@ -456,7 +643,7 @@ static void fpsimd_to_sve(struct task_struct *task) if (!system_supports_sve()) return; - vq = sve_vq_from_vl(task->thread.sve_vl); + vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread)); __fpsimd_to_sve(sst, fst, vq); } @@ -473,7 +660,7 @@ static void fpsimd_to_sve(struct task_struct *task) */ static void sve_to_fpsimd(struct task_struct *task) { - unsigned int vq; + unsigned int vq, vl; void const *sst = task->thread.sve_state; struct user_fpsimd_state *fst = &task->thread.uw.fpsimd_state; unsigned int i; @@ -482,7 +669,8 @@ static void sve_to_fpsimd(struct task_struct *task) if (!system_supports_sve()) return; - vq = sve_vq_from_vl(task->thread.sve_vl); + vl = thread_get_cur_vl(&task->thread); + vq = sve_vq_from_vl(vl); for (i = 0; i < SVE_NUM_ZREGS; ++i) { p = (__uint128_t const *)ZREG(sst, vq, i); fst->vregs[i] = arm64_le128_to_cpu(*p); @@ -497,7 +685,14 @@ static void sve_to_fpsimd(struct task_struct *task) */ size_t sve_state_size(struct task_struct const *task) { - return SVE_SIG_REGS_SIZE(sve_vq_from_vl(task->thread.sve_vl)); + unsigned int vl = 0; + + if (system_supports_sve()) + vl = task_get_sve_vl(task); + if (system_supports_sme()) + vl = max(vl, task_get_sme_vl(task)); + + return SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)); } /* @@ -510,10 +705,12 @@ size_t sve_state_size(struct task_struct const *task) * do_sve_acc() case, there is no ABI requirement to hide stale data * written previously be task. */ -void sve_alloc(struct task_struct *task) +void sve_alloc(struct task_struct *task, bool flush) { if (task->thread.sve_state) { - memset(task->thread.sve_state, 0, sve_state_size(task)); + if (flush) + memset(task->thread.sve_state, 0, + sve_state_size(task)); return; } @@ -523,6 +720,19 @@ void sve_alloc(struct task_struct *task) } +/* + * Force the FPSIMD state shared with SVE to be updated in the SVE state + * even if the SVE state is the current active state. + * + * This should only be called by ptrace. task must be non-runnable. + * task->thread.sve_state must point to at least sve_state_size(task) + * bytes of allocated kernel memory. + */ +void fpsimd_force_sync_to_sve(struct task_struct *task) +{ + fpsimd_to_sve(task); +} + /* * Ensure that task->thread.sve_state is up to date with respect to * the user task, irrespective of when SVE is in use or not. @@ -533,7 +743,8 @@ void sve_alloc(struct task_struct *task) */ void fpsimd_sync_to_sve(struct task_struct *task) { - if (!test_tsk_thread_flag(task, TIF_SVE)) + if (!test_tsk_thread_flag(task, TIF_SVE) && + !thread_sm_enabled(&task->thread)) fpsimd_to_sve(task); } @@ -547,7 +758,8 @@ void fpsimd_sync_to_sve(struct task_struct *task) */ void sve_sync_to_fpsimd(struct task_struct *task) { - if (test_tsk_thread_flag(task, TIF_SVE)) + if (test_tsk_thread_flag(task, TIF_SVE) || + thread_sm_enabled(&task->thread)) sve_to_fpsimd(task); } @@ -572,13 +784,13 @@ void sve_sync_from_fpsimd_zeropad(struct task_struct *task) if (!test_tsk_thread_flag(task, TIF_SVE)) return; - vq = sve_vq_from_vl(task->thread.sve_vl); + vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread)); memset(sst, 0, SVE_SIG_REGS_SIZE(vq)); __fpsimd_to_sve(sst, fst, vq); } -int sve_set_vector_length(struct task_struct *task, +int vec_set_vector_length(struct task_struct *task, enum vec_type type, unsigned long vl, unsigned long flags) { if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT | @@ -589,33 +801,34 @@ int sve_set_vector_length(struct task_struct *task, return -EINVAL; /* - * Clamp to the maximum vector length that VL-agnostic SVE code can - * work with. A flag may be assigned in the future to allow setting - * of larger vector lengths without confusing older software. + * Clamp to the maximum vector length that VL-agnostic code + * can work with. A flag may be assigned in the future to + * allow setting of larger vector lengths without confusing + * older software. */ - if (vl > SVE_VL_ARCH_MAX) - vl = SVE_VL_ARCH_MAX; + if (vl > VL_ARCH_MAX) + vl = VL_ARCH_MAX; - vl = find_supported_vector_length(vl); + vl = find_supported_vector_length(type, vl); if (flags & (PR_SVE_VL_INHERIT | PR_SVE_SET_VL_ONEXEC)) - task->thread.sve_vl_onexec = vl; + task_set_vl_onexec(task, type, vl); else /* Reset VL to system default on next exec: */ - task->thread.sve_vl_onexec = 0; + task_set_vl_onexec(task, type, 0); /* Only actually set the VL if not deferred: */ if (flags & PR_SVE_SET_VL_ONEXEC) goto out; - if (vl == task->thread.sve_vl) + if (vl == task_get_vl(task, type)) goto out; /* * To ensure the FPSIMD bits of the SVE vector registers are preserved, * write any live register state back to task_struct, and convert to a - * non-SVE thread. + * regular FPSIMD thread. */ if (task == current) { get_cpu_fpsimd_context(); @@ -624,22 +837,31 @@ int sve_set_vector_length(struct task_struct *task, } fpsimd_flush_task_state(task); - if (test_and_clear_tsk_thread_flag(task, TIF_SVE)) + if (test_and_clear_tsk_thread_flag(task, TIF_SVE) || + thread_sm_enabled(&task->thread)) sve_to_fpsimd(task); + if (system_supports_sme() && type == ARM64_VEC_SME) { + task->thread.svcr &= ~(SVCR_SM_MASK | + SVCR_ZA_MASK); + clear_thread_flag(TIF_SME); + } + if (task == current) put_cpu_fpsimd_context(); /* - * Force reallocation of task SVE state to the correct size - * on next use: + * Force reallocation of task SVE and SME state to the correct + * size on next use: */ sve_free(task); + if (system_supports_sme() && type == ARM64_VEC_SME) + sme_free(task); - task->thread.sve_vl = vl; + task_set_vl(task, type, vl); out: - update_tsk_thread_flag(task, TIF_SVE_VL_INHERIT, + update_tsk_thread_flag(task, vec_vl_inherit_flag(type), flags & PR_SVE_VL_INHERIT); return 0; @@ -647,20 +869,21 @@ out: /* * Encode the current vector length and flags for return. - * This is only required for prctl(): ptrace has separate fields + * This is only required for prctl(): ptrace has separate fields. + * SVE and SME use the same bits for _ONEXEC and _INHERIT. * - * flags are as for sve_set_vector_length(). + * flags are as for vec_set_vector_length(). */ -static int sve_prctl_status(unsigned long flags) +static int vec_prctl_status(enum vec_type type, unsigned long flags) { int ret; if (flags & PR_SVE_SET_VL_ONEXEC) - ret = current->thread.sve_vl_onexec; + ret = task_get_vl_onexec(current, type); else - ret = current->thread.sve_vl; + ret = task_get_vl(current, type); - if (test_thread_flag(TIF_SVE_VL_INHERIT)) + if (test_thread_flag(vec_vl_inherit_flag(type))) ret |= PR_SVE_VL_INHERIT; return ret; @@ -678,11 +901,11 @@ int sve_set_current_vl(unsigned long arg) if (!system_supports_sve() || is_compat_task()) return -EINVAL; - ret = sve_set_vector_length(current, vl, flags); + ret = vec_set_vector_length(current, ARM64_VEC_SVE, vl, flags); if (ret) return ret; - return sve_prctl_status(flags); + return vec_prctl_status(ARM64_VEC_SVE, flags); } /* PR_SVE_GET_VL */ @@ -691,22 +914,65 @@ int sve_get_current_vl(void) if (!system_supports_sve() || is_compat_task()) return -EINVAL; - return sve_prctl_status(0); + return vec_prctl_status(ARM64_VEC_SVE, 0); } -static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX)) +#ifdef CONFIG_ARM64_SME +/* PR_SME_SET_VL */ +int sme_set_current_vl(unsigned long arg) +{ + unsigned long vl, flags; + int ret; + + vl = arg & PR_SME_VL_LEN_MASK; + flags = arg & ~vl; + + if (!system_supports_sme() || is_compat_task()) + return -EINVAL; + + ret = vec_set_vector_length(current, ARM64_VEC_SME, vl, flags); + if (ret) + return ret; + + return vec_prctl_status(ARM64_VEC_SME, flags); +} + +/* PR_SME_GET_VL */ +int sme_get_current_vl(void) +{ + if (!system_supports_sme() || is_compat_task()) + return -EINVAL; + + return vec_prctl_status(ARM64_VEC_SME, 0); +} +#endif /* CONFIG_ARM64_SME */ + +static void vec_probe_vqs(struct vl_info *info, + DECLARE_BITMAP(map, SVE_VQ_MAX)) { unsigned int vq, vl; - unsigned long zcr; bitmap_zero(map, SVE_VQ_MAX); - zcr = ZCR_ELx_LEN_MASK; - zcr = read_sysreg_s(SYS_ZCR_EL1) & ~zcr; - for (vq = SVE_VQ_MAX; vq >= SVE_VQ_MIN; --vq) { - write_sysreg_s(zcr | (vq - 1), SYS_ZCR_EL1); /* self-syncing */ - vl = sve_get_vl(); + write_vl(info->type, vq - 1); /* self-syncing */ + + switch (info->type) { + case ARM64_VEC_SVE: + vl = sve_get_vl(); + break; + case ARM64_VEC_SME: + vl = sme_get_vl(); + break; + default: + vl = 0; + break; + } + + /* Minimum VL identified? */ + if (sve_vq_from_vl(vl) > vq) + break; + vq = sve_vq_from_vl(vl); /* skip intervening lengths */ set_bit(__vq_to_bit(vq), map); } @@ -716,10 +982,11 @@ static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX)) * Initialise the set of known supported VQs for the boot CPU. * This is called during kernel boot, before secondary CPUs are brought up. */ -void __init sve_init_vq_map(void) +void __init vec_init_vq_map(enum vec_type type) { - sve_probe_vqs(sve_vq_map); - bitmap_copy(sve_vq_partial_map, sve_vq_map, SVE_VQ_MAX); + struct vl_info *info = &vl_info[type]; + vec_probe_vqs(info, info->vq_map); + bitmap_copy(info->vq_partial_map, info->vq_map, SVE_VQ_MAX); } /* @@ -727,30 +994,33 @@ void __init sve_init_vq_map(void) * those not supported by the current CPU. * This function is called during the bring-up of early secondary CPUs only. */ -void sve_update_vq_map(void) +void vec_update_vq_map(enum vec_type type) { + struct vl_info *info = &vl_info[type]; DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); - sve_probe_vqs(tmp_map); - bitmap_and(sve_vq_map, sve_vq_map, tmp_map, SVE_VQ_MAX); - bitmap_or(sve_vq_partial_map, sve_vq_partial_map, tmp_map, SVE_VQ_MAX); + vec_probe_vqs(info, tmp_map); + bitmap_and(info->vq_map, info->vq_map, tmp_map, SVE_VQ_MAX); + bitmap_or(info->vq_partial_map, info->vq_partial_map, tmp_map, + SVE_VQ_MAX); } /* * Check whether the current CPU supports all VQs in the committed set. * This function is called during the bring-up of late secondary CPUs only. */ -int sve_verify_vq_map(void) +int vec_verify_vq_map(enum vec_type type) { + struct vl_info *info = &vl_info[type]; DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); unsigned long b; - sve_probe_vqs(tmp_map); + vec_probe_vqs(info, tmp_map); bitmap_complement(tmp_map, tmp_map, SVE_VQ_MAX); - if (bitmap_intersects(tmp_map, sve_vq_map, SVE_VQ_MAX)) { - pr_warn("SVE: cpu%d: Required vector length(s) missing\n", - smp_processor_id()); + if (bitmap_intersects(tmp_map, info->vq_map, SVE_VQ_MAX)) { + pr_warn("%s: cpu%d: Required vector length(s) missing\n", + info->name, smp_processor_id()); return -EINVAL; } @@ -766,7 +1036,7 @@ int sve_verify_vq_map(void) /* Recover the set of supported VQs: */ bitmap_complement(tmp_map, tmp_map, SVE_VQ_MAX); /* Find VQs supported that are not globally supported: */ - bitmap_andnot(tmp_map, tmp_map, sve_vq_map, SVE_VQ_MAX); + bitmap_andnot(tmp_map, tmp_map, info->vq_map, SVE_VQ_MAX); /* Find the lowest such VQ, if any: */ b = find_last_bit(tmp_map, SVE_VQ_MAX); @@ -777,9 +1047,9 @@ int sve_verify_vq_map(void) * Mismatches above sve_max_virtualisable_vl are fine, since * no guest is allowed to configure ZCR_EL2.LEN to exceed this: */ - if (sve_vl_from_vq(__bit_to_vq(b)) <= sve_max_virtualisable_vl) { - pr_warn("SVE: cpu%d: Unsupported vector length(s) present\n", - smp_processor_id()); + if (sve_vl_from_vq(__bit_to_vq(b)) <= info->max_virtualisable_vl) { + pr_warn("%s: cpu%d: Unsupported vector length(s) present\n", + info->name, smp_processor_id()); return -EINVAL; } @@ -788,19 +1058,25 @@ int sve_verify_vq_map(void) static void __init sve_efi_setup(void) { + int max_vl = 0; + int i; + if (!IS_ENABLED(CONFIG_EFI)) return; + for (i = 0; i < ARRAY_SIZE(vl_info); i++) + max_vl = max(vl_info[i].max_vl, max_vl); + /* * alloc_percpu() warns and prints a backtrace if this goes wrong. * This is evidence of a crippled system and we are returning void, * so no attempt is made to handle this situation here. */ - if (!sve_vl_valid(sve_max_vl)) + if (!sve_vl_valid(max_vl)) goto fail; efi_sve_state = __alloc_percpu( - SVE_SIG_REGS_SIZE(sve_vq_from_vl(sve_max_vl)), SVE_VQ_BYTES); + SVE_SIG_REGS_SIZE(sve_vq_from_vl(max_vl)), SVE_VQ_BYTES); if (!efi_sve_state) goto fail; @@ -849,6 +1125,7 @@ u64 read_zcr_features(void) void __init sve_setup(void) { + struct vl_info *info = &vl_info[ARM64_VEC_SVE]; u64 zcr; DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); unsigned long b; @@ -861,49 +1138,52 @@ void __init sve_setup(void) * so sve_vq_map must have at least SVE_VQ_MIN set. * If something went wrong, at least try to patch it up: */ - if (WARN_ON(!test_bit(__vq_to_bit(SVE_VQ_MIN), sve_vq_map))) - set_bit(__vq_to_bit(SVE_VQ_MIN), sve_vq_map); + if (WARN_ON(!test_bit(__vq_to_bit(SVE_VQ_MIN), info->vq_map))) + set_bit(__vq_to_bit(SVE_VQ_MIN), info->vq_map); zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1); - sve_max_vl = sve_vl_from_vq((zcr & ZCR_ELx_LEN_MASK) + 1); + info->max_vl = sve_vl_from_vq((zcr & ZCR_ELx_LEN_MASK) + 1); /* * Sanity-check that the max VL we determined through CPU features * corresponds properly to sve_vq_map. If not, do our best: */ - if (WARN_ON(sve_max_vl != find_supported_vector_length(sve_max_vl))) - sve_max_vl = find_supported_vector_length(sve_max_vl); + if (WARN_ON(info->max_vl != find_supported_vector_length(ARM64_VEC_SVE, + info->max_vl))) + info->max_vl = find_supported_vector_length(ARM64_VEC_SVE, + info->max_vl); /* * For the default VL, pick the maximum supported value <= 64. * VL == 64 is guaranteed not to grow the signal frame. */ - set_sve_default_vl(find_supported_vector_length(64)); + set_sve_default_vl(find_supported_vector_length(ARM64_VEC_SVE, 64)); - bitmap_andnot(tmp_map, sve_vq_partial_map, sve_vq_map, + bitmap_andnot(tmp_map, info->vq_partial_map, info->vq_map, SVE_VQ_MAX); b = find_last_bit(tmp_map, SVE_VQ_MAX); if (b >= SVE_VQ_MAX) /* No non-virtualisable VLs found */ - sve_max_virtualisable_vl = SVE_VQ_MAX; + info->max_virtualisable_vl = SVE_VQ_MAX; else if (WARN_ON(b == SVE_VQ_MAX - 1)) /* No virtualisable VLs? This is architecturally forbidden. */ - sve_max_virtualisable_vl = SVE_VQ_MIN; + info->max_virtualisable_vl = SVE_VQ_MIN; else /* b + 1 < SVE_VQ_MAX */ - sve_max_virtualisable_vl = sve_vl_from_vq(__bit_to_vq(b + 1)); + info->max_virtualisable_vl = sve_vl_from_vq(__bit_to_vq(b + 1)); - if (sve_max_virtualisable_vl > sve_max_vl) - sve_max_virtualisable_vl = sve_max_vl; + if (info->max_virtualisable_vl > info->max_vl) + info->max_virtualisable_vl = info->max_vl; - pr_info("SVE: maximum available vector length %u bytes per vector\n", - sve_max_vl); - pr_info("SVE: default vector length %u bytes per vector\n", - get_sve_default_vl()); + pr_info("%s: maximum available vector length %u bytes per vector\n", + info->name, info->max_vl); + pr_info("%s: default vector length %u bytes per vector\n", + info->name, get_sve_default_vl()); /* KVM decides whether to support mismatched systems. Just warn here: */ - if (sve_max_virtualisable_vl < sve_max_vl) - pr_warn("SVE: unvirtualisable vector lengths present\n"); + if (sve_max_virtualisable_vl() < sve_max_vl()) + pr_warn("%s: unvirtualisable vector lengths present\n", + info->name); sve_efi_setup(); } @@ -915,10 +1195,172 @@ void __init sve_setup(void) void fpsimd_release_task(struct task_struct *dead_task) { __sve_free(dead_task); + sme_free(dead_task); } #endif /* CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_SME + +/* + * Ensure that task->thread.za_state is allocated and sufficiently large. + * + * This function should be used only in preparation for replacing + * task->thread.za_state with new data. The memory is always zeroed + * here to prevent stale data from showing through: this is done in + * the interest of testability and predictability, the architecture + * guarantees that when ZA is enabled it will be zeroed. + */ +void sme_alloc(struct task_struct *task) +{ + if (task->thread.za_state) { + memset(task->thread.za_state, 0, za_state_size(task)); + return; + } + + /* This could potentially be up to 64K. */ + task->thread.za_state = + kzalloc(za_state_size(task), GFP_KERNEL); +} + +static void sme_free(struct task_struct *task) +{ + kfree(task->thread.za_state); + task->thread.za_state = NULL; +} + +void sme_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p) +{ + /* Set priority for all PEs to architecturally defined minimum */ + write_sysreg_s(read_sysreg_s(SYS_SMPRI_EL1) & ~SMPRI_EL1_PRIORITY_MASK, + SYS_SMPRI_EL1); + + /* Allow SME in kernel */ + write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_SMEN_EL1EN, CPACR_EL1); + isb(); + + /* Allow EL0 to access TPIDR2 */ + write_sysreg(read_sysreg(SCTLR_EL1) | SCTLR_ELx_ENTP2, SCTLR_EL1); + isb(); +} + +/* + * This must be called after sme_kernel_enable(), we rely on the + * feature table being sorted to ensure this. + */ +void fa64_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p) +{ + /* Allow use of FA64 */ + write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_FA64_MASK, + SYS_SMCR_EL1); +} + +/* + * Read the pseudo-SMCR used by cpufeatures to identify the supported + * vector length. + * + * Use only if SME is present. + * This function clobbers the SME vector length. + */ +u64 read_smcr_features(void) +{ + u64 smcr; + unsigned int vq_max; + + sme_kernel_enable(NULL); + sme_smstart_sm(); + + /* + * Set the maximum possible VL. + */ + write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_LEN_MASK, + SYS_SMCR_EL1); + + smcr = read_sysreg_s(SYS_SMCR_EL1); + smcr &= ~(u64)SMCR_ELx_LEN_MASK; /* Only the LEN field */ + vq_max = sve_vq_from_vl(sve_get_vl()); + smcr |= vq_max - 1; /* set LEN field to maximum effective value */ + + sme_smstop_sm(); + + return smcr; +} + +void __init sme_setup(void) +{ + struct vl_info *info = &vl_info[ARM64_VEC_SME]; + u64 smcr; + int min_bit; + + if (!system_supports_sme()) + return; + + /* + * SME doesn't require any particular vector length be + * supported but it does require at least one. We should have + * disabled the feature entirely while bringing up CPUs but + * let's double check here. + */ + WARN_ON(bitmap_empty(info->vq_map, SVE_VQ_MAX)); + + min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX); + info->min_vl = sve_vl_from_vq(__bit_to_vq(min_bit)); + + smcr = read_sanitised_ftr_reg(SYS_SMCR_EL1); + info->max_vl = sve_vl_from_vq((smcr & SMCR_ELx_LEN_MASK) + 1); + + /* + * Sanity-check that the max VL we determined through CPU features + * corresponds properly to sme_vq_map. If not, do our best: + */ + if (WARN_ON(info->max_vl != find_supported_vector_length(ARM64_VEC_SME, + info->max_vl))) + info->max_vl = find_supported_vector_length(ARM64_VEC_SME, + info->max_vl); + + WARN_ON(info->min_vl > info->max_vl); + + /* + * For the default VL, pick the maximum supported value <= 32 + * (256 bits) if there is one since this is guaranteed not to + * grow the signal frame when in streaming mode, otherwise the + * minimum available VL will be used. + */ + set_sme_default_vl(find_supported_vector_length(ARM64_VEC_SME, 32)); + + pr_info("SME: minimum available vector length %u bytes per vector\n", + info->min_vl); + pr_info("SME: maximum available vector length %u bytes per vector\n", + info->max_vl); + pr_info("SME: default vector length %u bytes per vector\n", + get_sme_default_vl()); +} + +#endif /* CONFIG_ARM64_SME */ + +static void sve_init_regs(void) +{ + /* + * Convert the FPSIMD state to SVE, zeroing all the state that + * is not shared with FPSIMD. If (as is likely) the current + * state is live in the registers then do this there and + * update our metadata for the current task including + * disabling the trap, otherwise update our in-memory copy. + * We are guaranteed to not be in streaming mode, we can only + * take a SVE trap when not in streaming mode and we can't be + * in streaming mode when taking a SME trap. + */ + if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { + unsigned long vq_minus_one = + sve_vq_from_vl(task_get_sve_vl(current)) - 1; + sve_set_vq(vq_minus_one); + sve_flush_live(true, vq_minus_one); + fpsimd_bind_task_to_cpu(); + } else { + fpsimd_to_sve(current); + } +} + /* * Trapped SVE access * @@ -938,7 +1380,7 @@ void do_sve_acc(unsigned long esr, struct pt_regs *regs) return; } - sve_alloc(current); + sve_alloc(current, true); if (!current->thread.sve_state) { force_sig(SIGKILL); return; @@ -950,20 +1392,64 @@ void do_sve_acc(unsigned long esr, struct pt_regs *regs) WARN_ON(1); /* SVE access shouldn't have trapped */ /* - * Convert the FPSIMD state to SVE, zeroing all the state that - * is not shared with FPSIMD. If (as is likely) the current - * state is live in the registers then do this there and - * update our metadata for the current task including - * disabling the trap, otherwise update our in-memory copy. + * Even if the task can have used streaming mode we can only + * generate SVE access traps in normal SVE mode and + * transitioning out of streaming mode may discard any + * streaming mode state. Always clear the high bits to avoid + * any potential errors tracking what is properly initialised. */ + sve_init_regs(); + + put_cpu_fpsimd_context(); +} + +/* + * Trapped SME access + * + * Storage is allocated for the full SVE and SME state, the current + * FPSIMD register contents are migrated to SVE if SVE is not already + * active, and the access trap is disabled. + * + * TIF_SME should be clear on entry: otherwise, fpsimd_restore_current_state() + * would have disabled the SME access trap for userspace during + * ret_to_user, making an SVE access trap impossible in that case. + */ +void do_sme_acc(unsigned long esr, struct pt_regs *regs) +{ + /* Even if we chose not to use SME, the hardware could still trap: */ + if (unlikely(!system_supports_sme()) || WARN_ON(is_compat_task())) { + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); + return; + } + + /* + * If this not a trap due to SME being disabled then something + * is being used in the wrong mode, report as SIGILL. + */ + if (ESR_ELx_ISS(esr) != ESR_ELx_SME_ISS_SME_DISABLED) { + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); + return; + } + + sve_alloc(current, false); + sme_alloc(current); + if (!current->thread.sve_state || !current->thread.za_state) { + force_sig(SIGKILL); + return; + } + + get_cpu_fpsimd_context(); + + /* With TIF_SME userspace shouldn't generate any traps */ + if (test_and_set_thread_flag(TIF_SME)) + WARN_ON(1); + if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { unsigned long vq_minus_one = - sve_vq_from_vl(current->thread.sve_vl) - 1; - sve_set_vq(vq_minus_one); - sve_flush_live(vq_minus_one); + sve_vq_from_vl(task_get_sme_vl(current)) - 1; + sme_set_vq(vq_minus_one); + fpsimd_bind_task_to_cpu(); - } else { - fpsimd_to_sve(current); } put_cpu_fpsimd_context(); @@ -1030,10 +1516,43 @@ void fpsimd_thread_switch(struct task_struct *next) __put_cpu_fpsimd_context(); } -void fpsimd_flush_thread(void) +static void fpsimd_flush_thread_vl(enum vec_type type) { int vl, supported_vl; + /* + * Reset the task vector length as required. This is where we + * ensure that all user tasks have a valid vector length + * configured: no kernel task can become a user task without + * an exec and hence a call to this function. By the time the + * first call to this function is made, all early hardware + * probing is complete, so __sve_default_vl should be valid. + * If a bug causes this to go wrong, we make some noise and + * try to fudge thread.sve_vl to a safe value here. + */ + vl = task_get_vl_onexec(current, type); + if (!vl) + vl = get_default_vl(type); + + if (WARN_ON(!sve_vl_valid(vl))) + vl = vl_info[type].min_vl; + + supported_vl = find_supported_vector_length(type, vl); + if (WARN_ON(supported_vl != vl)) + vl = supported_vl; + + task_set_vl(current, type, vl); + + /* + * If the task is not set to inherit, ensure that the vector + * length will be reset by a subsequent exec: + */ + if (!test_thread_flag(vec_vl_inherit_flag(type))) + task_set_vl_onexec(current, type, 0); +} + +void fpsimd_flush_thread(void) +{ if (!system_supports_fpsimd()) return; @@ -1046,36 +1565,14 @@ void fpsimd_flush_thread(void) if (system_supports_sve()) { clear_thread_flag(TIF_SVE); sve_free(current); + fpsimd_flush_thread_vl(ARM64_VEC_SVE); + } - /* - * Reset the task vector length as required. - * This is where we ensure that all user tasks have a valid - * vector length configured: no kernel task can become a user - * task without an exec and hence a call to this function. - * By the time the first call to this function is made, all - * early hardware probing is complete, so __sve_default_vl - * should be valid. - * If a bug causes this to go wrong, we make some noise and - * try to fudge thread.sve_vl to a safe value here. - */ - vl = current->thread.sve_vl_onexec ? - current->thread.sve_vl_onexec : get_sve_default_vl(); - - if (WARN_ON(!sve_vl_valid(vl))) - vl = SVE_VL_MIN; - - supported_vl = find_supported_vector_length(vl); - if (WARN_ON(supported_vl != vl)) - vl = supported_vl; - - current->thread.sve_vl = vl; - - /* - * If the task is not set to inherit, ensure that the vector - * length will be reset by a subsequent exec: - */ - if (!test_thread_flag(TIF_SVE_VL_INHERIT)) - current->thread.sve_vl_onexec = 0; + if (system_supports_sme()) { + clear_thread_flag(TIF_SME); + sme_free(current); + fpsimd_flush_thread_vl(ARM64_VEC_SME); + current->thread.svcr = 0; } put_cpu_fpsimd_context(); @@ -1120,22 +1617,34 @@ static void fpsimd_bind_task_to_cpu(void) WARN_ON(!system_supports_fpsimd()); last->st = ¤t->thread.uw.fpsimd_state; last->sve_state = current->thread.sve_state; - last->sve_vl = current->thread.sve_vl; + last->za_state = current->thread.za_state; + last->sve_vl = task_get_sve_vl(current); + last->sme_vl = task_get_sme_vl(current); + last->svcr = ¤t->thread.svcr; current->thread.fpsimd_cpu = smp_processor_id(); + /* + * Toggle SVE and SME trapping for userspace if needed, these + * are serialsied by ret_to_user(). + */ + if (system_supports_sme()) { + if (test_thread_flag(TIF_SME)) + sme_user_enable(); + else + sme_user_disable(); + } + if (system_supports_sve()) { - /* Toggle SVE trapping for userspace if needed */ if (test_thread_flag(TIF_SVE)) sve_user_enable(); else sve_user_disable(); - - /* Serialised by exception return to user */ } } void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state, - unsigned int sve_vl) + unsigned int sve_vl, void *za_state, + unsigned int sme_vl, u64 *svcr) { struct fpsimd_last_state_struct *last = this_cpu_ptr(&fpsimd_last_state); @@ -1144,8 +1653,11 @@ void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state, WARN_ON(!in_softirq() && !irqs_disabled()); last->st = st; + last->svcr = svcr; last->sve_state = sve_state; + last->za_state = za_state; last->sve_vl = sve_vl; + last->sme_vl = sme_vl; } /* @@ -1239,6 +1751,15 @@ static void fpsimd_flush_cpu_state(void) { WARN_ON(!system_supports_fpsimd()); __this_cpu_write(fpsimd_last_state.st, NULL); + + /* + * Leaving streaming mode enabled will cause issues for any kernel + * NEON and leaving streaming mode or ZA enabled may increase power + * consumption. + */ + if (system_supports_sme()) + sme_smstop(); + set_thread_flag(TIF_FOREIGN_FPSTATE); } @@ -1316,6 +1837,7 @@ EXPORT_SYMBOL(kernel_neon_end); static DEFINE_PER_CPU(struct user_fpsimd_state, efi_fpsimd_state); static DEFINE_PER_CPU(bool, efi_fpsimd_state_used); static DEFINE_PER_CPU(bool, efi_sve_state_used); +static DEFINE_PER_CPU(bool, efi_sm_state); /* * EFI runtime services support functions @@ -1350,11 +1872,33 @@ void __efi_fpsimd_begin(void) */ if (system_supports_sve() && likely(efi_sve_state)) { char *sve_state = this_cpu_ptr(efi_sve_state); + bool ffr = true; + u64 svcr; __this_cpu_write(efi_sve_state_used, true); - sve_save_state(sve_state + sve_ffr_offset(sve_max_vl), - &this_cpu_ptr(&efi_fpsimd_state)->fpsr); + if (system_supports_sme()) { + svcr = read_sysreg_s(SYS_SVCR); + + __this_cpu_write(efi_sm_state, + svcr & SVCR_SM_MASK); + + /* + * Unless we have FA64 FFR does not + * exist in streaming mode. + */ + if (!system_supports_fa64()) + ffr = !(svcr & SVCR_SM_MASK); + } + + sve_save_state(sve_state + sve_ffr_offset(sve_max_vl()), + &this_cpu_ptr(&efi_fpsimd_state)->fpsr, + ffr); + + if (system_supports_sme()) + sysreg_clear_set_s(SYS_SVCR, + SVCR_SM_MASK, 0); + } else { fpsimd_save_state(this_cpu_ptr(&efi_fpsimd_state)); } @@ -1377,10 +1921,31 @@ void __efi_fpsimd_end(void) if (system_supports_sve() && likely(__this_cpu_read(efi_sve_state_used))) { char const *sve_state = this_cpu_ptr(efi_sve_state); + bool ffr = true; - sve_load_state(sve_state + sve_ffr_offset(sve_max_vl), + /* + * Restore streaming mode; EFI calls are + * normal function calls so should not return in + * streaming mode. + */ + if (system_supports_sme()) { + if (__this_cpu_read(efi_sm_state)) { + sysreg_clear_set_s(SYS_SVCR, + 0, + SVCR_SM_MASK); + + /* + * Unless we have FA64 FFR does not + * exist in streaming mode. + */ + if (!system_supports_fa64()) + ffr = false; + } + } + + sve_load_state(sve_state + sve_ffr_offset(sve_max_vl()), &this_cpu_ptr(&efi_fpsimd_state)->fpsr, - sve_vq_from_vl(sve_get_vl()) - 1); + ffr); __this_cpu_write(efi_sve_state_used, false); } else { @@ -1455,6 +2020,13 @@ static int __init fpsimd_init(void) if (!cpu_have_named_feature(ASIMD)) pr_notice("Advanced SIMD is not implemented\n"); - return sve_sysctl_init(); + + if (cpu_have_named_feature(SME) && !cpu_have_named_feature(SVE)) + pr_notice("SME is implemented but not SVE\n"); + + sve_sysctl_init(); + sme_sysctl_init(); + + return 0; } core_initcall(fpsimd_init); diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index dba774f3b8d7..dd61dd776ff4 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -259,8 +259,6 @@ int __init ftrace_dyn_arch_init(void) * on the way back to parent. For this purpose, this function is called * in _mcount() or ftrace_caller() to replace return address (*parent) on * the call stack to return_to_handler. - * - * Note that @frame_pointer is used only for sanity check later. */ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, unsigned long frame_pointer) @@ -278,8 +276,10 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, */ old = *parent; - if (!function_graph_enter(old, self_addr, frame_pointer, NULL)) + if (!function_graph_enter(old, self_addr, frame_pointer, + (void *)frame_pointer)) { *parent = return_hooker; + } } #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index ab6566bf1c33..88487de883f6 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -295,7 +295,7 @@ SYM_FUNC_START_LOCAL(__create_page_tables) #ifdef CONFIG_ARM64_VA_BITS_52 mrs_s x6, SYS_ID_AA64MMFR2_EL1 - and x6, x6, #(0xf << ID_AA64MMFR2_LVA_SHIFT) + and x6, x6, #(0xf << ID_AA64MMFR2_EL1_VARange_SHIFT) mov x5, #52 cbnz x6, 1f #endif @@ -700,10 +700,10 @@ SYM_FUNC_END(__secondary_too_slow) */ SYM_FUNC_START(__enable_mmu) mrs x2, ID_AA64MMFR0_EL1 - ubfx x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4 - cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MIN + ubfx x2, x2, #ID_AA64MMFR0_EL1_TGRAN_SHIFT, 4 + cmp x2, #ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN b.lt __no_granule_support - cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MAX + cmp x2, #ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX b.gt __no_granule_support update_early_cpu_boot_status 0, x2, x3 adrp x2, idmap_pg_dir @@ -726,7 +726,7 @@ SYM_FUNC_START(__cpu_secondary_check52bitva) b.ne 2f mrs_s x0, SYS_ID_AA64MMFR2_EL1 - and x0, x0, #(0xf << ID_AA64MMFR2_LVA_SHIFT) + and x0, x0, #(0xf << ID_AA64MMFR2_EL1_VARange_SHIFT) cbnz x0, 2f update_early_cpu_boot_status \ diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 46a0b4d6e251..db93ce2b0113 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -326,11 +326,6 @@ static void swsusp_mte_restore_tags(void) unsigned long pfn = xa_state.xa_index; struct page *page = pfn_to_online_page(pfn); - /* - * It is not required to invoke page_kasan_tag_reset(page) - * at this point since the tags stored in page->flags are - * already restored. - */ mte_restore_page_tags(page_address(page), tags); mte_free_tag_storage(tags); diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 43d212618834..b466b48a3ce7 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -81,15 +81,15 @@ SYM_CODE_START_LOCAL(mutate_to_vhe) // Needs to be VHE capable, obviously mrs x1, id_aa64mmfr1_el1 - ubfx x1, x1, #ID_AA64MMFR1_VHE_SHIFT, #4 + ubfx x1, x1, #ID_AA64MMFR1_EL1_VH_SHIFT, #4 cbz x1, 1f // Check whether VHE is disabled from the command line adr_l x1, id_aa64mmfr1_override ldr x2, [x1, FTR_OVR_VAL_OFFSET] ldr x1, [x1, FTR_OVR_MASK_OFFSET] - ubfx x2, x2, #ID_AA64MMFR1_VHE_SHIFT, #4 - ubfx x1, x1, #ID_AA64MMFR1_VHE_SHIFT, #4 + ubfx x2, x2, #ID_AA64MMFR1_EL1_VH_SHIFT, #4 + ubfx x1, x1, #ID_AA64MMFR1_EL1_VH_SHIFT, #4 cmp x1, xzr and x2, x2, x1 csinv x2, x2, xzr, ne diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index d8e606fe3c21..681b1156ad4d 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -17,7 +17,7 @@ #define FTR_DESC_NAME_LEN 20 #define FTR_DESC_FIELD_LEN 10 #define FTR_ALIAS_NAME_LEN 30 -#define FTR_ALIAS_OPTION_LEN 80 +#define FTR_ALIAS_OPTION_LEN 116 struct ftr_set_desc { char name[FTR_DESC_NAME_LEN]; @@ -44,7 +44,7 @@ static const struct ftr_set_desc mmfr1 __initconst = { .name = "id_aa64mmfr1", .override = &id_aa64mmfr1_override, .fields = { - { "vh", ID_AA64MMFR1_VHE_SHIFT, mmfr1_vh_filter }, + { "vh", ID_AA64MMFR1_EL1_VH_SHIFT, mmfr1_vh_filter }, {} }, }; @@ -53,8 +53,8 @@ static const struct ftr_set_desc pfr1 __initconst = { .name = "id_aa64pfr1", .override = &id_aa64pfr1_override, .fields = { - { "bt", ID_AA64PFR1_BT_SHIFT }, - { "mte", ID_AA64PFR1_MTE_SHIFT}, + { "bt", ID_AA64PFR1_EL1_BT_SHIFT }, + { "mte", ID_AA64PFR1_EL1_MTE_SHIFT }, {} }, }; @@ -63,10 +63,20 @@ static const struct ftr_set_desc isar1 __initconst = { .name = "id_aa64isar1", .override = &id_aa64isar1_override, .fields = { - { "gpi", ID_AA64ISAR1_GPI_SHIFT }, - { "gpa", ID_AA64ISAR1_GPA_SHIFT }, - { "api", ID_AA64ISAR1_API_SHIFT }, - { "apa", ID_AA64ISAR1_APA_SHIFT }, + { "gpi", ID_AA64ISAR1_EL1_GPI_SHIFT }, + { "gpa", ID_AA64ISAR1_EL1_GPA_SHIFT }, + { "api", ID_AA64ISAR1_EL1_API_SHIFT }, + { "apa", ID_AA64ISAR1_EL1_APA_SHIFT }, + {} + }, +}; + +static const struct ftr_set_desc isar2 __initconst = { + .name = "id_aa64isar2", + .override = &id_aa64isar2_override, + .fields = { + { "gpa3", ID_AA64ISAR2_EL1_GPA3_SHIFT }, + { "apa3", ID_AA64ISAR2_EL1_APA3_SHIFT }, {} }, }; @@ -88,6 +98,7 @@ static const struct ftr_set_desc * const regs[] __initconst = { &mmfr1, &pfr1, &isar1, + &isar2, &kaslr, }; @@ -100,7 +111,8 @@ static const struct { { "arm64.nobti", "id_aa64pfr1.bt=0" }, { "arm64.nopauth", "id_aa64isar1.gpi=0 id_aa64isar1.gpa=0 " - "id_aa64isar1.api=0 id_aa64isar1.apa=0" }, + "id_aa64isar1.api=0 id_aa64isar1.apa=0 " + "id_aa64isar2.gpa3=0 id_aa64isar2.apa3=0" }, { "arm64.nomte", "id_aa64pfr1.mte=0" }, { "nokaslr", "kaslr.disabled=1" }, }; @@ -214,8 +226,11 @@ static __init void parse_cmdline(void) { const u8 *prop = get_bootargs_cmdline(); - if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop) + if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) || + IS_ENABLED(CONFIG_CMDLINE_FORCE) || + !prop) { __parse_cmdline(CONFIG_CMDLINE, true); + } if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && prop) __parse_cmdline(prop, true); diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index e03e60f9482b..8ca6df922da8 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -80,9 +80,6 @@ KVM_NVHE_ALIAS(nvhe_hyp_panic_handler); /* Vectors installed by hyp-init on reset HVC. */ KVM_NVHE_ALIAS(__hyp_stub_vectors); -/* Kernel symbol used by icache_is_vpipt(). */ -KVM_NVHE_ALIAS(__icache_flags); - /* Kernel symbols needed for cpus_have_final/const_caps checks. */ KVM_NVHE_ALIAS(arm64_const_caps_ready); KVM_NVHE_ALIAS(cpu_hwcap_keys); @@ -102,11 +99,10 @@ KVM_NVHE_ALIAS(gic_nonsecure_priorities); KVM_NVHE_ALIAS(__start___kvm_ex_table); KVM_NVHE_ALIAS(__stop___kvm_ex_table); -/* Array containing bases of nVHE per-CPU memory regions. */ -KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base); - /* PMU available static key */ +#ifdef CONFIG_HW_PERF_EVENTS KVM_NVHE_ALIAS(kvm_arm_pmu_available); +#endif /* Position-independent library routines */ KVM_NVHE_ALIAS_HYP(clear_page, __pi_clear_page); @@ -119,12 +115,6 @@ KVM_NVHE_ALIAS_HYP(__memcpy, __pi_memcpy); KVM_NVHE_ALIAS_HYP(__memset, __pi_memset); #endif -/* Kernel memory sections */ -KVM_NVHE_ALIAS(__start_rodata); -KVM_NVHE_ALIAS(__end_rodata); -KVM_NVHE_ALIAS(__bss_start); -KVM_NVHE_ALIAS(__bss_stop); - /* Hyp memory sections */ KVM_NVHE_ALIAS(__hyp_idmap_text_start); KVM_NVHE_ALIAS(__hyp_idmap_text_end); @@ -132,8 +122,14 @@ KVM_NVHE_ALIAS(__hyp_text_start); KVM_NVHE_ALIAS(__hyp_text_end); KVM_NVHE_ALIAS(__hyp_bss_start); KVM_NVHE_ALIAS(__hyp_bss_end); +KVM_NVHE_ALIAS(__hyp_data_start); +KVM_NVHE_ALIAS(__hyp_data_end); KVM_NVHE_ALIAS(__hyp_rodata_start); KVM_NVHE_ALIAS(__hyp_rodata_end); +#ifdef CONFIG_FTRACE +KVM_NVHE_ALIAS(__hyp_event_ids_start); +KVM_NVHE_ALIAS(__hyp_event_ids_end); +#endif /* pKVM static key */ KVM_NVHE_ALIAS(kvm_protected_mode_initialized); diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 309a27553c87..71737f6bdf6b 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -58,12 +58,13 @@ void *module_alloc(unsigned long size) PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } - return p; + /* Memory is intended to be executable, reset the pointer tag. */ + return kasan_reset_tag(p); } enum aarch64_reloc_op { @@ -519,14 +520,76 @@ static int module_init_ftrace_plt(const Elf_Ehdr *hdr, return 0; } +static int module_init_hyp(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, + struct module *mod) +{ +#ifdef CONFIG_KVM + const Elf_Shdr *s; + + /* + * If the .hyp.text is missing or empty, this is not a hypervisor + * module so ignore the rest of it. + */ + s = find_section(hdr, sechdrs, ".hyp.text"); + if (!s || !s->sh_size) + return 0; + + mod->arch.hyp.text = (struct pkvm_module_section) { + .start = (void *)s->sh_addr, + .end = (void *)s->sh_addr + s->sh_size, + }; + + s = find_section(hdr, sechdrs, ".hyp.bss"); + if (!s) + return -ENOEXEC; + + mod->arch.hyp.bss = (struct pkvm_module_section) { + .start = (void *)s->sh_addr, + .end = (void *)s->sh_addr + s->sh_size, + }; + + s = find_section(hdr, sechdrs, ".hyp.rodata"); + if (!s) + return -ENOEXEC; + + mod->arch.hyp.rodata = (struct pkvm_module_section) { + .start = (void *)s->sh_addr, + .end = (void *)s->sh_addr + s->sh_size, + }; + + s = find_section(hdr, sechdrs, ".hyp.data"); + if (!s) + return -ENOEXEC; + + mod->arch.hyp.data = (struct pkvm_module_section) { + .start = (void *)s->sh_addr, + .end = (void *)s->sh_addr + s->sh_size, + }; + + s = find_section(hdr, sechdrs, ".hyp.reloc"); + if (!s) + return -ENOEXEC; + + mod->arch.hyp.relocs = (void *)s->sh_addr; + mod->arch.hyp.nr_relocs = s->sh_size / sizeof(*mod->arch.hyp.relocs); +#endif + return 0; +} + int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { + int err; const Elf_Shdr *s; + s = find_section(hdr, sechdrs, ".altinstructions"); if (s) apply_alternatives_module((void *)s->sh_addr, s->sh_size); - return module_init_ftrace_plt(hdr, sechdrs, me); + err = module_init_ftrace_plt(hdr, sechdrs, me); + if (err) + return err; + + return module_init_hyp(hdr, sechdrs, me); } diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index a3898bac5ae6..7b9177e57022 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -26,9 +27,12 @@ static DEFINE_PER_CPU_READ_MOSTLY(u64, mte_tcf_preferred); #ifdef CONFIG_KASAN_HW_TAGS -/* Whether the MTE asynchronous mode is enabled. */ -DEFINE_STATIC_KEY_FALSE(mte_async_mode); -EXPORT_SYMBOL_GPL(mte_async_mode); +/* + * The asynchronous and asymmetric MTE modes have the same behavior for + * store operations. This flag is set when either of these modes is enabled. + */ +DEFINE_STATIC_KEY_FALSE(mte_async_or_asymm_mode); +EXPORT_SYMBOL_GPL(mte_async_or_asymm_mode); #endif static void mte_sync_page_tags(struct page *page, pte_t old_pte, @@ -44,15 +48,6 @@ static void mte_sync_page_tags(struct page *page, pte_t old_pte, if (!pte_is_tagged) return; - page_kasan_tag_reset(page); - /* - * We need smp_wmb() in between setting the flags and clearing the - * tags because if another thread reads page->flags and builds a - * tagged address out of it, there is an actual dependency to the - * memory access, but on the current thread we do not guarantee that - * the new page->flags are visible before the tags were updated. - */ - smp_wmb(); /* * Test PG_mte_tagged again in case it was racing with another * set_pte_at(). @@ -124,7 +119,7 @@ void mte_enable_kernel_sync(void) * Make sure we enter this function when no PE has set * async mode previously. */ - WARN_ONCE(system_uses_mte_async_mode(), + WARN_ONCE(system_uses_mte_async_or_asymm_mode(), "MTE async mode enabled system wide!"); __mte_enable_kernel("synchronous", SCTLR_ELx_TCF_SYNC); @@ -142,8 +137,34 @@ void mte_enable_kernel_async(void) * mode in between sync and async, this strategy needs * to be reviewed. */ - if (!system_uses_mte_async_mode()) - static_branch_enable(&mte_async_mode); + if (!system_uses_mte_async_or_asymm_mode()) + static_branch_enable(&mte_async_or_asymm_mode); +} + +void mte_enable_kernel_asymm(void) +{ + if (cpus_have_cap(ARM64_MTE_ASYMM)) { + __mte_enable_kernel("asymmetric", SCTLR_ELx_TCF_ASYMM); + + /* + * MTE asymm mode behaves as async mode for store + * operations. The mode is set system wide by the + * first PE that executes this function. + * + * Note: If in future KASAN acquires a runtime switching + * mode in between sync and async, this strategy needs + * to be reviewed. + */ + if (!system_uses_mte_async_or_asymm_mode()) + static_branch_enable(&mte_async_or_asymm_mode); + } else { + /* + * If the CPU does not support MTE asymmetric mode the + * kernel falls back on synchronous mode which is the + * default for kasan=on. + */ + mte_enable_kernel_sync(); + } } #endif @@ -165,6 +186,11 @@ void mte_check_tfsr_el1(void) } #endif +/* + * This is where we actually resolve the system and process MTE mode + * configuration into an actual value in SCTLR_EL1 that affects + * userspace. + */ static void mte_update_sctlr_user(struct task_struct *task) { /* @@ -178,15 +204,56 @@ static void mte_update_sctlr_user(struct task_struct *task) unsigned long pref, resolved_mte_tcf; pref = __this_cpu_read(mte_tcf_preferred); + /* + * If there is no overlap between the system preferred and + * program requested values go with what was requested. + */ resolved_mte_tcf = (mte_ctrl & pref) ? pref : mte_ctrl; sctlr &= ~SCTLR_EL1_TCF0_MASK; - if (resolved_mte_tcf & MTE_CTRL_TCF_ASYNC) + /* + * Pick an actual setting. The order in which we check for + * set bits and map into register values determines our + * default order. + */ + if (resolved_mte_tcf & MTE_CTRL_TCF_ASYMM) + sctlr |= SCTLR_EL1_TCF0_ASYMM; + else if (resolved_mte_tcf & MTE_CTRL_TCF_ASYNC) sctlr |= SCTLR_EL1_TCF0_ASYNC; else if (resolved_mte_tcf & MTE_CTRL_TCF_SYNC) sctlr |= SCTLR_EL1_TCF0_SYNC; task->thread.sctlr_user = sctlr; } +static void mte_update_gcr_excl(struct task_struct *task) +{ + /* + * SYS_GCR_EL1 will be set to current->thread.mte_ctrl value by + * mte_set_user_gcr() in kernel_exit, but only if KASAN is enabled. + */ + if (kasan_hw_tags_enabled()) + return; + + write_sysreg_s( + ((task->thread.mte_ctrl >> MTE_CTRL_GCR_USER_EXCL_SHIFT) & + SYS_GCR_EL1_EXCL_MASK) | SYS_GCR_EL1_RRND, + SYS_GCR_EL1); +} + +#ifdef CONFIG_KASAN_HW_TAGS +/* Only called from assembly, silence sparse */ +void __init kasan_hw_tags_enable(struct alt_instr *alt, __le32 *origptr, + __le32 *updptr, int nr_inst); + +void __init kasan_hw_tags_enable(struct alt_instr *alt, __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); /* Branch -> NOP */ + + if (kasan_hw_tags_enabled()) + *updptr = cpu_to_le32(aarch64_insn_gen_nop()); +} +#endif + void mte_thread_init_user(void) { if (!system_supports_mte()) @@ -206,6 +273,10 @@ void mte_thread_switch(struct task_struct *next) return; mte_update_sctlr_user(next); + mte_update_gcr_excl(next); + + /* TCO may not have been disabled on exception entry for the current task. */ + mte_disable_tco_entry(next); /* * Check if an async tag exception occurred at EL1. @@ -298,10 +369,22 @@ long set_mte_ctrl(struct task_struct *task, unsigned long arg) if (arg & PR_MTE_TCF_SYNC) mte_ctrl |= MTE_CTRL_TCF_SYNC; + /* + * If the system supports it and both sync and async modes are + * specified then implicitly enable asymmetric mode. + * Userspace could see a mix of both sync and async anyway due + * to differing or changing defaults on CPUs. + */ + if (cpus_have_cap(ARM64_MTE_ASYMM) && + (arg & PR_MTE_TCF_ASYNC) && + (arg & PR_MTE_TCF_SYNC)) + mte_ctrl |= MTE_CTRL_TCF_ASYMM; + task->thread.mte_ctrl = mte_ctrl; if (task == current) { preempt_disable(); mte_update_sctlr_user(task); + mte_update_gcr_excl(task); update_sctlr_el1(task->thread.sctlr_user); preempt_enable(); } @@ -471,6 +554,8 @@ static ssize_t mte_tcf_preferred_show(struct device *dev, return sysfs_emit(buf, "async\n"); case MTE_CTRL_TCF_SYNC: return sysfs_emit(buf, "sync\n"); + case MTE_CTRL_TCF_ASYMM: + return sysfs_emit(buf, "asymm\n"); default: return sysfs_emit(buf, "???\n"); } @@ -486,6 +571,8 @@ static ssize_t mte_tcf_preferred_store(struct device *dev, tcf = MTE_CTRL_TCF_ASYNC; else if (sysfs_streq(buf, "sync")) tcf = MTE_CTRL_TCF_SYNC; + else if (cpus_have_cap(ARM64_MTE_ASYMM) && sysfs_streq(buf, "asymm")) + tcf = MTE_CTRL_TCF_ASYMM; else return -EINVAL; @@ -513,3 +600,32 @@ static int register_mte_tcf_preferred_sysctl(void) return 0; } subsys_initcall(register_mte_tcf_preferred_sysctl); + +/* + * Return 0 on success, the number of bytes not probed otherwise. + */ +size_t mte_probe_user_range(const char __user *uaddr, size_t size) +{ + const char __user *end = uaddr + size; + int err = 0; + char val; + + __raw_get_user(val, uaddr, err); + if (err) + return size; + + uaddr = PTR_ALIGN(uaddr, MTE_GRANULE_SIZE); + while (uaddr < end) { + /* + * A read is sufficient for mte, the caller should have probed + * for the pte write permission if required. + */ + __raw_get_user(val, uaddr, err); + if (err) + return end - uaddr; + uaddr += MTE_GRANULE_SIZE; + } + (void)val; + + return 0; +} diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c index 33e0fabc0b79..b336073fe6b1 100644 --- a/arch/arm64/kernel/patching.c +++ b/arch/arm64/kernel/patching.c @@ -66,16 +66,16 @@ int __kprobes aarch64_insn_read(void *addr, u32 *insnp) return ret; } -static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) +static int __kprobes __aarch64_text_write(void *dst, void *src, size_t size) { - void *waddr = addr; - unsigned long flags = 0; + unsigned long flags; + void *waddr; int ret; raw_spin_lock_irqsave(&patch_lock, flags); - waddr = patch_map(addr, FIX_TEXT_POKE0); + waddr = patch_map(dst, FIX_TEXT_POKE0); - ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE); + ret = copy_to_kernel_nofault(waddr, src, size); patch_unmap(FIX_TEXT_POKE0); raw_spin_unlock_irqrestore(&patch_lock, flags); @@ -85,7 +85,14 @@ static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) int __kprobes aarch64_insn_write(void *addr, u32 insn) { - return __aarch64_insn_write(addr, cpu_to_le32(insn)); + __le32 __insn = cpu_to_le32(insn); + + return __aarch64_text_write(addr, &__insn, AARCH64_INSN_SIZE); +} + +int __kprobes aarch64_addr_write(void *addr, u64 dst) +{ + return __aarch64_text_write(addr, &dst, sizeof(dst)); } int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn) diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 86d9f2013172..65b196e3ca6c 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -5,10 +5,10 @@ * Copyright (C) 2015 ARM Limited */ #include +#include #include #include -#include struct frame_tail { struct frame_tail __user *fp; @@ -102,9 +102,7 @@ compat_user_backtrace(struct compat_frame_tail __user *tail, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - if (guest_cbs && guest_cbs->is_in_guest()) { + if (perf_guest_state()) { /* We don't support guest os callchain now */ return; } @@ -134,50 +132,38 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, } } -/* - * Gets called by walk_stackframe() for every stackframe. This will be called - * whist unwinding the stackframe and is like a subroutine return so we use - * the PC. - */ static bool callchain_trace(void *data, unsigned long pc) { struct perf_callchain_entry_ctx *entry = data; - perf_callchain_store(entry, pc); - return true; + return perf_callchain_store(entry, pc) == 0; } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - struct stackframe frame; - - if (guest_cbs && guest_cbs->is_in_guest()) { + if (perf_guest_state()) { /* We don't support guest os callchain now */ return; } - start_backtrace(&frame, regs->regs[29], regs->pc); - walk_stackframe(current, &frame, callchain_trace, entry); + arch_stack_walk(callchain_trace, entry, current, regs); } unsigned long perf_instruction_pointer(struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - if (guest_cbs && guest_cbs->is_in_guest()) - return guest_cbs->get_guest_ip(); + if (perf_guest_state()) + return perf_guest_get_ip(); return instruction_pointer(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + unsigned int guest_state = perf_guest_state(); int misc = 0; - if (guest_cbs && guest_cbs->is_in_guest()) { - if (guest_cbs->is_user_mode()) + if (guest_state) { + if (guest_state & PERF_GUEST_USER) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index b4044469527e..a132ab4bdef7 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -286,6 +286,9 @@ static const struct attribute_group armv8_pmuv3_events_attr_group = { PMU_FORMAT_ATTR(event, "config:0-15"); PMU_FORMAT_ATTR(long, "config1:0"); +static int sysctl_perf_user_access __read_mostly; +static int sysctl_export_pmu_events __read_mostly; + static inline bool armv8pmu_event_is_64bit(struct perf_event *event) { return event->attr.config1 & 0x1; @@ -371,7 +374,7 @@ static const struct attribute_group armv8_pmuv3_caps_attr_group = { */ static bool armv8pmu_has_long_event(struct arm_pmu *cpu_pmu) { - return (cpu_pmu->pmuver >= ID_AA64DFR0_PMUVER_8_5); + return (cpu_pmu->pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P5); } /* @@ -953,6 +956,17 @@ static int armv8pmu_filter_match(struct perf_event *event) return evtype != ARMV8_PMUV3_PERFCTR_CHAIN; } +static int __init export_pmu_events(char *str) +{ + /* Enable exporting of pmu events at early bootup with kernel + * arguments. + */ + sysctl_export_pmu_events = 1; + return 0; +} + +early_param("export_pmu_events", export_pmu_events); + static void armv8pmu_reset(void *info) { struct arm_pmu *cpu_pmu = (struct arm_pmu *)info; @@ -975,6 +989,9 @@ static void armv8pmu_reset(void *info) if (armv8pmu_has_long_event(cpu_pmu)) pmcr |= ARMV8_PMU_PMCR_LP; + if (sysctl_export_pmu_events) + pmcr |= ARMV8_PMU_PMCR_X; + armv8pmu_pmcr_write(pmcr); } @@ -1054,8 +1071,8 @@ static void __armv8pmu_probe_pmu(void *info) dfr0 = read_sysreg(id_aa64dfr0_el1); pmuver = cpuid_feature_extract_unsigned_field(dfr0, - ID_AA64DFR0_PMUVER_SHIFT); - if (pmuver == ID_AA64DFR0_PMUVER_IMP_DEF || pmuver == 0) + ID_AA64DFR0_EL1_PMUVer_SHIFT); + if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF || pmuver == 0) return; cpu_pmu->pmuver = pmuver; @@ -1081,7 +1098,7 @@ static void __armv8pmu_probe_pmu(void *info) pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS); /* store PMMIR_EL1 register for sysfs */ - if (pmuver >= ID_AA64DFR0_PMUVER_8_4 && (pmceid_raw[1] & BIT(31))) + if (pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P4 && (pmceid_raw[1] & BIT(31))) cpu_pmu->reg_pmmir = read_cpuid(PMMIR_EL1); else cpu_pmu->reg_pmmir = 0; @@ -1104,6 +1121,36 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu) return probe.present ? 0 : -ENODEV; } +static struct ctl_table armv8_pmu_sysctl_table[] = { + { + .procname = "perf_user_access", + .data = &sysctl_perf_user_access, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "export_pmu_events", + .data = &sysctl_export_pmu_events, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static void armv8_pmu_register_sysctl_table(void) +{ + static u32 tbl_registered = 0; + + if (!cmpxchg_relaxed(&tbl_registered, 0, 1)) + register_sysctl("kernel", armv8_pmu_sysctl_table); +} + static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name, int (*map_event)(struct perf_event *event), const struct attribute_group *events, @@ -1136,6 +1183,7 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name, cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_CAPS] = caps ? caps : &armv8_pmuv3_caps_attr_group; + armv8_pmu_register_sysctl_table(); return 0; } @@ -1145,17 +1193,32 @@ static int armv8_pmu_init_nogroups(struct arm_pmu *cpu_pmu, char *name, return armv8_pmu_init(cpu_pmu, name, map_event, NULL, NULL, NULL); } -static int armv8_pmuv3_init(struct arm_pmu *cpu_pmu) -{ - return armv8_pmu_init_nogroups(cpu_pmu, "armv8_pmuv3", - armv8_pmuv3_map_event); +#define PMUV3_INIT_SIMPLE(name) \ +static int name##_pmu_init(struct arm_pmu *cpu_pmu) \ +{ \ + return armv8_pmu_init_nogroups(cpu_pmu, #name, armv8_pmuv3_map_event);\ } -static int armv8_a34_pmu_init(struct arm_pmu *cpu_pmu) -{ - return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a34", - armv8_pmuv3_map_event); -} +PMUV3_INIT_SIMPLE(armv8_pmuv3) + +PMUV3_INIT_SIMPLE(armv8_cortex_a34) +PMUV3_INIT_SIMPLE(armv8_cortex_a55) +PMUV3_INIT_SIMPLE(armv8_cortex_a65) +PMUV3_INIT_SIMPLE(armv8_cortex_a75) +PMUV3_INIT_SIMPLE(armv8_cortex_a76) +PMUV3_INIT_SIMPLE(armv8_cortex_a77) +PMUV3_INIT_SIMPLE(armv8_cortex_a78) +PMUV3_INIT_SIMPLE(armv9_cortex_a510) +PMUV3_INIT_SIMPLE(armv9_cortex_a710) +PMUV3_INIT_SIMPLE(armv8_cortex_x1) +PMUV3_INIT_SIMPLE(armv9_cortex_x2) +PMUV3_INIT_SIMPLE(armv8_neoverse_e1) +PMUV3_INIT_SIMPLE(armv8_neoverse_n1) +PMUV3_INIT_SIMPLE(armv9_neoverse_n2) +PMUV3_INIT_SIMPLE(armv8_neoverse_v1) + +PMUV3_INIT_SIMPLE(armv8_nvidia_carmel) +PMUV3_INIT_SIMPLE(armv8_nvidia_denver) static int armv8_a35_pmu_init(struct arm_pmu *cpu_pmu) { @@ -1169,24 +1232,12 @@ static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu) armv8_a53_map_event); } -static int armv8_a55_pmu_init(struct arm_pmu *cpu_pmu) -{ - return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a55", - armv8_pmuv3_map_event); -} - static int armv8_a57_pmu_init(struct arm_pmu *cpu_pmu) { return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a57", armv8_a57_map_event); } -static int armv8_a65_pmu_init(struct arm_pmu *cpu_pmu) -{ - return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a65", - armv8_pmuv3_map_event); -} - static int armv8_a72_pmu_init(struct arm_pmu *cpu_pmu) { return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a72", @@ -1199,42 +1250,6 @@ static int armv8_a73_pmu_init(struct arm_pmu *cpu_pmu) armv8_a73_map_event); } -static int armv8_a75_pmu_init(struct arm_pmu *cpu_pmu) -{ - return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a75", - armv8_pmuv3_map_event); -} - -static int armv8_a76_pmu_init(struct arm_pmu *cpu_pmu) -{ - return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a76", - armv8_pmuv3_map_event); -} - -static int armv8_a77_pmu_init(struct arm_pmu *cpu_pmu) -{ - return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a77", - armv8_pmuv3_map_event); -} - -static int armv8_a78_pmu_init(struct arm_pmu *cpu_pmu) -{ - return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a78", - armv8_pmuv3_map_event); -} - -static int armv8_e1_pmu_init(struct arm_pmu *cpu_pmu) -{ - return armv8_pmu_init_nogroups(cpu_pmu, "armv8_neoverse_e1", - armv8_pmuv3_map_event); -} - -static int armv8_n1_pmu_init(struct arm_pmu *cpu_pmu) -{ - return armv8_pmu_init_nogroups(cpu_pmu, "armv8_neoverse_n1", - armv8_pmuv3_map_event); -} - static int armv8_thunder_pmu_init(struct arm_pmu *cpu_pmu) { return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cavium_thunder", @@ -1248,23 +1263,31 @@ static int armv8_vulcan_pmu_init(struct arm_pmu *cpu_pmu) } static const struct of_device_id armv8_pmu_of_device_ids[] = { - {.compatible = "arm,armv8-pmuv3", .data = armv8_pmuv3_init}, - {.compatible = "arm,cortex-a34-pmu", .data = armv8_a34_pmu_init}, + {.compatible = "arm,armv8-pmuv3", .data = armv8_pmuv3_pmu_init}, + {.compatible = "arm,cortex-a34-pmu", .data = armv8_cortex_a34_pmu_init}, {.compatible = "arm,cortex-a35-pmu", .data = armv8_a35_pmu_init}, {.compatible = "arm,cortex-a53-pmu", .data = armv8_a53_pmu_init}, - {.compatible = "arm,cortex-a55-pmu", .data = armv8_a55_pmu_init}, + {.compatible = "arm,cortex-a55-pmu", .data = armv8_cortex_a55_pmu_init}, {.compatible = "arm,cortex-a57-pmu", .data = armv8_a57_pmu_init}, - {.compatible = "arm,cortex-a65-pmu", .data = armv8_a65_pmu_init}, + {.compatible = "arm,cortex-a65-pmu", .data = armv8_cortex_a65_pmu_init}, {.compatible = "arm,cortex-a72-pmu", .data = armv8_a72_pmu_init}, {.compatible = "arm,cortex-a73-pmu", .data = armv8_a73_pmu_init}, - {.compatible = "arm,cortex-a75-pmu", .data = armv8_a75_pmu_init}, - {.compatible = "arm,cortex-a76-pmu", .data = armv8_a76_pmu_init}, - {.compatible = "arm,cortex-a77-pmu", .data = armv8_a77_pmu_init}, - {.compatible = "arm,cortex-a78-pmu", .data = armv8_a78_pmu_init}, - {.compatible = "arm,neoverse-e1-pmu", .data = armv8_e1_pmu_init}, - {.compatible = "arm,neoverse-n1-pmu", .data = armv8_n1_pmu_init}, + {.compatible = "arm,cortex-a75-pmu", .data = armv8_cortex_a75_pmu_init}, + {.compatible = "arm,cortex-a76-pmu", .data = armv8_cortex_a76_pmu_init}, + {.compatible = "arm,cortex-a77-pmu", .data = armv8_cortex_a77_pmu_init}, + {.compatible = "arm,cortex-a78-pmu", .data = armv8_cortex_a78_pmu_init}, + {.compatible = "arm,cortex-a510-pmu", .data = armv9_cortex_a510_pmu_init}, + {.compatible = "arm,cortex-a710-pmu", .data = armv9_cortex_a710_pmu_init}, + {.compatible = "arm,cortex-x1-pmu", .data = armv8_cortex_x1_pmu_init}, + {.compatible = "arm,cortex-x2-pmu", .data = armv9_cortex_x2_pmu_init}, + {.compatible = "arm,neoverse-e1-pmu", .data = armv8_neoverse_e1_pmu_init}, + {.compatible = "arm,neoverse-n1-pmu", .data = armv8_neoverse_n1_pmu_init}, + {.compatible = "arm,neoverse-n2-pmu", .data = armv9_neoverse_n2_pmu_init}, + {.compatible = "arm,neoverse-v1-pmu", .data = armv8_neoverse_v1_pmu_init}, {.compatible = "cavium,thunder-pmu", .data = armv8_thunder_pmu_init}, {.compatible = "brcm,vulcan-pmu", .data = armv8_vulcan_pmu_init}, + {.compatible = "nvidia,carmel-pmu", .data = armv8_nvidia_carmel_pmu_init}, + {.compatible = "nvidia,denver-pmu", .data = armv8_nvidia_denver_pmu_init}, {}, }; @@ -1287,7 +1310,7 @@ static int __init armv8_pmu_driver_init(void) if (acpi_disabled) return platform_driver_register(&armv8_pmu_driver); else - return arm_pmu_acpi_probe(armv8_pmuv3_init); + return arm_pmu_acpi_probe(armv8_pmuv3_pmu_init); } device_initcall(armv8_pmu_driver_init) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 23efabcb00b8..c592a027a330 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -40,6 +40,9 @@ #include #include #include +#include +#include +#include #include #include @@ -245,10 +248,13 @@ void show_regs(struct pt_regs *regs) __show_regs(regs); dump_backtrace(regs, NULL, KERN_DEFAULT); } +EXPORT_SYMBOL_GPL(show_regs); static void tls_thread_flush(void) { write_sysreg(0, tpidr_el0); + if (system_supports_tpidr2()) + write_sysreg_s(0, SYS_TPIDR2_EL0); if (is_compat_task()) { current->thread.uw.tp_value = 0; @@ -297,16 +303,42 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) /* * Detach src's sve_state (if any) from dst so that it does not - * get erroneously used or freed prematurely. dst's sve_state + * get erroneously used or freed prematurely. dst's copies * will be allocated on demand later on if dst uses SVE. * For consistency, also clear TIF_SVE here: this could be done * later in copy_process(), but to avoid tripping up future - * maintainers it is best not to leave TIF_SVE and sve_state in + * maintainers it is best not to leave TIF flags and buffers in * an inconsistent state, even temporarily. */ dst->thread.sve_state = NULL; clear_tsk_thread_flag(dst, TIF_SVE); + /* + * In the unlikely event that we create a new thread with ZA + * enabled we should retain the ZA state so duplicate it here. + * This may be shortly freed if we exec() or if CLONE_SETTLS + * but it's simpler to do it here. To avoid confusing the rest + * of the code ensure that we have a sve_state allocated + * whenever za_state is allocated. + */ + if (thread_za_enabled(&src->thread)) { + dst->thread.sve_state = kzalloc(sve_state_size(src), + GFP_KERNEL); + if (!dst->thread.sve_state) + return -ENOMEM; + dst->thread.za_state = kmemdup(src->thread.za_state, + za_state_size(src), + GFP_KERNEL); + if (!dst->thread.za_state) { + kfree(dst->thread.sve_state); + dst->thread.sve_state = NULL; + return -ENOMEM; + } + } else { + dst->thread.za_state = NULL; + clear_tsk_thread_flag(dst, TIF_SME); + } + /* clear any pending asynchronous tag fault raised by the parent */ clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT); @@ -342,6 +374,8 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, * out-of-sync with the saved value. */ *task_user_tls(p) = read_sysreg(tpidr_el0); + if (system_supports_tpidr2()) + p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); if (stack_start) { if (is_compat_thread(task_thread_info(p))) @@ -352,10 +386,12 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, /* * If a TLS pointer was passed to clone, use it for the new - * thread. + * thread. We also reset TPIDR2 if it's in use. */ - if (clone_flags & CLONE_SETTLS) + if (clone_flags & CLONE_SETTLS) { p->thread.uw.tp_value = tls; + p->thread.tpidr2_el0 = 0; + } } else { /* * A kthread has no context to ERET to, so ensure any buggy @@ -386,6 +422,8 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, void tls_preserve_current_state(void) { *task_user_tls(current) = read_sysreg(tpidr_el0); + if (system_supports_tpidr2() && !is_compat_task()) + current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); } static void tls_thread_switch(struct task_struct *next) @@ -398,6 +436,8 @@ static void tls_thread_switch(struct task_struct *next) write_sysreg(0, tpidrro_el0); write_sysreg(*task_user_tls(next), tpidr_el0); + if (system_supports_tpidr2()) + write_sysreg_s(next->thread.tpidr2_el0, SYS_TPIDR2_EL0); } /* @@ -482,7 +522,8 @@ void update_sctlr_el1(u64 sctlr) /* * Thread switching. */ -__notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, +__notrace_funcgraph __sched +struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next) { struct task_struct *last; @@ -496,6 +537,12 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, erratum_1418040_thread_switch(next); ptrauth_thread_switch_user(next); + /* + * vendor hook is needed before the dsb(), + * because MPAM is related to cache maintenance. + */ + trace_android_vh_mpam_set(prev, next); + /* * Complete any pending TLB or cache maintenance on this CPU in case * the thread migrates to a different CPU. @@ -514,38 +561,48 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, if (prev->thread.sctlr_user != next->thread.sctlr_user) update_sctlr_el1(next->thread.sctlr_user); + trace_android_vh_is_fpsimd_save(prev, next); + /* the actual thread switch */ last = cpu_switch_to(prev, next); return last; } +struct wchan_info { + unsigned long pc; + int count; +}; + +static bool get_wchan_cb(void *arg, unsigned long pc) +{ + struct wchan_info *wchan_info = arg; + + if (!in_sched_functions(pc)) { + wchan_info->pc = pc; + return false; + } + return wchan_info->count++ < 16; +} + unsigned long get_wchan(struct task_struct *p) { - struct stackframe frame; - unsigned long stack_page, ret = 0; - int count = 0; + struct wchan_info wchan_info = { + .pc = 0, + .count = 0, + }; + if (!p || p == current || task_is_running(p)) return 0; - stack_page = (unsigned long)try_get_task_stack(p); - if (!stack_page) + if (!try_get_task_stack(p)) return 0; - start_backtrace(&frame, thread_saved_fp(p), thread_saved_pc(p)); + arch_stack_walk(get_wchan_cb, &wchan_info, p, NULL); - do { - if (unwind_frame(p, &frame)) - goto out; - if (!in_sched_functions(frame.pc)) { - ret = frame.pc; - goto out; - } - } while (count++ < 16); - -out: put_task_stack(p); - return ret; + + return wchan_info.pc; } unsigned long arch_align_stack(unsigned long sp) @@ -628,7 +685,8 @@ long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg) return -EINVAL; if (system_supports_mte()) - valid_mask |= PR_MTE_TCF_MASK | PR_MTE_TAG_MASK; + valid_mask |= PR_MTE_TCF_SYNC | PR_MTE_TCF_ASYNC \ + | PR_MTE_TAG_MASK; if (arg & ~valid_mask) return -EINVAL; diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 428cfabd11c4..b78e5b779ade 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -168,7 +168,7 @@ static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void) /* If the CPU has CSV2 set, we're safe */ pfr0 = read_cpuid(ID_AA64PFR0_EL1); - if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV2_SHIFT)) + if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_CSV2_SHIFT)) return SPECTRE_UNAFFECTED; /* Alternatively, we have a list of unaffected CPUs */ @@ -951,7 +951,7 @@ static bool supports_ecbhb(int scope) mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); return cpuid_feature_extract_unsigned_field(mmfr1, - ID_AA64MMFR1_ECBHB_SHIFT); + ID_AA64MMFR1_EL1_ECBHB_SHIFT); } bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index e26196a33cf4..d29229d6139c 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -714,21 +714,51 @@ static int system_call_set(struct task_struct *target, #ifdef CONFIG_ARM64_SVE static void sve_init_header_from_task(struct user_sve_header *header, - struct task_struct *target) + struct task_struct *target, + enum vec_type type) { unsigned int vq; + bool active; + bool fpsimd_only; + enum vec_type task_type; memset(header, 0, sizeof(*header)); - header->flags = test_tsk_thread_flag(target, TIF_SVE) ? - SVE_PT_REGS_SVE : SVE_PT_REGS_FPSIMD; - if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) - header->flags |= SVE_PT_VL_INHERIT; + /* Check if the requested registers are active for the task */ + if (thread_sm_enabled(&target->thread)) + task_type = ARM64_VEC_SME; + else + task_type = ARM64_VEC_SVE; + active = (task_type == type); - header->vl = target->thread.sve_vl; + switch (type) { + case ARM64_VEC_SVE: + if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) + header->flags |= SVE_PT_VL_INHERIT; + fpsimd_only = !test_tsk_thread_flag(target, TIF_SVE); + break; + case ARM64_VEC_SME: + if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) + header->flags |= SVE_PT_VL_INHERIT; + fpsimd_only = false; + break; + default: + WARN_ON_ONCE(1); + return; + } + + if (active) { + if (fpsimd_only) { + header->flags |= SVE_PT_REGS_FPSIMD; + } else { + header->flags |= SVE_PT_REGS_SVE; + } + } + + header->vl = task_get_vl(target, type); vq = sve_vq_from_vl(header->vl); - header->max_vl = sve_max_vl; + header->max_vl = vec_max_vl(type); header->size = SVE_PT_SIZE(vq, header->flags); header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl), SVE_PT_REGS_SVE); @@ -739,19 +769,17 @@ static unsigned int sve_size_from_header(struct user_sve_header const *header) return ALIGN(header->size, SVE_VQ_BYTES); } -static int sve_get(struct task_struct *target, - const struct user_regset *regset, - struct membuf to) +static int sve_get_common(struct task_struct *target, + const struct user_regset *regset, + struct membuf to, + enum vec_type type) { struct user_sve_header header; unsigned int vq; unsigned long start, end; - if (!system_supports_sve()) - return -EINVAL; - /* Header */ - sve_init_header_from_task(&header, target); + sve_init_header_from_task(&header, target, type); vq = sve_vq_from_vl(header.vl); membuf_write(&to, &header, sizeof(header)); @@ -759,49 +787,61 @@ static int sve_get(struct task_struct *target, if (target == current) fpsimd_preserve_current_state(); - /* Registers: FPSIMD-only case */ - BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); - if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) + BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); + + switch ((header.flags & SVE_PT_REGS_MASK)) { + case SVE_PT_REGS_FPSIMD: return __fpr_get(target, regset, to); - /* Otherwise: full SVE case */ + case SVE_PT_REGS_SVE: + start = SVE_PT_SVE_OFFSET; + end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq); + membuf_write(&to, target->thread.sve_state, end - start); - BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); - start = SVE_PT_SVE_OFFSET; - end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq); - membuf_write(&to, target->thread.sve_state, end - start); + start = end; + end = SVE_PT_SVE_FPSR_OFFSET(vq); + membuf_zero(&to, end - start); - start = end; - end = SVE_PT_SVE_FPSR_OFFSET(vq); - membuf_zero(&to, end - start); + /* + * Copy fpsr, and fpcr which must follow contiguously in + * struct fpsimd_state: + */ + start = end; + end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE; + membuf_write(&to, &target->thread.uw.fpsimd_state.fpsr, + end - start); - /* - * Copy fpsr, and fpcr which must follow contiguously in - * struct fpsimd_state: - */ - start = end; - end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE; - membuf_write(&to, &target->thread.uw.fpsimd_state.fpsr, end - start); + start = end; + end = sve_size_from_header(&header); + return membuf_zero(&to, end - start); - start = end; - end = sve_size_from_header(&header); - return membuf_zero(&to, end - start); + default: + return 0; + } } -static int sve_set(struct task_struct *target, +static int sve_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) + struct membuf to) +{ + if (!system_supports_sve()) + return -EINVAL; + + return sve_get_common(target, regset, to, ARM64_VEC_SVE); +} + +static int sve_set_common(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf, + enum vec_type type) { int ret; struct user_sve_header header; unsigned int vq; unsigned long start, end; - if (!system_supports_sve()) - return -EINVAL; - /* Header */ if (count < sizeof(header)) return -EINVAL; @@ -812,15 +852,39 @@ static int sve_set(struct task_struct *target, /* * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are consumed by - * sve_set_vector_length(), which will also validate them for us: + * vec_set_vector_length(), which will also validate them for us: */ - ret = sve_set_vector_length(target, header.vl, + ret = vec_set_vector_length(target, type, header.vl, ((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16); if (ret) goto out; /* Actual VL set may be less than the user asked for: */ - vq = sve_vq_from_vl(target->thread.sve_vl); + vq = sve_vq_from_vl(task_get_vl(target, type)); + + /* Enter/exit streaming mode */ + if (system_supports_sme()) { + u64 old_svcr = target->thread.svcr; + + switch (type) { + case ARM64_VEC_SVE: + target->thread.svcr &= ~SVCR_SM_MASK; + break; + case ARM64_VEC_SME: + target->thread.svcr |= SVCR_SM_MASK; + break; + default: + WARN_ON_ONCE(1); + return -EINVAL; + } + + /* + * If we switched then invalidate any existing SVE + * state and ensure there's storage. + */ + if (target->thread.svcr != old_svcr) + sve_alloc(target, true); + } /* Registers: FPSIMD-only case */ @@ -829,10 +893,15 @@ static int sve_set(struct task_struct *target, ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, SVE_PT_FPSIMD_OFFSET); clear_tsk_thread_flag(target, TIF_SVE); + if (type == ARM64_VEC_SME) + fpsimd_force_sync_to_sve(target); goto out; } - /* Otherwise: full SVE case */ + /* + * Otherwise: no registers or full SVE case. For backwards + * compatibility reasons we treat empty flags as SVE registers. + */ /* * If setting a different VL from the requested VL and there is @@ -844,7 +913,7 @@ static int sve_set(struct task_struct *target, goto out; } - sve_alloc(target); + sve_alloc(target, true); if (!target->thread.sve_state) { ret = -ENOMEM; clear_tsk_thread_flag(target, TIF_SVE); @@ -853,8 +922,9 @@ static int sve_set(struct task_struct *target, /* * Ensure target->thread.sve_state is up to date with target's - * FPSIMD regs, so that a short copyin leaves trailing registers - * unmodified. + * FPSIMD regs, so that a short copyin leaves trailing + * registers unmodified. Always enable SVE even if going into + * streaming mode. */ fpsimd_sync_to_sve(target); set_tsk_thread_flag(target, TIF_SVE); @@ -890,8 +960,181 @@ out: return ret; } +static int sve_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + if (!system_supports_sve()) + return -EINVAL; + + return sve_set_common(target, regset, pos, count, kbuf, ubuf, + ARM64_VEC_SVE); +} + #endif /* CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_SME + +static int ssve_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_sme()) + return -EINVAL; + + return sve_get_common(target, regset, to, ARM64_VEC_SME); +} + +static int ssve_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + if (!system_supports_sme()) + return -EINVAL; + + return sve_set_common(target, regset, pos, count, kbuf, ubuf, + ARM64_VEC_SME); +} + +static int za_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct user_za_header header; + unsigned int vq; + unsigned long start, end; + + if (!system_supports_sme()) + return -EINVAL; + + /* Header */ + memset(&header, 0, sizeof(header)); + + if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) + header.flags |= ZA_PT_VL_INHERIT; + + header.vl = task_get_sme_vl(target); + vq = sve_vq_from_vl(header.vl); + header.max_vl = sme_max_vl(); + header.max_size = ZA_PT_SIZE(vq); + + /* If ZA is not active there is only the header */ + if (thread_za_enabled(&target->thread)) + header.size = ZA_PT_SIZE(vq); + else + header.size = ZA_PT_ZA_OFFSET; + + membuf_write(&to, &header, sizeof(header)); + + BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header)); + end = ZA_PT_ZA_OFFSET; + + if (target == current) + fpsimd_preserve_current_state(); + + /* Any register data to include? */ + if (thread_za_enabled(&target->thread)) { + start = end; + end = ZA_PT_SIZE(vq); + membuf_write(&to, target->thread.za_state, end - start); + } + + /* Zero any trailing padding */ + start = end; + end = ALIGN(header.size, SVE_VQ_BYTES); + return membuf_zero(&to, end - start); +} + +static int za_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct user_za_header header; + unsigned int vq; + unsigned long start, end; + + if (!system_supports_sme()) + return -EINVAL; + + /* Header */ + if (count < sizeof(header)) + return -EINVAL; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header, + 0, sizeof(header)); + if (ret) + goto out; + + /* + * All current ZA_PT_* flags are consumed by + * vec_set_vector_length(), which will also validate them for + * us: + */ + ret = vec_set_vector_length(target, ARM64_VEC_SME, header.vl, + ((unsigned long)header.flags) << 16); + if (ret) + goto out; + + /* Actual VL set may be less than the user asked for: */ + vq = sve_vq_from_vl(task_get_sme_vl(target)); + + /* Ensure there is some SVE storage for streaming mode */ + if (!target->thread.sve_state) { + sve_alloc(target, false); + if (!target->thread.sve_state) { + clear_thread_flag(TIF_SME); + ret = -ENOMEM; + goto out; + } + } + + /* Allocate/reinit ZA storage */ + sme_alloc(target); + if (!target->thread.za_state) { + ret = -ENOMEM; + clear_tsk_thread_flag(target, TIF_SME); + goto out; + } + + /* If there is no data then disable ZA */ + if (!count) { + target->thread.svcr &= ~SVCR_ZA_MASK; + goto out; + } + + /* + * If setting a different VL from the requested VL and there is + * register data, the data layout will be wrong: don't even + * try to set the registers in this case. + */ + if (vq != sve_vq_from_vl(header.vl)) { + ret = -EIO; + goto out; + } + + BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header)); + start = ZA_PT_ZA_OFFSET; + end = ZA_PT_SIZE(vq); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + target->thread.za_state, + start, end); + if (ret) + goto out; + + /* Mark ZA as active and let userspace use it */ + set_tsk_thread_flag(target, TIF_SME); + target->thread.svcr |= SVCR_ZA_MASK; + +out: + fpsimd_flush_task_state(target); + return ret; +} + +#endif /* CONFIG_ARM64_SME */ + #ifdef CONFIG_ARM64_PTR_AUTH static int pac_mask_get(struct task_struct *target, const struct user_regset *regset, @@ -1109,6 +1352,10 @@ enum aarch64_regset { #ifdef CONFIG_ARM64_SVE REGSET_SVE, #endif +#ifdef CONFIG_ARM64_SVE + REGSET_SSVE, + REGSET_ZA, +#endif #ifdef CONFIG_ARM64_PTR_AUTH REGSET_PAC_MASK, REGSET_PAC_ENABLED_KEYS, @@ -1189,6 +1436,25 @@ static const struct user_regset aarch64_regsets[] = { .set = sve_set, }, #endif +#ifdef CONFIG_ARM64_SME + [REGSET_SSVE] = { /* Streaming mode SVE */ + .core_note_type = NT_ARM_SSVE, + .n = DIV_ROUND_UP(SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE), + SVE_VQ_BYTES), + .size = SVE_VQ_BYTES, + .align = SVE_VQ_BYTES, + .regset_get = ssve_get, + .set = ssve_set, + }, + [REGSET_ZA] = { /* SME ZA */ + .core_note_type = NT_ARM_ZA, + .n = DIV_ROUND_UP(ZA_PT_ZA_SIZE(SVE_VQ_MAX), SVE_VQ_BYTES), + .size = SVE_VQ_BYTES, + .align = SVE_VQ_BYTES, + .regset_get = za_get, + .set = za_set, + }, +#endif #ifdef CONFIG_ARM64_PTR_AUTH [REGSET_PAC_MASK] = { .core_note_type = NT_ARM_PAC_MASK, diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index a6d18755652f..68330017d04f 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -9,9 +9,9 @@ #include #include #include +#include #include -#include struct return_address_data { unsigned int level; @@ -35,15 +35,11 @@ NOKPROBE_SYMBOL(save_return_addr); void *return_address(unsigned int level) { struct return_address_data data; - struct stackframe frame; data.level = level + 2; data.addr = NULL; - start_backtrace(&frame, - (unsigned long)__builtin_frame_address(0), - (unsigned long)return_address); - walk_stackframe(current, &frame, save_return_addr, &data); + arch_stack_walk(save_return_addr, &data, current, NULL); if (!data.level) return data.addr; diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 644b7cd38e3e..adb574d0bb0f 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -49,6 +50,7 @@ #include #include #include +#include #include #include @@ -446,3 +448,10 @@ static int __init register_arm64_panic_block(void) return 0; } device_initcall(register_arm64_panic_block); + +void kvm_arm_init_hyp_services(void) +{ + kvm_init_ioremap_services(); + kvm_init_memshare_services(); + kvm_init_memrelinquish_services(); +} diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index b3e1beccf458..06dd20b3c88d 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -57,6 +57,7 @@ struct rt_sigframe_user_layout { unsigned long fpsimd_offset; unsigned long esr_offset; unsigned long sve_offset; + unsigned long za_offset; unsigned long extra_offset; unsigned long end_offset; }; @@ -219,6 +220,7 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx) struct user_ctxs { struct fpsimd_context __user *fpsimd; struct sve_context __user *sve; + struct za_context __user *za; }; #ifdef CONFIG_ARM64_SVE @@ -227,11 +229,17 @@ static int preserve_sve_context(struct sve_context __user *ctx) { int err = 0; u16 reserved[ARRAY_SIZE(ctx->__reserved)]; - unsigned int vl = current->thread.sve_vl; + u16 flags = 0; + unsigned int vl = task_get_sve_vl(current); unsigned int vq = 0; - if (test_thread_flag(TIF_SVE)) + if (thread_sm_enabled(¤t->thread)) { + vl = task_get_sme_vl(current); vq = sve_vq_from_vl(vl); + flags |= SVE_SIG_FLAG_SM; + } else if (test_thread_flag(TIF_SVE)) { + vq = sve_vq_from_vl(vl); + } memset(reserved, 0, sizeof(reserved)); @@ -239,6 +247,7 @@ static int preserve_sve_context(struct sve_context __user *ctx) __put_user_error(round_up(SVE_SIG_CONTEXT_SIZE(vq), 16), &ctx->head.size, err); __put_user_error(vl, &ctx->vl, err); + __put_user_error(flags, &ctx->flags, err); BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); @@ -259,18 +268,28 @@ static int preserve_sve_context(struct sve_context __user *ctx) static int restore_sve_fpsimd_context(struct user_ctxs *user) { int err; - unsigned int vq; + unsigned int vl, vq; struct user_fpsimd_state fpsimd; struct sve_context sve; if (__copy_from_user(&sve, user->sve, sizeof(sve))) return -EFAULT; - if (sve.vl != current->thread.sve_vl) + if (sve.flags & SVE_SIG_FLAG_SM) { + if (!system_supports_sme()) + return -EINVAL; + + vl = task_get_sme_vl(current); + } else { + vl = task_get_sve_vl(current); + } + + if (sve.vl != vl) return -EINVAL; if (sve.head.size <= sizeof(*user->sve)) { clear_thread_flag(TIF_SVE); + current->thread.svcr &= ~SVCR_SM_MASK; goto fpsimd_only; } @@ -289,7 +308,7 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) fpsimd_flush_task_state(current); /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ - sve_alloc(current); + sve_alloc(current, true); if (!current->thread.sve_state) { clear_thread_flag(TIF_SVE); return -ENOMEM; @@ -302,7 +321,10 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) if (err) return -EFAULT; - set_thread_flag(TIF_SVE); + if (sve.flags & SVE_SIG_FLAG_SM) + current->thread.svcr |= SVCR_SM_MASK; + else + set_thread_flag(TIF_SVE); fpsimd_only: /* copy the FP and status/control registers */ @@ -327,6 +349,101 @@ extern int restore_sve_fpsimd_context(struct user_ctxs *user); #endif /* ! CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_SME + +static int preserve_za_context(struct za_context __user *ctx) +{ + int err = 0; + u16 reserved[ARRAY_SIZE(ctx->__reserved)]; + unsigned int vl = task_get_sme_vl(current); + unsigned int vq; + + if (thread_za_enabled(¤t->thread)) + vq = sve_vq_from_vl(vl); + else + vq = 0; + + memset(reserved, 0, sizeof(reserved)); + + __put_user_error(ZA_MAGIC, &ctx->head.magic, err); + __put_user_error(round_up(ZA_SIG_CONTEXT_SIZE(vq), 16), + &ctx->head.size, err); + __put_user_error(vl, &ctx->vl, err); + BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); + err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); + + if (vq) { + /* + * This assumes that the ZA state has already been saved to + * the task struct by calling the function + * fpsimd_signal_preserve_current_state(). + */ + err |= __copy_to_user((char __user *)ctx + ZA_SIG_REGS_OFFSET, + current->thread.za_state, + ZA_SIG_REGS_SIZE(vq)); + } + + return err ? -EFAULT : 0; +} + +static int restore_za_context(struct user_ctxs *user) +{ + int err; + unsigned int vq; + struct za_context za; + + if (__copy_from_user(&za, user->za, sizeof(za))) + return -EFAULT; + + if (za.vl != task_get_sme_vl(current)) + return -EINVAL; + + if (za.head.size <= sizeof(*user->za)) { + current->thread.svcr &= ~SVCR_ZA_MASK; + return 0; + } + + vq = sve_vq_from_vl(za.vl); + + if (za.head.size < ZA_SIG_CONTEXT_SIZE(vq)) + return -EINVAL; + + /* + * Careful: we are about __copy_from_user() directly into + * thread.za_state with preemption enabled, so protection is + * needed to prevent a racing context switch from writing stale + * registers back over the new data. + */ + + fpsimd_flush_task_state(current); + /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ + + sme_alloc(current); + if (!current->thread.za_state) { + current->thread.svcr &= ~SVCR_ZA_MASK; + clear_thread_flag(TIF_SME); + return -ENOMEM; + } + + err = __copy_from_user(current->thread.za_state, + (char __user const *)user->za + + ZA_SIG_REGS_OFFSET, + ZA_SIG_REGS_SIZE(vq)); + if (err) + return -EFAULT; + + set_thread_flag(TIF_SME); + current->thread.svcr |= SVCR_ZA_MASK; + + return 0; +} +#else /* ! CONFIG_ARM64_SME */ + +/* Turn any non-optimised out attempts to use these into a link error: */ +extern int preserve_za_context(void __user *ctx); +extern int restore_za_context(struct user_ctxs *user); + +#endif /* ! CONFIG_ARM64_SME */ static int parse_user_sigframe(struct user_ctxs *user, struct rt_sigframe __user *sf) @@ -341,6 +458,7 @@ static int parse_user_sigframe(struct user_ctxs *user, user->fpsimd = NULL; user->sve = NULL; + user->za = NULL; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; @@ -394,7 +512,7 @@ static int parse_user_sigframe(struct user_ctxs *user, break; case SVE_MAGIC: - if (!system_supports_sve()) + if (!system_supports_sve() && !system_supports_sme()) goto invalid; if (user->sve) @@ -406,6 +524,19 @@ static int parse_user_sigframe(struct user_ctxs *user, user->sve = (struct sve_context __user *)head; break; + case ZA_MAGIC: + if (!system_supports_sme()) + goto invalid; + + if (user->za) + goto invalid; + + if (size < sizeof(*user->za)) + goto invalid; + + user->za = (struct za_context __user *)head; + break; + case EXTRA_MAGIC: if (have_extra_context) goto invalid; @@ -529,6 +660,9 @@ static int restore_sigframe(struct pt_regs *regs, } } + if (err == 0 && system_supports_sme() && user.za) + err = restore_za_context(&user); + return err; } @@ -595,11 +729,12 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, if (system_supports_sve()) { unsigned int vq = 0; - if (add_all || test_thread_flag(TIF_SVE)) { - int vl = sve_max_vl; + if (add_all || test_thread_flag(TIF_SVE) || + thread_sm_enabled(¤t->thread)) { + int vl = max(sve_max_vl(), sme_max_vl()); if (!add_all) - vl = current->thread.sve_vl; + vl = thread_get_cur_vl(¤t->thread); vq = sve_vq_from_vl(vl); } @@ -610,6 +745,24 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, return err; } + if (system_supports_sme()) { + unsigned int vl; + unsigned int vq = 0; + + if (add_all) + vl = sme_max_vl(); + else + vl = task_get_sme_vl(current); + + if (thread_za_enabled(¤t->thread)) + vq = sve_vq_from_vl(vl); + + err = sigframe_alloc(user, &user->za_offset, + ZA_SIG_CONTEXT_SIZE(vq)); + if (err) + return err; + } + return sigframe_alloc_end(user); } @@ -650,13 +803,21 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, __put_user_error(current->thread.fault_code, &esr_ctx->esr, err); } - /* Scalable Vector Extension state, if present */ - if (system_supports_sve() && err == 0 && user->sve_offset) { + /* Scalable Vector Extension state (including streaming), if present */ + if ((system_supports_sve() || system_supports_sme()) && + err == 0 && user->sve_offset) { struct sve_context __user *sve_ctx = apply_user_offset(user, user->sve_offset); err |= preserve_sve_context(sve_ctx); } + /* ZA state if present */ + if (system_supports_sme() && err == 0 && user->za_offset) { + struct za_context __user *za_ctx = + apply_user_offset(user, user->za_offset); + err |= preserve_za_context(za_ctx); + } + if (err == 0 && user->extra_offset) { char __user *sfp = (char __user *)user->sigframe; char __user *userp = @@ -760,6 +921,23 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, /* TCO (Tag Check Override) always cleared for signal handlers */ regs->pstate &= ~PSR_TCO_BIT; + /* Signal handlers are invoked with ZA and streaming mode disabled */ + if (system_supports_sme()) { + /* + * If we were in streaming mode the saved register + * state was SVE but we will exit SM and use the + * FPSIMD register state - flush the saved FPSIMD + * register state in case it gets loaded. + */ + if (current->thread.svcr & SVCR_SM_MASK) + memset(¤t->thread.uw.fpsimd_state, 0, + sizeof(current->thread.uw.fpsimd_state)); + + current->thread.svcr &= ~(SVCR_ZA_MASK | + SVCR_SM_MASK); + sme_smstop(); + } + if (ka->sa.sa_flags & SA_RESTORER) sigtramp = ka->sa.sa_restorer; else diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 3beaa6640ab3..7fec4e38e8e2 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -53,9 +53,14 @@ #define CREATE_TRACE_POINTS #include +#undef CREATE_TRACE_POINTS +#include DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); EXPORT_PER_CPU_SYMBOL(cpu_number); +EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_raise); +EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_entry); +EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_exit); /* * as from 2.5, kernels no longer have an init_tasks structure @@ -903,6 +908,7 @@ static void do_handle_IPI(int ipinr) break; case IPI_CPU_STOP: + trace_android_vh_ipi_stop(get_irq_regs()); local_cpu_stop(); break; @@ -994,7 +1000,12 @@ void __init set_smp_ipi_range(int ipi_base, int n) WARN_ON(err); ipi_desc[i] = irq_to_desc(ipi_base + i); - irq_set_status_flags(ipi_base + i, IRQ_HIDDEN); + + if (i != IPI_RESCHEDULE) + irq_set_status_flags(ipi_base + i, IRQ_HIDDEN); + else + /* The recheduling IPI is special... */ + irq_set_status_flags(ipi_base + i, IRQ_HIDDEN|IRQ_RAW); } ipi_irq_base = ipi_base; @@ -1128,5 +1139,18 @@ bool cpus_are_stuck_in_kernel(void) { bool smp_spin_tables = (num_possible_cpus() > 1 && !have_cpu_die()); - return !!cpus_stuck_in_kernel || smp_spin_tables; + return !!cpus_stuck_in_kernel || smp_spin_tables || + is_protected_kvm_enabled(); } + +int nr_ipi_get(void) +{ + return nr_ipi; +} +EXPORT_SYMBOL_GPL(nr_ipi_get); + +struct irq_desc **ipi_desc_get(void) +{ + return ipi_desc; +} +EXPORT_SYMBOL_GPL(ipi_desc_get); diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 3b8dc538a4c4..31aa86480654 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -7,55 +7,90 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include /* - * AArch64 PCS assigns the frame pointer to x29. + * Start an unwind from a pt_regs. * - * A simple function prologue looks like this: - * sub sp, sp, #0x10 - * stp x29, x30, [sp] - * mov x29, sp + * The unwind will begin at the PC within the regs. * - * A simple function epilogue looks like this: - * mov sp, x29 - * ldp x29, x30, [sp] - * add sp, sp, #0x10 + * The regs must be on a stack currently owned by the calling task. */ - - -notrace void start_backtrace(struct stackframe *frame, unsigned long fp, - unsigned long pc) +static inline void unwind_init_from_regs(struct unwind_state *state, + struct pt_regs *regs) { - frame->fp = fp; - frame->pc = pc; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - frame->graph = 0; -#endif + unwind_init_common(state, current); - /* - * Prime the first unwind. - * - * In unwind_frame() we'll check that the FP points to a valid stack, - * which can't be STACK_TYPE_UNKNOWN, and the first unwind will be - * treated as a transition to whichever stack that happens to be. The - * prev_fp value won't be used, but we set it to 0 such that it is - * definitely not an accessible stack address. - */ - bitmap_zero(frame->stacks_done, __NR_STACK_TYPES); - frame->prev_fp = 0; - frame->prev_type = STACK_TYPE_UNKNOWN; + state->fp = regs->regs[29]; + state->pc = regs->pc; +} + +/* + * Start an unwind from a caller. + * + * The unwind will begin at the caller of whichever function this is inlined + * into. + * + * The function which invokes this must be noinline. + */ +static __always_inline void unwind_init_from_caller(struct unwind_state *state) +{ + unwind_init_common(state, current); + + state->fp = (unsigned long)__builtin_frame_address(1); + state->pc = (unsigned long)__builtin_return_address(0); +} + +/* + * Start an unwind from a blocked task. + * + * The unwind will begin at the blocked tasks saved PC (i.e. the caller of + * cpu_switch_to()). + * + * The caller should ensure the task is blocked in cpu_switch_to() for the + * duration of the unwind, or the unwind will be bogus. It is never valid to + * call this for the current task. + */ +static inline void unwind_init_from_task(struct unwind_state *state, + struct task_struct *task) +{ + unwind_init_common(state, task); + + state->fp = thread_saved_fp(task); + state->pc = thread_saved_pc(task); +} + +/* + * We can only safely access per-cpu stacks from current in a non-preemptible + * context. + */ +static bool on_accessible_stack(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info) +{ + if (info) + info->type = STACK_TYPE_UNKNOWN; + + if (on_task_stack(tsk, sp, size, info)) + return true; + if (tsk != current || preemptible()) + return false; + if (on_irq_stack(sp, size, info)) + return true; + if (on_overflow_stack(sp, size, info)) + return true; + if (on_sdei_stack(sp, size, info)) + return true; + + return false; } -NOKPROBE_SYMBOL(start_backtrace); /* * Unwind from one frame record (A) to the next frame record (B). @@ -64,112 +99,74 @@ NOKPROBE_SYMBOL(start_backtrace); * records (e.g. a cycle), determined based on the location and fp value of A * and the location (but not the fp value) of B. */ -int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) +static int notrace unwind_next(struct unwind_state *state) { - unsigned long fp = frame->fp; + struct task_struct *tsk = state->task; + unsigned long fp = state->fp; struct stack_info info; - - if (!tsk) - tsk = current; + int err; /* Final frame; nothing to unwind */ if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) return -ENOENT; - if (fp & 0x7) - return -EINVAL; + err = unwind_next_common(state, &info, on_accessible_stack, NULL); + if (err) + return err; - if (!on_accessible_stack(tsk, fp, 16, &info)) - return -EINVAL; - - if (test_bit(info.type, frame->stacks_done)) - return -EINVAL; - - /* - * As stacks grow downward, any valid record on the same stack must be - * at a strictly higher address than the prior record. - * - * Stacks can nest in several valid orders, e.g. - * - * TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL - * TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW - * - * ... but the nesting itself is strict. Once we transition from one - * stack to another, it's never valid to unwind back to that first - * stack. - */ - if (info.type == frame->prev_type) { - if (fp <= frame->prev_fp) - return -EINVAL; - } else { - set_bit(frame->prev_type, frame->stacks_done); - } - - /* - * Record this frame record's values and location. The prev_fp and - * prev_type are only meaningful to the next unwind_frame() invocation. - */ - frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp)); - frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8)); - frame->prev_fp = fp; - frame->prev_type = info.type; + state->pc = ptrauth_strip_insn_pac(state->pc); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (tsk->ret_stack && - (ptrauth_strip_insn_pac(frame->pc) == (unsigned long)return_to_handler)) { - struct ftrace_ret_stack *ret_stack; + (state->pc == (unsigned long)return_to_handler)) { + unsigned long orig_pc; /* * This is a case where function graph tracer has * modified a return address (LR) in a stack frame * to hook a function return. * So replace it to an original value. */ - ret_stack = ftrace_graph_get_ret_stack(tsk, frame->graph++); - if (WARN_ON_ONCE(!ret_stack)) + orig_pc = ftrace_graph_ret_addr(tsk, NULL, state->pc, + (void *)state->fp); + if (WARN_ON_ONCE(state->pc == orig_pc)) return -EINVAL; - frame->pc = ret_stack->ret; + state->pc = orig_pc; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - frame->pc = ptrauth_strip_insn_pac(frame->pc); - return 0; } -NOKPROBE_SYMBOL(unwind_frame); +NOKPROBE_SYMBOL(unwind_next); -void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame, - bool (*fn)(void *, unsigned long), void *data) +static void notrace unwind(struct unwind_state *state, + stack_trace_consume_fn consume_entry, void *cookie) { while (1) { int ret; - if (!fn(data, frame->pc)) + if (!consume_entry(cookie, state->pc)) break; - ret = unwind_frame(tsk, frame); + ret = unwind_next(state); if (ret < 0) break; } } -NOKPROBE_SYMBOL(walk_stackframe); +NOKPROBE_SYMBOL(unwind); -static void dump_backtrace_entry(unsigned long where, const char *loglvl) +static bool dump_backtrace_entry(void *arg, unsigned long where) { + char *loglvl = arg; printk("%s %pSb\n", loglvl, (void *)where); + return true; } void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, const char *loglvl) { - struct stackframe frame; - int skip = 0; - pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); - if (regs) { - if (user_mode(regs)) - return; - skip = 1; - } + if (regs && user_mode(regs)) + return; if (!tsk) tsk = current; @@ -177,39 +174,12 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, if (!try_get_task_stack(tsk)) return; - if (tsk == current) { - start_backtrace(&frame, - (unsigned long)__builtin_frame_address(0), - (unsigned long)dump_backtrace); - } else { - /* - * task blocked in __switch_to - */ - start_backtrace(&frame, - thread_saved_fp(tsk), - thread_saved_pc(tsk)); - } - printk("%sCall trace:\n", loglvl); - do { - /* skip until specified stack frame */ - if (!skip) { - dump_backtrace_entry(frame.pc, loglvl); - } else if (frame.fp == regs->regs[29]) { - skip = 0; - /* - * Mostly, this is the case where this function is - * called in panic/abort. As exception handler's - * stack frame does not contain the corresponding pc - * at which an exception has taken place, use regs->pc - * instead. - */ - dump_backtrace_entry(regs->pc, loglvl); - } - } while (!unwind_frame(tsk, &frame)); + arch_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs); put_task_stack(tsk); } +EXPORT_SYMBOL_GPL(dump_backtrace); void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { @@ -217,25 +187,21 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) barrier(); } -#ifdef CONFIG_STACKTRACE - noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task, struct pt_regs *regs) { - struct stackframe frame; + struct unwind_state state; - if (regs) - start_backtrace(&frame, regs->regs[29], regs->pc); - else if (task == current) - start_backtrace(&frame, - (unsigned long)__builtin_frame_address(1), - (unsigned long)__builtin_return_address(0)); - else - start_backtrace(&frame, thread_saved_fp(task), - thread_saved_pc(task)); + if (regs) { + if (task != current) + return; + unwind_init_from_regs(&state, regs); + } else if (task == current) { + unwind_init_from_caller(&state); + } else { + unwind_init_from_task(&state, task); + } - walk_stackframe(task, &frame, consume_entry, cookie); + unwind(&state, consume_entry, cookie); } - -#endif diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index 50a0f1a38e84..78c8e1f5d05e 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -158,11 +158,36 @@ trace_exit: syscall_trace_exit(regs); } -static inline void sve_user_discard(void) +/* + * As per the ABI exit SME streaming mode and clear the SVE state not + * shared with FPSIMD on syscall entry. + */ +static inline void fp_user_discard(void) { + /* + * If SME is active then exit streaming mode. If ZA is active + * then flush the SVE registers but leave userspace access to + * both SVE and SME enabled, otherwise disable SME for the + * task and fall through to disabling SVE too. This means + * that after a syscall we never have any streaming mode + * register state to track, if this changes the KVM code will + * need updating. + */ + if (system_supports_sme() && test_thread_flag(TIF_SME)) { + u64 svcr = read_sysreg_s(SYS_SVCR); + + if (svcr & SVCR_SM_MASK) + sme_smstop_sm(); + } + if (!system_supports_sve()) return; + /* + * If SME is not active then disable SVE, the registers will + * be cleared when userspace next attempts to access them and + * we do not need to track the SVE register state until then. + */ clear_thread_flag(TIF_SVE); /* @@ -177,7 +202,7 @@ static inline void sve_user_discard(void) void do_el0_svc(struct pt_regs *regs) { - sve_user_discard(); + fp_user_discard(); el0_svc_common(regs, regs->regs[8], __NR_syscalls, sys_call_table); } diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index eebbc8d7123e..b5855eb7435d 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -29,25 +30,25 @@ #include #include -#include #include +static bool profile_pc_cb(void *arg, unsigned long pc) +{ + unsigned long *prof_pc = arg; + + if (in_lock_functions(pc)) + return true; + *prof_pc = pc; + return false; +} + unsigned long profile_pc(struct pt_regs *regs) { - struct stackframe frame; + unsigned long prof_pc = 0; - if (!in_lock_functions(regs->pc)) - return regs->pc; + arch_stack_walk(profile_pc_cb, &prof_pc, current, regs); - start_backtrace(&frame, regs->regs[29], regs->pc); - - do { - int ret = unwind_frame(NULL, &frame); - if (ret < 0) - return 0; - } while (in_lock_functions(frame.pc)); - - return frame.pc; + return prof_pc; } EXPORT_SYMBOL(profile_pc); diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 877c68f47282..856a367a8294 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -22,6 +22,8 @@ #include #include +#include + #ifdef CONFIG_ACPI static bool __init acpi_cpu_is_threaded(int cpu) { @@ -163,6 +165,11 @@ static void amu_scale_freq_tick(void) { u64 prev_core_cnt, prev_const_cnt; u64 core_cnt, const_cnt, scale; + bool use_amu_fie = true; + + trace_android_vh_use_amu_fie(&use_amu_fie); + if(!use_amu_fie) + return; prev_const_cnt = this_cpu_read(arch_const_cycles_prev); prev_core_cnt = this_cpu_read(arch_core_cycles_prev); diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 21e69a991bc8..5e57a0a26924 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -46,6 +46,8 @@ #include #include +#include + static bool __kprobes __check_eq(unsigned long pstate) { return (pstate & PSR_Z_BIT) != 0; @@ -495,6 +497,7 @@ void do_undefinstr(struct pt_regs *regs) if (call_undef_hook(regs) == 0) return; + trace_android_rvh_do_undefinstr(regs); BUG_ON(!user_mode(regs)); force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); } @@ -513,6 +516,7 @@ void do_ptrauth_fault(struct pt_regs *regs, unsigned long esr) * Unexpected FPAC exception or pointer authentication failure in * the kernel: kill the task before it does any more harm. */ + trace_android_rvh_do_ptrauth_fault(regs, esr); BUG_ON(!user_mode(regs)); force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr); } @@ -585,11 +589,11 @@ static void ctr_read_handler(unsigned long esr, struct pt_regs *regs) if (cpus_have_const_cap(ARM64_WORKAROUND_1542419)) { /* Hide DIC so that we can trap the unnecessary maintenance...*/ - val &= ~BIT(CTR_DIC_SHIFT); + val &= ~BIT(CTR_EL0_DIC_SHIFT); /* ... and fake IminLine to reduce the number of traps. */ - val &= ~CTR_IMINLINE_MASK; - val |= (PAGE_SHIFT - 2) & CTR_IMINLINE_MASK; + val &= ~CTR_EL0_IminLine_MASK; + val |= (PAGE_SHIFT - 2) & CTR_EL0_IminLine_MASK; } pt_regs_write_reg(regs, rt, val); @@ -816,6 +820,7 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_SVE] = "SVE", [ESR_ELx_EC_ERET] = "ERET/ERETAA/ERETAB", [ESR_ELx_EC_FPAC] = "FPAC", + [ESR_ELx_EC_SME] = "SME", [ESR_ELx_EC_IMP_DEF] = "EL3 IMP DEF", [ESR_ELx_EC_IABT_LOW] = "IABT (lower EL)", [ESR_ELx_EC_IABT_CUR] = "IABT (current EL)", @@ -898,6 +903,8 @@ void __noreturn arm64_serror_panic(struct pt_regs *regs, unsigned long esr) pr_crit("SError Interrupt on CPU%d, code 0x%016lx -- %s\n", smp_processor_id(), esr, esr_get_class_string(esr)); + + trace_android_rvh_arm64_serror_panic(regs, esr); if (regs) __show_regs(regs); diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 184abd7c4206..271ec8f29f17 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -13,16 +13,36 @@ *(__kvm_ex_table) \ __stop___kvm_ex_table = .; -#define HYPERVISOR_DATA_SECTIONS \ +#ifdef CONFIG_TRACING +#define HYPERVISOR_EVENT_IDS \ + . = ALIGN(PAGE_SIZE); \ + __hyp_event_ids_start = .; \ + *(HYP_SECTION_NAME(_hyp_event_ids)) \ + __hyp_event_ids_end = .; +#else +#define HYPERVISOR_EVENT_IDS +#endif + +#define HYPERVISOR_RODATA_SECTIONS \ HYP_SECTION_NAME(.rodata) : { \ . = ALIGN(PAGE_SIZE); \ __hyp_rodata_start = .; \ *(HYP_SECTION_NAME(.data..ro_after_init)) \ *(HYP_SECTION_NAME(.rodata)) \ + HYPERVISOR_EVENT_IDS \ . = ALIGN(PAGE_SIZE); \ __hyp_rodata_end = .; \ } +#define HYPERVISOR_DATA_SECTION \ + HYP_SECTION_NAME(.data) : { \ + . = ALIGN(PAGE_SIZE); \ + __hyp_data_start = .; \ + *(HYP_SECTION_NAME(.data)) \ + . = ALIGN(PAGE_SIZE); \ + __hyp_data_end = .; \ + } + #define HYPERVISOR_PERCPU_SECTION \ . = ALIGN(PAGE_SIZE); \ HYP_SECTION_NAME(.data..percpu) : { \ @@ -42,6 +62,17 @@ . = ALIGN(PAGE_SIZE); \ __hyp_bss_end = .; +#ifdef CONFIG_TRACING +#define HYPERVISOR_EVENTS \ + .hyp.events : { \ + __start_hyp_events = .; \ + *(_hyp_events) \ + __stop_hyp_events = .; \ + } +#else +#define HYPERVISOR_EVENTS +#endif + /* * We require that __hyp_bss_start and __bss_start are aligned, and enforce it * with an assertion. But the BSS_SECTION macro places an empty .sbss section @@ -51,9 +82,11 @@ #define SBSS_ALIGN PAGE_SIZE #else /* CONFIG_KVM */ #define HYPERVISOR_EXTABLE -#define HYPERVISOR_DATA_SECTIONS +#define HYPERVISOR_RODATA_SECTIONS +#define HYPERVISOR_DATA_SECTION #define HYPERVISOR_PERCPU_SECTION #define HYPERVISOR_RELOC_SECTION +#define HYPERVISOR_EVENTS #define SBSS_ALIGN 0 #endif @@ -181,7 +214,9 @@ SECTIONS /* everything from this point to __init_begin will be marked RO NX */ RO_DATA(PAGE_SIZE) - HYPERVISOR_DATA_SECTIONS + HYPERVISOR_EVENTS + + HYPERVISOR_RODATA_SECTIONS idmap_pg_dir = .; . += IDMAP_DIR_SIZE; @@ -262,6 +297,8 @@ SECTIONS _sdata = .; RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) + HYPERVISOR_DATA_SECTION + /* * Data written with the MMU off but read with the MMU on requires * cache lines to be invalidated, discarding up to a Cache Writeback diff --git a/arch/arm64/kvm/.gitignore b/arch/arm64/kvm/.gitignore new file mode 100644 index 000000000000..6182aefb8302 --- /dev/null +++ b/arch/arm64/kvm/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +hyp_constants.h diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index d7eec0b43744..b5bc9d8ee413 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -4,6 +4,7 @@ # source "virt/lib/Kconfig" +source "virt/kvm/Kconfig" menuconfig VIRTUALIZATION bool "Virtualization" @@ -19,7 +20,7 @@ if VIRTUALIZATION menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" - depends on OF + depends on HAVE_KVM select MMU_NOTIFIER select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT @@ -38,17 +39,15 @@ menuconfig KVM select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_VCPU_RUN_PID_CHANGE select SCHED_INFO + select GUEST_PERF_EVENTS if PERF_EVENTS help Support hosting virtualized guest machines. If unsure, say N. -if KVM - -source "virt/kvm/Kconfig" - config NVHE_EL2_DEBUG bool "Debug mode for non-VHE EL2 object" + depends on KVM help Say Y here to enable the debug mode for the non-VHE KVM EL2 object. Failure reports will BUG() in the hypervisor. This is intended for @@ -56,6 +55,17 @@ config NVHE_EL2_DEBUG If unsure, say N. -endif # KVM +config PROTECTED_NVHE_STACKTRACE + bool "Protected KVM hypervisor stacktraces" + depends on NVHE_EL2_DEBUG + default n + help + Say Y here to enable pKVM hypervisor stacktraces on hyp_panic() + + If using protected nVHE mode, but cannot afford the associated + memory cost (less than 0.75 page per CPU) of pKVM stacktraces, + say N. + + If unsure, or not using protected nVHE (pKVM), say N. endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 989bb5dad2c8..21d8980762fc 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -11,12 +11,12 @@ obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM) += hyp/ kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \ - $(KVM)/vfio.o $(KVM)/irqchip.o $(KVM)/binary_stats.o \ - arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \ + $(KVM)/vfio.o $(KVM)/irqchip.o $(KVM)/binary_stats.o iommu.o \ + arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ inject_fault.o va_layout.o handle_exit.o \ - guest.o debug.o reset.o sys_regs.o \ - vgic-sys-reg-v3.o fpsimd.o pmu.o \ - arch_timer.o trng.o\ + guest.o debug.o reset.o sys_regs.o stacktrace.o \ + vgic-sys-reg-v3.o fpsimd.o pkvm.o \ + arch_timer.o trng.o vmid.o \ vgic/vgic.o vgic/vgic-init.o \ vgic/vgic-irqfd.o vgic/vgic-v2.o \ vgic/vgic-v3.o vgic/vgic-v4.o \ @@ -24,4 +24,22 @@ kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \ vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \ vgic/vgic-its.o vgic/vgic-debug.o -kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o +kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o + +kvm-$(CONFIG_TRACING) += hyp_events.o hyp_trace.o + +always-y := hyp_constants.h hyp-constants.s + +define rule_gen_hyp_constants + $(call filechk,offsets,__HYP_CONSTANTS_H__) +endef + +CFLAGS_hyp-constants.o = -I $(srctree)/$(src)/hyp/include +$(obj)/hyp-constants.s: $(src)/hyp/hyp-constants.c FORCE + $(call if_changed_dep,cc_s_c) + +$(obj)/hyp_constants.h: $(obj)/hyp-constants.s FORCE + $(call if_changed_rule,gen_hyp_constants) + +obj-kvm := $(addprefix $(obj)/, $(kvm-y)) +$(obj-kvm): $(obj)/hyp_constants.h diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 3df67c127489..bbc6461c9045 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -88,7 +88,9 @@ static u64 timer_get_offset(struct arch_timer_context *ctxt) switch(arch_timer_ctx_index(ctxt)) { case TIMER_VTIMER: - return __vcpu_sys_reg(vcpu, CNTVOFF_EL2); + if (likely(!kvm_vm_is_protected(vcpu->kvm))) + return __vcpu_sys_reg(vcpu, CNTVOFF_EL2); + fallthrough; default: return 0; } @@ -208,18 +210,16 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) return IRQ_HANDLED; } -static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) +static u64 kvm_counter_compute_delta(struct arch_timer_context *timer_ctx, + u64 val) { - u64 cval, now; + u64 now = kvm_phys_timer_read() - timer_get_offset(timer_ctx); - cval = timer_get_cval(timer_ctx); - now = kvm_phys_timer_read() - timer_get_offset(timer_ctx); - - if (now < cval) { + if (now < val) { u64 ns; ns = cyclecounter_cyc2ns(timecounter->cc, - cval - now, + val - now, timecounter->mask, &timecounter->frac); return ns; @@ -228,6 +228,11 @@ static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) return 0; } +static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) +{ + return kvm_counter_compute_delta(timer_ctx, timer_get_cval(timer_ctx)); +} + static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) { WARN_ON(timer_ctx && timer_ctx->loaded); @@ -236,6 +241,20 @@ static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) (ARCH_TIMER_CTRL_IT_MASK | ARCH_TIMER_CTRL_ENABLE)) == ARCH_TIMER_CTRL_ENABLE); } +static bool vcpu_has_wfit_active(struct kvm_vcpu *vcpu) +{ + return (cpus_have_final_cap(ARM64_HAS_WFXT) && + vcpu_get_flag(vcpu, IN_WFIT)); +} + +static u64 wfit_delay_ns(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *ctx = vcpu_vtimer(vcpu); + u64 val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); + + return kvm_counter_compute_delta(ctx, val); +} + /* * Returns the earliest expiration time in ns among guest timers. * Note that it will return 0 if none of timers can fire. @@ -253,6 +272,9 @@ static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) min_delta = min(min_delta, kvm_timer_compute_delta(ctx)); } + if (vcpu_has_wfit_active(vcpu)) + min_delta = min(min_delta, wfit_delay_ns(vcpu)); + /* If none of timers can fire, then return 0 */ if (min_delta == ULLONG_MAX) return 0; @@ -350,15 +372,9 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) return cval <= now; } -bool kvm_timer_is_pending(struct kvm_vcpu *vcpu) +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { - struct timer_map map; - - get_timer_map(vcpu, &map); - - return kvm_timer_should_fire(map.direct_vtimer) || - kvm_timer_should_fire(map.direct_ptimer) || - kvm_timer_should_fire(map.emul_ptimer); + return vcpu_has_wfit_active(vcpu) && wfit_delay_ns(vcpu) == 0; } /* @@ -484,7 +500,8 @@ static void kvm_timer_blocking(struct kvm_vcpu *vcpu) */ if (!kvm_timer_irq_can_fire(map.direct_vtimer) && !kvm_timer_irq_can_fire(map.direct_ptimer) && - !kvm_timer_irq_can_fire(map.emul_ptimer)) + !kvm_timer_irq_can_fire(map.emul_ptimer) && + !vcpu_has_wfit_active(vcpu)) return; /* @@ -750,10 +767,13 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) /* Make the updates of cntvoff for all vtimer contexts atomic */ static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff) { - int i; + unsigned long i; struct kvm *kvm = vcpu->kvm; struct kvm_vcpu *tmp; + if (unlikely(kvm_vm_is_protected(vcpu->kvm))) + cntvoff = 0; + mutex_lock(&kvm->lock); kvm_for_each_vcpu(i, tmp, kvm) timer_set_offset(vcpu_vtimer(tmp), cntvoff); @@ -1189,8 +1209,8 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) { - int vtimer_irq, ptimer_irq; - int i, ret; + int vtimer_irq, ptimer_irq, ret; + unsigned long i; vtimer_irq = vcpu_vtimer(vcpu)->irq.irq; ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu)); @@ -1216,6 +1236,9 @@ bool kvm_arch_timer_get_input_level(int vintid) struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); struct arch_timer_context *timer; + if (WARN(!vcpu, "No vcpu context!\n")) + return false; + if (vintid == vcpu_vtimer(vcpu)->irq.irq) timer = vcpu_vtimer(vcpu); else if (vintid == vcpu_ptimer(vcpu)->irq.irq) @@ -1297,7 +1320,7 @@ void kvm_timer_init_vhe(void) static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { vcpu_vtimer(vcpu)->irq.irq = vtimer_irq; diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 3fe816c244ce..34a909d9b5ae 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -27,6 +27,8 @@ #define CREATE_TRACE_POINTS #include "trace_arm.h" +#include "hyp_trace.h" + #include #include #include @@ -37,6 +39,7 @@ #include #include #include +#include #include #include @@ -49,14 +52,9 @@ DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); -static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); -unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; +DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); - -/* The VMID used in the VTTBR */ -static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); -static u32 kvm_next_vmid; -static DEFINE_SPINLOCK(kvm_vmid_lock); +DECLARE_KVM_NVHE_PER_CPU(int, hyp_cpu_number); static bool vgic_present; @@ -83,24 +81,42 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, { int r; - if (cap->flags) - return -EINVAL; + /* Capabilities with flags */ + switch (cap->cap) { + case KVM_CAP_ARM_PROTECTED_VM: + return pkvm_vm_ioctl_enable_cap(kvm, cap); + default: + if (cap->flags) + return -EINVAL; + } + /* Capabilities without flags */ switch (cap->cap) { case KVM_CAP_ARM_NISV_TO_USER: - r = 0; - kvm->arch.return_nisv_io_abort_to_user = true; - break; - case KVM_CAP_ARM_MTE: - mutex_lock(&kvm->lock); - if (!system_supports_mte() || kvm->created_vcpus) { + if (kvm_vm_is_protected(kvm)) { r = -EINVAL; } else { r = 0; - kvm->arch.mte_enabled = true; + set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER, + &kvm->arch.flags); + } + break; + case KVM_CAP_ARM_MTE: + mutex_lock(&kvm->lock); + if (!system_supports_mte() || + kvm_vm_is_protected(kvm) || + kvm->created_vcpus) { + r = -EINVAL; + } else { + r = 0; + set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); } mutex_unlock(&kvm->lock); break; + case KVM_CAP_ARM_SYSTEM_SUSPEND: + r = 0; + set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags); + break; default: r = -EINVAL; break; @@ -138,17 +154,26 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int ret; - ret = kvm_arm_setup_stage2(kvm, type); + if (type & ~KVM_VM_TYPE_MASK) + return -EINVAL; + + ret = kvm_share_hyp(kvm, kvm + 1); if (ret) return ret; - ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu); + ret = pkvm_init_host_vm(kvm, type); if (ret) - return ret; + goto err_unshare_kvm; - ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP); + if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) { + ret = -ENOMEM; + goto err_unshare_kvm; + } + cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask); + + ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type); if (ret) - goto out_free_stage2_pgd; + goto err_free_cpumask; kvm_vgic_early_init(kvm); @@ -156,10 +181,14 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.max_vcpus = kvm_arm_default_max_vcpus(); set_default_spectre(kvm); + kvm_arm_init_hypercalls(kvm); - return ret; -out_free_stage2_pgd: - kvm_free_stage2_pgd(&kvm->arch.mmu); + return 0; + +err_free_cpumask: + free_cpumask_var(kvm->arch.supported_cpus); +err_unshare_kvm: + kvm_unshare_hyp(kvm, kvm + 1); return ret; } @@ -175,24 +204,27 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) */ void kvm_arch_destroy_vm(struct kvm *kvm) { - int i; - bitmap_free(kvm->arch.pmu_filter); + free_cpumask_var(kvm->arch.supported_cpus); kvm_vgic_destroy(kvm); - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - if (kvm->vcpus[i]) { - kvm_vcpu_destroy(kvm->vcpus[i]); - kvm->vcpus[i] = NULL; - } - } - atomic_set(&kvm->online_vcpus, 0); + if (is_protected_kvm_enabled()) + pkvm_destroy_hyp_vm(kvm); + + kvm_destroy_vcpus(kvm); + + if (atomic64_read(&kvm->stat.protected_hyp_mem)) + pr_warn("%lluB of donations to the nVHE hyp are missing\n", + atomic64_read(&kvm->stat.protected_hyp_mem)); + + kvm_unshare_hyp(kvm, kvm + 1); } -int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) +static int kvm_check_extension(struct kvm *kvm, long ext) { int r; + switch (ext) { case KVM_CAP_IRQCHIP: r = vgic_present; @@ -210,20 +242,30 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_VCPU_EVENTS: case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2: - case KVM_CAP_ARM_NISV_TO_USER: case KVM_CAP_ARM_INJECT_EXT_DABT: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_VCPU_ATTRIBUTES: case KVM_CAP_PTP_KVM: + case KVM_CAP_ARM_SYSTEM_SUSPEND: r = 1; break; + case KVM_CAP_ARM_NISV_TO_USER: + r = !kvm || !kvm_vm_is_protected(kvm); + break; case KVM_CAP_SET_GUEST_DEBUG2: return KVM_GUESTDBG_VALID_MASK; case KVM_CAP_ARM_SET_DEVICE_ADDR: r = 1; break; case KVM_CAP_NR_VCPUS: - r = num_online_cpus(); + /* + * ARM64 treats KVM_CAP_NR_CPUS differently from all other + * architectures, as it does not always bound it to + * KVM_CAP_MAX_VCPUS. It should not matter much because + * this is just an advisory value. + */ + r = min_t(unsigned int, num_online_cpus(), + kvm_arm_default_max_vcpus()); break; case KVM_CAP_MAX_VCPUS: case KVM_CAP_MAX_VCPU_ID: @@ -283,6 +325,75 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) return r; } +/* + * Checks whether the extension specified in ext is supported in protected + * mode for the specified vm. + * The capabilities supported by kvm in general are passed in kvm_cap. + */ +static int pkvm_check_extension(struct kvm *kvm, long ext, int kvm_cap) +{ + int r; + + switch (ext) { + case KVM_CAP_IRQCHIP: + case KVM_CAP_ARM_PSCI: + case KVM_CAP_ARM_PSCI_0_2: + case KVM_CAP_NR_VCPUS: + case KVM_CAP_MAX_VCPUS: + case KVM_CAP_MAX_VCPU_ID: + case KVM_CAP_MSI_DEVID: + case KVM_CAP_ARM_VM_IPA_SIZE: + r = kvm_cap; + break; + case KVM_CAP_GUEST_DEBUG_HW_BPS: + r = min(kvm_cap, pkvm_get_max_brps()); + break; + case KVM_CAP_GUEST_DEBUG_HW_WPS: + r = min(kvm_cap, pkvm_get_max_wrps()); + break; + case KVM_CAP_ARM_PMU_V3: + r = kvm_cap && FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), + PVM_ID_AA64DFR0_ALLOW); + break; + case KVM_CAP_ARM_SVE: + r = kvm_cap && FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), + PVM_ID_AA64PFR0_RESTRICT_UNSIGNED); + break; + case KVM_CAP_ARM_PTRAUTH_ADDRESS: + r = kvm_cap && + FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), + PVM_ID_AA64ISAR1_ALLOW) && + FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), + PVM_ID_AA64ISAR1_ALLOW); + break; + case KVM_CAP_ARM_PTRAUTH_GENERIC: + r = kvm_cap && + FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), + PVM_ID_AA64ISAR1_ALLOW) && + FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA), + PVM_ID_AA64ISAR1_ALLOW); + break; + case KVM_CAP_ARM_PROTECTED_VM: + r = 1; + break; + default: + r = 0; + break; + } + + return r; +} + +int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) +{ + int r = kvm_check_extension(kvm, ext); + + if (kvm && kvm_vm_is_protected(kvm)) + r = pkvm_check_extension(kvm, ext, r); + + return r; +} + long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -291,10 +402,12 @@ long kvm_arch_dev_ioctl(struct file *filp, struct kvm *kvm_arch_alloc_vm(void) { - if (!has_vhe()) - return kzalloc(sizeof(struct kvm), GFP_KERNEL); + size_t sz = sizeof(struct kvm); - return vzalloc(sizeof(struct kvm)); + if (!has_vhe()) + return kzalloc(sz, GFP_KERNEL_ACCOUNT); + + return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO); } void kvm_arch_free_vm(struct kvm *kvm) @@ -326,6 +439,12 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; + /* + * Default value for the FP state, will be overloaded at load + * time if we support FP (pretty likely) + */ + vcpu->arch.fp_state = FP_STATE_FREE; + /* Set up the timer */ kvm_timer_vcpu_init(vcpu); @@ -341,7 +460,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (err) return err; - return create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP); + return kvm_share_hyp(vcpu, vcpu + 1); } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) @@ -350,44 +469,28 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm))) + if (vcpu_has_run_once(vcpu) && unlikely(!irqchip_in_kernel(vcpu->kvm))) static_branch_dec(&userspace_irqchip_in_use); - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + if (is_protected_kvm_enabled()) + free_hyp_stage2_memcache(&vcpu->arch.pkvm_memcache, vcpu->kvm); + else + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + kvm_timer_vcpu_terminate(vcpu); kvm_pmu_vcpu_destroy(vcpu); kvm_arm_vcpu_destroy(vcpu); } -int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) -{ - return kvm_timer_is_pending(vcpu); -} - void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { - /* - * If we're about to block (most likely because we've just hit a - * WFI), we need to sync back the state of the GIC CPU interface - * so that we have the latest PMR and group enables. This ensures - * that kvm_arch_vcpu_runnable has up-to-date data to decide - * whether we have pending interrupts. - * - * For the same reason, we want to tell GICv4 that we need - * doorbells to be signalled, should an interrupt become pending. - */ - preempt_disable(); - kvm_vgic_vmcr_sync(vcpu); - vgic_v4_put(vcpu, true); - preempt_enable(); + } void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) { - preempt_disable(); - vgic_v4_load(vcpu); - preempt_enable(); + } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -395,6 +498,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) struct kvm_s2_mmu *mmu; int *last_ran; + if (is_protected_kvm_enabled()) + goto nommu; + mmu = vcpu->arch.hw_mmu; last_ran = this_cpu_ptr(mmu->last_vcpu_ran); @@ -412,6 +518,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) *last_ran = vcpu->vcpu_id; } +nommu: vcpu->cpu = cpu; kvm_vgic_load(vcpu); @@ -431,35 +538,72 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (vcpu_has_ptrauth(vcpu)) vcpu_ptrauth_disable(vcpu); kvm_arch_vcpu_load_debug_state_flags(vcpu); + + if (is_protected_kvm_enabled()) { + kvm_call_hyp_nvhe(__pkvm_vcpu_load, + vcpu->kvm->arch.pkvm.handle, + vcpu->vcpu_idx, vcpu->arch.hcr_el2); + kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, + &vcpu->arch.vgic_cpu.vgic_v3); + } + + if (!cpumask_test_cpu(smp_processor_id(), vcpu->kvm->arch.supported_cpus)) + vcpu_set_on_unsupported_cpu(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + if (is_protected_kvm_enabled()) { + kvm_call_hyp(__vgic_v3_save_vmcr_aprs, + &vcpu->arch.vgic_cpu.vgic_v3); + kvm_call_hyp_nvhe(__pkvm_vcpu_put); + + /* __pkvm_vcpu_put implies a sync of the state */ + if (!kvm_vm_is_protected(vcpu->kvm)) + vcpu_set_flag(vcpu, PKVM_HOST_STATE_DIRTY); + } + kvm_arch_vcpu_put_debug_state_flags(vcpu); kvm_arch_vcpu_put_fp(vcpu); if (has_vhe()) kvm_vcpu_put_sysregs_vhe(vcpu); kvm_timer_vcpu_put(vcpu); - kvm_vgic_put(vcpu); + kvm_vgic_put(vcpu, false); kvm_vcpu_pmu_restore_host(vcpu); + kvm_arm_vmid_clear_active(); + vcpu_clear_on_unsupported_cpu(vcpu); vcpu->cpu = -1; } -static void vcpu_power_off(struct kvm_vcpu *vcpu) +void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) { - vcpu->arch.power_off = true; + vcpu->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; kvm_make_request(KVM_REQ_SLEEP, vcpu); kvm_vcpu_kick(vcpu); } +bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_STOPPED; +} + +static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu) +{ + vcpu->arch.mp_state.mp_state = KVM_MP_STATE_SUSPENDED; + kvm_make_request(KVM_REQ_SUSPEND, vcpu); + kvm_vcpu_kick(vcpu); +} + +static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_SUSPENDED; +} + int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - if (vcpu->arch.power_off) - mp_state->mp_state = KVM_MP_STATE_STOPPED; - else - mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + *mp_state = vcpu->arch.mp_state; return 0; } @@ -471,10 +615,13 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, switch (mp_state->mp_state) { case KVM_MP_STATE_RUNNABLE: - vcpu->arch.power_off = false; + vcpu->arch.mp_state = *mp_state; break; case KVM_MP_STATE_STOPPED: - vcpu_power_off(vcpu); + kvm_arm_vcpu_power_off(vcpu); + break; + case KVM_MP_STATE_SUSPENDED: + kvm_arm_vcpu_suspend(vcpu); break; default: ret = -EINVAL; @@ -494,7 +641,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF); return ((irq_lines || kvm_vgic_vcpu_pending_irq(v)) - && !v->arch.power_off && !v->arch.pause); + && !kvm_arm_vcpu_stopped(v) && !v->arch.pause); } bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) @@ -502,99 +649,38 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) return vcpu_mode_priv(vcpu); } -/* Just ensure a guest exit from a particular CPU */ -static void exit_vm_noop(void *info) +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) { + return *vcpu_pc(vcpu); } -void force_vm_exit(const cpumask_t *mask) +static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) { - preempt_disable(); - smp_call_function_many(mask, exit_vm_noop, NULL, true); - preempt_enable(); + return vcpu->arch.target >= 0; } -/** - * need_new_vmid_gen - check that the VMID is still valid - * @vmid: The VMID to check - * - * return true if there is a new generation of VMIDs being used - * - * The hardware supports a limited set of values with the value zero reserved - * for the host, so we check if an assigned value belongs to a previous - * generation, which requires us to assign a new value. If we're the first to - * use a VMID for the new generation, we must flush necessary caches and TLBs - * on all CPUs. +/* + * Handle both the initialisation that is being done when the vcpu is + * run for the first time, as well as the updates that must be + * performed each time we get a new thread dealing with this vcpu. */ -static bool need_new_vmid_gen(struct kvm_vmid *vmid) -{ - u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen); - smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */ - return unlikely(READ_ONCE(vmid->vmid_gen) != current_vmid_gen); -} - -/** - * update_vmid - Update the vmid with a valid VMID for the current generation - * @vmid: The stage-2 VMID information struct - */ -static void update_vmid(struct kvm_vmid *vmid) -{ - if (!need_new_vmid_gen(vmid)) - return; - - spin_lock(&kvm_vmid_lock); - - /* - * We need to re-check the vmid_gen here to ensure that if another vcpu - * already allocated a valid vmid for this vm, then this vcpu should - * use the same vmid. - */ - if (!need_new_vmid_gen(vmid)) { - spin_unlock(&kvm_vmid_lock); - return; - } - - /* First user of a new VMID generation? */ - if (unlikely(kvm_next_vmid == 0)) { - atomic64_inc(&kvm_vmid_gen); - kvm_next_vmid = 1; - - /* - * On SMP we know no other CPUs can use this CPU's or each - * other's VMID after force_vm_exit returns since the - * kvm_vmid_lock blocks them from reentry to the guest. - */ - force_vm_exit(cpu_all_mask); - /* - * Now broadcast TLB + ICACHE invalidation over the inner - * shareable domain to make sure all data structures are - * clean. - */ - kvm_call_hyp(__kvm_flush_vm_context); - } - - WRITE_ONCE(vmid->vmid, kvm_next_vmid); - kvm_next_vmid++; - kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1; - - smp_wmb(); - WRITE_ONCE(vmid->vmid_gen, atomic64_read(&kvm_vmid_gen)); - - spin_unlock(&kvm_vmid_lock); -} - -static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) +int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - int ret = 0; + int ret; - if (likely(vcpu->arch.has_run_once)) - return 0; + if (!kvm_vcpu_initialized(vcpu)) + return -ENOEXEC; if (!kvm_arm_vcpu_is_finalized(vcpu)) return -EPERM; - vcpu->arch.has_run_once = true; + ret = kvm_arch_vcpu_run_map_fp(vcpu); + if (ret) + return ret; + + if (likely(vcpu_has_run_once(vcpu))) + return 0; kvm_arm_vcpu_init_debug(vcpu); @@ -606,12 +692,6 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) ret = kvm_vgic_map_resources(kvm); if (ret) return ret; - } else { - /* - * Tell the rest of the code that there are userspace irqchip - * VMs in the wild. - */ - static_branch_inc(&userspace_irqchip_in_use); } ret = kvm_timer_enable(vcpu); @@ -619,6 +699,29 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) return ret; ret = kvm_arm_pmu_v3_enable(vcpu); + if (ret) + return ret; + + if (is_protected_kvm_enabled()) { + /* Start with the vcpu in a dirty state */ + if (!kvm_vm_is_protected(vcpu->kvm)) + vcpu_set_flag(vcpu, PKVM_HOST_STATE_DIRTY); + ret = pkvm_create_hyp_vm(kvm); + if (ret) + return ret; + } + + if (!irqchip_in_kernel(kvm)) { + /* + * Tell the rest of the code that there are userspace irqchip + * VMs in the wild. + */ + static_branch_inc(&userspace_irqchip_in_use); + } + + mutex_lock(&kvm->lock); + set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags); + mutex_unlock(&kvm->lock); return ret; } @@ -630,7 +733,7 @@ bool kvm_arch_intc_initialized(struct kvm *kvm) void kvm_arm_halt_guest(struct kvm *kvm) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) @@ -640,7 +743,7 @@ void kvm_arm_halt_guest(struct kvm *kvm) void kvm_arm_resume_guest(struct kvm *kvm) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { @@ -649,15 +752,15 @@ void kvm_arm_resume_guest(struct kvm *kvm) } } -static void vcpu_req_sleep(struct kvm_vcpu *vcpu) +static void kvm_vcpu_sleep(struct kvm_vcpu *vcpu) { struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); rcuwait_wait_event(wait, - (!vcpu->arch.power_off) &&(!vcpu->arch.pause), + (!kvm_arm_vcpu_stopped(vcpu)) && (!vcpu->arch.pause), TASK_INTERRUPTIBLE); - if (vcpu->arch.power_off || vcpu->arch.pause) { + if (kvm_arm_vcpu_stopped(vcpu) || vcpu->arch.pause) { /* Awaken to handle a signal, request we sleep again later. */ kvm_make_request(KVM_REQ_SLEEP, vcpu); } @@ -670,16 +773,86 @@ static void vcpu_req_sleep(struct kvm_vcpu *vcpu) smp_rmb(); } -static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) +/** + * kvm_vcpu_wfi - emulate Wait-For-Interrupt behavior + * @vcpu: The VCPU pointer + * + * Suspend execution of a vCPU until a valid wake event is detected, i.e. until + * the vCPU is runnable. The vCPU may or may not be scheduled out, depending + * on when a wake event arrives, e.g. there may already be a pending wake event. + */ +void kvm_vcpu_wfi(struct kvm_vcpu *vcpu) { - return vcpu->arch.target >= 0; + /* + * Sync back the state of the GIC CPU interface so that we have + * the latest PMR and group enables. This ensures that + * kvm_arch_vcpu_runnable has up-to-date data to decide whether + * we have pending interrupts, e.g. when determining if the + * vCPU should block. + * + * For the same reason, we want to tell GICv4 that we need + * doorbells to be signalled, should an interrupt become pending. + */ + preempt_disable(); + kvm_vgic_put(vcpu, true); + preempt_enable(); + + kvm_vcpu_block(vcpu); + vcpu_clear_flag(vcpu, IN_WFIT); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); + + preempt_disable(); + kvm_vgic_load(vcpu); + preempt_enable(); } -static void check_vcpu_requests(struct kvm_vcpu *vcpu) +static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu) +{ + if (!kvm_arm_vcpu_suspended(vcpu)) + return 1; + + kvm_vcpu_wfi(vcpu); + + /* + * The suspend state is sticky; we do not leave it until userspace + * explicitly marks the vCPU as runnable. Request that we suspend again + * later. + */ + kvm_make_request(KVM_REQ_SUSPEND, vcpu); + + /* + * Check to make sure the vCPU is actually runnable. If so, exit to + * userspace informing it of the wakeup condition. + */ + if (kvm_arch_vcpu_runnable(vcpu)) { + memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP; + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + return 0; + } + + /* + * Otherwise, we were unblocked to process a different event, such as a + * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to + * process the event. + */ + return 1; +} + +/** + * check_vcpu_requests - check and handle pending vCPU requests + * @vcpu: the VCPU pointer + * + * Return: 1 if we should enter the guest + * 0 if we should exit to userspace + * < 0 if we should exit to userspace, where the return value indicates + * an error + */ +static int check_vcpu_requests(struct kvm_vcpu *vcpu) { if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) - vcpu_req_sleep(vcpu); + kvm_vcpu_sleep(vcpu); if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) kvm_reset_vcpu(vcpu); @@ -704,7 +877,12 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu)) kvm_pmu_handle_pmcr(vcpu, __vcpu_sys_reg(vcpu, PMCR_EL0)); + + if (kvm_check_request(KVM_REQ_SUSPEND, vcpu)) + return kvm_vcpu_suspend(vcpu); } + + return 1; } static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu) @@ -749,8 +927,15 @@ static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) } } + if (unlikely(vcpu_on_unsupported_cpu(vcpu))) { + run->exit_reason = KVM_EXIT_FAIL_ENTRY; + run->fail_entry.hardware_entry_failure_reason = KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED; + run->fail_entry.cpu = smp_processor_id(); + *ret = 0; + return true; + } + return kvm_request_pending(vcpu) || - need_new_vmid_gen(&vcpu->arch.hw_mmu->vmid) || xfer_to_guest_mode_work_pending(); } @@ -787,13 +972,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; int ret; - if (unlikely(!kvm_vcpu_initialized(vcpu))) - return -ENOEXEC; - - ret = kvm_vcpu_first_run_init(vcpu); - if (ret) - return ret; - if (run->exit_reason == KVM_EXIT_MMIO) { ret = kvm_handle_mmio_return(vcpu); if (ret) @@ -819,9 +997,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (!ret) ret = 1; - update_vmid(&vcpu->arch.hw_mmu->vmid); - - check_vcpu_requests(vcpu); + if (ret > 0) + ret = check_vcpu_requests(vcpu); /* * Preparing the interrupts to be injected also @@ -830,12 +1007,23 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ preempt_disable(); + /* + * The VMID allocator only tracks active VMIDs per + * physical CPU, and therefore the VMID allocated may not be + * preserved on VMID roll-over if the task was preempted, + * making a thread's VMID inactive. So we need to call + * kvm_arm_vmid_update() in non-premptible context. + */ + kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid); + kvm_pmu_flush_hwstate(vcpu); local_irq_disable(); kvm_vgic_flush_hwstate(vcpu); + kvm_pmu_update_vcpu_events(vcpu); + /* * Ensure we set mode to IN_GUEST_MODE after we disable * interrupts and before the final VCPU requests check. @@ -857,6 +1045,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) } kvm_arm_setup_debug(vcpu); + kvm_arch_vcpu_ctxflush_fp(vcpu); /************************************************************** * Enter the guest @@ -908,9 +1097,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * context synchronization event) is necessary to ensure that * pending interrupts are taken. */ - local_irq_enable(); - isb(); - local_irq_disable(); + if (ARM_EXCEPTION_CODE(ret) == ARM_EXCEPTION_IRQ) { + local_irq_enable(); + isb(); + local_irq_disable(); + } guest_timing_exit_irqoff(); @@ -961,8 +1152,8 @@ out: * the vcpu state. Note that this relies on __kvm_adjust_pc() * being preempt-safe on VHE. */ - if (unlikely(vcpu->arch.flags & (KVM_ARM64_PENDING_EXCEPTION | - KVM_ARM64_INCREMENT_PC))) + if (unlikely(vcpu_get_flag(vcpu, PENDING_EXCEPTION) || + vcpu_get_flag(vcpu, INCREMENT_PC))) kvm_call_hyp(__kvm_adjust_pc, vcpu); vcpu_put(vcpu); @@ -1128,7 +1319,7 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, * need to invalidate the I-cache though, as FWB does *not* * imply CTR_EL0.DIC. */ - if (vcpu->arch.has_run_once) { + if (vcpu_has_run_once(vcpu)) { if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) stage2_unmap_vm(vcpu->kvm); else @@ -1142,9 +1333,9 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, * Handle the "start in power-off" case. */ if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) - vcpu_power_off(vcpu); + kvm_arm_vcpu_power_off(vcpu); else - vcpu->arch.power_off = false; + vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE; return 0; } @@ -1360,18 +1551,11 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { - unsigned long dev_id, type; - - dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >> - KVM_ARM_DEVICE_ID_SHIFT; - type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >> - KVM_ARM_DEVICE_TYPE_SHIFT; - - switch (dev_id) { + switch (FIELD_GET(KVM_ARM_DEVICE_ID_MASK, dev_addr->id)) { case KVM_ARM_DEVICE_VGIC_V2: if (!vgic_present) return -ENXIO; - return kvm_vgic_addr(kvm, type, &dev_addr->addr, true); + return kvm_set_legacy_vgic_v2_addr(kvm, dev_addr); default: return -ENODEV; } @@ -1401,12 +1585,9 @@ long kvm_arch_vm_ioctl(struct file *filp, return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr); } case KVM_ARM_PREFERRED_TARGET: { - int err; struct kvm_vcpu_init init; - err = kvm_vcpu_preferred_target(&init); - if (err) - return err; + kvm_vcpu_preferred_target(&init); if (copy_to_user(argp, &init, sizeof(init))) return -EFAULT; @@ -1474,6 +1655,9 @@ static void cpu_prepare_hyp_mode(int cpu) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); unsigned long tcr; + int *hyp_cpu_number_ptr = per_cpu_ptr_nvhe_sym(hyp_cpu_number, cpu); + + *hyp_cpu_number_ptr = cpu; /* * Calculate the raw per-cpu offset without a translation from the @@ -1505,7 +1689,6 @@ static void cpu_prepare_hyp_mode(int cpu) tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET; params->tcr_el2 = tcr; - params->stack_hyp_va = kern_hyp_va(per_cpu(kvm_arm_hyp_stack_page, cpu) + PAGE_SIZE); params->pgd_pa = kvm_mmu_get_httbr(); if (is_protected_kvm_enabled()) params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS; @@ -1591,25 +1774,33 @@ static void cpu_set_hyp_vector(void) kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot); } -static void cpu_hyp_reinit(void) +static void cpu_hyp_init_context(void) { kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt); - cpu_hyp_reset(); + if (!is_kernel_in_hyp_mode()) + cpu_init_hyp_mode(); +} + +static void cpu_hyp_init_features(void) +{ + cpu_set_hyp_vector(); + kvm_arm_init_debug(); if (is_kernel_in_hyp_mode()) kvm_timer_init_vhe(); - else - cpu_init_hyp_mode(); - - cpu_set_hyp_vector(); - - kvm_arm_init_debug(); if (vgic_present) kvm_vgic_init_cpu_hardware(); } +static void cpu_hyp_reinit(void) +{ + cpu_hyp_reset(); + cpu_hyp_init_context(); + cpu_hyp_init_features(); +} + static void _kvm_arch_hardware_enable(void *discard) { if (!__this_cpu_read(kvm_arm_hardware_enabled)) { @@ -1701,7 +1892,7 @@ static void init_cpu_logical_map(void) /* * Copy the MPIDR <-> logical CPU ID mapping to hyp. - * Only copy the set of online CPUs whose features have been chacked + * Only copy the set of online CPUs whose features have been checked * against the finalized system capabilities. The hypervisor will not * allow any other CPUs from the `possible` set to boot. */ @@ -1724,6 +1915,7 @@ static bool init_psci_relay(void) } kvm_host_psci_config.version = psci_ops.get_version(); + kvm_host_psci_config.smccc_version = arm_smccc_get_version(); if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) { kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids(); @@ -1773,8 +1965,7 @@ static int init_subsystems(void) if (err) goto out; - kvm_perf_init(); - kvm_sys_reg_table_init(); + kvm_register_perf_callbacks(NULL); out: if (err || !is_protected_kvm_enabled()) @@ -1790,33 +1981,54 @@ static void teardown_hyp_mode(void) free_hyp_pgds(); for_each_possible_cpu(cpu) { free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); - free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order()); + free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order()); } } static int do_pkvm_init(u32 hyp_va_bits) { - void *per_cpu_base = kvm_ksym_ref(kvm_arm_hyp_percpu_base); + void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)); int ret; preempt_disable(); - hyp_install_host_vector(); + cpu_hyp_init_context(); ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size, num_possible_cpus(), kern_hyp_va(per_cpu_base), hyp_va_bits); + cpu_hyp_init_features(); + + /* + * The stub hypercalls are now disabled, so set our local flag to + * prevent a later re-init attempt in kvm_arch_hardware_enable(). + */ + __this_cpu_write(kvm_arm_hardware_enabled, 1); preempt_enable(); return ret; } +static void kvm_hyp_init_symbols(void) +{ + kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); + kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1); + kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1); + kvm_nvhe_sym(id_aa64isar2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); + kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); + kvm_nvhe_sym(__icache_flags) = __icache_flags; + kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits; + kvm_nvhe_sym(smccc_trng_available) = smccc_trng_available; +} + +int kvm_hyp_init_events(void); + static int kvm_hyp_init_protection(u32 hyp_va_bits) { void *addr = phys_to_virt(hyp_mem_base); int ret; - kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); - kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); - ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP); if (ret) return ret; @@ -1883,7 +2095,7 @@ static int init_hyp_mode(void) page_addr = page_address(page); memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size()); - kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr; + kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr; } /* @@ -1896,6 +2108,13 @@ static int init_hyp_mode(void) goto out_err; } + err = create_hyp_mappings(kvm_ksym_ref(__hyp_data_start), + kvm_ksym_ref(__hyp_data_end), PAGE_HYP); + if (err) { + kvm_err("Cannot map .hyp.data section\n"); + goto out_err; + } + err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start), kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO); if (err) { @@ -1933,18 +2152,50 @@ static int init_hyp_mode(void) * Map the Hyp stack pages */ for_each_possible_cpu(cpu) { + struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu); - err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE, - PAGE_HYP); + unsigned long hyp_addr; + /* + * Allocate a contiguous HYP private VA range for the stack + * and guard page. The allocation is also aligned based on + * the order of its size. + */ + err = hyp_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr); + if (err) { + kvm_err("Cannot allocate hyp stack guard page\n"); + goto out_err; + } + + /* + * Since the stack grows downwards, map the stack to the page + * at the higher address and leave the lower guard page + * unbacked. + * + * Any valid stack address now has the PAGE_SHIFT bit as 1 + * and addresses corresponding to the guard page have the + * PAGE_SHIFT bit as 0 - this is used for overflow detection. + */ + err = __create_hyp_mappings(hyp_addr + PAGE_SIZE, PAGE_SIZE, + __pa(stack_page), PAGE_HYP); if (err) { kvm_err("Cannot map hyp stack\n"); goto out_err; } + + /* + * Save the stack PA in nvhe_init_params. This will be needed + * to recreate the stack mapping in protected nVHE mode. + * __hyp_pa() won't do the right thing there, since the stack + * has been mapped in the flexible private VA space. + */ + params->stack_pa = __pa(stack_page); + + params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE); } for_each_possible_cpu(cpu) { - char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu]; + char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu]; char *percpu_end = percpu_begin + nvhe_percpu_size(); /* Map Hyp percpu pages */ @@ -1958,6 +2209,13 @@ static int init_hyp_mode(void) cpu_prepare_hyp_mode(cpu); } + kvm_hyp_init_symbols(); + + /* TODO: Real .h interface */ +#ifdef CONFIG_TRACING + kvm_hyp_init_events(); +#endif + if (is_protected_kvm_enabled()) { init_cpu_logical_map(); @@ -1965,9 +2223,7 @@ static int init_hyp_mode(void) err = -ENODEV; goto out_err; } - } - if (is_protected_kvm_enabled()) { err = kvm_hyp_init_protection(hyp_va_bits); if (err) { kvm_err("Failed to init hyp memory protection\n"); @@ -2025,14 +2281,14 @@ static int finalize_hyp_mode(void) * at, which would end badly once inaccessible. */ kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); - kmemleak_free_part(__va(hyp_mem_base), hyp_mem_size); + kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size); return pkvm_drop_host_privileges(); } struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; mpidr &= MPIDR_HWID_BITMASK; kvm_for_each_vcpu(i, vcpu, kvm) { @@ -2095,6 +2351,17 @@ int kvm_arch_init(void *opaque) return -ENODEV; } + if (kvm_get_mode() == KVM_MODE_NONE) { + kvm_info("KVM disabled from command line\n"); + return -ENODEV; + } + + err = kvm_sys_reg_table_init(); + if (err) { + kvm_info("Error initializing system register tables"); + return err; + } + in_hyp_mode = is_kernel_in_hyp_mode(); if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) || @@ -2110,6 +2377,12 @@ int kvm_arch_init(void *opaque) if (err) return err; + err = kvm_arm_vmid_alloc_init(); + if (err) { + kvm_err("Failed to initialize VMID allocator.\n"); + return err; + } + if (!in_hyp_mode) { err = init_hyp_mode(); if (err) @@ -2132,6 +2405,10 @@ int kvm_arch_init(void *opaque) kvm_err("Failed to finalize Hyp protection\n"); goto out_hyp; } + + err = init_hyp_tracefs(); + if (err) + kvm_err("Failed to initialize Hyp tracing\n"); } if (is_protected_kvm_enabled()) { @@ -2149,13 +2426,14 @@ out_hyp: if (!in_hyp_mode) teardown_hyp_mode(); out_err: + kvm_arm_vmid_alloc_free(); return err; } /* NOP: Compiling as a module not supported */ void kvm_arch_exit(void) { - kvm_perf_teardown(); + kvm_unregister_perf_callbacks(); } static int __init early_kvm_mode_cfg(char *arg) @@ -2163,13 +2441,29 @@ static int __init early_kvm_mode_cfg(char *arg) if (!arg) return -EINVAL; - if (strcmp(arg, "protected") == 0) { - kvm_mode = KVM_MODE_PROTECTED; + if (strcmp(arg, "none") == 0) { + kvm_mode = KVM_MODE_NONE; return 0; } - if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) + if (!is_hyp_mode_available()) { + pr_warn_once("KVM is not available. Ignoring kvm-arm.mode\n"); return 0; + } + + if (strcmp(arg, "protected") == 0) { + if (!is_kernel_in_hyp_mode()) + kvm_mode = KVM_MODE_PROTECTED; + else + pr_warn_once("Protected KVM not available with VHE\n"); + + return 0; + } + + if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) { + kvm_mode = KVM_MODE_DEFAULT; + return 0; + } return -EINVAL; } diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index db9361338b2a..81218d707b01 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -32,25 +32,33 @@ static DEFINE_PER_CPU(u64, mdcr_el2); * * Guest access to MDSCR_EL1 is trapped by the hypervisor and handled * after we have restored the preserved value to the main context. + * + * When single-step is enabled by userspace, we tweak PSTATE.SS on every + * guest entry. Preserve PSTATE.SS so we can restore the original value + * for the vcpu after the single-step is disabled. */ static void save_guest_debug_regs(struct kvm_vcpu *vcpu) { - u64 val = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - - vcpu->arch.guest_debug_preserved.mdscr_el1 = val; + __vcpu_save_guest_debug_regs(vcpu); trace_kvm_arm_set_dreg32("Saved MDSCR_EL1", vcpu->arch.guest_debug_preserved.mdscr_el1); + + vcpu->arch.guest_debug_preserved.pstate_ss = + (*vcpu_cpsr(vcpu) & DBG_SPSR_SS); } static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) { - u64 val = vcpu->arch.guest_debug_preserved.mdscr_el1; - - vcpu_write_sys_reg(vcpu, val, MDSCR_EL1); + __vcpu_restore_guest_debug_regs(vcpu); trace_kvm_arm_set_dreg32("Restored MDSCR_EL1", vcpu_read_sys_reg(vcpu, MDSCR_EL1)); + + if (vcpu->arch.guest_debug_preserved.pstate_ss) + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; + else + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; } /** @@ -104,10 +112,12 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) * Trap debug register access when one of the following is true: * - Userspace is using the hardware to debug the guest * (KVM_GUESTDBG_USE_HW is set). - * - The guest is not using debug (KVM_ARM64_DEBUG_DIRTY is clear). + * - The guest is not using debug (DEBUG_DIRTY clear). + * - The guest has enabled the OS Lock (debug exceptions are blocked). */ if ((vcpu->guest_debug & KVM_GUESTDBG_USE_HW) || - !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY)) + !vcpu_get_flag(vcpu, DEBUG_DIRTY) || + kvm_vcpu_os_lock_enabled(vcpu)) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2); @@ -145,8 +155,8 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) * debug related registers. * * Additionally, KVM only traps guest accesses to the debug registers if - * the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY - * flag on vcpu->arch.flags). Since the guest must not interfere + * the guest is not actively using them (see the DEBUG_DIRTY + * flag on vcpu->arch.iflags). Since the guest must not interfere * with the hardware state when debugging the guest, we must ensure that * trapping is enabled whenever we are debugging the guest using the * debug registers. @@ -160,8 +170,8 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) kvm_arm_setup_mdcr_el2(vcpu); - /* Is Guest debugging in effect? */ - if (vcpu->guest_debug) { + /* Check if we need to use the debug registers. */ + if (kvm_vcpu_needs_debug_regs(vcpu)) { /* Save guest debug state */ save_guest_debug_regs(vcpu); @@ -186,7 +196,18 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) * debugging the system. */ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; + /* + * If the software step state at the last guest exit + * was Active-pending, we don't set DBG_SPSR_SS so + * that the state is maintained (to not run another + * single-step until the pending Software Step + * exception is taken). + */ + if (!vcpu_get_flag(vcpu, DBG_SS_ACTIVE_PENDING)) + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; + else + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; + mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); mdscr |= DBG_MDSCR_SS; vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); @@ -203,9 +224,8 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) * * We simply switch the debug_ptr to point to our new * external_debug_state which has been populated by the - * debug ioctl. The existing KVM_ARM64_DEBUG_DIRTY - * mechanism ensures the registers are updated on the - * world switch. + * debug ioctl. The existing DEBUG_DIRTY mechanism ensures + * the registers are updated on the world switch. */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { /* Enable breakpoints/watchpoints */ @@ -214,7 +234,7 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state; - vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; + vcpu_set_flag(vcpu, DEBUG_DIRTY); trace_kvm_arm_set_regset("BKPTS", get_num_brps(), &vcpu->arch.debug_ptr->dbg_bcr[0], @@ -223,6 +243,19 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) trace_kvm_arm_set_regset("WAPTS", get_num_wrps(), &vcpu->arch.debug_ptr->dbg_wcr[0], &vcpu->arch.debug_ptr->dbg_wvr[0]); + + /* + * The OS Lock blocks debug exceptions in all ELs when it is + * enabled. If the guest has enabled the OS Lock, constrain its + * effects to the guest. Emulate the behavior by clearing + * MDSCR_EL1.MDE. In so doing, we ensure that host debug + * exceptions are unaffected by guest configuration of the OS + * Lock. + */ + } else if (kvm_vcpu_os_lock_enabled(vcpu)) { + mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); + mdscr &= ~DBG_MDSCR_MDE; + vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); } } @@ -231,7 +264,7 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) /* If KDE or MDE are set, perform a full save/restore cycle. */ if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE)) - vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; + vcpu_set_flag(vcpu, DEBUG_DIRTY); /* Write mdcr_el2 changes since vcpu_load on VHE systems */ if (has_vhe() && orig_mdcr_el2 != vcpu->arch.mdcr_el2) @@ -244,7 +277,19 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) { trace_kvm_arm_clear_debug(vcpu->guest_debug); - if (vcpu->guest_debug) { + /* + * Restore the guest's debug registers if we were using them. + */ + if (kvm_vcpu_needs_debug_regs(vcpu)) { + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS)) + /* + * Mark the vcpu as ACTIVE_PENDING + * until Software Step exception is taken. + */ + vcpu_set_flag(vcpu, DBG_SS_ACTIVE_PENDING); + } + restore_guest_debug_regs(vcpu); /* @@ -278,18 +323,18 @@ void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu) * If SPE is present on this CPU and is available at current EL, * we may need to check if the host state needs to be saved. */ - if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_PMSVER_SHIFT) && + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) && !(read_sysreg_s(SYS_PMBIDR_EL1) & BIT(SYS_PMBIDR_EL1_P_SHIFT))) - vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_SPE; + vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_SPE); /* Check if we have TRBE implemented and available at the host */ - if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_TRBE_SHIFT) && + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) && !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_PROG)) - vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_TRBE; + vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_TRBE); } void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu) { - vcpu->arch.flags &= ~(KVM_ARM64_DEBUG_STATE_SAVE_SPE | - KVM_ARM64_DEBUG_STATE_SAVE_TRBE); + vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_SPE); + vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_TRBE); } diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 5621020b28de..ec8e4494873d 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -7,7 +7,6 @@ */ #include #include -#include #include #include #include @@ -15,6 +14,19 @@ #include #include +void kvm_vcpu_unshare_task_fp(struct kvm_vcpu *vcpu) +{ + struct task_struct *p = vcpu->arch.parent_task; + struct user_fpsimd_state *fpsimd; + + if (!is_protected_kvm_enabled() || !p) + return; + + fpsimd = &p->thread.uw.fpsimd_state; + kvm_unshare_hyp(fpsimd, fpsimd + 1); + put_task_struct(p); +} + /* * Called on entry to KVM_RUN unless this vcpu previously ran at least * once and the most recent prior KVM_RUN for this vcpu was called from @@ -28,36 +40,29 @@ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu) { int ret; - struct thread_info *ti = ¤t->thread_info; struct user_fpsimd_state *fpsimd = ¤t->thread.uw.fpsimd_state; + kvm_vcpu_unshare_task_fp(vcpu); + + /* Make sure the host task fpsimd state is visible to hyp: */ + ret = kvm_share_hyp(fpsimd, fpsimd + 1); + if (ret) + return ret; + + vcpu->arch.host_fpsimd_state = kern_hyp_va(fpsimd); + /* - * Make sure the host task thread flags and fpsimd state are - * visible to hyp: + * We need to keep current's task_struct pinned until its data has been + * unshared with the hypervisor to make sure it is not re-used by the + * kernel and donated to someone else while already shared -- see + * kvm_vcpu_unshare_task_fp() for the matching put_task_struct(). */ - ret = create_hyp_mappings(ti, ti + 1, PAGE_HYP); - if (ret) - goto error; - - ret = create_hyp_mappings(fpsimd, fpsimd + 1, PAGE_HYP); - if (ret) - goto error; - - if (vcpu->arch.sve_state) { - void *sve_end; - - sve_end = vcpu->arch.sve_state + vcpu_sve_state_size(vcpu); - - ret = create_hyp_mappings(vcpu->arch.sve_state, sve_end, - PAGE_HYP); - if (ret) - goto error; + if (is_protected_kvm_enabled()) { + get_task_struct(current); + vcpu->arch.parent_task = current; } - vcpu->arch.host_thread_info = kern_hyp_va(ti); - vcpu->arch.host_fpsimd_state = kern_hyp_va(fpsimd); -error: - return ret; + return 0; } /* @@ -66,40 +71,75 @@ error: * * Here, we just set the correct metadata to indicate that the FPSIMD * state in the cpu regs (if any) belongs to current on the host. - * - * TIF_SVE is backed up here, since it may get clobbered with guest state. - * This flag is restored by kvm_arch_vcpu_put_fp(vcpu). */ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) { BUG_ON(!current->mm); + BUG_ON(test_thread_flag(TIF_SVE)); - vcpu->arch.flags &= ~(KVM_ARM64_FP_ENABLED | - KVM_ARM64_HOST_SVE_IN_USE | - KVM_ARM64_HOST_SVE_ENABLED); - vcpu->arch.flags |= KVM_ARM64_FP_HOST; + if (!system_supports_fpsimd()) + return; - if (test_thread_flag(TIF_SVE)) - vcpu->arch.flags |= KVM_ARM64_HOST_SVE_IN_USE; + vcpu->arch.fp_state = FP_STATE_HOST_OWNED; + vcpu_clear_flag(vcpu, HOST_SVE_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) - vcpu->arch.flags |= KVM_ARM64_HOST_SVE_ENABLED; + vcpu_set_flag(vcpu, HOST_SVE_ENABLED); + + /* + * We don't currently support SME guests but if we leave + * things in streaming mode then when the guest starts running + * FPSIMD or SVE code it may generate SME traps so as a + * special case if we are in streaming mode we force the host + * state to be saved now and exit streaming mode so that we + * don't have to handle any SME traps for valid guest + * operations. Do this for ZA as well for now for simplicity. + */ + if (system_supports_sme()) { + vcpu_clear_flag(vcpu, HOST_SME_ENABLED); + if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) + vcpu_set_flag(vcpu, HOST_SME_ENABLED); + + if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) { + vcpu->arch.fp_state = FP_STATE_FREE; + fpsimd_save_and_flush_cpu_state(); + } + } } /* - * If the guest FPSIMD state was loaded, update the host's context - * tracking data mark the CPU FPSIMD regs as dirty and belonging to vcpu - * so that they will be written back if the kernel clobbers them due to - * kernel-mode NEON before re-entry into the guest. + * Called just before entering the guest once we are no longer preemptable + * and interrupts are disabled. If we have managed to run anything using + * FP while we were preemptible (such as off the back of an interrupt), + * then neither the host nor the guest own the FP hardware (and it was the + * responsibility of the code that used FP to save the existing state). + */ +void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu) +{ + if (test_thread_flag(TIF_FOREIGN_FPSTATE)) + vcpu->arch.fp_state = FP_STATE_FREE; +} + +/* + * Called just after exiting the guest. If the guest FPSIMD state + * was loaded, update the host's context tracking data mark the CPU + * FPSIMD regs as dirty and belonging to vcpu so that they will be + * written back if the kernel clobbers them due to kernel-mode NEON + * before re-entry into the guest. */ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) { WARN_ON_ONCE(!irqs_disabled()); - if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { + if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) { + /* + * Currently we do not support SME guests so SVCR is + * always 0 and we just need a variable to point to. + */ fpsimd_bind_state_to_cpu(&vcpu->arch.ctxt.fp_regs, vcpu->arch.sve_state, - vcpu->arch.sve_max_vl); + vcpu->arch.sve_max_vl, + NULL, 0, &vcpu->arch.svcr); clear_thread_flag(TIF_FOREIGN_FPSTATE); update_thread_flag(TIF_SVE, vcpu_has_sve(vcpu)); @@ -115,13 +155,27 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) { unsigned long flags; - bool host_has_sve = system_supports_sve(); - bool guest_has_sve = vcpu_has_sve(vcpu); local_irq_save(flags); - if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { - if (guest_has_sve) { + /* + * If we have VHE then the Hyp code will reset CPACR_EL1 to + * CPACR_EL1_DEFAULT and we need to reenable SME. + */ + if (has_vhe() && system_supports_sme()) { + /* Also restore EL0 state seen on entry */ + if (vcpu_get_flag(vcpu, HOST_SME_ENABLED)) + sysreg_clear_set(CPACR_EL1, 0, + CPACR_EL1_SMEN_EL0EN | + CPACR_EL1_SMEN_EL1EN); + else + sysreg_clear_set(CPACR_EL1, + CPACR_EL1_SMEN_EL0EN, + CPACR_EL1_SMEN_EL1EN); + } + + if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) { + if (vcpu_has_sve(vcpu)) { __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR); /* Restore the VL that was saved when bound to the CPU */ @@ -131,7 +185,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) } fpsimd_save_and_flush_cpu_state(); - } else if (has_vhe() && host_has_sve) { + } else if (has_vhe() && system_supports_sve()) { /* * The FPSIMD/SVE state in the CPU has not been touched, and we * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been @@ -139,14 +193,13 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) * for EL0. To avoid spurious traps, restore the trap state * seen by kvm_arch_vcpu_load_fp(): */ - if (vcpu->arch.flags & KVM_ARM64_HOST_SVE_ENABLED) + if (vcpu_get_flag(vcpu, HOST_SVE_ENABLED)) sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_ZEN_EL0EN); else sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0); } - update_thread_flag(TIF_SVE, - vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE); + update_thread_flag(TIF_SVE, 0); local_irq_restore(flags); } diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 94108e2e0917..89cb03e2ed06 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include @@ -29,7 +29,9 @@ #include "trace.h" const struct _kvm_stats_desc kvm_vm_stats_desc[] = { - KVM_GENERIC_VM_STATS() + KVM_GENERIC_VM_STATS(), + STATS_DESC_ICOUNTER(VM, protected_hyp_mem), + STATS_DESC_ICOUNTER(VM, protected_shared_mem), }; const struct kvm_stats_header kvm_vm_stats_header = { @@ -282,7 +284,7 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) break; /* - * Otherwide, this is a priviledged mode, and *all* the + * Otherwise, this is a privileged mode, and *all* the * registers must be narrowed to 32bit. */ default: @@ -756,7 +758,9 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) switch (reg->id & KVM_REG_ARM_COPROC_MASK) { case KVM_REG_ARM_CORE: return get_core_reg(vcpu, reg); - case KVM_REG_ARM_FW: return kvm_arm_get_fw_reg(vcpu, reg); + case KVM_REG_ARM_FW: + case KVM_REG_ARM_FW_FEAT_BMAP: + return kvm_arm_get_fw_reg(vcpu, reg); case KVM_REG_ARM64_SVE: return get_sve_reg(vcpu, reg); } @@ -774,7 +778,9 @@ int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) switch (reg->id & KVM_REG_ARM_COPROC_MASK) { case KVM_REG_ARM_CORE: return set_core_reg(vcpu, reg); - case KVM_REG_ARM_FW: return kvm_arm_set_fw_reg(vcpu, reg); + case KVM_REG_ARM_FW: + case KVM_REG_ARM_FW_FEAT_BMAP: + return kvm_arm_set_fw_reg(vcpu, reg); case KVM_REG_ARM64_SVE: return set_sve_reg(vcpu, reg); } @@ -869,13 +875,10 @@ u32 __attribute_const__ kvm_target_cpu(void) return KVM_ARM_TARGET_GENERIC_V8; } -int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init) +void kvm_vcpu_preferred_target(struct kvm_vcpu_init *init) { u32 target = kvm_target_cpu(); - if (target < 0) - return -ENODEV; - memset(init, 0, sizeof(*init)); /* @@ -885,8 +888,6 @@ int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init) * target type. */ init->target = (__u32)target; - - return 0; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) @@ -938,6 +939,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, } else { /* If not enabled clear all flags */ vcpu->guest_debug = 0; + vcpu_clear_flag(vcpu, DBG_SS_ACTIVE_PENDING); } out: diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index a5ab5215094e..63e77373b2f8 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -80,25 +81,51 @@ static int handle_no_fpsimd(struct kvm_vcpu *vcpu) * * @vcpu: the vcpu pointer * - * WFE: Yield the CPU and come back to this vcpu when the scheduler + * WFE[T]: Yield the CPU and come back to this vcpu when the scheduler * decides to. * WFI: Simply call kvm_vcpu_block(), which will halt execution of * world-switches and schedule other host processes until there is an * incoming IRQ or FIQ to the VM. + * WFIT: Same as WFI, with a timed wakeup implemented as a background timer + * + * WF{I,E}T can immediately return if the deadline has already expired. */ static int kvm_handle_wfx(struct kvm_vcpu *vcpu) { - if (kvm_vcpu_get_esr(vcpu) & ESR_ELx_WFx_ISS_WFE) { + u64 esr = kvm_vcpu_get_esr(vcpu); + + if (esr & ESR_ELx_WFx_ISS_WFE) { trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true); vcpu->stat.wfe_exit_stat++; - kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu)); } else { trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false); vcpu->stat.wfi_exit_stat++; - kvm_vcpu_block(vcpu); - kvm_clear_request(KVM_REQ_UNHALT, vcpu); } + if (esr & ESR_ELx_WFx_ISS_WFxT) { + if (esr & ESR_ELx_WFx_ISS_RV) { + u64 val, now; + + now = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_TIMER_CNT); + val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); + + if (now >= val) + goto out; + } else { + /* Treat WFxT as WFx if RN is invalid */ + esr &= ~ESR_ELx_WFx_ISS_WFxT; + } + } + + if (esr & ESR_ELx_WFx_ISS_WFE) { + kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu)); + } else { + if (esr & ESR_ELx_WFx_ISS_WFxT) + vcpu_set_flag(vcpu, IN_WFIT); + + kvm_vcpu_wfi(vcpu); + } +out: kvm_incr_pc(vcpu); return 1; @@ -123,8 +150,14 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) run->exit_reason = KVM_EXIT_DEBUG; run->debug.arch.hsr = esr; - if (ESR_ELx_EC(esr) == ESR_ELx_EC_WATCHPT_LOW) + switch (ESR_ELx_EC(esr)) { + case ESR_ELx_EC_WATCHPT_LOW: run->debug.arch.far = vcpu->arch.fault.far_el2; + break; + case ESR_ELx_EC_SOFTSTP_LOW: + vcpu_clear_flag(vcpu, DBG_SS_ACTIVE_PENDING); + break; + } return 0; } @@ -165,6 +198,7 @@ static exit_handle_fn arm_exit_handlers[] = { [ESR_ELx_EC_CP15_64] = kvm_handle_cp15_64, [ESR_ELx_EC_CP14_MR] = kvm_handle_cp14_32, [ESR_ELx_EC_CP14_LS] = kvm_handle_cp14_load_store, + [ESR_ELx_EC_CP10_ID] = kvm_handle_cp10_id, [ESR_ELx_EC_CP14_64] = kvm_handle_cp14_64, [ESR_ELx_EC_HVC32] = handle_hvc, [ESR_ELx_EC_SMC32] = handle_smc, @@ -201,6 +235,21 @@ static int handle_trap_exceptions(struct kvm_vcpu *vcpu) { int handled; + /* + * If we run a non-protected VM when protection is enabled + * system-wide, resync the state from the hypervisor and mark + * it as dirty on the host side if it wasn't dirty already + * (which could happen if preemption has taken place). + */ + if (is_protected_kvm_enabled() && !kvm_vm_is_protected(vcpu->kvm)) { + preempt_disable(); + if (!(vcpu_get_flag(vcpu, PKVM_HOST_STATE_DIRTY))) { + kvm_call_hyp_nvhe(__pkvm_vcpu_sync_state); + vcpu_set_flag(vcpu, PKVM_HOST_STATE_DIRTY); + } + preempt_enable(); + } + /* * See ARM ARM B1.14.1: "Hyp traps on instructions * that fail their condition code check" @@ -246,7 +295,7 @@ int handle_exit(struct kvm_vcpu *vcpu, int exception_index) case ARM_EXCEPTION_HYP_GONE: /* * EL2 has been reset to the hyp-stub. This happens when a guest - * is pre-empted by kvm_reboot()'s shutdown call. + * is pre-emptied by kvm_reboot()'s shutdown call. */ run->exit_reason = KVM_EXIT_FAIL_ENTRY; return 0; @@ -268,6 +317,13 @@ int handle_exit(struct kvm_vcpu *vcpu, int exception_index) /* For exit types that need handling before we can be preempted */ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index) { + /* + * We just exited, so the state is clean from a hypervisor + * perspective. + */ + if (is_protected_kvm_enabled()) + vcpu_clear_flag(vcpu, PKVM_HOST_STATE_DIRTY); + if (ARM_SERROR_PENDING(exception_index)) { if (this_cpu_has_cap(ARM64_HAS_RAS_EXTN)) { u64 disr = kvm_vcpu_get_disr(vcpu); @@ -293,13 +349,8 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr_in_kimg = __phys_to_kimg(elr_phys); u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr_virt; u64 mode = spsr & PSR_MODE_MASK; + u64 panic_addr = elr_virt + hyp_offset; - /* - * The nVHE hyp symbols are not included by kallsyms to avoid issues - * with aliasing. That means that the symbols cannot be printed with the - * "%pS" format specifier, so fall back to the vmlinux address if - * there's no better option. - */ if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) { kvm_err("Invalid host exception to nVHE hyp!\n"); } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && @@ -319,11 +370,16 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, if (file) kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); else - kvm_err("nVHE hyp BUG at: %016llx!\n", elr_virt + hyp_offset); + kvm_err("nVHE hyp BUG at: [<%016llx>] %pB!\n", panic_addr, + (void *)(panic_addr + kaslr_offset())); } else { - kvm_err("nVHE hyp panic at: %016llx!\n", elr_virt + hyp_offset); + kvm_err("nVHE hyp panic at: [<%016llx>] %pB!\n", panic_addr, + (void *)(panic_addr + kaslr_offset())); } + /* Dump the nVHE hypervisor backtrace */ + kvm_nvhe_dump_backtrace(hyp_offset); + /* * Hyp has panicked and we're going to handle that by panicking the * kernel. The kernel offset will be revealed in the panic so we're diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index b726332eec49..a38dea6186c9 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -5,9 +5,6 @@ incdir := $(srctree)/$(src)/include subdir-asflags-y := -I$(incdir) -subdir-ccflags-y := -I$(incdir) \ - -fno-stack-protector \ - -DDISABLE_BRANCH_PROFILING \ - $(DISABLE_STACKLEAK_PLUGIN) +subdir-ccflags-y := -I$(incdir) -obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o reserved_mem.o +obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index aa06e28f2991..b210bcd16811 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -61,12 +61,25 @@ static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val) vcpu->arch.ctxt.spsr_und = val; } +unsigned long get_except64_offset(unsigned long psr, unsigned long target_mode, + enum exception_type type) +{ + u64 mode = psr & (PSR_MODE_MASK | PSR_MODE32_BIT); + u64 exc_offset; + + if (mode == target_mode) + exc_offset = CURRENT_EL_SP_ELx_VECTOR; + else if ((mode | PSR_MODE_THREAD_BIT) == target_mode) + exc_offset = CURRENT_EL_SP_EL0_VECTOR; + else if (!(mode & PSR_MODE32_BIT)) + exc_offset = LOWER_EL_AArch64_VECTOR; + else + exc_offset = LOWER_EL_AArch32_VECTOR; + + return exc_offset + type; +} + /* - * This performs the exception entry at a given EL (@target_mode), stashing PC - * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE. - * The EL passed to this function *must* be a non-secure, privileged mode with - * bit 0 being set (PSTATE.SP == 1). - * * When an exception is taken, most PSTATE fields are left unchanged in the * handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all * of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx @@ -78,45 +91,17 @@ static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val) * Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from * MSB to LSB. */ -static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode, - enum exception_type type) +unsigned long get_except64_cpsr(unsigned long old, bool has_mte, + unsigned long sctlr, unsigned long target_mode) { - unsigned long sctlr, vbar, old, new, mode; - u64 exc_offset; - - mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); - - if (mode == target_mode) - exc_offset = CURRENT_EL_SP_ELx_VECTOR; - else if ((mode | PSR_MODE_THREAD_BIT) == target_mode) - exc_offset = CURRENT_EL_SP_EL0_VECTOR; - else if (!(mode & PSR_MODE32_BIT)) - exc_offset = LOWER_EL_AArch64_VECTOR; - else - exc_offset = LOWER_EL_AArch32_VECTOR; - - switch (target_mode) { - case PSR_MODE_EL1h: - vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL1); - sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1); - __vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL1); - break; - default: - /* Don't do that */ - BUG(); - } - - *vcpu_pc(vcpu) = vbar + exc_offset + type; - - old = *vcpu_cpsr(vcpu); - new = 0; + u64 new = 0; new |= (old & PSR_N_BIT); new |= (old & PSR_Z_BIT); new |= (old & PSR_C_BIT); new |= (old & PSR_V_BIT); - if (kvm_has_mte(kern_hyp_va(vcpu->kvm))) + if (has_mte) new |= PSR_TCO_BIT; new |= (old & PSR_DIT_BIT); @@ -152,6 +137,37 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode, new |= target_mode; + return new; +} + +/* + * This performs the exception entry at a given EL (@target_mode), stashing PC + * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE. + * The EL passed to this function *must* be a non-secure, privileged mode with + * bit 0 being set (PSTATE.SP == 1). + */ +static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode, + enum exception_type type) +{ + u64 offset = get_except64_offset(*vcpu_cpsr(vcpu), target_mode, type); + unsigned long sctlr, vbar, old, new; + + switch (target_mode) { + case PSR_MODE_EL1h: + vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL1); + sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1); + __vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL1); + break; + default: + /* Don't do that */ + BUG(); + } + + *vcpu_pc(vcpu) = vbar + offset; + + old = *vcpu_cpsr(vcpu); + new = get_except64_cpsr(old, kvm_has_mte(kern_hyp_va(vcpu->kvm)), sctlr, + target_mode); *vcpu_cpsr(vcpu) = new; __vcpu_write_spsr(vcpu, old); } @@ -304,14 +320,14 @@ static void enter_exception32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) static void kvm_inject_exception(struct kvm_vcpu *vcpu) { if (vcpu_el1_is_32bit(vcpu)) { - switch (vcpu->arch.flags & KVM_ARM64_EXCEPT_MASK) { - case KVM_ARM64_EXCEPT_AA32_UND: + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA32_UND): enter_exception32(vcpu, PSR_AA32_MODE_UND, 4); break; - case KVM_ARM64_EXCEPT_AA32_IABT: + case unpack_vcpu_flag(EXCEPT_AA32_IABT): enter_exception32(vcpu, PSR_AA32_MODE_ABT, 12); break; - case KVM_ARM64_EXCEPT_AA32_DABT: + case unpack_vcpu_flag(EXCEPT_AA32_DABT): enter_exception32(vcpu, PSR_AA32_MODE_ABT, 16); break; default: @@ -319,9 +335,8 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) break; } } else { - switch (vcpu->arch.flags & KVM_ARM64_EXCEPT_MASK) { - case (KVM_ARM64_EXCEPT_AA64_ELx_SYNC | - KVM_ARM64_EXCEPT_AA64_EL1): + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC): enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); break; default: @@ -341,12 +356,12 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) */ void __kvm_adjust_pc(struct kvm_vcpu *vcpu) { - if (vcpu->arch.flags & KVM_ARM64_PENDING_EXCEPTION) { + if (vcpu_get_flag(vcpu, PENDING_EXCEPTION)) { kvm_inject_exception(vcpu); - vcpu->arch.flags &= ~(KVM_ARM64_PENDING_EXCEPTION | - KVM_ARM64_EXCEPT_MASK); - } else if (vcpu->arch.flags & KVM_ARM64_INCREMENT_PC) { + vcpu_clear_flag(vcpu, PENDING_EXCEPTION); + vcpu_clear_flag(vcpu, EXCEPT_MASK); + } else if (vcpu_get_flag(vcpu, INCREMENT_PC)) { kvm_skip_instr(vcpu); - vcpu->arch.flags &= ~KVM_ARM64_INCREMENT_PC; + vcpu_clear_flag(vcpu, INCREMENT_PC); } } diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S index 3c635929771a..e950875e31ce 100644 --- a/arch/arm64/kvm/hyp/fpsimd.S +++ b/arch/arm64/kvm/hyp/fpsimd.S @@ -21,11 +21,13 @@ SYM_FUNC_START(__fpsimd_restore_state) SYM_FUNC_END(__fpsimd_restore_state) SYM_FUNC_START(__sve_restore_state) - __sve_load 0, x1, 2 + mov x2, #1 + sve_load 0, x1, x2, 3 ret SYM_FUNC_END(__sve_restore_state) SYM_FUNC_START(__sve_save_state) - sve_save 0, x1, 2 + mov x2, #1 + sve_save 0, x1, x2, 3 ret SYM_FUNC_END(__sve_save_state) diff --git a/arch/arm64/kvm/hyp/hyp-constants.c b/arch/arm64/kvm/hyp/hyp-constants.c new file mode 100644 index 000000000000..9fe0d2a624ef --- /dev/null +++ b/arch/arm64/kvm/hyp/hyp-constants.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include + +int main(void) +{ + DEFINE(STRUCT_HYP_PAGE_SIZE, sizeof(struct hyp_page)); + DEFINE(PKVM_HYP_VM_SIZE, sizeof(struct pkvm_hyp_vm)); + DEFINE(PKVM_HYP_VCPU_SIZE, sizeof(struct pkvm_hyp_vcpu)); +#ifdef CONFIG_FTRACE + DEFINE(STRUCT_HYP_BUFFER_PAGE_SIZE, sizeof(struct hyp_buffer_page)); +#endif + return 0; +} diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index 4ebe9f558f3a..961bbef104a6 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -132,7 +132,7 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu) struct kvm_guest_debug_arch *host_dbg; struct kvm_guest_debug_arch *guest_dbg; - if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY)) + if (!vcpu_get_flag(vcpu, DEBUG_DIRTY)) return; host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; @@ -151,7 +151,7 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) struct kvm_guest_debug_arch *host_dbg; struct kvm_guest_debug_arch *guest_dbg; - if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY)) + if (!vcpu_get_flag(vcpu, DEBUG_DIRTY)) return; host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; @@ -162,7 +162,7 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) __debug_save_state(guest_dbg, guest_ctxt); __debug_restore_state(host_dbg, host_ctxt); - vcpu->arch.flags &= ~KVM_ARM64_DEBUG_DIRTY; + vcpu_clear_flag(vcpu, DEBUG_DIRTY); } #endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */ diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h new file mode 100644 index 000000000000..1b8a2dcd712f --- /dev/null +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 - ARM Ltd + * Author: Marc Zyngier + */ + +#ifndef __ARM64_KVM_HYP_FAULT_H__ +#define __ARM64_KVM_HYP_FAULT_H__ + +#include +#include +#include +#include + +static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) +{ + u64 par, tmp; + + /* + * Resolve the IPA the hard way using the guest VA. + * + * Stage-1 translation already validated the memory access + * rights. As such, we can use the EL1 translation regime, and + * don't have to distinguish between EL0 and EL1 access. + * + * We do need to save/restore PAR_EL1 though, as we haven't + * saved the guest context yet, and we may return early... + */ + par = read_sysreg_par(); + if (!__kvm_at("s1e1r", far)) + tmp = read_sysreg_par(); + else + tmp = SYS_PAR_EL1_F; /* back to the guest */ + write_sysreg(par, par_el1); + + if (unlikely(tmp & SYS_PAR_EL1_F)) + return false; /* Translation failed, back to guest */ + + /* Convert PAR to HPFAR format */ + *hpfar = PAR_TO_HPFAR(tmp); + return true; +} + +static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) +{ + u64 hpfar, far; + + far = read_sysreg_el2(SYS_FAR); + + /* + * The HPFAR can be invalid if the stage 2 fault did not + * happen during a stage 1 page table walk (the ESR_EL2.S1PTW + * bit is clear) and one of the two following cases are true: + * 1. The fault was due to a permission fault + * 2. The processor carries errata 834220 + * + * Therefore, for all non S1PTW faults where we either have a + * permission fault or the errata workaround is enabled, we + * resolve the IPA using the AT instruction. + */ + if (!(esr & ESR_ELx_S1PTW) && + (cpus_have_final_cap(ARM64_WORKAROUND_834220) || + (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { + if (!__translate_far_to_hpfar(far, &hpfar)) + return false; + } else { + hpfar = read_sysreg(hpfar_el2); + } + + fault->far_el2 = far; + fault->hpfar_el2 = hpfar; + return true; +} + +#endif diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index ecd41844eda0..71a9b7e1e03d 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -8,6 +8,7 @@ #define __ARM64_KVM_HYP_SWITCH_H__ #include +#include #include #include @@ -28,27 +29,14 @@ #include #include #include -#include extern struct exception_table_entry __start___kvm_ex_table; extern struct exception_table_entry __stop___kvm_ex_table; -/* Check whether the FP regs were dirtied while in the host-side run loop: */ -static inline bool update_fp_enabled(struct kvm_vcpu *vcpu) +/* Check whether the FP regs are owned by the guest */ +static inline bool guest_owns_fp_regs(struct kvm_vcpu *vcpu) { - /* - * When the system doesn't support FP/SIMD, we cannot rely on - * the _TIF_FOREIGN_FPSTATE flag. However, we always inject an - * abort on the very first access to FP and thus we should never - * see KVM_ARM64_FP_ENABLED. For added safety, make sure we always - * trap the accesses. - */ - if (!system_supports_fpsimd() || - vcpu->arch.host_thread_info->flags & _TIF_FOREIGN_FPSTATE) - vcpu->arch.flags &= ~(KVM_ARM64_FP_ENABLED | - KVM_ARM64_FP_HOST); - - return !!(vcpu->arch.flags & KVM_ARM64_FP_ENABLED); + return vcpu->arch.fp_state == FP_STATE_GUEST_OWNED; } /* Save the 32-bit only FPSIMD system register state */ @@ -95,6 +83,17 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2); write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); + + if (cpus_have_final_cap(ARM64_SME)) { + sysreg_clear_set_s(SYS_HFGRTR_EL2, + HFGxTR_EL2_nSMPRI_EL1_MASK | + HFGxTR_EL2_nTPIDR2_EL0_MASK, + 0); + sysreg_clear_set_s(SYS_HFGWTR_EL2, + HFGxTR_EL2_nSMPRI_EL1_MASK | + HFGxTR_EL2_nTPIDR2_EL0_MASK, + 0); + } } static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) @@ -104,6 +103,15 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) write_sysreg(0, hstr_el2); if (kvm_arm_support_pmu_v3()) write_sysreg(0, pmuserenr_el0); + + if (cpus_have_final_cap(ARM64_SME)) { + sysreg_clear_set_s(SYS_HFGRTR_EL2, 0, + HFGxTR_EL2_nSMPRI_EL1_MASK | + HFGxTR_EL2_nTPIDR2_EL0_MASK); + sysreg_clear_set_s(SYS_HFGWTR_EL2, 0, + HFGxTR_EL2_nSMPRI_EL1_MASK | + HFGxTR_EL2_nTPIDR2_EL0_MASK); + } } static inline void ___activate_traps(struct kvm_vcpu *vcpu) @@ -133,88 +141,9 @@ static inline void ___deactivate_traps(struct kvm_vcpu *vcpu) } } -static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) -{ - u64 par, tmp; - - /* - * Resolve the IPA the hard way using the guest VA. - * - * Stage-1 translation already validated the memory access - * rights. As such, we can use the EL1 translation regime, and - * don't have to distinguish between EL0 and EL1 access. - * - * We do need to save/restore PAR_EL1 though, as we haven't - * saved the guest context yet, and we may return early... - */ - par = read_sysreg_par(); - if (!__kvm_at("s1e1r", far)) - tmp = read_sysreg_par(); - else - tmp = SYS_PAR_EL1_F; /* back to the guest */ - write_sysreg(par, par_el1); - - if (unlikely(tmp & SYS_PAR_EL1_F)) - return false; /* Translation failed, back to guest */ - - /* Convert PAR to HPFAR format */ - *hpfar = PAR_TO_HPFAR(tmp); - return true; -} - -static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) -{ - u64 hpfar, far; - - far = read_sysreg_el2(SYS_FAR); - - /* - * The HPFAR can be invalid if the stage 2 fault did not - * happen during a stage 1 page table walk (the ESR_EL2.S1PTW - * bit is clear) and one of the two following cases are true: - * 1. The fault was due to a permission fault - * 2. The processor carries errata 834220 - * - * Therefore, for all non S1PTW faults where we either have a - * permission fault or the errata workaround is enabled, we - * resolve the IPA using the AT instruction. - */ - if (!(esr & ESR_ELx_S1PTW) && - (cpus_have_final_cap(ARM64_WORKAROUND_834220) || - (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { - if (!__translate_far_to_hpfar(far, &hpfar)) - return false; - } else { - hpfar = read_sysreg(hpfar_el2); - } - - fault->far_el2 = far; - fault->hpfar_el2 = hpfar; - return true; -} - static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) { - u8 ec; - u64 esr; - - esr = vcpu->arch.fault.esr_el2; - ec = ESR_ELx_EC(esr); - - if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW) - return true; - - return __get_fault_info(esr, &vcpu->arch.fault); -} - -static inline void __hyp_sve_save_host(struct kvm_vcpu *vcpu) -{ - struct thread_struct *thread; - - thread = container_of(vcpu->arch.host_fpsimd_state, struct thread_struct, - uw.fpsimd_state); - - __sve_save_state(sve_pffr(thread), &vcpu->arch.host_fpsimd_state->fpsr); + return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault); } static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) @@ -225,38 +154,35 @@ static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR); } -/* Check for an FPSIMD/SVE trap and handle as appropriate */ -static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu) +/* + * We trap the first access to the FP/SIMD to save the host context and + * restore the guest context lazily. + * If FP/SIMD is not implemented, handle the trap and inject an undefined + * instruction exception to the guest. Similarly for trapped SVE accesses. + */ +static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) { - bool sve_guest, sve_host; + bool sve_guest; u8 esr_ec; u64 reg; if (!system_supports_fpsimd()) return false; - if (system_supports_sve()) { - sve_guest = vcpu_has_sve(vcpu); - sve_host = vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE; - } else { - sve_guest = false; - sve_host = false; - } - + sve_guest = vcpu_has_sve(vcpu); esr_ec = kvm_vcpu_trap_get_class(vcpu); - if (esr_ec != ESR_ELx_EC_FP_ASIMD && - esr_ec != ESR_ELx_EC_SVE) - return false; /* Don't handle SVE traps for non-SVE vcpus here: */ if (!sve_guest && esr_ec != ESR_ELx_EC_FP_ASIMD) return false; /* Valid trap. Switch the context: */ + + /* First disable enough traps to allow us to update the registers */ if (has_vhe()) { - reg = CPACR_EL1_FPEN; + reg = CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN; if (sve_guest) - reg |= CPACR_EL1_ZEN; + reg |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; sysreg_clear_set(cpacr_el1, 0, reg); } else { @@ -268,15 +194,11 @@ static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu) } isb(); - if (vcpu->arch.flags & KVM_ARM64_FP_HOST) { - if (sve_host) - __hyp_sve_save_host(vcpu); - else - __fpsimd_save_state(vcpu->arch.host_fpsimd_state); - - vcpu->arch.flags &= ~KVM_ARM64_FP_HOST; - } + /* Write out the host state if it's in the registers */ + if (vcpu->arch.fp_state == FP_STATE_HOST_OWNED) + __fpsimd_save_state(vcpu->arch.host_fpsimd_state); + /* Restore the guest state */ if (sve_guest) __hyp_sve_restore_guest(vcpu); else @@ -286,7 +208,7 @@ static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu) if (!(read_sysreg(hcr_el2) & HCR_RW)) write_sysreg(__vcpu_sys_reg(vcpu, FPEXC32_EL2), fpexc32_el2); - vcpu->arch.flags |= KVM_ARM64_FP_ENABLED; + vcpu->arch.fp_state = FP_STATE_GUEST_OWNED; return true; } @@ -348,14 +270,6 @@ static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu) static inline bool esr_is_ptrauth_trap(u32 esr) { - u32 ec = ESR_ELx_EC(esr); - - if (ec == ESR_ELx_EC_PAC) - return true; - - if (ec != ESR_ELx_EC_SYS64) - return false; - switch (esr_sys64_to_sysreg(esr)) { case SYS_APIAKEYLO_EL1: case SYS_APIAKEYHI_EL1: @@ -384,13 +298,12 @@ static inline bool esr_is_ptrauth_trap(u32 esr) DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); -static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu) +static bool kvm_hyp_handle_ptrauth(struct kvm_vcpu *vcpu, u64 *exit_code) { struct kvm_cpu_context *ctxt; u64 val; - if (!vcpu_has_ptrauth(vcpu) || - !esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu))) + if (!vcpu_has_ptrauth(vcpu)) return false; ctxt = this_cpu_ptr(&kvm_hyp_ctxt); @@ -409,6 +322,92 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu) return true; } +static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && + handle_tx2_tvm(vcpu)) + return true; + + if (static_branch_unlikely(&vgic_v3_cpuif_trap) && + __vgic_v3_perform_cpuif_access(vcpu) == 1) + return true; + + if (esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu))) + return kvm_hyp_handle_ptrauth(vcpu, exit_code); + + return false; +} + +static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (static_branch_unlikely(&vgic_v3_cpuif_trap) && + __vgic_v3_perform_cpuif_access(vcpu) == 1) + return true; + + return false; +} + +static bool kvm_hyp_handle_iabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (!__populate_fault_info(vcpu)) + return true; + + return false; +} + +static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (!__populate_fault_info(vcpu)) + return true; + + if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { + bool valid; + + valid = kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT && + kvm_vcpu_dabt_isvalid(vcpu) && + !kvm_vcpu_abt_issea(vcpu) && + !kvm_vcpu_abt_iss1tw(vcpu); + + if (valid) { + int ret = __vgic_v2_perform_cpuif_access(vcpu); + + if (ret == 1) + return true; + + /* Promote an illegal access to an SError.*/ + if (ret == -1) + *exit_code = ARM_EXCEPTION_EL1_SERROR; + } + } + + return false; +} + +typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *); + +static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu); + +static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code); + +/* + * Allow the hypervisor to handle the exit with an exit handler if it has one. + * + * Returns true if the hypervisor handled the exit, and control should go back + * to the guest, or false if it hasn't. + */ +static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); + exit_handler_fn fn; + + fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; + + if (fn) + return fn(vcpu, exit_code); + + return false; +} + /* * Return true when we were able to fixup the guest exit and should return to * the guest, false when we should restore the host state and return to the @@ -422,6 +421,12 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) */ vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR); + /* + * Check whether we want to repaint the state one way or + * another. + */ + early_exit_filter(vcpu, exit_code); + if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); @@ -450,59 +455,9 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) if (*exit_code != ARM_EXCEPTION_TRAP) goto exit; - if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && - kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 && - handle_tx2_tvm(vcpu)) + /* Check if there's an exit handler and allow it to handle the exit. */ + if (kvm_hyp_handle_exit(vcpu, exit_code)) goto guest; - - /* - * We trap the first access to the FP/SIMD to save the host context - * and restore the guest context lazily. - * If FP/SIMD is not implemented, handle the trap and inject an - * undefined instruction exception to the guest. - * Similarly for trapped SVE accesses. - */ - if (__hyp_handle_fpsimd(vcpu)) - goto guest; - - if (__hyp_handle_ptrauth(vcpu)) - goto guest; - - if (!__populate_fault_info(vcpu)) - goto guest; - - if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { - bool valid; - - valid = kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_DABT_LOW && - kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT && - kvm_vcpu_dabt_isvalid(vcpu) && - !kvm_vcpu_abt_issea(vcpu) && - !kvm_vcpu_abt_iss1tw(vcpu); - - if (valid) { - int ret = __vgic_v2_perform_cpuif_access(vcpu); - - if (ret == 1) - goto guest; - - /* Promote an illegal access to an SError.*/ - if (ret == -1) - *exit_code = ARM_EXCEPTION_EL1_SERROR; - - goto exit; - } - } - - if (static_branch_unlikely(&vgic_v3_cpuif_trap) && - (kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 || - kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) { - int ret = __vgic_v3_perform_cpuif_access(vcpu); - - if (ret == 1) - goto guest; - } - exit: /* Return to the host kernel and handle the exit */ return false; diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index 7ecca8b07851..baa5b9b3dde5 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -195,7 +195,7 @@ static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu) __vcpu_sys_reg(vcpu, DACR32_EL2) = read_sysreg(dacr32_el2); __vcpu_sys_reg(vcpu, IFSR32_EL2) = read_sysreg(ifsr32_el2); - if (has_vhe() || vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY) + if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY)) __vcpu_sys_reg(vcpu, DBGVCR32_EL2) = read_sysreg(dbgvcr32_el2); } @@ -212,7 +212,7 @@ static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu) write_sysreg(__vcpu_sys_reg(vcpu, DACR32_EL2), dacr32_el2); write_sysreg(__vcpu_sys_reg(vcpu, IFSR32_EL2), ifsr32_el2); - if (has_vhe() || vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY) + if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY)) write_sysreg(__vcpu_sys_reg(vcpu, DBGVCR32_EL2), dbgvcr32_el2); } diff --git a/arch/arm64/kvm/hyp/include/nvhe/clock.h b/arch/arm64/kvm/hyp/include/nvhe/clock.h new file mode 100644 index 000000000000..7e5c2d2b1886 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/clock.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARM64_KVM_HYP_NVHE_CLOCK_H +#define __ARM64_KVM_HYP_NVHE_CLOCK_H +#include + +#include + +#ifdef CONFIG_TRACING +void trace_clock_update(struct kvm_nvhe_clock_data *data); +u64 trace_clock(void); +#else +static inline void trace_clock_update(struct kvm_nvhe_clock_data *data) { } +static inline u64 trace_clock(void) { return 0; } +#endif +#endif diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h new file mode 100644 index 000000000000..1becb10ecd80 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 - Google LLC + * Author: Andrew Walbran + */ +#ifndef __KVM_HYP_FFA_H +#define __KVM_HYP_FFA_H + +#include + +#define FFA_MIN_FUNC_NUM 0x60 +#define FFA_MAX_FUNC_NUM 0x7F + +int hyp_ffa_init(void *pages); +bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt); + +#endif /* __KVM_HYP_FFA_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h index 0a048dc06a7d..9330b13075f8 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h +++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h @@ -7,7 +7,7 @@ #include #include -#define HYP_NO_ORDER USHRT_MAX +#define HYP_NO_ORDER 0xff struct hyp_pool { /* @@ -19,11 +19,11 @@ struct hyp_pool { struct list_head free_area[MAX_ORDER]; phys_addr_t range_start; phys_addr_t range_end; - unsigned short max_order; + u8 max_order; }; /* Allocation */ -void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order); +void *hyp_alloc_pages(struct hyp_pool *pool, u8 order); void hyp_split_page(struct hyp_page *page); void hyp_get_page(struct hyp_pool *pool, void *addr); void hyp_put_page(struct hyp_pool *pool, void *addr); diff --git a/arch/arm64/kvm/hyp/include/nvhe/iommu.h b/arch/arm64/kvm/hyp/include/nvhe/iommu.h new file mode 100644 index 000000000000..639277053e6d --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ARM64_KVM_NVHE_IOMMU_H__ +#define __ARM64_KVM_NVHE_IOMMU_H__ + +#include +#include + +#include + +struct pkvm_iommu; + +struct pkvm_iommu_ops { + /* + * Global driver initialization called before devices are registered. + * Driver-specific arguments are passed in a buffer shared by the host. + * The buffer memory has been pinned in EL2 but host retains R/W access. + * Extra care must be taken when reading from it to avoid TOCTOU bugs. + * If the driver maintains its own page tables, it is expected to + * initialize them to all memory owned by the host. + * Driver initialization lock held during callback. + */ + int (*init)(void *data, size_t size); + + /* + * Driver-specific validation of a device that is being registered. + * All fields of the device struct have been populated. + * Called with the host lock held. + */ + int (*validate)(struct pkvm_iommu *dev); + + /* + * Validation of a new child device that is being register by + * the parent device the child selected. Called with the host lock held. + */ + int (*validate_child)(struct pkvm_iommu *dev, struct pkvm_iommu *child); + + /* + * Callback to apply a host stage-2 mapping change at driver level. + * Called before 'host_stage2_idmap_apply' with host lock held. + */ + void (*host_stage2_idmap_prepare)(phys_addr_t start, phys_addr_t end, + enum kvm_pgtable_prot prot); + + /* + * Callback to apply a host stage-2 mapping change at device level. + * Called after 'host_stage2_idmap_prepare' with host lock held. + */ + void (*host_stage2_idmap_apply)(struct pkvm_iommu *dev, + phys_addr_t start, phys_addr_t end); + + /* + * Callback to finish a host stage-2 mapping change at device level. + * Called after 'host_stage2_idmap_apply' with host lock held. + */ + void (*host_stage2_idmap_complete)(struct pkvm_iommu *dev); + + /* Power management callbacks. Called with host lock held. */ + int (*suspend)(struct pkvm_iommu *dev); + int (*resume)(struct pkvm_iommu *dev); + + /* + * Host data abort handler callback. Called with host lock held. + * Returns true if the data abort has been handled. + */ + bool (*host_dabt_handler)(struct pkvm_iommu *dev, + struct kvm_cpu_context *host_ctxt, + u32 esr, size_t off); + + /* Amount of memory allocated per-device for use by the driver. */ + size_t data_size; +}; + +struct pkvm_iommu { + struct pkvm_iommu *parent; + struct list_head list; + struct list_head siblings; + struct list_head children; + unsigned long id; + const struct pkvm_iommu_ops *ops; + phys_addr_t pa; + void *va; + size_t size; + bool powered; + char data[]; +}; + +int __pkvm_iommu_driver_init(struct pkvm_iommu_driver *drv, void *data, size_t size); +int __pkvm_iommu_register(unsigned long dev_id, unsigned long drv_id, + phys_addr_t dev_pa, size_t dev_size, + unsigned long parent_id, + void *kern_mem_va, size_t mem_size); +int __pkvm_iommu_pm_notify(unsigned long dev_id, + enum pkvm_iommu_pm_event event); +int __pkvm_iommu_finalize(void); +int pkvm_iommu_host_stage2_adjust_range(phys_addr_t addr, phys_addr_t *start, + phys_addr_t *end); +bool pkvm_iommu_host_dabt_handler(struct kvm_cpu_context *host_ctxt, u32 esr, + phys_addr_t fault_pa); +void pkvm_iommu_host_stage2_idmap(phys_addr_t start, phys_addr_t end, + enum kvm_pgtable_prot prot); + +#endif /* __ARM64_KVM_NVHE_IOMMU_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index b58c910babaf..b4050138ada9 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -8,8 +8,10 @@ #define __KVM_NVHE_MEM_PROTECT__ #include #include +#include #include #include +#include #include /* @@ -24,6 +26,13 @@ enum pkvm_page_state { PKVM_PAGE_OWNED = 0ULL, PKVM_PAGE_SHARED_OWNED = KVM_PGTABLE_PROT_SW0, PKVM_PAGE_SHARED_BORROWED = KVM_PGTABLE_PROT_SW1, + __PKVM_PAGE_RESERVED = KVM_PGTABLE_PROT_SW0 | + KVM_PGTABLE_PROT_SW1, + + /* Meta-states which aren't encoded directly in the PTE's SW bits */ + PKVM_NOPAGE = BIT(0), + PKVM_PAGE_RESTRICTED_PROT = BIT(1), + PKVM_MODULE_DONT_TOUCH = BIT(2), }; #define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) @@ -38,29 +47,73 @@ static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) return prot & PKVM_PAGE_STATE_PROT_MASK; } -struct host_kvm { +struct host_mmu { struct kvm_arch arch; struct kvm_pgtable pgt; struct kvm_pgtable_mm_ops mm_ops; hyp_spinlock_t lock; }; -extern struct host_kvm host_kvm; +extern struct host_mmu host_mmu; -extern const u8 pkvm_hyp_id; +/* This corresponds to page-table locking order */ +enum pkvm_component_id { + PKVM_ID_HOST, + PKVM_ID_HYP, + PKVM_ID_GUEST, + PKVM_ID_FFA, + PKVM_ID_PROTECTED, + PKVM_ID_MAX = PKVM_ID_PROTECTED, +}; + +extern unsigned long hyp_nr_cpus; int __pkvm_prot_finalize(void); int __pkvm_host_share_hyp(u64 pfn); +int __pkvm_host_unshare_hyp(u64 pfn); +int __pkvm_host_reclaim_page(struct pkvm_hyp_vm *vm, u64 pfn, u64 ipa); +int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages); +int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages); +int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu); +int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu); +int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa); +int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa); +int __pkvm_guest_relinquish_to_host(struct pkvm_hyp_vcpu *vcpu, + u64 ipa, u64 *ppa); +int __pkvm_install_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa); +int __pkvm_remove_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa); +bool __pkvm_check_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu); +int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages); +int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages); bool addr_is_memory(phys_addr_t phys); -int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); -int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id); +int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot, + bool update_iommu); +int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, enum pkvm_component_id owner_id); +int host_stage2_protect_pages_locked(phys_addr_t addr, u64 size); +int host_stage2_unmap_reg_locked(phys_addr_t start, u64 size); int kvm_host_prepare_stage2(void *pgt_pool_base); +int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd); void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); +int hyp_register_host_perm_fault_handler(int (*cb)(struct kvm_cpu_context *ctxt, u64 esr, u64 addr)); +int hyp_pin_shared_mem(void *from, void *to); +void hyp_unpin_shared_mem(void *from, void *to); +int host_stage2_get_leaf(phys_addr_t phys, kvm_pte_t *ptep, u32 *level); +int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, + struct kvm_hyp_memcache *host_mc); + +int module_change_host_page_prot(u64 pfn, enum kvm_pgtable_prot prot); + +void destroy_hyp_vm_pgt(struct pkvm_hyp_vm *vm); +void drain_hyp_pool(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc); + +void psci_mem_protect_inc(u64 n); +void psci_mem_protect_dec(u64 n); + static __always_inline void __load_host_stage2(void) { if (static_branch_likely(&kvm_protected_mode_initialized)) - __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch); + __load_stage2(&host_mmu.arch.mmu, &host_mmu.arch); else write_sysreg(0, vttbr_el2); } diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h index 592b7edb3edb..a392021ef0ee 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/memory.h +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h @@ -7,9 +7,16 @@ #include +/* + * Accesses to struct hyp_page flags are serialized by the host stage-2 + * page-table lock. + */ +#define MODULE_OWNED_PAGE BIT(0) + struct hyp_page { unsigned short refcount; - unsigned short order; + u8 order; + u8 flags; }; extern u64 __hyp_vmemmap; @@ -38,6 +45,10 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr) #define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page)) #define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool) +/* + * Refcounting for 'struct hyp_page'. + * hyp_pool::lock must be held if atomic access to the refcount is required. + */ static inline int hyp_page_count(void *addr) { struct hyp_page *p = hyp_virt_to_page(addr); @@ -45,4 +56,27 @@ static inline int hyp_page_count(void *addr) return p->refcount; } +static inline void hyp_page_ref_inc(struct hyp_page *p) +{ + BUG_ON(p->refcount == USHRT_MAX); + p->refcount++; +} + +static inline void hyp_page_ref_dec(struct hyp_page *p) +{ + BUG_ON(!p->refcount); + p->refcount--; +} + +static inline int hyp_page_ref_dec_and_test(struct hyp_page *p) +{ + hyp_page_ref_dec(p); + return (p->refcount == 0); +} + +static inline void hyp_set_page_refcounted(struct hyp_page *p) +{ + BUG_ON(p->refcount); + p->refcount = 1; +} #endif /* __KVM_HYP_MEMORY_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h index c9a8f535212e..66a6d71bd4e0 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -10,87 +10,28 @@ #include #include -#define HYP_MEMBLOCK_REGIONS 128 -extern struct memblock_region kvm_nvhe_sym(hyp_memory)[]; -extern unsigned int kvm_nvhe_sym(hyp_memblock_nr); extern struct kvm_pgtable pkvm_pgtable; extern hyp_spinlock_t pkvm_pgd_lock; -extern struct hyp_pool hpool; -extern u64 __io_map_base; +extern const struct pkvm_module_ops module_ops; + +int hyp_create_pcpu_fixmap(void); +void *hyp_fixmap_map(phys_addr_t phys); +void hyp_fixmap_unmap(void); +void hyp_poison_page(phys_addr_t phys); int hyp_create_idmap(u32 hyp_va_bits); int hyp_map_vectors(void); -int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back); +int hyp_back_vmemmap(phys_addr_t back); int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot); int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot); int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot); -unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size, - enum kvm_pgtable_prot prot); - -static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size, - unsigned long *start, unsigned long *end) -{ - unsigned long nr_pages = size >> PAGE_SHIFT; - struct hyp_page *p = hyp_phys_to_page(phys); - - *start = (unsigned long)p; - *end = *start + nr_pages * sizeof(struct hyp_page); - *start = ALIGN_DOWN(*start, PAGE_SIZE); - *end = ALIGN(*end, PAGE_SIZE); -} - -static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages) -{ - unsigned long total = 0, i; - - /* Provision the worst case scenario */ - for (i = 0; i < KVM_PGTABLE_MAX_LEVELS; i++) { - nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE); - total += nr_pages; - } - - return total; -} - -static inline unsigned long __hyp_pgtable_total_pages(void) -{ - unsigned long res = 0, i; - - /* Cover all of memory with page-granularity */ - for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) { - struct memblock_region *reg = &kvm_nvhe_sym(hyp_memory)[i]; - res += __hyp_pgtable_max_pages(reg->size >> PAGE_SHIFT); - } - - return res; -} - -static inline unsigned long hyp_s1_pgtable_pages(void) -{ - unsigned long res; - - res = __hyp_pgtable_total_pages(); - - /* Allow 1 GiB for private mappings */ - res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT); - - return res; -} - -static inline unsigned long host_s2_pgtable_pages(void) -{ - unsigned long res; - - /* - * Include an extra 16 pages to safely upper-bound the worst case of - * concatenated pgds. - */ - res = __hyp_pgtable_total_pages() + 16; - - /* Allow 1 GiB for MMIO mappings */ - res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT); - - return res; -} +int __pkvm_create_private_mapping(phys_addr_t phys, size_t size, + enum kvm_pgtable_prot prot, + unsigned long *haddr); +int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr); +void pkvm_remove_mappings(void *from, void *to); +int __pkvm_map_module_page(u64 pfn, void *va, enum kvm_pgtable_prot prot, bool is_protected); +void __pkvm_unmap_module_page(u64 pfn, void *va); +void *__pkvm_alloc_module_va(u64 nr_pages); #endif /* __KVM_HYP_MM_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/modules.h b/arch/arm64/kvm/hyp/include/nvhe/modules.h new file mode 100644 index 000000000000..f339d881889c --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/modules.h @@ -0,0 +1,36 @@ +#include + +#define HCALL_HANDLED 0 +#define HCALL_UNHANDLED -1 + +int __pkvm_register_host_smc_handler(bool (*cb)(struct kvm_cpu_context *)); +int __pkvm_register_default_trap_handler(bool (*cb)(struct kvm_cpu_context *)); +int __pkvm_register_illegal_abt_notifier(void (*cb)(struct kvm_cpu_context *)); +int __pkvm_register_hyp_panic_notifier(void (*cb)(struct kvm_cpu_context *)); + +enum pkvm_psci_notification; +int __pkvm_register_psci_notifier(void (*cb)(enum pkvm_psci_notification, struct kvm_cpu_context *)); + +int reset_pkvm_priv_hcall_limit(void); + +#ifdef CONFIG_MODULES +int __pkvm_init_module(void *module_init); +int __pkvm_register_hcall(unsigned long hfn_hyp_va); +int handle_host_dynamic_hcall(struct kvm_cpu_context *host_ctxt); +void pkvm_modules_lock(void); +void pkvm_modules_unlock(void); +bool pkvm_modules_enabled(void); +int __pkvm_close_module_registration(void); +#else +static inline int __pkvm_init_module(void *module_init) { return -EOPNOTSUPP; } +static inline int +__pkvm_register_hcall(unsigned long hfn_hyp_va) { return -EOPNOTSUPP; } +static inline int handle_host_dynamic_hcall(struct kvm_cpu_context *host_ctxt) +{ + return HCALL_UNHANDLED; +} +static inline void pkvm_modules_lock(void) { } +static inline void pkvm_modules_unlock(void) { } +static inline bool pkvm_modules_enabled(void) { return false; } +static inline int __pkvm_close_module_registration(void) { return -EOPNOTSUPP; } +#endif diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h new file mode 100644 index 000000000000..c373844cea79 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Google LLC + * Author: Fuad Tabba + */ + +#ifndef __ARM64_KVM_NVHE_PKVM_H__ +#define __ARM64_KVM_NVHE_PKVM_H__ + +#include + +#include +#include + +/* + * Holds the relevant data for maintaining the vcpu state completely at hyp. + */ +struct pkvm_hyp_vcpu { + struct kvm_vcpu vcpu; + + /* Backpointer to the host's (untrusted) vCPU instance. */ + struct kvm_vcpu *host_vcpu; + + /* + * If this hyp vCPU is loaded, then this is a backpointer to the + * per-cpu pointer tracking us. Otherwise, NULL if not loaded. + */ + struct pkvm_hyp_vcpu **loaded_hyp_vcpu; + + /* Tracks exit code for the protected guest. */ + u32 exit_code; + + /* + * Track the power state transition of a protected vcpu. + * Can be in one of three states: + * PSCI_0_2_AFFINITY_LEVEL_ON + * PSCI_0_2_AFFINITY_LEVEL_OFF + * PSCI_0_2_AFFINITY_LEVEL_PENDING + */ + int power_state; +}; + +/* + * Holds the relevant data for running a protected vm. + */ +struct pkvm_hyp_vm { + struct kvm kvm; + + /* Backpointer to the host's (untrusted) KVM instance. */ + struct kvm *host_kvm; + + /* The guest's stage-2 page-table managed by the hypervisor. */ + struct kvm_pgtable pgt; + struct kvm_pgtable_mm_ops mm_ops; + struct hyp_pool pool; + hyp_spinlock_t lock; + + /* Primary vCPU pending entry to the pvmfw */ + struct pkvm_hyp_vcpu *pvmfw_entry_vcpu; + + /* + * The number of vcpus initialized and ready to run. + * Modifying this is protected by 'vm_table_lock'. + */ + unsigned int nr_vcpus; + + /* + * True when the guest is being torn down. When in this state, the + * guest's vCPUs can't be loaded anymore, but its pages can be + * reclaimed by the host. + */ + bool is_dying; + + /* Array of the hyp vCPU structures for this VM. */ + struct pkvm_hyp_vcpu *vcpus[]; +}; + +static inline struct pkvm_hyp_vm * +pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + return container_of(hyp_vcpu->vcpu.kvm, struct pkvm_hyp_vm, kvm); +} + +static inline bool vcpu_is_protected(struct kvm_vcpu *vcpu) +{ + if (!is_protected_kvm_enabled()) + return false; + + return vcpu->kvm->arch.pkvm.enabled; +} + +static inline bool pkvm_hyp_vcpu_is_protected(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + return vcpu_is_protected(&hyp_vcpu->vcpu); +} + +extern phys_addr_t pvmfw_base; +extern phys_addr_t pvmfw_size; + +void pkvm_hyp_vm_table_init(void *tbl); + +int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, + unsigned long pgd_hva, unsigned long last_ran_hva); +int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, + unsigned long vcpu_hva); +int __pkvm_start_teardown_vm(pkvm_handle_t handle); +int __pkvm_finalize_teardown_vm(pkvm_handle_t handle); +int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 pfn, u64 ipa); + +struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, + unsigned int vcpu_idx); +void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu); +struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void); + +u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id); +bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); +bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); +void kvm_reset_pvm_sys_regs(struct kvm_vcpu *vcpu); +int kvm_check_pvm_sysreg_table(void); + +void pkvm_reset_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu); + +bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code); +bool kvm_hyp_handle_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code); + +struct pkvm_hyp_vcpu *pkvm_mpidr_to_hyp_vcpu(struct pkvm_hyp_vm *vm, u64 mpidr); + +static inline bool pkvm_hyp_vm_has_pvmfw(struct pkvm_hyp_vm *vm) +{ + return vm->kvm.arch.pkvm.pvmfw_load_addr != PVMFW_INVALID_LOAD_ADDR; +} + +static inline bool pkvm_ipa_range_has_pvmfw(struct pkvm_hyp_vm *vm, + u64 ipa_start, u64 ipa_end) +{ + struct kvm_protected_vm *pkvm = &vm->kvm.arch.pkvm; + u64 pvmfw_load_end = pkvm->pvmfw_load_addr + pvmfw_size; + + if (!pkvm_hyp_vm_has_pvmfw(vm)) + return false; + + return ipa_end > pkvm->pvmfw_load_addr && ipa_start < pvmfw_load_end; +} + +int pkvm_load_pvmfw_pages(struct pkvm_hyp_vm *vm, u64 ipa, phys_addr_t phys, + u64 size); +void pkvm_poison_pvmfw_pages(void); + +#endif /* __ARM64_KVM_NVHE_PKVM_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/serial.h b/arch/arm64/kvm/hyp/include/nvhe/serial.h new file mode 100644 index 000000000000..85ff8fdd6c9c --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/serial.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ARM64_KVM_NVHE_SERIAL_H__ +#define __ARM64_KVM_NVHE_SERIAL_H__ + +void hyp_puts(const char *s); +void hyp_putx64(u64 x); +void hyp_putc(char c); +int __pkvm_register_serial_driver(void (*driver_cb)(char)); + +#endif diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h index 4652fd04bdbe..7c7ea8c55405 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h +++ b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h @@ -28,9 +28,17 @@ typedef union hyp_spinlock { }; } hyp_spinlock_t; +#define __HYP_SPIN_LOCK_INITIALIZER \ + { .__val = 0 } + +#define __HYP_SPIN_LOCK_UNLOCKED \ + ((hyp_spinlock_t) __HYP_SPIN_LOCK_INITIALIZER) + +#define DEFINE_HYP_SPINLOCK(x) hyp_spinlock_t x = __HYP_SPIN_LOCK_UNLOCKED + #define hyp_spin_lock_init(l) \ do { \ - *(l) = (hyp_spinlock_t){ .__val = 0 }; \ + *(l) = __HYP_SPIN_LOCK_UNLOCKED; \ } while (0) static inline void hyp_spin_lock(hyp_spinlock_t *lock) diff --git a/arch/arm64/kvm/hyp/include/nvhe/trace.h b/arch/arm64/kvm/hyp/include/nvhe/trace.h new file mode 100644 index 000000000000..ff5d047e8715 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/trace.h @@ -0,0 +1,115 @@ +#ifndef __ARM64_KVM_HYP_NVHE_TRACE_H +#define __ARM64_KVM_HYP_NVHE_TRACE_H + +#include + +#include +#include + +#include +#include +#include + +#ifdef CONFIG_TRACING + +struct hyp_buffer_page { + struct list_head list; + struct buffer_data_page *page; + atomic_t write; + atomic_t entries; +}; + +#define HYP_RB_UNUSED 0 +#define HYP_RB_READY 1 +#define HYP_RB_WRITE 2 + +struct hyp_rb_per_cpu { + struct hyp_buffer_page *tail_page; + struct hyp_buffer_page *reader_page; + struct hyp_buffer_page *head_page; + struct hyp_buffer_page *bpages; + unsigned long nr_pages; + atomic64_t write_stamp; + atomic_t pages_touched; + atomic_t nr_entries; + atomic_t status; + atomic_t overrun; +}; + +static inline bool __start_write_hyp_rb(struct hyp_rb_per_cpu *rb) +{ + /* + * Paired with rb_cpu_init() + */ + return atomic_cmpxchg_acquire(&rb->status, HYP_RB_READY, HYP_RB_WRITE) + != HYP_RB_UNUSED; +} + +static inline void __stop_write_hyp_rb(struct hyp_rb_per_cpu *rb) +{ + /* + * Paired with rb_cpu_teardown() + */ + atomic_set_release(&rb->status, HYP_RB_READY); +} + +struct hyp_rb_per_cpu; +DECLARE_PER_CPU(struct hyp_rb_per_cpu, trace_rb); + +void *rb_reserve_trace_entry(struct hyp_rb_per_cpu *cpu_buffer, unsigned long length); + +int __pkvm_start_tracing(unsigned long pack_va, size_t pack_size); +void __pkvm_stop_tracing(void); +int __pkvm_rb_swap_reader_page(int cpu); +int __pkvm_rb_update_footers(int cpu); +int __pkvm_enable_event(unsigned short id, bool enable); + +#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \ + HYP_EVENT_FORMAT(__name, __struct); \ + extern atomic_t __name##_enabled; \ + extern unsigned short hyp_event_id_##__name; \ + static inline void trace_##__name(__proto) \ + { \ + size_t length = sizeof(struct trace_hyp_format_##__name); \ + struct hyp_rb_per_cpu *rb = this_cpu_ptr(&trace_rb); \ + struct trace_hyp_format_##__name *__entry; \ + \ + if (!atomic_read(&__name##_enabled)) \ + return; \ + if (!__start_write_hyp_rb(rb)) \ + return; \ + __entry = rb_reserve_trace_entry(rb, length); \ + __entry->hdr.id = hyp_event_id_##__name; \ + __assign \ + __stop_write_hyp_rb(rb); \ + } + +/* TODO: atomic_t to static_branch */ + +#else +static inline int __pkvm_start_tracing(unsigned long pack_va, size_t pack_size) +{ + return -ENODEV; +} + +static inline void __pkvm_stop_tracing(void) { } + +static inline int __pkvm_rb_swap_reader_page(int cpu) +{ + return -ENODEV; +} + +static inline int __pkvm_rb_update_footers(int cpu) +{ + return -ENODEV; +} + +#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \ + static inline void trace_##__name(__proto) {} + +static inline int __pkvm_enable_event(unsigned short id, bool enable) +{ + return -ENODEV; +} +#endif +#endif diff --git a/arch/arm64/kvm/hyp/nvhe/.gitignore b/arch/arm64/kvm/hyp/nvhe/.gitignore index 5b6c43cc96f8..899547d88045 100644 --- a/arch/arm64/kvm/hyp/nvhe/.gitignore +++ b/arch/arm64/kvm/hyp/nvhe/.gitignore @@ -1,4 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -gen-hyprel hyp.lds hyp-reloc.S diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 964c2134ea1e..adb102979ec1 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -1,101 +1,20 @@ # SPDX-License-Identifier: GPL-2.0 -# -# Makefile for Kernel-based Virtual Machine module, HYP/nVHE part -# - -asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS - -hostprogs := gen-hyprel -HOST_EXTRACFLAGS += -I$(objtree)/include - lib-objs := clear_page.o copy_page.o memcpy.o memset.o lib-objs := $(addprefix ../../../lib/, $(lib-objs)) -obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ - hyp-main.o hyp-smp.o psci-relay.o early_alloc.o stub.o page_alloc.o \ - cache.o setup.o mm.o mem_protect.o -obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ +hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ + hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \ + cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o iommu.o \ + serial.o +hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o -obj-y += $(lib-objs) +hyp-obj-$(CONFIG_TRACING) += clock.o events.o trace.o +hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o +hyp-obj-$(CONFIG_MODULES) += modules.o +hyp-obj-y += $(lib-objs) -## -## Build rules for compiling nVHE hyp code -## Output of this folder is `kvm_nvhe.o`, a partially linked object -## file containing all nVHE hyp code and data. -## - -hyp-obj := $(patsubst %.o,%.nvhe.o,$(obj-y)) -obj-y := kvm_nvhe.o -extra-y := $(hyp-obj) kvm_nvhe.tmp.o kvm_nvhe.rel.o hyp.lds hyp-reloc.S hyp-reloc.o - -# 1) Compile all source files to `.nvhe.o` object files. The file extension -# avoids file name clashes for files shared with VHE. -$(obj)/%.nvhe.o: $(src)/%.c FORCE - $(call if_changed_rule,cc_o_c) -$(obj)/%.nvhe.o: $(src)/%.S FORCE - $(call if_changed_rule,as_o_S) - -# 2) Compile linker script. $(obj)/hyp.lds: $(src)/hyp.lds.S FORCE $(call if_changed_dep,cpp_lds_S) -# 3) Partially link all '.nvhe.o' files and apply the linker script. -# Prefixes names of ELF sections with '.hyp', eg. '.hyp.text'. -# Note: The following rule assumes that the 'ld' rule puts LDFLAGS before -# the list of dependencies to form '-T $(obj)/hyp.lds'. This is to -# keep the dependency on the target while avoiding an error from -# GNU ld if the linker script is passed to it twice. -LDFLAGS_kvm_nvhe.tmp.o := -r -T -$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE - $(call if_changed,ld) - -# 4) Generate list of hyp code/data positions that need to be relocated at -# runtime. Because the hypervisor is part of the kernel binary, relocations -# produce a kernel VA. We enumerate relocations targeting hyp at build time -# and convert the kernel VAs at those positions to hyp VAs. -$(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o $(obj)/gen-hyprel FORCE - $(call if_changed,hyprel) - -# 5) Compile hyp-reloc.S and link it into the existing partially linked object. -# The object file now contains a section with pointers to hyp positions that -# will contain kernel VAs at runtime. These pointers have relocations on them -# so that they get updated as the hyp object is linked into `vmlinux`. -LDFLAGS_kvm_nvhe.rel.o := -r -$(obj)/kvm_nvhe.rel.o: $(obj)/kvm_nvhe.tmp.o $(obj)/hyp-reloc.o FORCE - $(call if_changed,ld) - -# 6) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'. -# Prefixes names of ELF symbols with '__kvm_nvhe_'. -$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.rel.o FORCE - $(call if_changed,hypcopy) - -# The HYPREL command calls `gen-hyprel` to generate an assembly file with -# a list of relocations targeting hyp code/data. -quiet_cmd_hyprel = HYPREL $@ - cmd_hyprel = $(obj)/gen-hyprel $< > $@ - -# The HYPCOPY command uses `objcopy` to prefix all ELF symbol names -# to avoid clashes with VHE code/data. -quiet_cmd_hypcopy = HYPCOPY $@ - cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@ - -# Remove ftrace, Shadow Call Stack, and CFI CFLAGS. -# This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations. -KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI), $(KBUILD_CFLAGS)) -# Starting from 13.0.0 llvm emits SHT_REL section '.llvm.call-graph-profile' -# when profile optimization is applied. gen-hyprel does not support SHT_REL and -# causes a build failure. Remove profile optimization flags. -KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS)) - -# KVM nVHE code is run at a different exception code with a different map, so -# compiler instrumentation that inserts callbacks or checks into the code may -# cause crashes. Just disable it. -GCOV_PROFILE := n -KASAN_SANITIZE := n -UBSAN_SANITIZE := n -KCOV_INSTRUMENT := n - -# Skip objtool checking for this directory because nVHE code is compiled with -# non-standard build rules. -OBJECT_FILES_NON_STANDARD := y +include $(srctree)/arch/arm64/kvm/hyp/nvhe/Makefile.nvhe +obj-y := kvm_nvhe.o diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile.module b/arch/arm64/kvm/hyp/nvhe/Makefile.module new file mode 100644 index 000000000000..d3ad4468b50d --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/Makefile.module @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 + +$(obj)/hyp.lds: arch/arm64/kvm/hyp/nvhe/module.lds.S FORCE + $(call if_changed_dep,cpp_lds_S) + +include $(srctree)/arch/arm64/kvm/hyp/nvhe/Makefile.nvhe diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile.nvhe b/arch/arm64/kvm/hyp/nvhe/Makefile.nvhe new file mode 100644 index 000000000000..b63a14897b0b --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/Makefile.nvhe @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for Kernel-based Virtual Machine module, HYP/nVHE part +# + +asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS + +# Tracepoint and MMIO logging symbols should not be visible at nVHE KVM as +# there is no way to execute them and any such MMIO access from nVHE KVM +# will explode instantly (Words of Marc Zyngier). So introduce a generic flag +# __DISABLE_TRACE_MMIO__ to disable MMIO tracing for nVHE KVM. +ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__ +ccflags-y += -fno-stack-protector \ + -DDISABLE_BRANCH_PROFILING \ + $(DISABLE_STACKLEAK_PLUGIN) + +HYPREL := arch/arm64/tools/gen-hyprel + +## +## Build rules for compiling nVHE hyp code +## Output of this folder is `kvm_nvhe.o`, a partially linked object +## file containing all nVHE hyp code and data. +## + +hyp-obj := $(patsubst %.o,%.nvhe.o,$(hyp-obj-y)) +targets += $(hyp-obj) kvm_nvhe.tmp.o kvm_nvhe.rel.o hyp.lds hyp-reloc.S hyp-reloc.o + +# 1) Compile all source files to `.nvhe.o` object files. The file extension +# avoids file name clashes for files shared with VHE. +$(obj)/%.nvhe.o: $(src)/%.c FORCE + $(call if_changed_rule,cc_o_c) +$(obj)/%.nvhe.o: $(src)/%.S FORCE + $(call if_changed_rule,as_o_S) + +# 2) Partially link all '.nvhe.o' files and apply the linker script. +# Prefixes names of ELF sections with '.hyp', eg. '.hyp.text'. +# Note: The following rule assumes that the 'ld' rule puts LDFLAGS before +# the list of dependencies to form '-T $(obj)/hyp.lds'. This is to +# keep the dependency on the target while avoiding an error from +# GNU ld if the linker script is passed to it twice. +LDFLAGS_kvm_nvhe.tmp.o := -r -T +$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE + $(call if_changed,ld) + +# 3) Generate list of hyp code/data positions that need to be relocated at +# runtime. Because the hypervisor is part of the kernel binary, relocations +# produce a kernel VA. We enumerate relocations targeting hyp at build time +# and convert the kernel VAs at those positions to hyp VAs. +$(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o FORCE + $(call if_changed,hyprel) + +# 4) Compile hyp-reloc.S and link it into the existing partially linked object. +# The object file now contains a section with pointers to hyp positions that +# will contain kernel VAs at runtime. These pointers have relocations on them +# so that they get updated as the hyp object is linked into `vmlinux`. +LDFLAGS_kvm_nvhe.rel.o := -r +$(obj)/kvm_nvhe.rel.o: $(obj)/kvm_nvhe.tmp.o $(obj)/hyp-reloc.o FORCE + $(call if_changed,ld) + +# 5) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'. +# Prefixes names of ELF symbols with '__kvm_nvhe_'. +$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.rel.o FORCE + $(call if_changed,hypcopy) + +# The HYPREL command calls `gen-hyprel` to generate an assembly file with +# a list of relocations targeting hyp code/data. +quiet_cmd_hyprel = HYPREL $@ + cmd_hyprel = $(HYPREL) $< > $@ + +# The HYPCOPY command uses `objcopy` to prefix all ELF symbol names +# to avoid clashes with VHE code/data. +quiet_cmd_hypcopy = HYPCOPY $@ + cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@ + +# Remove ftrace, Shadow Call Stack, and CFI CFLAGS. +# This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations. +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI), $(KBUILD_CFLAGS)) +# Starting from 13.0.0 llvm emits SHT_REL section '.llvm.call-graph-profile' +# when profile optimization is applied. gen-hyprel does not support SHT_REL and +# causes a build failure. Remove profile optimization flags. +KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS)) + +# KVM nVHE code is run at a different exception code with a different map, so +# compiler instrumentation that inserts callbacks or checks into the code may +# cause crashes. Just disable it. +GCOV_PROFILE := n +KASAN_SANITIZE := n +UBSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +# Skip objtool checking for this directory because nVHE code is compiled with +# non-standard build rules. +OBJECT_FILES_NON_STANDARD := y diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S index 958734f4d6b0..4c447f2a31e1 100644 --- a/arch/arm64/kvm/hyp/nvhe/cache.S +++ b/arch/arm64/kvm/hyp/nvhe/cache.S @@ -11,3 +11,13 @@ SYM_FUNC_START_PI(dcache_clean_inval_poc) dcache_by_line_op civac, sy, x0, x1, x2, x3 ret SYM_FUNC_END_PI(dcache_clean_inval_poc) + +SYM_FUNC_START_PI(icache_inval_pou) +alternative_if ARM64_HAS_CACHE_DIC + isb + ret +alternative_else_nop_endif + + invalidate_icache_by_line x0, x1, x2, x3 + ret +SYM_FUNC_END_PI(icache_inval_pou) diff --git a/arch/arm64/kvm/hyp/nvhe/clock.c b/arch/arm64/kvm/hyp/nvhe/clock.c new file mode 100644 index 000000000000..4ff87e86787c --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/clock.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include +#include + +static struct kvm_nvhe_clock_data trace_clock_data; + +/* + * Update without any locks! This is fine because tracing, the sole user of this + * clock is ordering the memory and protects from races between read and + * updates. + */ +void trace_clock_update(struct kvm_nvhe_clock_data *data) +{ + trace_clock_data.mult = data->mult; + trace_clock_data.shift = data->shift; + trace_clock_data.epoch_ns = data->epoch_ns; + trace_clock_data.epoch_cyc = data->epoch_cyc; +} + +/* + * This clock is relying on host provided slope and epoch values to return + * something synchronized with the host. The downside is we can't trust the + * output which must not be used for anything else than debugging. + */ +u64 trace_clock(void) +{ + u64 cyc = __arch_counter_get_cntpct() - trace_clock_data.epoch_cyc; + __uint128_t ns; + + /* + * The host kernel can avoid the 64-bits overflow of the multiplication + * by updating the epoch value with a timer (see + * kernel/time/clocksource.c). The hypervisor doesn't have that option, + * so let's do a more costly 128-bits mult here. + */ + ns = (__uint128_t)cyc * trace_clock_data.mult; + ns >>= trace_clock_data.shift; + + return (u64)ns + trace_clock_data.epoch_ns; +} diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index df361d839902..e17455773b98 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -84,10 +84,10 @@ static void __debug_restore_trace(u64 trfcr_el1) void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) { /* Disable and flush SPE data generation */ - if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE) + if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE)) __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1); /* Disable and flush Self-Hosted Trace generation */ - if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE) + if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE)) __debug_save_trace(&vcpu->arch.host_debug_state.trfcr_el1); } @@ -98,9 +98,9 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu) void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) { - if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE) + if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE)) __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); - if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE) + if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE)) __debug_restore_trace(vcpu->arch.host_debug_state.trfcr_el1); } diff --git a/arch/arm64/kvm/hyp/nvhe/early_alloc.c b/arch/arm64/kvm/hyp/nvhe/early_alloc.c index 1306c430ab87..00de04153cc6 100644 --- a/arch/arm64/kvm/hyp/nvhe/early_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/early_alloc.c @@ -43,6 +43,9 @@ void *hyp_early_alloc_page(void *arg) return hyp_early_alloc_contig(1); } +static void hyp_early_alloc_get_page(void *addr) { } +static void hyp_early_alloc_put_page(void *addr) { } + void hyp_early_alloc_init(void *virt, unsigned long size) { base = cur = (unsigned long)virt; @@ -51,4 +54,6 @@ void hyp_early_alloc_init(void *virt, unsigned long size) hyp_early_alloc_mm_ops.zalloc_page = hyp_early_alloc_page; hyp_early_alloc_mm_ops.phys_to_virt = hyp_phys_to_virt; hyp_early_alloc_mm_ops.virt_to_phys = hyp_virt_to_phys; + hyp_early_alloc_mm_ops.get_page = hyp_early_alloc_get_page; + hyp_early_alloc_mm_ops.put_page = hyp_early_alloc_put_page; } diff --git a/arch/arm64/kvm/hyp/nvhe/events.c b/arch/arm64/kvm/hyp/nvhe/events.c new file mode 100644 index 000000000000..9deb2c31db5e --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/events.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 Google LLC + */ + +#include +#include + +extern struct hyp_event_id __hyp_event_ids_start[]; +extern struct hyp_event_id __hyp_event_ids_end[]; + +#undef HYP_EVENT +#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \ + atomic_t __ro_after_init __name##_enabled = ATOMIC_INIT(0); \ + struct hyp_event_id hyp_event_id_##__name __section("_hyp_event_ids") = { \ + .data = (void *)&__name##_enabled, \ + } + +#include + +int __pkvm_enable_event(unsigned short id, bool enable) +{ + struct hyp_event_id *event_id = __hyp_event_ids_start; + atomic_t *enable_key; + + for (; (unsigned long)event_id < (unsigned long)__hyp_event_ids_end; + event_id++) { + if (event_id->id != id) + continue; + + enable_key = (atomic_t *)event_id->data; + enable_key = hyp_fixmap_map(__hyp_pa(enable_key)); + + atomic_set(enable_key, enable); + + hyp_fixmap_unmap(); + + return 0; + } + + return -EINVAL; +} diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c new file mode 100644 index 000000000000..4d1c3c85b47d --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -0,0 +1,741 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * FF-A v1.0 proxy to filter out invalid memory-sharing SMC calls issued by + * the host. FF-A is a slightly more palatable abbreviation of "Arm Firmware + * Framework for Arm A-profile", which is specified by Arm in document + * number DEN0077. + * + * Copyright (C) 2022 - Google LLC + * Author: Andrew Walbran + * + * This driver hooks into the SMC trapping logic for the host and intercepts + * all calls falling within the FF-A range. Each call is either: + * + * - Forwarded on unmodified to the SPMD at EL3 + * - Rejected as "unsupported" + * - Accompanied by a host stage-2 page-table check/update and reissued + * + * Consequently, any attempts by the host to make guest memory pages + * accessible to the secure world using FF-A will be detected either here + * (in the case that the memory is already owned by the guest) or during + * donation to the guest (in the case that the memory was previously shared + * with the secure world). + * + * To allow the rolling-back of page-table updates and FF-A calls in the + * event of failure, operations involving the RXTX buffers are locked for + * the duration and are therefore serialised. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * "ID value 0 must be returned at the Non-secure physical FF-A instance" + * We share this ID with the host. + */ +#define HOST_FFA_ID 0 + +/* + * A buffer to hold the maximum descriptor size we can see from the host, + * which is required when the SPMD returns a fragmented FFA_MEM_RETRIEVE_RESP + * when resolving the handle on the reclaim path. + */ +struct kvm_ffa_descriptor_buffer { + void *buf; + size_t len; +}; + +static struct kvm_ffa_descriptor_buffer ffa_desc_buf; + +struct kvm_ffa_buffers { + hyp_spinlock_t lock; + void *tx; + void *rx; +}; + +/* + * Note that we don't currently lock these buffers explicitly, instead + * relying on the locking of the host FFA buffers as we only have one + * client. + */ +static struct kvm_ffa_buffers hyp_buffers; +static struct kvm_ffa_buffers host_buffers; + +static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno) +{ + *res = (struct arm_smccc_res) { + .a0 = FFA_ERROR, + .a2 = ffa_errno, + }; +} + +static void ffa_to_smccc_res_prop(struct arm_smccc_res *res, int ret, u64 prop) +{ + if (ret == FFA_RET_SUCCESS) { + *res = (struct arm_smccc_res) { .a0 = FFA_SUCCESS, + .a2 = prop }; + } else { + ffa_to_smccc_error(res, ret); + } +} + +static void ffa_to_smccc_res(struct arm_smccc_res *res, int ret) +{ + ffa_to_smccc_res_prop(res, ret, 0); +} + +static void ffa_set_retval(struct kvm_cpu_context *ctxt, + struct arm_smccc_res *res) +{ + cpu_reg(ctxt, 0) = res->a0; + cpu_reg(ctxt, 1) = res->a1; + cpu_reg(ctxt, 2) = res->a2; + cpu_reg(ctxt, 3) = res->a3; +} + +static bool is_ffa_call(u64 func_id) +{ + return ARM_SMCCC_IS_FAST_CALL(func_id) && + ARM_SMCCC_OWNER_NUM(func_id) == ARM_SMCCC_OWNER_STANDARD && + ARM_SMCCC_FUNC_NUM(func_id) >= FFA_MIN_FUNC_NUM && + ARM_SMCCC_FUNC_NUM(func_id) <= FFA_MAX_FUNC_NUM; +} + +static int spmd_map_ffa_buffers(u64 ffa_page_count) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_smc(FFA_FN64_RXTX_MAP, + hyp_virt_to_phys(hyp_buffers.tx), + hyp_virt_to_phys(hyp_buffers.rx), + ffa_page_count, + 0, 0, 0, 0, + &res); + + return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; +} + +static int spmd_unmap_ffa_buffers(void) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_smc(FFA_RXTX_UNMAP, + HOST_FFA_ID, + 0, 0, 0, 0, 0, 0, + &res); + + return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; +} + +static void spmd_mem_frag_tx(struct arm_smccc_res *res, u32 handle_lo, + u32 handle_hi, u32 fraglen, u32 endpoint_id) +{ + arm_smccc_1_1_smc(FFA_MEM_FRAG_TX, + handle_lo, handle_hi, fraglen, endpoint_id, + 0, 0, 0, + res); +} + +static void spmd_mem_frag_rx(struct arm_smccc_res *res, u32 handle_lo, + u32 handle_hi, u32 fragoff) +{ + arm_smccc_1_1_smc(FFA_MEM_FRAG_RX, + handle_lo, handle_hi, fragoff, HOST_FFA_ID, + 0, 0, 0, + res); +} + +static void spmd_mem_xfer(struct arm_smccc_res *res, u64 func_id, u32 len, + u32 fraglen) +{ + arm_smccc_1_1_smc(func_id, len, fraglen, + 0, 0, 0, 0, 0, + res); +} + +static void spmd_mem_reclaim(struct arm_smccc_res *res, u32 handle_lo, + u32 handle_hi, u32 flags) +{ + arm_smccc_1_1_smc(FFA_MEM_RECLAIM, + handle_lo, handle_hi, flags, + 0, 0, 0, 0, + res); +} + +static void spmd_retrieve_req(struct arm_smccc_res *res, u32 len) +{ + arm_smccc_1_1_smc(FFA_FN64_MEM_RETRIEVE_REQ, + len, len, + 0, 0, 0, 0, 0, + res); +} + +static void do_ffa_rxtx_map(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(phys_addr_t, tx, ctxt, 1); + DECLARE_REG(phys_addr_t, rx, ctxt, 2); + DECLARE_REG(u32, npages, ctxt, 3); + int ret = 0; + void *rx_virt, *tx_virt; + + if (npages != (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) / FFA_PAGE_SIZE) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + if (!PAGE_ALIGNED(tx) || !PAGE_ALIGNED(rx)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + hyp_spin_lock(&host_buffers.lock); + if (host_buffers.tx) { + ret = FFA_RET_DENIED; + goto out_unlock; + } + + ret = spmd_map_ffa_buffers(npages); + if (ret) + goto out_unlock; + + ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(tx)); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unmap; + } + + ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(rx)); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unshare_tx; + } + + tx_virt = hyp_phys_to_virt(tx); + ret = hyp_pin_shared_mem(tx_virt, tx_virt + 1); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unshare_rx; + } + + rx_virt = hyp_phys_to_virt(rx); + ret = hyp_pin_shared_mem(rx_virt, rx_virt + 1); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unpin_tx; + } + + host_buffers.tx = tx_virt; + host_buffers.rx = rx_virt; + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + ffa_to_smccc_res(res, ret); + return; + +err_unpin_tx: + hyp_unpin_shared_mem(tx_virt, tx_virt + 1); +err_unshare_rx: + __pkvm_host_unshare_hyp(hyp_phys_to_pfn(rx)); +err_unshare_tx: + __pkvm_host_unshare_hyp(hyp_phys_to_pfn(tx)); +err_unmap: + spmd_unmap_ffa_buffers(); + goto out_unlock; +} + +static void do_ffa_rxtx_unmap(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, id, ctxt, 1); + int ret = 0; + + if (id != HOST_FFA_ID) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.tx) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + hyp_unpin_shared_mem(host_buffers.tx, host_buffers.tx + 1); + WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.tx))); + host_buffers.tx = NULL; + + hyp_unpin_shared_mem(host_buffers.rx, host_buffers.rx + 1); + WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.rx))); + host_buffers.rx = NULL; + + spmd_unmap_ffa_buffers(); + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + ffa_to_smccc_res(res, ret); +} + +static u32 __ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 i; + + for (i = 0; i < nranges; ++i) { + struct ffa_mem_region_addr_range *range = &ranges[i]; + u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE; + u64 pfn = hyp_phys_to_pfn(range->address); + + if (!PAGE_ALIGNED(sz)) + break; + + if (__pkvm_host_share_ffa(pfn, sz / PAGE_SIZE)) + break; + } + + return i; +} + +static u32 __ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 i; + + for (i = 0; i < nranges; ++i) { + struct ffa_mem_region_addr_range *range = &ranges[i]; + u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE; + u64 pfn = hyp_phys_to_pfn(range->address); + + if (!PAGE_ALIGNED(sz)) + break; + + if (__pkvm_host_unshare_ffa(pfn, sz / PAGE_SIZE)) + break; + } + + return i; +} + +static int ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 nshared = __ffa_host_share_ranges(ranges, nranges); + int ret = 0; + + if (nshared != nranges) { + WARN_ON(__ffa_host_unshare_ranges(ranges, nshared) != nshared); + ret = FFA_RET_DENIED; + } + + return ret; +} + +static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 nunshared = __ffa_host_unshare_ranges(ranges, nranges); + int ret = 0; + + if (nunshared != nranges) { + WARN_ON(__ffa_host_share_ranges(ranges, nunshared) != nunshared); + ret = FFA_RET_DENIED; + } + + return ret; +} + +static void do_ffa_mem_frag_tx(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, handle_lo, ctxt, 1); + DECLARE_REG(u32, handle_hi, ctxt, 2); + DECLARE_REG(u32, fraglen, ctxt, 3); + DECLARE_REG(u32, endpoint_id, ctxt, 4); + struct ffa_mem_region_addr_range *buf; + int ret = FFA_RET_INVALID_PARAMETERS; + u32 nr_ranges; + + if (fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) + goto out; + + if (fraglen % sizeof(*buf)) + goto out; + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.tx) + goto out_unlock; + + buf = hyp_buffers.tx; + memcpy(buf, host_buffers.tx, fraglen); + nr_ranges = fraglen / sizeof(*buf); + + ret = ffa_host_share_ranges(buf, nr_ranges); + if (ret) { + /* + * We're effectively aborting the transaction, so we need + * to restore the global state back to what it was prior to + * transmission of the first fragment. + */ + spmd_mem_reclaim(res, handle_lo, handle_hi, 0); + WARN_ON(res->a0 != FFA_SUCCESS); + goto out_unlock; + } + + spmd_mem_frag_tx(res, handle_lo, handle_hi, fraglen, endpoint_id); + if (res->a0 != FFA_SUCCESS && res->a0 != FFA_MEM_FRAG_RX) + WARN_ON(ffa_host_unshare_ranges(buf, nr_ranges)); + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + if (ret) + ffa_to_smccc_res(res, ret); + + /* + * If for any reason this did not succeed, we're in trouble as we have + * now lost the content of the previous fragments and we can't rollback + * the host stage-2 changes. The pages previously marked as shared will + * remain stuck in that state forever, hence preventing the host from + * sharing/donating them again and may possibly lead to subsequent + * failures, but this will not compromise confidentiality. + */ + return; +} + +static __always_inline void do_ffa_mem_xfer(const u64 func_id, + struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, len, ctxt, 1); + DECLARE_REG(u32, fraglen, ctxt, 2); + DECLARE_REG(u64, addr_mbz, ctxt, 3); + DECLARE_REG(u32, npages_mbz, ctxt, 4); + struct ffa_composite_mem_region *reg; + struct ffa_mem_region *buf; + u32 offset, nr_ranges; + int ret = 0; + + BUILD_BUG_ON(func_id != FFA_FN64_MEM_SHARE && + func_id != FFA_FN64_MEM_LEND); + + if (addr_mbz || npages_mbz || fraglen > len || + fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + if (fraglen < sizeof(struct ffa_mem_region) + + sizeof(struct ffa_mem_region_attributes)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.tx) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + buf = hyp_buffers.tx; + memcpy(buf, host_buffers.tx, fraglen); + + offset = buf->ep_mem_access[0].composite_off; + if (!offset || buf->ep_count != 1 || buf->sender_id != HOST_FFA_ID) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + if (fraglen < offset + sizeof(struct ffa_composite_mem_region)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + reg = (void *)buf + offset; + nr_ranges = ((void *)buf + fraglen) - (void *)reg->constituents; + if (nr_ranges % sizeof(reg->constituents[0])) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + nr_ranges /= sizeof(reg->constituents[0]); + ret = ffa_host_share_ranges(reg->constituents, nr_ranges); + if (ret) + goto out_unlock; + + spmd_mem_xfer(res, func_id, len, fraglen); + if (fraglen != len) { + if (res->a0 != FFA_MEM_FRAG_RX) + goto err_unshare; + + if (res->a3 != fraglen) + goto err_unshare; + } else if (res->a0 != FFA_SUCCESS) { + goto err_unshare; + } + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + if (ret) + ffa_to_smccc_res(res, ret); + return; + +err_unshare: + WARN_ON(ffa_host_unshare_ranges(reg->constituents, nr_ranges)); + goto out_unlock; +} + +static void do_ffa_mem_reclaim(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, handle_lo, ctxt, 1); + DECLARE_REG(u32, handle_hi, ctxt, 2); + DECLARE_REG(u32, flags, ctxt, 3); + struct ffa_composite_mem_region *reg; + u32 offset, len, fraglen, fragoff; + struct ffa_mem_region *buf; + int ret = 0; + u64 handle; + + handle = PACK_HANDLE(handle_lo, handle_hi); + + hyp_spin_lock(&host_buffers.lock); + + buf = hyp_buffers.tx; + *buf = (struct ffa_mem_region) { + .sender_id = HOST_FFA_ID, + .handle = handle, + }; + + spmd_retrieve_req(res, sizeof(*buf)); + buf = hyp_buffers.rx; + if (res->a0 != FFA_MEM_RETRIEVE_RESP) + goto out_unlock; + + len = res->a1; + fraglen = res->a2; + + offset = buf->ep_mem_access[0].composite_off; + /* + * We can trust the SPMD to get this right, but let's at least + * check that we end up with something that doesn't look _completely_ + * bogus. + */ + if (WARN_ON(offset > len || + fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) { + ret = FFA_RET_ABORTED; + goto out_unlock; + } + + if (len > ffa_desc_buf.len) { + ret = FFA_RET_NO_MEMORY; + goto out_unlock; + } + + buf = ffa_desc_buf.buf; + memcpy(buf, hyp_buffers.rx, fraglen); + + for (fragoff = fraglen; fragoff < len; fragoff += fraglen) { + spmd_mem_frag_rx(res, handle_lo, handle_hi, fragoff); + if (res->a0 != FFA_MEM_FRAG_TX) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + fraglen = res->a3; + memcpy((void *)buf + fragoff, hyp_buffers.rx, fraglen); + } + + spmd_mem_reclaim(res, handle_lo, handle_hi, flags); + if (res->a0 != FFA_SUCCESS) + goto out_unlock; + + reg = (void *)buf + offset; + /* If the SPMD was happy, then we should be too. */ + WARN_ON(ffa_host_unshare_ranges(reg->constituents, + reg->addr_range_cnt)); +out_unlock: + hyp_spin_unlock(&host_buffers.lock); + + if (ret) + ffa_to_smccc_res(res, ret); +} + +static bool ffa_call_unsupported(u64 func_id) +{ + switch (func_id) { + /* Unsupported memory management calls */ + case FFA_FN64_MEM_RETRIEVE_REQ: + case FFA_MEM_RETRIEVE_RESP: + case FFA_MEM_RELINQUISH: + case FFA_MEM_OP_PAUSE: + case FFA_MEM_OP_RESUME: + case FFA_MEM_FRAG_RX: + case FFA_FN64_MEM_DONATE: + /* Indirect message passing via RX/TX buffers */ + case FFA_MSG_SEND: + case FFA_MSG_POLL: + case FFA_MSG_WAIT: + /* 32-bit variants of 64-bit calls */ + case FFA_MSG_SEND_DIRECT_REQ: + case FFA_MSG_SEND_DIRECT_RESP: + case FFA_RXTX_MAP: + case FFA_MEM_DONATE: + case FFA_MEM_RETRIEVE_REQ: + return true; + } + + return false; +} + +static bool do_ffa_features(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, id, ctxt, 1); + u64 prop = 0; + int ret = 0; + + if (ffa_call_unsupported(id)) { + ret = FFA_RET_NOT_SUPPORTED; + goto out_handled; + } + + switch (id) { + case FFA_MEM_SHARE: + case FFA_FN64_MEM_SHARE: + case FFA_MEM_LEND: + case FFA_FN64_MEM_LEND: + ret = FFA_RET_SUCCESS; + prop = 0; /* No support for dynamic buffers */ + goto out_handled; + default: + return false; + } + +out_handled: + ffa_to_smccc_res_prop(res, ret, prop); + return true; +} + +bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, func_id, host_ctxt, 0); + struct arm_smccc_res res; + + if (!is_ffa_call(func_id)) + return false; + + switch (func_id) { + case FFA_FEATURES: + if (!do_ffa_features(&res, host_ctxt)) + return false; + goto out_handled; + /* Memory management */ + case FFA_FN64_RXTX_MAP: + do_ffa_rxtx_map(&res, host_ctxt); + goto out_handled; + case FFA_RXTX_UNMAP: + do_ffa_rxtx_unmap(&res, host_ctxt); + goto out_handled; + case FFA_MEM_SHARE: + case FFA_FN64_MEM_SHARE: + do_ffa_mem_xfer(FFA_FN64_MEM_SHARE, &res, host_ctxt); + goto out_handled; + case FFA_MEM_RECLAIM: + do_ffa_mem_reclaim(&res, host_ctxt); + goto out_handled; + case FFA_MEM_LEND: + case FFA_FN64_MEM_LEND: + do_ffa_mem_xfer(FFA_FN64_MEM_LEND, &res, host_ctxt); + goto out_handled; + case FFA_MEM_FRAG_TX: + do_ffa_mem_frag_tx(&res, host_ctxt); + goto out_handled; + } + + if (!ffa_call_unsupported(func_id)) + return false; /* Pass through */ + + ffa_to_smccc_error(&res, FFA_RET_NOT_SUPPORTED); +out_handled: + ffa_set_retval(host_ctxt, &res); + return true; +} + +int hyp_ffa_init(void *pages) +{ + struct arm_smccc_res res; + size_t min_rxtx_sz; + void *tx, *rx; + + if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_1) + return 0; + + arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_0, 0, 0, 0, 0, 0, 0, &res); + if (res.a0 == FFA_RET_NOT_SUPPORTED) + return 0; + + if (res.a0 != FFA_VERSION_1_0) + return -EOPNOTSUPP; + + arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res); + if (res.a0 != FFA_SUCCESS) + return -EOPNOTSUPP; + + if (res.a2 != HOST_FFA_ID) + return -EINVAL; + + arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP, + 0, 0, 0, 0, 0, 0, &res); + if (res.a0 != FFA_SUCCESS) + return -EOPNOTSUPP; + + switch (res.a2) { + case FFA_FEAT_RXTX_MIN_SZ_4K: + min_rxtx_sz = SZ_4K; + break; + case FFA_FEAT_RXTX_MIN_SZ_16K: + min_rxtx_sz = SZ_16K; + break; + case FFA_FEAT_RXTX_MIN_SZ_64K: + min_rxtx_sz = SZ_64K; + break; + default: + return -EINVAL; + } + + if (min_rxtx_sz > PAGE_SIZE) + return -EOPNOTSUPP; + + tx = pages; + pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE; + rx = pages; + pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE; + + ffa_desc_buf = (struct kvm_ffa_descriptor_buffer) { + .buf = pages, + .len = PAGE_SIZE * + (hyp_ffa_proxy_pages() - (2 * KVM_FFA_MBOX_NR_PAGES)), + }; + + hyp_buffers = (struct kvm_ffa_buffers) { + .lock = __HYP_SPIN_LOCK_UNLOCKED, + .tx = tx, + .rx = rx, + }; + + host_buffers = (struct kvm_ffa_buffers) { + .lock = __HYP_SPIN_LOCK_UNLOCKED, + }; + + return 0; +} diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index d310d2b2c8b4..b6c0188c4b35 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -80,7 +80,7 @@ SYM_FUNC_START(__hyp_do_panic) mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ PSR_MODE_EL1h) msr spsr_el2, lr - ldr lr, =nvhe_hyp_panic_handler + adr_l lr, nvhe_hyp_panic_handler hyp_kimg_va lr, x6 msr elr_el2, lr @@ -110,17 +110,14 @@ SYM_FUNC_START(__hyp_do_panic) b __host_enter_for_panic SYM_FUNC_END(__hyp_do_panic) -.macro host_el1_sync_vect - .align 7 -.L__vect_start\@: - stp x0, x1, [sp, #-16]! - mrs x0, esr_el2 - ubfx x0, x0, #ESR_ELx_EC_SHIFT, #ESR_ELx_EC_WIDTH - cmp x0, #ESR_ELx_EC_HVC64 - b.ne __host_exit - +SYM_FUNC_START(__host_hvc) ldp x0, x1, [sp] // Don't fixup the stack yet + /* No stub for you, sonny Jim */ +alternative_if ARM64_KVM_PROTECTED_MODE + b __host_exit +alternative_else_nop_endif + /* Check for a stub HVC call */ cmp x0, #HVC_STUB_HCALL_NR b.hs __host_exit @@ -128,15 +125,24 @@ SYM_FUNC_END(__hyp_do_panic) add sp, sp, #16 /* * Compute the idmap address of __kvm_handle_stub_hvc and - * jump there. Since we use kimage_voffset, do not use the - * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead - * (by loading it from the constant pool). + * jump there. * * Preserve x0-x4, which may contain stub parameters. */ - ldr x5, =__kvm_handle_stub_hvc + adr_l x5, __kvm_handle_stub_hvc hyp_pa x5, x6 br x5 +SYM_FUNC_END(__host_hvc) + +.macro host_el1_sync_vect + .align 7 +.L__vect_start\@: + stp x0, x1, [sp, #-16]! + mrs x0, esr_el2 + ubfx x0, x0, #ESR_ELx_EC_SHIFT, #ESR_ELx_EC_WIDTH + cmp x0, #ESR_ELx_EC_HVC64 + b.eq __host_hvc + b __host_exit .L__vect_end\@: .if ((.L__vect_end\@ - .L__vect_start\@) > 0x80) .error "host_el1_sync_vect larger than vector entry" @@ -145,6 +151,18 @@ SYM_FUNC_END(__hyp_do_panic) .macro invalid_host_el2_vect .align 7 + + /* + * Test whether the SP has overflowed, without corrupting a GPR. + * nVHE hypervisor stacks are aligned so that the PAGE_SHIFT bit + * of SP should always be 1. + */ + add sp, sp, x0 // sp' = sp + x0 + sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp + tbz x0, #PAGE_SHIFT, .L__hyp_sp_overflow\@ + sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 + sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp + /* If a guest is loaded, panic out of it. */ stp x0, x1, [sp, #-16]! get_loaded_vcpu x0, x1 @@ -157,6 +175,13 @@ SYM_FUNC_END(__hyp_do_panic) * been partially clobbered by __host_enter. */ b hyp_panic + +.L__hyp_sp_overflow\@: + /* Switch to the overflow stack */ + adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0 + + b hyp_panic_bad_stack + ASM_BUG() .endm .macro invalid_host_el1_vect @@ -190,15 +215,15 @@ SYM_CODE_START(__kvm_hyp_host_vector) invalid_host_el2_vect // FIQ EL2h invalid_host_el2_vect // Error EL2h - host_el1_sync_vect // Synchronous 64-bit EL1 - invalid_host_el1_vect // IRQ 64-bit EL1 - invalid_host_el1_vect // FIQ 64-bit EL1 - invalid_host_el1_vect // Error 64-bit EL1 + host_el1_sync_vect // Synchronous 64-bit EL1/EL0 + invalid_host_el1_vect // IRQ 64-bit EL1/EL0 + invalid_host_el1_vect // FIQ 64-bit EL1/EL0 + invalid_host_el1_vect // Error 64-bit EL1/EL0 - invalid_host_el1_vect // Synchronous 32-bit EL1 - invalid_host_el1_vect // IRQ 32-bit EL1 - invalid_host_el1_vect // FIQ 32-bit EL1 - invalid_host_el1_vect // Error 32-bit EL1 + host_el1_sync_vect // Synchronous 32-bit EL1/EL0 + invalid_host_el1_vect // IRQ 32-bit EL1/EL0 + invalid_host_el1_vect // FIQ 32-bit EL1/EL0 + invalid_host_el1_vect // Error 32-bit EL1/EL0 SYM_CODE_END(__kvm_hyp_host_vector) /* diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 2da6aa8da868..8d8f7e255dec 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -4,35 +4,905 @@ * Author: Andrew Scull */ -#include +#include + +#include #include #include #include #include #include +#include #include +#include +#include #include +#include #include +#include +#include #include +#include +#include + +#include "../../sys_regs.h" + +/* + * Host FPSIMD state. Written to when the guest accesses its own FPSIMD state, + * and read when the guest state is live and we need to switch back to the host. + * + * Only valid when (fp_state == FP_STATE_GUEST_OWNED) in the hyp vCPU structure. + */ +static DEFINE_PER_CPU(struct user_fpsimd_state, loaded_host_fpsimd_state); + DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt); +static bool (*default_host_smc_handler)(struct kvm_cpu_context *host_ctxt); +static bool (*default_trap_handler)(struct kvm_cpu_context *host_ctxt); + +int __pkvm_register_host_smc_handler(bool (*cb)(struct kvm_cpu_context *)) +{ + return cmpxchg(&default_host_smc_handler, NULL, cb) ? -EBUSY : 0; +} + +int __pkvm_register_default_trap_handler(bool (*cb)(struct kvm_cpu_context *)) +{ + return cmpxchg(&default_trap_handler, NULL, cb) ? -EBUSY : 0; +} + +static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + u64 nr_pages = VTCR_EL2_LVLS(hyp_vm->kvm.arch.vtcr) - 1; + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + return refill_memcache(&hyp_vcpu->vcpu.arch.pkvm_memcache, nr_pages, + &host_vcpu->arch.pkvm_memcache); +} + +typedef void (*hyp_entry_exit_handler_fn)(struct pkvm_hyp_vcpu *); + +static void handle_pvm_entry_wfx(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + if (vcpu_get_flag(hyp_vcpu->host_vcpu, INCREMENT_PC)) { + vcpu_clear_flag(&hyp_vcpu->vcpu, PC_UPDATE_REQ); + kvm_incr_pc(&hyp_vcpu->vcpu); + } +} + +static void handle_pvm_entry_psci(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + u32 psci_fn = smccc_get_function(&hyp_vcpu->vcpu); + u64 ret = READ_ONCE(hyp_vcpu->host_vcpu->arch.ctxt.regs.regs[0]); + + switch (psci_fn) { + case PSCI_0_2_FN_CPU_ON: + case PSCI_0_2_FN64_CPU_ON: + /* + * Check whether the cpu_on request to the host was successful. + * If not, reset the vcpu state from ON_PENDING to OFF. + * This could happen if this vcpu attempted to turn on the other + * vcpu while the other one is in the process of turning itself + * off. + */ + if (ret != PSCI_RET_SUCCESS) { + unsigned long cpu_id = smccc_get_arg1(&hyp_vcpu->vcpu); + struct pkvm_hyp_vcpu *target_vcpu; + struct pkvm_hyp_vm *hyp_vm; + + hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + target_vcpu = pkvm_mpidr_to_hyp_vcpu(hyp_vm, cpu_id); + + if (target_vcpu && READ_ONCE(target_vcpu->power_state) == PSCI_0_2_AFFINITY_LEVEL_ON_PENDING) + WRITE_ONCE(target_vcpu->power_state, PSCI_0_2_AFFINITY_LEVEL_OFF); + + ret = PSCI_RET_INTERNAL_FAILURE; + } + + break; + default: + break; + } + + vcpu_set_reg(&hyp_vcpu->vcpu, 0, ret); +} + +static void handle_pvm_entry_hvc64(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + u32 fn = smccc_get_function(&hyp_vcpu->vcpu); + + switch (fn) { + case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID: + pkvm_refill_memcache(hyp_vcpu); + break; + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID: + fallthrough; + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID: + fallthrough; + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID: + vcpu_set_reg(&hyp_vcpu->vcpu, 0, SMCCC_RET_SUCCESS); + break; + default: + handle_pvm_entry_psci(hyp_vcpu); + break; + } +} + +static void handle_pvm_entry_sys64(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + /* Exceptions have priority on anything else */ + if (vcpu_get_flag(host_vcpu, PENDING_EXCEPTION)) { + /* Exceptions caused by this should be undef exceptions. */ + u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); + + __vcpu_sys_reg(&hyp_vcpu->vcpu, ESR_EL1) = esr; + kvm_pend_exception(&hyp_vcpu->vcpu, EXCEPT_AA64_EL1_SYNC); + return; + } + + if (vcpu_get_flag(host_vcpu, INCREMENT_PC)) { + vcpu_clear_flag(&hyp_vcpu->vcpu, PC_UPDATE_REQ); + kvm_incr_pc(&hyp_vcpu->vcpu); + } + + if (!esr_sys64_to_params(hyp_vcpu->vcpu.arch.fault.esr_el2).is_write) { + /* r0 as transfer register between the guest and the host. */ + u64 rt_val = READ_ONCE(host_vcpu->arch.ctxt.regs.regs[0]); + int rt = kvm_vcpu_sys_get_rt(&hyp_vcpu->vcpu); + + vcpu_set_reg(&hyp_vcpu->vcpu, rt, rt_val); + } +} + +static void handle_pvm_entry_iabt(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + unsigned long cpsr = *vcpu_cpsr(&hyp_vcpu->vcpu); + u32 esr = ESR_ELx_IL; + + if (!vcpu_get_flag(hyp_vcpu->host_vcpu, PENDING_EXCEPTION)) + return; + + /* + * If the host wants to inject an exception, get syndrom and + * fault address. + */ + if ((cpsr & PSR_MODE_MASK) == PSR_MODE_EL0t) + esr |= (ESR_ELx_EC_IABT_LOW << ESR_ELx_EC_SHIFT); + else + esr |= (ESR_ELx_EC_IABT_CUR << ESR_ELx_EC_SHIFT); + + esr |= ESR_ELx_FSC_EXTABT; + + __vcpu_sys_reg(&hyp_vcpu->vcpu, ESR_EL1) = esr; + __vcpu_sys_reg(&hyp_vcpu->vcpu, FAR_EL1) = + kvm_vcpu_get_hfar(&hyp_vcpu->vcpu); + + /* Tell the run loop that we want to inject something */ + kvm_pend_exception(&hyp_vcpu->vcpu, EXCEPT_AA64_EL1_SYNC); +} + +static void handle_pvm_entry_dabt(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + bool pc_update; + + /* Exceptions have priority over anything else */ + if (vcpu_get_flag(host_vcpu, PENDING_EXCEPTION)) { + unsigned long cpsr = *vcpu_cpsr(&hyp_vcpu->vcpu); + u32 esr = ESR_ELx_IL; + + if ((cpsr & PSR_MODE_MASK) == PSR_MODE_EL0t) + esr |= (ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT); + else + esr |= (ESR_ELx_EC_DABT_CUR << ESR_ELx_EC_SHIFT); + + esr |= ESR_ELx_FSC_EXTABT; + + __vcpu_sys_reg(&hyp_vcpu->vcpu, ESR_EL1) = esr; + __vcpu_sys_reg(&hyp_vcpu->vcpu, FAR_EL1) = + kvm_vcpu_get_hfar(&hyp_vcpu->vcpu); + + /* Tell the run loop that we want to inject something */ + kvm_pend_exception(&hyp_vcpu->vcpu, EXCEPT_AA64_EL1_SYNC); + + /* Cancel potential in-flight MMIO */ + hyp_vcpu->vcpu.mmio_needed = false; + return; + } + + /* Handle PC increment on MMIO */ + pc_update = (hyp_vcpu->vcpu.mmio_needed && + vcpu_get_flag(host_vcpu, INCREMENT_PC)); + if (pc_update) { + vcpu_clear_flag(&hyp_vcpu->vcpu, PC_UPDATE_REQ); + kvm_incr_pc(&hyp_vcpu->vcpu); + } + + /* If we were doing an MMIO read access, update the register*/ + if (pc_update && !kvm_vcpu_dabt_iswrite(&hyp_vcpu->vcpu)) { + /* r0 as transfer register between the guest and the host. */ + u64 rd_val = READ_ONCE(host_vcpu->arch.ctxt.regs.regs[0]); + int rd = kvm_vcpu_dabt_get_rd(&hyp_vcpu->vcpu); + + vcpu_set_reg(&hyp_vcpu->vcpu, rd, rd_val); + } + + hyp_vcpu->vcpu.mmio_needed = false; +} + +static void handle_pvm_exit_wfx(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + WRITE_ONCE(hyp_vcpu->host_vcpu->arch.ctxt.regs.pstate, + hyp_vcpu->vcpu.arch.ctxt.regs.pstate & PSR_MODE_MASK); + WRITE_ONCE(hyp_vcpu->host_vcpu->arch.fault.esr_el2, + hyp_vcpu->vcpu.arch.fault.esr_el2); +} + +static void handle_pvm_exit_sys64(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + u32 esr_el2 = hyp_vcpu->vcpu.arch.fault.esr_el2; + + /* r0 as transfer register between the guest and the host. */ + WRITE_ONCE(host_vcpu->arch.fault.esr_el2, + esr_el2 & ~ESR_ELx_SYS64_ISS_RT_MASK); + + /* The mode is required for the host to emulate some sysregs */ + WRITE_ONCE(host_vcpu->arch.ctxt.regs.pstate, + hyp_vcpu->vcpu.arch.ctxt.regs.pstate & PSR_MODE_MASK); + + if (esr_sys64_to_params(esr_el2).is_write) { + int rt = kvm_vcpu_sys_get_rt(&hyp_vcpu->vcpu); + u64 rt_val = vcpu_get_reg(&hyp_vcpu->vcpu, rt); + + WRITE_ONCE(host_vcpu->arch.ctxt.regs.regs[0], rt_val); + } +} + +static void handle_pvm_exit_hvc64(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + int n, i; + + switch (smccc_get_function(&hyp_vcpu->vcpu)) { + /* + * CPU_ON takes 3 arguments, however, to wake up the target vcpu the + * host only needs to know the target's cpu_id, which is passed as the + * first argument. The processing of the reset state is done at hyp. + */ + case PSCI_0_2_FN_CPU_ON: + case PSCI_0_2_FN64_CPU_ON: + n = 2; + break; + + case PSCI_0_2_FN_CPU_OFF: + case PSCI_0_2_FN_SYSTEM_OFF: + case PSCI_0_2_FN_SYSTEM_RESET: + case PSCI_0_2_FN_CPU_SUSPEND: + case PSCI_0_2_FN64_CPU_SUSPEND: + n = 1; + break; + + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID: + fallthrough; + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID: + fallthrough; + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID: + n = 4; + break; + + case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID: + n = 3; + break; + + case PSCI_1_1_FN_SYSTEM_RESET2: + case PSCI_1_1_FN64_SYSTEM_RESET2: + n = 3; + break; + + /* + * The rest are either blocked or handled by HYP, so we should + * really never be here. + */ + default: + BUG(); + } + + WRITE_ONCE(host_vcpu->arch.fault.esr_el2, + hyp_vcpu->vcpu.arch.fault.esr_el2); + + /* Pass the hvc function id (r0) as well as any potential arguments. */ + for (i = 0; i < n; i++) { + WRITE_ONCE(host_vcpu->arch.ctxt.regs.regs[i], + vcpu_get_reg(&hyp_vcpu->vcpu, i)); + } +} + +static void handle_pvm_exit_iabt(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + WRITE_ONCE(hyp_vcpu->host_vcpu->arch.fault.esr_el2, + hyp_vcpu->vcpu.arch.fault.esr_el2); + WRITE_ONCE(hyp_vcpu->host_vcpu->arch.fault.hpfar_el2, + hyp_vcpu->vcpu.arch.fault.hpfar_el2); +} + +static void handle_pvm_exit_dabt(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + hyp_vcpu->vcpu.mmio_needed = __pkvm_check_ioguard_page(hyp_vcpu); + + if (hyp_vcpu->vcpu.mmio_needed) { + /* r0 as transfer register between the guest and the host. */ + WRITE_ONCE(host_vcpu->arch.fault.esr_el2, + hyp_vcpu->vcpu.arch.fault.esr_el2 & ~ESR_ELx_SRT_MASK); + + if (kvm_vcpu_dabt_iswrite(&hyp_vcpu->vcpu)) { + int rt = kvm_vcpu_dabt_get_rd(&hyp_vcpu->vcpu); + u64 rt_val = vcpu_get_reg(&hyp_vcpu->vcpu, rt); + + WRITE_ONCE(host_vcpu->arch.ctxt.regs.regs[0], rt_val); + } + } else { + WRITE_ONCE(host_vcpu->arch.fault.esr_el2, + hyp_vcpu->vcpu.arch.fault.esr_el2 & ~ESR_ELx_ISV); + } + + WRITE_ONCE(host_vcpu->arch.ctxt.regs.pstate, + hyp_vcpu->vcpu.arch.ctxt.regs.pstate & PSR_MODE_MASK); + WRITE_ONCE(host_vcpu->arch.fault.far_el2, + hyp_vcpu->vcpu.arch.fault.far_el2 & GENMASK(11, 0)); + WRITE_ONCE(host_vcpu->arch.fault.hpfar_el2, + hyp_vcpu->vcpu.arch.fault.hpfar_el2); + WRITE_ONCE(__vcpu_sys_reg(host_vcpu, SCTLR_EL1), + __vcpu_sys_reg(&hyp_vcpu->vcpu, SCTLR_EL1) & + (SCTLR_ELx_EE | SCTLR_EL1_E0E)); +} + +static void handle_vm_entry_generic(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + vcpu_copy_flag(&hyp_vcpu->vcpu, hyp_vcpu->host_vcpu, PC_UPDATE_REQ); +} + +static void handle_vm_exit_generic(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + WRITE_ONCE(hyp_vcpu->host_vcpu->arch.fault.esr_el2, + hyp_vcpu->vcpu.arch.fault.esr_el2); +} + +static void handle_vm_exit_abt(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + WRITE_ONCE(host_vcpu->arch.fault.esr_el2, + hyp_vcpu->vcpu.arch.fault.esr_el2); + WRITE_ONCE(host_vcpu->arch.fault.far_el2, + hyp_vcpu->vcpu.arch.fault.far_el2); + WRITE_ONCE(host_vcpu->arch.fault.hpfar_el2, + hyp_vcpu->vcpu.arch.fault.hpfar_el2); + WRITE_ONCE(host_vcpu->arch.fault.disr_el1, + hyp_vcpu->vcpu.arch.fault.disr_el1); +} + +static const hyp_entry_exit_handler_fn entry_hyp_pvm_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_WFx] = handle_pvm_entry_wfx, + [ESR_ELx_EC_HVC64] = handle_pvm_entry_hvc64, + [ESR_ELx_EC_SYS64] = handle_pvm_entry_sys64, + [ESR_ELx_EC_IABT_LOW] = handle_pvm_entry_iabt, + [ESR_ELx_EC_DABT_LOW] = handle_pvm_entry_dabt, +}; + +static const hyp_entry_exit_handler_fn exit_hyp_pvm_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_WFx] = handle_pvm_exit_wfx, + [ESR_ELx_EC_HVC64] = handle_pvm_exit_hvc64, + [ESR_ELx_EC_SYS64] = handle_pvm_exit_sys64, + [ESR_ELx_EC_IABT_LOW] = handle_pvm_exit_iabt, + [ESR_ELx_EC_DABT_LOW] = handle_pvm_exit_dabt, +}; + +static const hyp_entry_exit_handler_fn entry_hyp_vm_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = handle_vm_entry_generic, +}; + +static const hyp_entry_exit_handler_fn exit_hyp_vm_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = handle_vm_exit_generic, + [ESR_ELx_EC_IABT_LOW] = handle_vm_exit_abt, + [ESR_ELx_EC_DABT_LOW] = handle_vm_exit_abt, +}; + +static void flush_hyp_vgic_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + struct vgic_v3_cpu_if *host_cpu_if, *hyp_cpu_if; + unsigned int used_lrs, max_lrs, i; + + host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3; + hyp_cpu_if = &hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3; + + max_lrs = (read_gicreg(ICH_VTR_EL2) & 0xf) + 1; + used_lrs = READ_ONCE(host_cpu_if->used_lrs); + used_lrs = min(used_lrs, max_lrs); + + hyp_cpu_if->vgic_hcr = READ_ONCE(host_cpu_if->vgic_hcr); + /* Should be a one-off */ + hyp_cpu_if->vgic_sre = (ICC_SRE_EL1_DIB | + ICC_SRE_EL1_DFB | + ICC_SRE_EL1_SRE); + hyp_cpu_if->used_lrs = used_lrs; + + for (i = 0; i < used_lrs; i++) + hyp_cpu_if->vgic_lr[i] = READ_ONCE(host_cpu_if->vgic_lr[i]); +} + +static void sync_hyp_vgic_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + struct vgic_v3_cpu_if *host_cpu_if, *hyp_cpu_if; + unsigned int i; + + host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3; + hyp_cpu_if = &hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3; + + WRITE_ONCE(host_cpu_if->vgic_hcr, hyp_cpu_if->vgic_hcr); + + for (i = 0; i < hyp_cpu_if->used_lrs; i++) + WRITE_ONCE(host_cpu_if->vgic_lr[i], hyp_cpu_if->vgic_lr[i]); +} + +static void flush_hyp_timer_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + return; + + /* + * A hyp vcpu has no offset, and sees vtime == ptime. The + * ptimer is fully emulated by EL1 and cannot be trusted. + */ + write_sysreg(0, cntvoff_el2); + isb(); + write_sysreg_el0(__vcpu_sys_reg(&hyp_vcpu->vcpu, CNTV_CVAL_EL0), + SYS_CNTV_CVAL); + write_sysreg_el0(__vcpu_sys_reg(&hyp_vcpu->vcpu, CNTV_CTL_EL0), + SYS_CNTV_CTL); +} + +static void sync_hyp_timer_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + return; + + /* + * Preserve the vtimer state so that it is always correct, + * even if the host tries to make a mess. + */ + __vcpu_sys_reg(&hyp_vcpu->vcpu, CNTV_CVAL_EL0) = + read_sysreg_el0(SYS_CNTV_CVAL); + __vcpu_sys_reg(&hyp_vcpu->vcpu, CNTV_CTL_EL0) = + read_sysreg_el0(SYS_CNTV_CTL); +} + +static void __copy_vcpu_state(const struct kvm_vcpu *from_vcpu, + struct kvm_vcpu *to_vcpu) +{ + int i; + + to_vcpu->arch.ctxt.regs = from_vcpu->arch.ctxt.regs; + to_vcpu->arch.ctxt.spsr_abt = from_vcpu->arch.ctxt.spsr_abt; + to_vcpu->arch.ctxt.spsr_und = from_vcpu->arch.ctxt.spsr_und; + to_vcpu->arch.ctxt.spsr_irq = from_vcpu->arch.ctxt.spsr_irq; + to_vcpu->arch.ctxt.spsr_fiq = from_vcpu->arch.ctxt.spsr_fiq; + + /* + * Copy the sysregs, but don't mess with the timer state which + * is directly handled by EL1 and is expected to be preserved. + */ + for (i = 1; i < NR_SYS_REGS; i++) { + if (i >= CNTVOFF_EL2 && i <= CNTP_CTL_EL0) + continue; + to_vcpu->arch.ctxt.sys_regs[i] = from_vcpu->arch.ctxt.sys_regs[i]; + } +} + +static void __sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + __copy_vcpu_state(&hyp_vcpu->vcpu, hyp_vcpu->host_vcpu); +} + +static void __flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + __copy_vcpu_state(hyp_vcpu->host_vcpu, &hyp_vcpu->vcpu); +} + +static void flush_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + u64 mdcr_el2 = READ_ONCE(host_vcpu->arch.mdcr_el2); + + /* + * Propagate the monitor debug configuration of the vcpu from host. + * Preserve HPMN, which is set-up by some knowledgeable bootcode. + * Ensure that MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK are clear, + * as guests should not be able to access profiling and trace buffers. + * Ensure that RES0 bits are clear. + */ + mdcr_el2 &= ~(MDCR_EL2_RES0 | + MDCR_EL2_HPMN_MASK | + (MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) | + (MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT)); + vcpu->arch.mdcr_el2 = read_sysreg(mdcr_el2) & MDCR_EL2_HPMN_MASK; + vcpu->arch.mdcr_el2 |= mdcr_el2; + + vcpu->arch.pmu = host_vcpu->arch.pmu; + vcpu->guest_debug = READ_ONCE(host_vcpu->guest_debug); + + if (!kvm_vcpu_needs_debug_regs(vcpu)) + return; + + __vcpu_save_guest_debug_regs(vcpu); + + /* Switch debug_ptr to the external_debug_state if done by the host. */ + if (kern_hyp_va(READ_ONCE(host_vcpu->arch.debug_ptr)) == + &host_vcpu->arch.external_debug_state) + vcpu->arch.debug_ptr = &host_vcpu->arch.external_debug_state; + + /* Propagate any special handling for single step from host. */ + vcpu_write_sys_reg(vcpu, vcpu_read_sys_reg(host_vcpu, MDSCR_EL1), + MDSCR_EL1); + *vcpu_cpsr(vcpu) = *vcpu_cpsr(host_vcpu); +} + +static void sync_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + if (!kvm_vcpu_needs_debug_regs(vcpu)) + return; + + __vcpu_restore_guest_debug_regs(vcpu); + vcpu->arch.debug_ptr = &host_vcpu->arch.vcpu_debug_state; +} + +static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + hyp_entry_exit_handler_fn ec_handler; + u8 esr_ec; + + if (READ_ONCE(hyp_vcpu->power_state) == PSCI_0_2_AFFINITY_LEVEL_ON_PENDING) + pkvm_reset_vcpu(hyp_vcpu); + + /* + * If we deal with a non-protected guest and the state is potentially + * dirty (from a host perspective), copy the state back into the hyp + * vcpu. + */ + if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu)) { + if (vcpu_get_flag(host_vcpu, PKVM_HOST_STATE_DIRTY)) + __flush_hyp_vcpu(hyp_vcpu); + + hyp_vcpu->vcpu.arch.iflags = READ_ONCE(host_vcpu->arch.iflags); + flush_debug_state(hyp_vcpu); + + hyp_vcpu->vcpu.arch.hcr_el2 = HCR_GUEST_FLAGS & ~(HCR_RW | HCR_TWI | HCR_TWE); + hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2); + } + + hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2; + + flush_hyp_vgic_state(hyp_vcpu); + flush_hyp_timer_state(hyp_vcpu); + + switch (ARM_EXCEPTION_CODE(hyp_vcpu->exit_code)) { + case ARM_EXCEPTION_IRQ: + case ARM_EXCEPTION_EL1_SERROR: + case ARM_EXCEPTION_IL: + break; + case ARM_EXCEPTION_TRAP: + esr_ec = ESR_ELx_EC(kvm_vcpu_get_esr(&hyp_vcpu->vcpu)); + + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + ec_handler = entry_hyp_pvm_handlers[esr_ec]; + else + ec_handler = entry_hyp_vm_handlers[esr_ec]; + + if (ec_handler) + ec_handler(hyp_vcpu); + break; + default: + BUG(); + } + + hyp_vcpu->exit_code = 0; +} + +static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, u32 exit_reason) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + hyp_entry_exit_handler_fn ec_handler; + u8 esr_ec; + + if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + sync_debug_state(hyp_vcpu); + + /* + * Don't sync the vcpu GPR/sysreg state after a run. Instead, + * leave it in the hyp vCPU until someone actually requires it. + */ + sync_hyp_vgic_state(hyp_vcpu); + sync_hyp_timer_state(hyp_vcpu); + + switch (ARM_EXCEPTION_CODE(exit_reason)) { + case ARM_EXCEPTION_IRQ: + break; + case ARM_EXCEPTION_TRAP: + esr_ec = ESR_ELx_EC(kvm_vcpu_get_esr(&hyp_vcpu->vcpu)); + + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + ec_handler = exit_hyp_pvm_handlers[esr_ec]; + else + ec_handler = exit_hyp_vm_handlers[esr_ec]; + + if (ec_handler) + ec_handler(hyp_vcpu); + break; + case ARM_EXCEPTION_EL1_SERROR: + case ARM_EXCEPTION_IL: + break; + default: + BUG(); + } + + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + vcpu_clear_flag(host_vcpu, PC_UPDATE_REQ); + else + host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags; + + hyp_vcpu->exit_code = exit_reason; +} + +static void __hyp_sve_save_guest(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + + __sve_save_state(vcpu_sve_pffr(vcpu), &vcpu->arch.ctxt.fp_regs.fpsr); + __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR); + sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL1); +} + +static void fpsimd_host_restore(void) +{ + sysreg_clear_set(cptr_el2, CPTR_EL2_TZ | CPTR_EL2_TFP, 0); + isb(); + + if (unlikely(is_protected_kvm_enabled())) { + struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + struct user_fpsimd_state *host_fpsimd_state; + + host_fpsimd_state = this_cpu_ptr(&loaded_host_fpsimd_state); + + if (vcpu_has_sve(&hyp_vcpu->vcpu)) + __hyp_sve_save_guest(hyp_vcpu); + else + __fpsimd_save_state(&hyp_vcpu->vcpu.arch.ctxt.fp_regs); + + __fpsimd_restore_state(host_fpsimd_state); + + hyp_vcpu->vcpu.arch.fp_state = FP_STATE_HOST_OWNED; + } + + if (system_supports_sve()) + sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); +} + +static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(unsigned int, vcpu_idx, host_ctxt, 2); + DECLARE_REG(u64, hcr_el2, host_ctxt, 3); + struct pkvm_hyp_vcpu *hyp_vcpu; + int __percpu *last_vcpu_ran; + int *last_ran; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vcpu = pkvm_load_hyp_vcpu(handle, vcpu_idx); + if (!hyp_vcpu) + return; + + /* + * Guarantee that both TLBs and I-cache are private to each vcpu. If a + * vcpu from the same VM has previously run on the same physical CPU, + * nuke the relevant contexts. + */ + last_vcpu_ran = hyp_vcpu->vcpu.arch.hw_mmu->last_vcpu_ran; + last_ran = (__force int *) &last_vcpu_ran[hyp_smp_processor_id()]; + if (*last_ran != hyp_vcpu->vcpu.vcpu_id) { + __kvm_flush_cpu_context(hyp_vcpu->vcpu.arch.hw_mmu); + *last_ran = hyp_vcpu->vcpu.vcpu_id; + } + + hyp_vcpu->vcpu.arch.host_fpsimd_state = this_cpu_ptr(&loaded_host_fpsimd_state); + hyp_vcpu->vcpu.arch.fp_state = FP_STATE_HOST_OWNED; + + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) { + /* Propagate WFx trapping flags, trap ptrauth */ + hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWE | HCR_TWI | + HCR_API | HCR_APK); + hyp_vcpu->vcpu.arch.hcr_el2 |= hcr_el2 & (HCR_TWE | HCR_TWI); + } +} + +static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt) +{ + struct pkvm_hyp_vcpu *hyp_vcpu; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (hyp_vcpu) { + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + if (hyp_vcpu->vcpu.arch.fp_state == FP_STATE_GUEST_OWNED) + fpsimd_host_restore(); + + if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu) && + !vcpu_get_flag(host_vcpu, PKVM_HOST_STATE_DIRTY)) { + __sync_hyp_vcpu(hyp_vcpu); + } + + pkvm_put_hyp_vcpu(hyp_vcpu); + } +} + +static void handle___pkvm_vcpu_sync_state(struct kvm_cpu_context *host_ctxt) +{ + struct pkvm_hyp_vcpu *hyp_vcpu; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + return; + + if (hyp_vcpu->vcpu.arch.fp_state == FP_STATE_GUEST_OWNED) + fpsimd_host_restore(); + + __sync_hyp_vcpu(hyp_vcpu); +} + +static struct kvm_vcpu *__get_host_hyp_vcpus(struct kvm_vcpu *arg, + struct pkvm_hyp_vcpu **hyp_vcpup) +{ + struct kvm_vcpu *host_vcpu = kern_hyp_va(arg); + struct pkvm_hyp_vcpu *hyp_vcpu = NULL; + + if (unlikely(is_protected_kvm_enabled())) { + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + + if (!hyp_vcpu || hyp_vcpu->host_vcpu != host_vcpu) { + hyp_vcpu = NULL; + host_vcpu = NULL; + } + } + + *hyp_vcpup = hyp_vcpu; + return host_vcpu; +} + +#define get_host_hyp_vcpus(ctxt, regnr, hyp_vcpup) \ + ({ \ + DECLARE_REG(struct kvm_vcpu *, __vcpu, ctxt, regnr); \ + __get_host_hyp_vcpus(__vcpu, hyp_vcpup); \ + }) + +#define get_host_hyp_vcpus_from_vgic_v3_cpu_if(ctxt, regnr, hyp_vcpup) \ + ({ \ + DECLARE_REG(struct vgic_v3_cpu_if *, cif, ctxt, regnr); \ + struct kvm_vcpu *__vcpu = container_of(cif, \ + struct kvm_vcpu, \ + arch.vgic_cpu.vgic_v3); \ + \ + __get_host_hyp_vcpus(__vcpu, hyp_vcpup); \ + }) + static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) { - DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); + struct pkvm_hyp_vcpu *hyp_vcpu; + struct kvm_vcpu *host_vcpu; + int ret; - cpu_reg(host_ctxt, 1) = __kvm_vcpu_run(kern_hyp_va(vcpu)); + host_vcpu = get_host_hyp_vcpus(host_ctxt, 1, &hyp_vcpu); + if (!host_vcpu) { + ret = -EINVAL; + goto out; + } + + if (unlikely(hyp_vcpu)) { + flush_hyp_vcpu(hyp_vcpu); + + ret = __kvm_vcpu_run(&hyp_vcpu->vcpu); + + sync_hyp_vcpu(hyp_vcpu, ret); + + if (hyp_vcpu->vcpu.arch.fp_state == FP_STATE_GUEST_OWNED) { + /* + * The guest has used the FP, trap all accesses + * from the host (both FP and SVE). + */ + u64 reg = CPTR_EL2_TFP; + + if (system_supports_sve()) + reg |= CPTR_EL2_TZ; + + sysreg_clear_set(cptr_el2, 0, reg); + } + } else { + /* The host is fully trusted, run its vCPU directly. */ + ret = __kvm_vcpu_run(host_vcpu); + } +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_map_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, pfn, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu) + goto out; + + /* Top-up our per-vcpu memcache from the host's */ + ret = pkvm_refill_memcache(hyp_vcpu); + if (ret) + goto out; + + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + ret = __pkvm_host_donate_guest(pfn, gfn, hyp_vcpu); + else + ret = __pkvm_host_share_guest(pfn, gfn, hyp_vcpu); +out: + cpu_reg(host_ctxt, 1) = ret; } static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt) { - DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); + struct pkvm_hyp_vcpu *hyp_vcpu; + struct kvm_vcpu *host_vcpu; - __kvm_adjust_pc(kern_hyp_va(vcpu)); + host_vcpu = get_host_hyp_vcpus(host_ctxt, 1, &hyp_vcpu); + if (!host_vcpu) + return; + + if (hyp_vcpu) { + /* This only applies to non-protected VMs */ + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + return; + + __kvm_adjust_pc(&hyp_vcpu->vcpu); + } else { + __kvm_adjust_pc(host_vcpu); + } } static void handle___kvm_flush_vm_context(struct kvm_cpu_context *host_ctxt) @@ -82,16 +952,6 @@ static void handle___vgic_v3_get_gic_config(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __vgic_v3_get_gic_config(); } -static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt) -{ - cpu_reg(host_ctxt, 1) = __vgic_v3_read_vmcr(); -} - -static void handle___vgic_v3_write_vmcr(struct kvm_cpu_context *host_ctxt) -{ - __vgic_v3_write_vmcr(cpu_reg(host_ctxt, 1)); -} - static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt) { __vgic_v3_init_lrs(); @@ -102,18 +962,65 @@ static void handle___kvm_get_mdcr_el2(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __kvm_get_mdcr_el2(); } -static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt) +static void handle___vgic_v3_save_vmcr_aprs(struct kvm_cpu_context *host_ctxt) { - DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); + struct pkvm_hyp_vcpu *hyp_vcpu; + struct kvm_vcpu *host_vcpu; - __vgic_v3_save_aprs(kern_hyp_va(cpu_if)); + host_vcpu = get_host_hyp_vcpus_from_vgic_v3_cpu_if(host_ctxt, 1, + &hyp_vcpu); + if (!host_vcpu) + return; + + if (unlikely(hyp_vcpu)) { + struct vgic_v3_cpu_if *hyp_cpu_if, *host_cpu_if; + int i; + + hyp_cpu_if = &hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3; + __vgic_v3_save_vmcr_aprs(hyp_cpu_if); + + host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3; + host_cpu_if->vgic_vmcr = hyp_cpu_if->vgic_vmcr; + for (i = 0; i < ARRAY_SIZE(host_cpu_if->vgic_ap0r); i++) { + host_cpu_if->vgic_ap0r[i] = hyp_cpu_if->vgic_ap0r[i]; + host_cpu_if->vgic_ap1r[i] = hyp_cpu_if->vgic_ap1r[i]; + } + } else { + __vgic_v3_save_vmcr_aprs(&host_vcpu->arch.vgic_cpu.vgic_v3); + } } -static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt) +static void handle___vgic_v3_restore_vmcr_aprs(struct kvm_cpu_context *host_ctxt) { - DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); + struct pkvm_hyp_vcpu *hyp_vcpu; + struct kvm_vcpu *host_vcpu; - __vgic_v3_restore_aprs(kern_hyp_va(cpu_if)); + host_vcpu = get_host_hyp_vcpus_from_vgic_v3_cpu_if(host_ctxt, 1, + &hyp_vcpu); + if (!host_vcpu) + return; + + if (unlikely(hyp_vcpu)) { + struct vgic_v3_cpu_if *hyp_cpu_if, *host_cpu_if; + int i; + + hyp_cpu_if = &hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3; + host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3; + + hyp_cpu_if->vgic_vmcr = host_cpu_if->vgic_vmcr; + /* Should be a one-off */ + hyp_cpu_if->vgic_sre = (ICC_SRE_EL1_DIB | + ICC_SRE_EL1_DFB | + ICC_SRE_EL1_SRE); + for (i = 0; i < ARRAY_SIZE(host_cpu_if->vgic_ap0r); i++) { + hyp_cpu_if->vgic_ap0r[i] = host_cpu_if->vgic_ap0r[i]; + hyp_cpu_if->vgic_ap1r[i] = host_cpu_if->vgic_ap1r[i]; + } + + __vgic_v3_restore_vmcr_aprs(hyp_cpu_if); + } else { + __vgic_v3_restore_vmcr_aprs(&host_vcpu->arch.vgic_cpu.vgic_v3); + } } static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt) @@ -147,54 +1054,300 @@ static void handle___pkvm_host_share_hyp(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __pkvm_host_share_hyp(pfn); } +static void handle___pkvm_host_unshare_hyp(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, pfn, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_host_unshare_hyp(pfn); +} + +static void handle___pkvm_reclaim_dying_guest_page(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, pfn, host_ctxt, 2); + DECLARE_REG(u64, ipa, host_ctxt, 3); + + cpu_reg(host_ctxt, 1) = __pkvm_reclaim_dying_guest_page(handle, pfn, ipa); +} + static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(phys_addr_t, phys, host_ctxt, 1); DECLARE_REG(size_t, size, host_ctxt, 2); DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3); - cpu_reg(host_ctxt, 1) = __pkvm_create_private_mapping(phys, size, prot); + /* + * __pkvm_create_private_mapping() populates a pointer with the + * hypervisor start address of the allocation. + * + * However, handle___pkvm_create_private_mapping() hypercall crosses the + * EL1/EL2 boundary so the pointer would not be valid in this context. + * + * Instead pass the allocation address as the return value (or return + * ERR_PTR() on failure). + */ + unsigned long haddr; + int err = __pkvm_create_private_mapping(phys, size, prot, &haddr); + + if (err) + haddr = (unsigned long)ERR_PTR(err); + + cpu_reg(host_ctxt, 1) = haddr; } static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) { cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); } + +static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1); + DECLARE_REG(unsigned long, vm_hva, host_ctxt, 2); + DECLARE_REG(unsigned long, pgd_hva, host_ctxt, 3); + DECLARE_REG(unsigned long, last_ran_hva, host_ctxt, 4); + + host_kvm = kern_hyp_va(host_kvm); + cpu_reg(host_ctxt, 1) = __pkvm_init_vm(host_kvm, vm_hva, pgd_hva, + last_ran_hva); +} + +static void handle___pkvm_init_vcpu(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 2); + DECLARE_REG(unsigned long, vcpu_hva, host_ctxt, 3); + + host_vcpu = kern_hyp_va(host_vcpu); + cpu_reg(host_ctxt, 1) = __pkvm_init_vcpu(handle, host_vcpu, vcpu_hva); +} + +static void handle___pkvm_start_teardown_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_start_teardown_vm(handle); +} + +static void handle___pkvm_finalize_teardown_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_finalize_teardown_vm(handle); +} +static void handle___pkvm_iommu_driver_init(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct pkvm_iommu_driver*, drv, host_ctxt, 1); + DECLARE_REG(void *, data, host_ctxt, 2); + DECLARE_REG(size_t, size, host_ctxt, 3); + + data = kern_hyp_va(data); + + cpu_reg(host_ctxt, 1) = __pkvm_iommu_driver_init(drv, data, size); +} + +static void handle___pkvm_iommu_register(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned long, dev_id, host_ctxt, 1); + DECLARE_REG(unsigned long, drv_id, host_ctxt, 2); + DECLARE_REG(phys_addr_t, dev_pa, host_ctxt, 3); + DECLARE_REG(size_t, dev_size, host_ctxt, 4); + DECLARE_REG(unsigned long, parent_id, host_ctxt, 5); + DECLARE_REG(void *, mem, host_ctxt, 6); + DECLARE_REG(size_t, mem_size, host_ctxt, 7); + + cpu_reg(host_ctxt, 1) = __pkvm_iommu_register(dev_id, drv_id, dev_pa, + dev_size, parent_id, + mem, mem_size); +} + +static void handle___pkvm_iommu_pm_notify(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned long, dev_id, host_ctxt, 1); + DECLARE_REG(enum pkvm_iommu_pm_event, event, host_ctxt, 2); + + cpu_reg(host_ctxt, 1) = __pkvm_iommu_pm_notify(dev_id, event); +} + +static void handle___pkvm_iommu_finalize(struct kvm_cpu_context *host_ctxt) +{ + cpu_reg(host_ctxt, 1) = __pkvm_iommu_finalize(); +} + +static void handle___pkvm_alloc_module_va(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, nr_pages, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = (u64)__pkvm_alloc_module_va(nr_pages); +} + +static void handle___pkvm_map_module_page(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, pfn, host_ctxt, 1); + DECLARE_REG(void *, va, host_ctxt, 2); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3); + + cpu_reg(host_ctxt, 1) = (u64)__pkvm_map_module_page(pfn, va, prot, false); +} + +static void handle___pkvm_unmap_module_page(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, pfn, host_ctxt, 1); + DECLARE_REG(void *, va, host_ctxt, 2); + + __pkvm_unmap_module_page(pfn, va); +} + +static void handle___pkvm_init_module(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(void *, ptr, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_init_module(ptr); +} + +static void handle___pkvm_register_hcall(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned long, hfn_hyp_va, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_register_hcall(hfn_hyp_va); +} + +static void +handle___pkvm_close_module_registration(struct kvm_cpu_context *host_ctxt) +{ + cpu_reg(host_ctxt, 1) = __pkvm_close_module_registration(); +} + +static void handle___pkvm_start_tracing(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned long, pack_hva, host_ctxt, 1); + DECLARE_REG(size_t, pack_size, host_ctxt, 2); + + cpu_reg(host_ctxt, 1) = __pkvm_start_tracing(pack_hva, pack_size); +} + +static void handle___pkvm_stop_tracing(struct kvm_cpu_context *host_ctxt) +{ + __pkvm_stop_tracing(); + + cpu_reg(host_ctxt, 1) = 0; +} + +static void handle___pkvm_rb_swap_reader_page(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(int, cpu, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_rb_swap_reader_page(cpu); +} + +static void handle___pkvm_rb_update_footers(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(int, cpu, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_rb_update_footers(cpu); +} + +static void handle___pkvm_enable_event(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned short, id, host_ctxt, 1); + DECLARE_REG(bool, enable, host_ctxt, 2); + + cpu_reg(host_ctxt, 1) = __pkvm_enable_event(id, enable); +} + typedef void (*hcall_t)(struct kvm_cpu_context *); #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x static const hcall_t host_hcall[] = { - HANDLE_FUNC(__kvm_vcpu_run), - HANDLE_FUNC(__kvm_adjust_pc), + /* ___kvm_hyp_init */ + HANDLE_FUNC(__kvm_get_mdcr_el2), + HANDLE_FUNC(__pkvm_init), + HANDLE_FUNC(__pkvm_create_private_mapping), + HANDLE_FUNC(__pkvm_cpu_set_vector), + HANDLE_FUNC(__kvm_enable_ssbs), + HANDLE_FUNC(__vgic_v3_init_lrs), + HANDLE_FUNC(__vgic_v3_get_gic_config), HANDLE_FUNC(__kvm_flush_vm_context), HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), HANDLE_FUNC(__kvm_tlb_flush_vmid), HANDLE_FUNC(__kvm_flush_cpu_context), - HANDLE_FUNC(__kvm_timer_set_cntvoff), - HANDLE_FUNC(__kvm_enable_ssbs), - HANDLE_FUNC(__vgic_v3_get_gic_config), - HANDLE_FUNC(__vgic_v3_read_vmcr), - HANDLE_FUNC(__vgic_v3_write_vmcr), - HANDLE_FUNC(__vgic_v3_init_lrs), - HANDLE_FUNC(__kvm_get_mdcr_el2), - HANDLE_FUNC(__vgic_v3_save_aprs), - HANDLE_FUNC(__vgic_v3_restore_aprs), - HANDLE_FUNC(__pkvm_init), - HANDLE_FUNC(__pkvm_cpu_set_vector), - HANDLE_FUNC(__pkvm_host_share_hyp), - HANDLE_FUNC(__pkvm_create_private_mapping), + + HANDLE_FUNC(__pkvm_alloc_module_va), + HANDLE_FUNC(__pkvm_map_module_page), + HANDLE_FUNC(__pkvm_unmap_module_page), + HANDLE_FUNC(__pkvm_init_module), + HANDLE_FUNC(__pkvm_register_hcall), + HANDLE_FUNC(__pkvm_close_module_registration), HANDLE_FUNC(__pkvm_prot_finalize), + + HANDLE_FUNC(__pkvm_host_share_hyp), + HANDLE_FUNC(__pkvm_host_unshare_hyp), + HANDLE_FUNC(__pkvm_host_map_guest), + HANDLE_FUNC(__kvm_adjust_pc), + HANDLE_FUNC(__kvm_vcpu_run), + HANDLE_FUNC(__kvm_timer_set_cntvoff), + HANDLE_FUNC(__vgic_v3_save_vmcr_aprs), + HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs), + HANDLE_FUNC(__pkvm_init_vm), + HANDLE_FUNC(__pkvm_init_vcpu), + HANDLE_FUNC(__pkvm_start_teardown_vm), + HANDLE_FUNC(__pkvm_finalize_teardown_vm), + HANDLE_FUNC(__pkvm_reclaim_dying_guest_page), + HANDLE_FUNC(__pkvm_vcpu_load), + HANDLE_FUNC(__pkvm_vcpu_put), + HANDLE_FUNC(__pkvm_vcpu_sync_state), + HANDLE_FUNC(__pkvm_iommu_driver_init), + HANDLE_FUNC(__pkvm_iommu_register), + HANDLE_FUNC(__pkvm_iommu_pm_notify), + HANDLE_FUNC(__pkvm_iommu_finalize), + HANDLE_FUNC(__pkvm_start_tracing), + HANDLE_FUNC(__pkvm_stop_tracing), + HANDLE_FUNC(__pkvm_rb_swap_reader_page), + HANDLE_FUNC(__pkvm_rb_update_footers), + HANDLE_FUNC(__pkvm_enable_event), }; +unsigned long pkvm_priv_hcall_limit __ro_after_init = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize; + +int reset_pkvm_priv_hcall_limit(void) +{ + unsigned long *addr; + + if (pkvm_priv_hcall_limit == __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize) + return -EACCES; + + addr = hyp_fixmap_map(__hyp_pa(&pkvm_priv_hcall_limit)); + *addr = KVM_HOST_SMCCC_FUNC(__pkvm_prot_finalize); + hyp_fixmap_unmap(); + + return 0; +} + static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(unsigned long, id, host_ctxt, 0); + unsigned long hcall_min = 0; hcall_t hfn; + if (handle_host_dynamic_hcall(host_ctxt) == HCALL_HANDLED) + return; + + /* + * If pKVM has been initialised then reject any calls to the + * early "privileged" hypercalls. Note that we cannot reject + * calls to __pkvm_prot_finalize for two reasons: (1) The static + * key used to determine initialisation must be toggled prior to + * finalisation and (2) finalisation is performed on a per-CPU + * basis. This is all fine, however, since __pkvm_prot_finalize + * returns -EPERM after the first call for a given CPU. + */ + if (static_branch_unlikely(&kvm_protected_mode_initialized)) + hcall_min = pkvm_priv_hcall_limit; + id -= KVM_HOST_SMCCC_ID(0); - if (unlikely(id >= ARRAY_SIZE(host_hcall))) + if (unlikely(id < hcall_min || id >= ARRAY_SIZE(host_hcall))) goto inval; hfn = host_hcall[id]; @@ -204,23 +1357,33 @@ static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 0) = SMCCC_RET_SUCCESS; hfn(host_ctxt); + trace_host_hcall(id, 0); + return; inval: + trace_host_hcall(id, 1); cpu_reg(host_ctxt, 0) = SMCCC_RET_NOT_SUPPORTED; } -static void default_host_smc_handler(struct kvm_cpu_context *host_ctxt) -{ - __kvm_hyp_host_forward_smc(host_ctxt); -} - static void handle_host_smc(struct kvm_cpu_context *host_ctxt) { + DECLARE_REG(u64, func_id, host_ctxt, 0); + struct pkvm_hyp_vcpu *hyp_vcpu; bool handled; + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (hyp_vcpu && hyp_vcpu->vcpu.arch.fp_state == FP_STATE_GUEST_OWNED) + fpsimd_host_restore(); + handled = kvm_host_psci_handler(host_ctxt); if (!handled) - default_host_smc_handler(host_ctxt); + handled = kvm_host_ffa_handler(host_ctxt); + if (!handled && READ_ONCE(default_host_smc_handler)) + handled = default_host_smc_handler(host_ctxt); + if (!handled) + __kvm_hyp_host_forward_smc(host_ctxt); + + trace_host_smc(func_id, !handled); /* SMC was trapped, move ELR past the current PC. */ kvm_skip_host_instr(); @@ -230,6 +1393,8 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) { u64 esr = read_sysreg_el2(SYS_ESR); + trace_hyp_enter(); + switch (ESR_ELx_EC(esr)) { case ESR_ELx_EC_HVC64: handle_host_hcall(host_ctxt); @@ -237,16 +1402,17 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) case ESR_ELx_EC_SMC64: handle_host_smc(host_ctxt); break; + case ESR_ELx_EC_FP_ASIMD: case ESR_ELx_EC_SVE: - sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0); - isb(); - sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); + fpsimd_host_restore(); break; case ESR_ELx_EC_IABT_LOW: case ESR_ELx_EC_DABT_LOW: handle_host_mem_abort(host_ctxt); break; default: - BUG(); + BUG_ON(!READ_ONCE(default_trap_handler) || !default_trap_handler(host_ctxt)); } + + trace_hyp_exit(); } diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c index 9f54833af400..9fcb92abd0b5 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c @@ -8,6 +8,8 @@ #include #include +DEFINE_PER_CPU(int, hyp_cpu_number); + /* * nVHE copy of data structures tracking available CPU cores. * Only entries for CPUs that were online at KVM init are populated. @@ -23,6 +25,8 @@ u64 cpu_logical_map(unsigned int cpu) return hyp_cpu_logical_map[cpu]; } +unsigned long __ro_after_init kvm_arm_hyp_percpu_base[NR_CPUS]; + unsigned long __hyp_per_cpu_offset(unsigned int cpu) { unsigned long *cpu_base_array; diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S index f4562f417d3f..8254ef9fc81c 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S @@ -16,6 +16,10 @@ SECTIONS { HYP_SECTION(.text) HYP_SECTION(.data..ro_after_init) HYP_SECTION(.rodata) +#ifdef CONFIG_TRACING + . = ALIGN(PAGE_SIZE); + HYP_SECTION(_hyp_event_ids) +#endif /* * .hyp..data..percpu needs to be page aligned to maintain the same @@ -25,5 +29,7 @@ SECTIONS { BEGIN_HYP_SECTION(.data..percpu) PERCPU_INPUT(L1_CACHE_BYTES) END_HYP_SECTION + HYP_SECTION(.bss) + HYP_SECTION(.data) } diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c new file mode 100644 index 000000000000..a6073f443c8f --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/iommu.c @@ -0,0 +1,570 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022 Google LLC + * Author: David Brazdil + */ + +#include + +#include +#include +#include +#include + +#include +#include +#include + +#define DRV_ID(drv_addr) ((unsigned long)drv_addr) + +enum { + IOMMU_DRIVER_NOT_READY = 0, + IOMMU_DRIVER_INITIALIZING, + IOMMU_DRIVER_READY, +}; + +/* List of registered IOMMU drivers, protected with iommu_drv_lock. */ +static LIST_HEAD(iommu_drivers); +/* IOMMU device list. Must only be accessed with host_mmu.lock held. */ +static LIST_HEAD(iommu_list); + +static bool iommu_finalized; +static DEFINE_HYP_SPINLOCK(iommu_registration_lock); +static DEFINE_HYP_SPINLOCK(iommu_drv_lock); + +static void *iommu_mem_pool; +static size_t iommu_mem_remaining; + +static void assert_host_component_locked(void) +{ + hyp_assert_lock_held(&host_mmu.lock); +} + +static void host_lock_component(void) +{ + hyp_spin_lock(&host_mmu.lock); +} + +static void host_unlock_component(void) +{ + hyp_spin_unlock(&host_mmu.lock); +} + +/* + * Find IOMMU driver by its ID. The input ID is treated as unstrusted + * and is properly validated. + */ +static inline struct pkvm_iommu_driver *get_driver(unsigned long id) +{ + struct pkvm_iommu_driver *drv, *ret = NULL; + + hyp_spin_lock(&iommu_drv_lock); + list_for_each_entry(drv, &iommu_drivers, list) { + if (DRV_ID(drv) == id) { + ret = drv; + break; + } + } + hyp_spin_unlock(&iommu_drv_lock); + return ret; +} + +static inline bool driver_acquire_init(struct pkvm_iommu_driver *drv) +{ + return atomic_cmpxchg_acquire(&drv->state, IOMMU_DRIVER_NOT_READY, + IOMMU_DRIVER_INITIALIZING) + == IOMMU_DRIVER_NOT_READY; +} + +static inline void driver_release_init(struct pkvm_iommu_driver *drv, + bool success) +{ + atomic_set_release(&drv->state, success ? IOMMU_DRIVER_READY + : IOMMU_DRIVER_NOT_READY); +} + +static inline bool is_driver_ready(struct pkvm_iommu_driver *drv) +{ + return atomic_read(&drv->state) == IOMMU_DRIVER_READY; +} + +static size_t __iommu_alloc_size(struct pkvm_iommu_driver *drv) +{ + return ALIGN(sizeof(struct pkvm_iommu) + drv->ops->data_size, + sizeof(unsigned long)); +} + +static bool validate_driver_id_unique(struct pkvm_iommu_driver *drv) +{ + struct pkvm_iommu_driver *cur; + + hyp_assert_lock_held(&iommu_drv_lock); + list_for_each_entry(cur, &iommu_drivers, list) { + if (DRV_ID(drv) == DRV_ID(cur)) + return false; + } + return true; +} + +static int __pkvm_register_iommu_driver(struct pkvm_iommu_driver *drv) +{ + int ret = 0; + + if (!drv) + return -EINVAL; + + hyp_assert_lock_held(&iommu_registration_lock); + hyp_spin_lock(&iommu_drv_lock); + if (validate_driver_id_unique(drv)) + list_add_tail(&drv->list, &iommu_drivers); + else + ret = -EEXIST; + hyp_spin_unlock(&iommu_drv_lock); + return ret; +} + +/* Global memory pool for allocating IOMMU list entry structs. */ +static inline struct pkvm_iommu *alloc_iommu(struct pkvm_iommu_driver *drv, + void *mem, size_t mem_size) +{ + size_t size = __iommu_alloc_size(drv); + void *ptr; + + assert_host_component_locked(); + + /* + * If new memory is being provided, replace the existing pool with it. + * Any remaining memory in the pool is discarded. + */ + if (mem && mem_size) { + iommu_mem_pool = mem; + iommu_mem_remaining = mem_size; + } + + if (size > iommu_mem_remaining) + return NULL; + + ptr = iommu_mem_pool; + iommu_mem_pool += size; + iommu_mem_remaining -= size; + return ptr; +} + +static inline void free_iommu(struct pkvm_iommu_driver *drv, struct pkvm_iommu *ptr) +{ + size_t size = __iommu_alloc_size(drv); + + assert_host_component_locked(); + + if (!ptr) + return; + + /* Only allow freeing the last allocated buffer. */ + if ((void *)ptr + size != iommu_mem_pool) + return; + + iommu_mem_pool -= size; + iommu_mem_remaining += size; +} + +static bool is_overlap(phys_addr_t r1_start, size_t r1_size, + phys_addr_t r2_start, size_t r2_size) +{ + phys_addr_t r1_end = r1_start + r1_size; + phys_addr_t r2_end = r2_start + r2_size; + + return (r1_start < r2_end) && (r2_start < r1_end); +} + +static bool is_mmio_range(phys_addr_t base, size_t size) +{ + struct memblock_region *reg; + phys_addr_t limit = BIT(host_mmu.pgt.ia_bits); + size_t i; + + /* Check against limits of host IPA space. */ + if ((base >= limit) || !size || (size > limit - base)) + return false; + + for (i = 0; i < hyp_memblock_nr; i++) { + reg = &hyp_memory[i]; + if (is_overlap(base, size, reg->base, reg->size)) + return false; + } + return true; +} + +static int __snapshot_host_stage2(u64 start, u64 pa_max, u32 level, + kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flags, + void * const arg) +{ + struct pkvm_iommu_driver * const drv = arg; + u64 end = start + kvm_granule_size(level); + kvm_pte_t pte = *ptep; + + /* + * Valid stage-2 entries are created lazily, invalid ones eagerly. + * Note: In the future we may need to check if [start,end) is MMIO. + * Note: Drivers initialize their PTs to all memory owned by the host, + * so we only call the driver on regions where that is not the case. + */ + if (pte && !kvm_pte_valid(pte)) + drv->ops->host_stage2_idmap_prepare(start, end, /*prot*/ 0); + return 0; +} + +static int snapshot_host_stage2(struct pkvm_iommu_driver * const drv) +{ + struct kvm_pgtable_walker walker = { + .cb = __snapshot_host_stage2, + .arg = drv, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + struct kvm_pgtable *pgt = &host_mmu.pgt; + + if (!drv->ops->host_stage2_idmap_prepare) + return 0; + + return kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker); +} + +static bool validate_against_existing_iommus(struct pkvm_iommu *dev) +{ + struct pkvm_iommu *other; + + assert_host_component_locked(); + + list_for_each_entry(other, &iommu_list, list) { + /* Device ID must be unique. */ + if (dev->id == other->id) + return false; + + /* MMIO regions must not overlap. */ + if (is_overlap(dev->pa, dev->size, other->pa, other->size)) + return false; + } + return true; +} + +static struct pkvm_iommu *find_iommu_by_id(unsigned long id) +{ + struct pkvm_iommu *dev; + + assert_host_component_locked(); + + list_for_each_entry(dev, &iommu_list, list) { + if (dev->id == id) + return dev; + } + return NULL; +} + +/* + * Initialize EL2 IOMMU driver. + * + * This is a common hypercall for driver initialization. Driver-specific + * arguments are passed in a shared memory buffer. The driver is expected to + * initialize it's page-table bookkeeping. + */ +int __pkvm_iommu_driver_init(struct pkvm_iommu_driver *drv, void *data, size_t size) +{ + const struct pkvm_iommu_ops *ops; + int ret = 0; + + /* New driver initialization not allowed after __pkvm_iommu_finalize(). */ + hyp_spin_lock(&iommu_registration_lock); + if (iommu_finalized) { + ret = -EPERM; + goto out_unlock; + } + + ret = __pkvm_register_iommu_driver(drv); + if (ret) + return ret; + + if (!drv->ops) { + ret = -EINVAL; + goto out_unlock; + } + + if (!driver_acquire_init(drv)) { + ret = -EBUSY; + goto out_unlock; + } + + ops = drv->ops; + + /* This can change stage-2 mappings. */ + if (ops->init) { + ret = hyp_pin_shared_mem(data, data + size); + if (!ret) { + ret = ops->init(data, size); + hyp_unpin_shared_mem(data, data + size); + } + if (ret) + goto out_release; + } + + /* + * Walk host stage-2 and pass current mappings to the driver. Start + * accepting host stage-2 updates as soon as the host lock is released. + */ + host_lock_component(); + ret = snapshot_host_stage2(drv); + if (!ret) + driver_release_init(drv, /*success=*/true); + host_unlock_component(); + +out_release: + if (ret) + driver_release_init(drv, /*success=*/false); + +out_unlock: + hyp_spin_unlock(&iommu_registration_lock); + return ret; +} + +int __pkvm_iommu_register(unsigned long dev_id, unsigned long drv_id, + phys_addr_t dev_pa, size_t dev_size, + unsigned long parent_id, + void *kern_mem_va, size_t mem_size) +{ + struct pkvm_iommu *dev = NULL; + struct pkvm_iommu_driver *drv; + void *mem_va = NULL; + int ret = 0; + + /* New device registration not allowed after __pkvm_iommu_finalize(). */ + hyp_spin_lock(&iommu_registration_lock); + if (iommu_finalized) { + ret = -EPERM; + goto out_unlock; + } + + drv = get_driver(drv_id); + if (!drv || !is_driver_ready(drv)) { + ret = -ENOENT; + goto out_unlock; + } + + if (!PAGE_ALIGNED(dev_pa) || !PAGE_ALIGNED(dev_size)) { + ret = -EINVAL; + goto out_unlock; + } + + if (!is_mmio_range(dev_pa, dev_size)) { + ret = -EINVAL; + goto out_unlock; + } + + /* + * Accept memory donation if the host is providing new memory. + * Note: We do not return the memory even if there is an error later. + */ + if (kern_mem_va && mem_size) { + mem_va = kern_hyp_va(kern_mem_va); + + if (!PAGE_ALIGNED(mem_va) || !PAGE_ALIGNED(mem_size)) { + ret = -EINVAL; + goto out_unlock; + } + + ret = __pkvm_host_donate_hyp(hyp_virt_to_pfn(mem_va), + mem_size >> PAGE_SHIFT); + if (ret) + goto out_unlock; + } + + host_lock_component(); + + /* Allocate memory for the new device entry. */ + dev = alloc_iommu(drv, mem_va, mem_size); + if (!dev) { + ret = -ENOMEM; + goto out_free; + } + + /* Populate the new device entry. */ + *dev = (struct pkvm_iommu){ + .children = LIST_HEAD_INIT(dev->children), + .id = dev_id, + .ops = drv->ops, + .pa = dev_pa, + .size = dev_size, + }; + + if (!validate_against_existing_iommus(dev)) { + ret = -EBUSY; + goto out_free; + } + + if (parent_id) { + dev->parent = find_iommu_by_id(parent_id); + if (!dev->parent) { + ret = -EINVAL; + goto out_free; + } + + if (dev->parent->ops->validate_child) { + ret = dev->parent->ops->validate_child(dev->parent, dev); + if (ret) + goto out_free; + } + } + + if (dev->ops->validate) { + ret = dev->ops->validate(dev); + if (ret) + goto out_free; + } + + /* + * Unmap the device's MMIO range from host stage-2. If registration + * is successful, future attempts to re-map will be blocked by + * pkvm_iommu_host_stage2_adjust_range. + */ + ret = host_stage2_unmap_reg_locked(dev_pa, dev_size); + if (ret) + goto out_free; + + /* Create EL2 mapping for the device. */ + ret = __pkvm_create_private_mapping(dev_pa, dev_size, + PAGE_HYP_DEVICE, (unsigned long *)(&dev->va)); + if (ret){ + goto out_free; + } + + /* Register device and prevent host from mapping the MMIO range. */ + list_add_tail(&dev->list, &iommu_list); + if (dev->parent) + list_add_tail(&dev->siblings, &dev->parent->children); + +out_free: + if (ret) + free_iommu(drv, dev); + host_unlock_component(); + +out_unlock: + hyp_spin_unlock(&iommu_registration_lock); + return ret; +} + +int __pkvm_iommu_finalize(void) +{ + int ret = 0; + + hyp_spin_lock(&iommu_registration_lock); + if (!iommu_finalized) + iommu_finalized = true; + else + ret = -EPERM; + hyp_spin_unlock(&iommu_registration_lock); + return ret; +} + +int __pkvm_iommu_pm_notify(unsigned long dev_id, enum pkvm_iommu_pm_event event) +{ + struct pkvm_iommu *dev; + int ret; + + host_lock_component(); + dev = find_iommu_by_id(dev_id); + if (dev) { + if (event == PKVM_IOMMU_PM_SUSPEND) { + ret = dev->ops->suspend ? dev->ops->suspend(dev) : 0; + if (!ret) + dev->powered = false; + } else if (event == PKVM_IOMMU_PM_RESUME) { + ret = dev->ops->resume ? dev->ops->resume(dev) : 0; + if (!ret) + dev->powered = true; + } else { + ret = -EINVAL; + } + } else { + ret = -ENODEV; + } + host_unlock_component(); + return ret; +} + +/* + * Check host memory access against IOMMUs' MMIO regions. + * Returns -EPERM if the address is within the bounds of a registered device. + * Otherwise returns zero and adjusts boundaries of the new mapping to avoid + * MMIO regions of registered IOMMUs. + */ +int pkvm_iommu_host_stage2_adjust_range(phys_addr_t addr, phys_addr_t *start, + phys_addr_t *end) +{ + struct pkvm_iommu *dev; + phys_addr_t new_start = *start; + phys_addr_t new_end = *end; + phys_addr_t dev_start, dev_end; + + assert_host_component_locked(); + + list_for_each_entry(dev, &iommu_list, list) { + dev_start = dev->pa; + dev_end = dev_start + dev->size; + + if (addr < dev_start) + new_end = min(new_end, dev_start); + else if (addr >= dev_end) + new_start = max(new_start, dev_end); + else + return -EPERM; + } + + *start = new_start; + *end = new_end; + return 0; +} + +bool pkvm_iommu_host_dabt_handler(struct kvm_cpu_context *host_ctxt, u32 esr, + phys_addr_t pa) +{ + struct pkvm_iommu *dev; + + assert_host_component_locked(); + + list_for_each_entry(dev, &iommu_list, list) { + if (pa < dev->pa || pa >= dev->pa + dev->size) + continue; + + /* No 'powered' check - the host assumes it is powered. */ + if (!dev->ops->host_dabt_handler || + !dev->ops->host_dabt_handler(dev, host_ctxt, esr, pa - dev->pa)) + return false; + + kvm_skip_host_instr(); + return true; + } + return false; +} + +void pkvm_iommu_host_stage2_idmap(phys_addr_t start, phys_addr_t end, + enum kvm_pgtable_prot prot) +{ + struct pkvm_iommu_driver *drv; + struct pkvm_iommu *dev; + + assert_host_component_locked(); + hyp_spin_lock(&iommu_drv_lock); + list_for_each_entry(drv, &iommu_drivers, list) { + if (drv && is_driver_ready(drv) && drv->ops->host_stage2_idmap_prepare) + drv->ops->host_stage2_idmap_prepare(start, end, prot); + } + hyp_spin_unlock(&iommu_drv_lock); + + list_for_each_entry(dev, &iommu_list, list) { + if (dev->powered && dev->ops->host_stage2_idmap_apply) + dev->ops->host_stage2_idmap_apply(dev, start, end); + } + + list_for_each_entry(dev, &iommu_list, list) { + if (dev->powered && dev->ops->host_stage2_idmap_complete) + dev->ops->host_stage2_idmap_complete(dev); + } +} diff --git a/arch/arm64/kvm/hyp/nvhe/list_debug.c b/arch/arm64/kvm/hyp/nvhe/list_debug.c new file mode 100644 index 000000000000..d68abd7ea124 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/list_debug.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022 - Google LLC + * Author: Keir Fraser + */ + +#include +#include + +static inline __must_check bool nvhe_check_data_corruption(bool v) +{ + return v; +} + +#define NVHE_CHECK_DATA_CORRUPTION(condition) \ + nvhe_check_data_corruption(({ \ + bool corruption = unlikely(condition); \ + if (corruption) { \ + if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \ + BUG_ON(1); \ + } else \ + WARN_ON(1); \ + } \ + corruption; \ + })) + +/* The predicates checked here are taken from lib/list_debug.c. */ + +bool __list_add_valid(struct list_head *new, struct list_head *prev, + struct list_head *next) +{ + if (NVHE_CHECK_DATA_CORRUPTION(next->prev != prev) || + NVHE_CHECK_DATA_CORRUPTION(prev->next != next) || + NVHE_CHECK_DATA_CORRUPTION(new == prev || new == next)) + return false; + + return true; +} + +bool __list_del_entry_valid(struct list_head *entry) +{ + struct list_head *prev, *next; + + prev = entry->prev; + next = entry->next; + + if (NVHE_CHECK_DATA_CORRUPTION(next == LIST_POISON1) || + NVHE_CHECK_DATA_CORRUPTION(prev == LIST_POISON2) || + NVHE_CHECK_DATA_CORRUPTION(prev->next != entry) || + NVHE_CHECK_DATA_CORRUPTION(next->prev != entry)) + return false; + + return true; +} diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 34eeb524b686..74ef0d062719 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -7,31 +7,76 @@ #include #include #include +#include #include #include +#include #include -#include +#include +#include #include +#include #include #include #include #define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP) -extern unsigned long hyp_nr_cpus; -struct host_kvm host_kvm; +struct host_mmu host_mmu; + +struct pkvm_moveable_reg pkvm_moveable_regs[PKVM_NR_MOVEABLE_REGS]; +unsigned int pkvm_moveable_regs_nr; static struct hyp_pool host_s2_pool; -/* - * Copies of the host's CPU features registers holding sanitized values. - */ -u64 id_aa64mmfr0_el1_sys_val; -u64 id_aa64mmfr1_el1_sys_val; +static DEFINE_PER_CPU(struct pkvm_hyp_vm *, __current_vm); +#define current_vm (*this_cpu_ptr(&__current_vm)) -const u8 pkvm_hyp_id = 1; +static struct kvm_pgtable_pte_ops host_s2_pte_ops; +static bool host_stage2_force_pte(u64 addr, u64 end, enum kvm_pgtable_prot prot); +static bool host_stage2_pte_is_counted(kvm_pte_t pte, u32 level); +static bool guest_stage2_force_pte_cb(u64 addr, u64 end, + enum kvm_pgtable_prot prot); +static bool guest_stage2_pte_is_counted(kvm_pte_t pte, u32 level); + +static struct kvm_pgtable_pte_ops guest_s2_pte_ops = { + .force_pte_cb = guest_stage2_force_pte_cb, + .pte_is_counted_cb = guest_stage2_pte_is_counted +}; + +static void guest_lock_component(struct pkvm_hyp_vm *vm) +{ + hyp_spin_lock(&vm->lock); + current_vm = vm; +} + +static void guest_unlock_component(struct pkvm_hyp_vm *vm) +{ + current_vm = NULL; + hyp_spin_unlock(&vm->lock); +} + +static void host_lock_component(void) +{ + hyp_spin_lock(&host_mmu.lock); +} + +static void host_unlock_component(void) +{ + hyp_spin_unlock(&host_mmu.lock); +} + +static void hyp_lock_component(void) +{ + hyp_spin_lock(&pkvm_pgd_lock); +} + +static void hyp_unlock_component(void) +{ + hyp_spin_unlock(&pkvm_pgd_lock); +} static void *host_s2_zalloc_pages_exact(size_t size) { @@ -75,7 +120,7 @@ static int prepare_s2_pool(void *pgt_pool_base) if (ret) return ret; - host_kvm.mm_ops = (struct kvm_pgtable_mm_ops) { + host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) { .zalloc_pages_exact = host_s2_zalloc_pages_exact, .zalloc_page = host_s2_zalloc_page, .phys_to_virt = hyp_phys_to_virt, @@ -96,51 +141,239 @@ static void prepare_host_vtcr(void) parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val); phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange); - host_kvm.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val, + host_mmu.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val, id_aa64mmfr1_el1_sys_val, phys_shift); } -static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot); - int kvm_host_prepare_stage2(void *pgt_pool_base) { - struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; + struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu; int ret; prepare_host_vtcr(); - hyp_spin_lock_init(&host_kvm.lock); + hyp_spin_lock_init(&host_mmu.lock); + mmu->arch = &host_mmu.arch; ret = prepare_s2_pool(pgt_pool_base); if (ret) return ret; - ret = __kvm_pgtable_stage2_init(&host_kvm.pgt, &host_kvm.arch, - &host_kvm.mm_ops, KVM_HOST_S2_FLAGS, - host_stage2_force_pte_cb); + host_s2_pte_ops.force_pte_cb = host_stage2_force_pte; + host_s2_pte_ops.pte_is_counted_cb = host_stage2_pte_is_counted; + + ret = __kvm_pgtable_stage2_init(&host_mmu.pgt, mmu, + &host_mmu.mm_ops, KVM_HOST_S2_FLAGS, + &host_s2_pte_ops); if (ret) return ret; - mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd); - mmu->arch = &host_kvm.arch; - mmu->pgt = &host_kvm.pgt; - WRITE_ONCE(mmu->vmid.vmid_gen, 0); - WRITE_ONCE(mmu->vmid.vmid, 0); + mmu->pgd_phys = __hyp_pa(host_mmu.pgt.pgd); + mmu->pgt = &host_mmu.pgt; + atomic64_set(&mmu->vmid.id, 0); return 0; } +static bool guest_stage2_force_pte_cb(u64 addr, u64 end, + enum kvm_pgtable_prot prot) +{ + return true; +} + +static bool guest_stage2_pte_is_counted(kvm_pte_t pte, u32 level) +{ + /* + * The refcount tracks valid entries as well as invalid entries if they + * encode ownership of a page to another entity than the page-table + * owner, whose id is 0. + */ + return !!pte; +} + +static void *guest_s2_zalloc_pages_exact(size_t size) +{ + void *addr = hyp_alloc_pages(¤t_vm->pool, get_order(size)); + + WARN_ON(size != (PAGE_SIZE << get_order(size))); + hyp_split_page(hyp_virt_to_page(addr)); + + return addr; +} + +static void guest_s2_free_pages_exact(void *addr, unsigned long size) +{ + u8 order = get_order(size); + unsigned int i; + + for (i = 0; i < (1 << order); i++) + hyp_put_page(¤t_vm->pool, addr + (i * PAGE_SIZE)); +} + +static void *guest_s2_zalloc_page(void *mc) +{ + struct hyp_page *p; + void *addr; + + addr = hyp_alloc_pages(¤t_vm->pool, 0); + if (addr) + return addr; + + addr = pop_hyp_memcache(mc, hyp_phys_to_virt); + if (!addr) + return addr; + + memset(addr, 0, PAGE_SIZE); + p = hyp_virt_to_page(addr); + memset(p, 0, sizeof(*p)); + p->refcount = 1; + + return addr; +} + +static void guest_s2_get_page(void *addr) +{ + hyp_get_page(¤t_vm->pool, addr); +} + +static void guest_s2_put_page(void *addr) +{ + hyp_put_page(¤t_vm->pool, addr); +} + +static void clean_dcache_guest_page(void *va, size_t size) +{ + __clean_dcache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size); + hyp_fixmap_unmap(); +} + +static void invalidate_icache_guest_page(void *va, size_t size) +{ + __invalidate_icache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size); + hyp_fixmap_unmap(); +} + +int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) +{ + struct kvm_s2_mmu *mmu = &vm->kvm.arch.mmu; + unsigned long nr_pages; + int ret; + + nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT; + ret = hyp_pool_init(&vm->pool, hyp_virt_to_pfn(pgd), nr_pages, 0); + if (ret) + return ret; + + hyp_spin_lock_init(&vm->lock); + vm->mm_ops = (struct kvm_pgtable_mm_ops) { + .zalloc_pages_exact = guest_s2_zalloc_pages_exact, + .free_pages_exact = guest_s2_free_pages_exact, + .zalloc_page = guest_s2_zalloc_page, + .phys_to_virt = hyp_phys_to_virt, + .virt_to_phys = hyp_virt_to_phys, + .page_count = hyp_page_count, + .get_page = guest_s2_get_page, + .put_page = guest_s2_put_page, + .dcache_clean_inval_poc = clean_dcache_guest_page, + .icache_inval_pou = invalidate_icache_guest_page, + }; + + guest_lock_component(vm); + ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0, + &guest_s2_pte_ops); + guest_unlock_component(vm); + if (ret) + return ret; + + vm->kvm.arch.mmu.pgd_phys = __hyp_pa(vm->pgt.pgd); + + return 0; +} + +struct relinquish_data { + enum pkvm_page_state expected_state; + u64 pa; +}; + +static int relinquish_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, void * const arg) +{ + kvm_pte_t pte = *ptep; + struct relinquish_data *data = arg; + enum pkvm_page_state state; + phys_addr_t phys; + + if (!kvm_pte_valid(pte)) + return 0; + + state = pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); + if (state != data->expected_state) + return -EPERM; + + phys = kvm_pte_to_phys(pte); + if (state == PKVM_PAGE_OWNED) { + hyp_poison_page(phys); + psci_mem_protect_dec(1); + } + + data->pa = phys; + + return 0; +} + +int __pkvm_guest_relinquish_to_host(struct pkvm_hyp_vcpu *vcpu, + u64 ipa, u64 *ppa) +{ + struct relinquish_data data; + struct kvm_pgtable_walker walker = { + .cb = relinquish_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = &data, + }; + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + int ret; + + host_lock_component(); + guest_lock_component(vm); + + /* Expected page state depends on VM type. */ + data.expected_state = pkvm_hyp_vcpu_is_protected(vcpu) ? + PKVM_PAGE_OWNED : + PKVM_PAGE_SHARED_BORROWED; + + /* Set default pa value to "not found". */ + data.pa = 0; + + /* If ipa is mapped: poisons the page, and gets the pa. */ + ret = kvm_pgtable_walk(&vm->pgt, ipa, PAGE_SIZE, &walker); + + /* Zap the guest stage2 pte and return ownership to the host */ + if (!ret && data.pa) { + WARN_ON(host_stage2_set_owner_locked(data.pa, PAGE_SIZE, PKVM_ID_HOST)); + WARN_ON(kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE)); + } + + guest_unlock_component(vm); + host_unlock_component(); + + *ppa = data.pa; + return ret; +} + int __pkvm_prot_finalize(void) { - struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; + struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu; struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); + if (params->hcr_el2 & HCR_VM) + return -EPERM; + params->vttbr = kvm_get_vttbr(mmu); - params->vtcr = host_kvm.arch.vtcr; + params->vtcr = host_mmu.arch.vtcr; params->hcr_el2 |= HCR_VM; kvm_flush_dcache_to_poc(params, sizeof(*params)); write_sysreg(params->hcr_el2, hcr_el2); - __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch); + __load_stage2(&host_mmu.arch.mmu, &host_mmu.arch); /* * Make sure to have an ISB before the TLB maintenance below but only @@ -156,21 +389,38 @@ int __pkvm_prot_finalize(void) return 0; } -static int host_stage2_unmap_dev_all(void) +int host_stage2_unmap_reg_locked(phys_addr_t start, u64 size) { - struct kvm_pgtable *pgt = &host_kvm.pgt; - struct memblock_region *reg; + int ret; + + hyp_assert_lock_held(&host_mmu.lock); + + ret = kvm_pgtable_stage2_unmap(&host_mmu.pgt, start, size); + if (ret) + return ret; + + pkvm_iommu_host_stage2_idmap(start, start + size, 0); + return 0; +} + +static int host_stage2_unmap_unmoveable_regs(void) +{ + struct kvm_pgtable *pgt = &host_mmu.pgt; + struct pkvm_moveable_reg *reg; u64 addr = 0; int i, ret; - /* Unmap all non-memory regions to recycle the pages */ - for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) { - reg = &hyp_memory[i]; - ret = kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr); - if (ret) - return ret; + /* Unmap all unmoveable regions to recycle the pages */ + for (i = 0; i < pkvm_moveable_regs_nr; i++) { + reg = &pkvm_moveable_regs[i]; + if (reg->start > addr) { + ret = host_stage2_unmap_reg_locked(addr, reg->start - addr); + if (ret) + return ret; + } + addr = max(addr, reg->start + reg->size); } - return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr); + return host_stage2_unmap_reg_locked(addr, BIT(pgt->ia_bits) - addr); } struct kvm_mem_range { @@ -178,7 +428,7 @@ struct kvm_mem_range { u64 end; }; -static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) +static struct memblock_region *find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) { int cur, left = 0, right = hyp_memblock_nr; struct memblock_region *reg; @@ -201,18 +451,33 @@ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) } else { range->start = reg->base; range->end = end; - return true; + return reg; } } - return false; + return NULL; +} + +static enum kvm_pgtable_prot default_host_prot(bool is_memory) +{ + return is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT; } bool addr_is_memory(phys_addr_t phys) { struct kvm_mem_range range; - return find_mem_range(phys, &range); + return !!find_mem_range(phys, &range); +} + +static bool addr_is_allowed_memory(phys_addr_t phys) +{ + struct memblock_region *reg; + struct kvm_mem_range range; + + reg = find_mem_range(phys, &range); + + return reg && !(reg->flags & MEMBLOCK_NOMAP); } static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) @@ -231,25 +496,34 @@ static bool range_is_memory(u64 start, u64 end) } static inline int __host_stage2_idmap(u64 start, u64 end, - enum kvm_pgtable_prot prot) + enum kvm_pgtable_prot prot, + bool update_iommu) { - return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start, - prot, &host_s2_pool); + int ret; + + ret = kvm_pgtable_stage2_map(&host_mmu.pgt, start, end - start, start, + prot, &host_s2_pool); + if (ret) + return ret; + + if (update_iommu) + pkvm_iommu_host_stage2_idmap(start, end, prot); + return 0; } /* - * The pool has been provided with enough pages to cover all of memory with - * page granularity, but it is difficult to know how much of the MMIO range - * we will need to cover upfront, so we may need to 'recycle' the pages if we - * run out. + * The pool has been provided with enough pages to cover all of moveable regions + * with page granularity, but it is difficult to know how much of the + * non-moveable regions we will need to cover upfront, so we may need to + * 'recycle' the pages if we run out. */ #define host_stage2_try(fn, ...) \ ({ \ int __ret; \ - hyp_assert_lock_held(&host_kvm.lock); \ + hyp_assert_lock_held(&host_mmu.lock); \ __ret = fn(__VA_ARGS__); \ if (__ret == -ENOMEM) { \ - __ret = host_stage2_unmap_dev_all(); \ + __ret = host_stage2_unmap_unmoveable_regs(); \ if (!__ret) \ __ret = fn(__VA_ARGS__); \ } \ @@ -269,8 +543,8 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) u32 level; int ret; - hyp_assert_lock_held(&host_kvm.lock); - ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level); + hyp_assert_lock_held(&host_mmu.lock); + ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, &level); if (ret) return ret; @@ -295,22 +569,39 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) } int host_stage2_idmap_locked(phys_addr_t addr, u64 size, - enum kvm_pgtable_prot prot) + enum kvm_pgtable_prot prot, bool update_iommu) { - hyp_assert_lock_held(&host_kvm.lock); - - return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot); + return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot, update_iommu); } -int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) +#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) +static kvm_pte_t kvm_init_invalid_leaf_owner(enum pkvm_component_id owner_id) { - hyp_assert_lock_held(&host_kvm.lock); - - return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt, - addr, size, &host_s2_pool, owner_id); + return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id); } -static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot) +int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, enum pkvm_component_id owner_id) +{ + kvm_pte_t annotation; + enum kvm_pgtable_prot prot; + int ret; + + if (owner_id > PKVM_ID_MAX) + return -EINVAL; + + annotation = kvm_init_invalid_leaf_owner(owner_id); + + ret = host_stage2_try(kvm_pgtable_stage2_annotate, &host_mmu.pgt, + addr, size, &host_s2_pool, annotation); + if (ret) + return ret; + + prot = owner_id == PKVM_ID_HOST ? PKVM_HOST_MEM_PROT : 0; + pkvm_iommu_host_stage2_idmap(addr, addr + size, prot); + return 0; +} + +static bool host_stage2_force_pte(u64 addr, u64 end, enum kvm_pgtable_prot prot) { /* * Block mappings must be used with care in the host stage-2 as a @@ -326,131 +617,1658 @@ static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot pr * mappings, hence avoiding to lose the state because of side-effects in * kvm_pgtable_stage2_map(). */ - if (range_is_memory(addr, end)) - return prot != PKVM_HOST_MEM_PROT; - else - return prot != PKVM_HOST_MMIO_PROT; + return prot != default_host_prot(range_is_memory(addr, end)); +} + +static bool host_stage2_pte_is_counted(kvm_pte_t pte, u32 level) +{ + u64 phys; + + if (!kvm_pte_valid(pte)) + return !!pte; + + if (kvm_pte_table(pte, level)) + return true; + + phys = kvm_pte_to_phys(pte); + if (addr_is_memory(phys)) + return (pte & KVM_HOST_S2_DEFAULT_MASK) != + KVM_HOST_S2_DEFAULT_MEM_PTE; + + return (pte & KVM_HOST_S2_DEFAULT_MASK) != KVM_HOST_S2_DEFAULT_MMIO_PTE; } static int host_stage2_idmap(u64 addr) { struct kvm_mem_range range; - bool is_memory = find_mem_range(addr, &range); - enum kvm_pgtable_prot prot; + bool is_memory = !!find_mem_range(addr, &range); + enum kvm_pgtable_prot prot = default_host_prot(is_memory); int ret; - prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT; + hyp_assert_lock_held(&host_mmu.lock); + + /* + * Adjust against IOMMU devices first. host_stage2_adjust_range() should + * be called last for proper alignment. + */ + if (!is_memory) { + ret = pkvm_iommu_host_stage2_adjust_range(addr, &range.start, + &range.end); + if (ret) + return ret; + } - hyp_spin_lock(&host_kvm.lock); ret = host_stage2_adjust_range(addr, &range); if (ret) - goto unlock; + return ret; - ret = host_stage2_idmap_locked(range.start, range.end - range.start, prot); -unlock: - hyp_spin_unlock(&host_kvm.lock); - - return ret; + return host_stage2_idmap_locked(range.start, range.end - range.start, prot, false); } -static inline bool check_prot(enum kvm_pgtable_prot prot, - enum kvm_pgtable_prot required, - enum kvm_pgtable_prot denied) +static void (*illegal_abt_notifier)(struct kvm_cpu_context *host_ctxt); + +int __pkvm_register_illegal_abt_notifier(void (*cb)(struct kvm_cpu_context *)) { - return (prot & (required | denied)) == required; + return cmpxchg(&illegal_abt_notifier, NULL, cb) ? -EBUSY : 0; } -int __pkvm_host_share_hyp(u64 pfn) +static void host_inject_abort(struct kvm_cpu_context *host_ctxt) { - phys_addr_t addr = hyp_pfn_to_phys(pfn); - enum kvm_pgtable_prot prot, cur; - void *virt = __hyp_va(addr); - enum pkvm_page_state state; - kvm_pte_t pte; - int ret; + u64 spsr = read_sysreg_el2(SYS_SPSR); + u64 esr = read_sysreg_el2(SYS_ESR); + u64 ventry, ec; - if (!addr_is_memory(addr)) - return -EINVAL; + if (READ_ONCE(illegal_abt_notifier)) + illegal_abt_notifier(host_ctxt); - hyp_spin_lock(&host_kvm.lock); - hyp_spin_lock(&pkvm_pgd_lock); - - ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, NULL); - if (ret) - goto unlock; - if (!pte) - goto map_shared; - - /* - * Check attributes in the host stage-2 PTE. We need the page to be: - * - mapped RWX as we're sharing memory; - * - not borrowed, as that implies absence of ownership. - * Otherwise, we can't let it got through - */ - cur = kvm_pgtable_stage2_pte_prot(pte); - prot = pkvm_mkstate(0, PKVM_PAGE_SHARED_BORROWED); - if (!check_prot(cur, PKVM_HOST_MEM_PROT, prot)) { - ret = -EPERM; - goto unlock; + /* Repaint the ESR to report a same-level fault if taken from EL1 */ + if ((spsr & PSR_MODE_MASK) != PSR_MODE_EL0t) { + ec = ESR_ELx_EC(esr); + if (ec == ESR_ELx_EC_DABT_LOW) + ec = ESR_ELx_EC_DABT_CUR; + else if (ec == ESR_ELx_EC_IABT_LOW) + ec = ESR_ELx_EC_IABT_CUR; + else + WARN_ON(1); + esr &= ~ESR_ELx_EC_MASK; + esr |= ec << ESR_ELx_EC_SHIFT; } - state = pkvm_getstate(cur); - if (state == PKVM_PAGE_OWNED) - goto map_shared; - /* - * Tolerate double-sharing the same page, but this requires - * cross-checking the hypervisor stage-1. + * Since S1PTW should only ever be set for stage-2 faults, we're pretty + * much guaranteed that it won't be set in ESR_EL1 by the hardware. So, + * let's use that bit to allow the host abort handler to differentiate + * this abort from normal userspace faults. + * + * Note: although S1PTW is RES0 at EL1, it is guaranteed by the + * architecture to be backed by flops, so it should be safe to use. */ - if (state != PKVM_PAGE_SHARED_OWNED) { - ret = -EPERM; - goto unlock; - } + esr |= ESR_ELx_S1PTW; - ret = kvm_pgtable_get_leaf(&pkvm_pgtable, (u64)virt, &pte, NULL); - if (ret) - goto unlock; + write_sysreg_el1(esr, SYS_ESR); + write_sysreg_el1(spsr, SYS_SPSR); + write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); + write_sysreg_el1(read_sysreg_el2(SYS_FAR), SYS_FAR); - /* - * If the page has been shared with the hypervisor, it must be - * already mapped as SHARED_BORROWED in its stage-1. - */ - cur = kvm_pgtable_hyp_pte_prot(pte); - prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); - if (!check_prot(cur, prot, ~prot)) - ret = -EPERM; - goto unlock; + ventry = read_sysreg_el1(SYS_VBAR); + ventry += get_except64_offset(spsr, PSR_MODE_EL1h, except_type_sync); + write_sysreg_el2(ventry, SYS_ELR); -map_shared: - /* - * If the page is not yet shared, adjust mappings in both page-tables - * while both locks are held. - */ - prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); - ret = pkvm_create_mappings_locked(virt, virt + PAGE_SIZE, prot); - BUG_ON(ret); + spsr = get_except64_cpsr(spsr, system_supports_mte(), + read_sysreg_el1(SYS_SCTLR), PSR_MODE_EL1h); + write_sysreg_el2(spsr, SYS_SPSR); +} - prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED); - ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot); - BUG_ON(ret); +static bool is_dabt(u64 esr) +{ + return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_LOW; +} -unlock: - hyp_spin_unlock(&pkvm_pgd_lock); - hyp_spin_unlock(&host_kvm.lock); +static int (*perm_fault_handler)(struct kvm_cpu_context *host_ctxt, u64 esr, u64 addr); - return ret; +int hyp_register_host_perm_fault_handler(int (*cb)(struct kvm_cpu_context *ctxt, u64 esr, u64 addr)) +{ + return cmpxchg(&perm_fault_handler, NULL, cb) ? -EBUSY : 0; +} + +static int handle_host_perm_fault(struct kvm_cpu_context *host_ctxt, u64 esr, u64 addr) +{ + int (*cb)(struct kvm_cpu_context *host_ctxt, u64 esr, u64 addr); + + cb = READ_ONCE(perm_fault_handler); + return cb ? cb(host_ctxt, esr, addr) : -EPERM; } void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) { struct kvm_vcpu_fault_info fault; u64 esr, addr; - int ret = 0; + int ret = -EPERM; esr = read_sysreg_el2(SYS_ESR); BUG_ON(!__get_fault_info(esr, &fault)); addr = (fault.hpfar_el2 & HPFAR_MASK) << 8; - ret = host_stage2_idmap(addr); - BUG_ON(ret && ret != -EAGAIN); + addr |= fault.far_el2 & FAR_MASK; + + host_lock_component(); + + /* Check if an IOMMU device can handle the DABT. */ + if (is_dabt(esr) && !addr_is_memory(addr) && + pkvm_iommu_host_dabt_handler(host_ctxt, esr, addr)) + ret = 0; + + /* If not handled, attempt to map the page. */ + if (ret == -EPERM) + ret = host_stage2_idmap(addr); + + host_unlock_component(); + + if ((esr & ESR_ELx_FSC_TYPE) == FSC_PERM) + ret = handle_host_perm_fault(host_ctxt, esr, addr); + + if (ret == -EPERM) + host_inject_abort(host_ctxt); + else + BUG_ON(ret && ret != -EAGAIN); + + trace_host_mem_abort(esr, addr); +} + +struct pkvm_mem_transition { + u64 nr_pages; + + struct { + enum pkvm_component_id id; + /* Address in the initiator's address space */ + u64 addr; + + union { + struct { + /* Address in the completer's address space */ + u64 completer_addr; + } host; + struct { + u64 completer_addr; + } hyp; + struct { + struct pkvm_hyp_vcpu *hyp_vcpu; + } guest; + }; + } initiator; + + struct { + enum pkvm_component_id id; + + union { + struct { + struct pkvm_hyp_vcpu *hyp_vcpu; + phys_addr_t phys; + } guest; + }; + } completer; +}; + +struct pkvm_mem_share { + const struct pkvm_mem_transition tx; + const enum kvm_pgtable_prot completer_prot; +}; + +struct pkvm_mem_donation { + const struct pkvm_mem_transition tx; +}; + +struct check_walk_data { + enum pkvm_page_state desired; + enum pkvm_page_state (*get_page_state)(kvm_pte_t pte, u64 addr); +}; + +static int __check_page_state_visitor(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + struct check_walk_data *d = arg; + kvm_pte_t pte = *ptep; + + if (kvm_pte_valid(pte) && !addr_is_allowed_memory(kvm_pte_to_phys(pte))) + return -EINVAL; + + return d->get_page_state(pte, addr) == d->desired ? 0 : -EPERM; +} + +static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct check_walk_data *data) +{ + struct kvm_pgtable_walker walker = { + .cb = __check_page_state_visitor, + .arg = data, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + return kvm_pgtable_walk(pgt, addr, size, &walker); +} + +static enum pkvm_page_state host_get_page_state(kvm_pte_t pte, u64 addr) +{ + bool is_memory = addr_is_memory(addr); + enum pkvm_page_state state = 0; + enum kvm_pgtable_prot prot; + + if (is_memory && hyp_phys_to_page(addr)->flags & MODULE_OWNED_PAGE) + return PKVM_MODULE_DONT_TOUCH; + + if (!kvm_pte_valid(pte) && pte) + return PKVM_NOPAGE; + + prot = kvm_pgtable_stage2_pte_prot(pte); + if (kvm_pte_valid(pte)) { + if ((prot & KVM_PGTABLE_PROT_RWX) != default_host_prot(is_memory)) + state = PKVM_PAGE_RESTRICTED_PROT; + } + + return state | pkvm_getstate(prot); +} + +static int __host_check_page_state_range(u64 addr, u64 size, + enum pkvm_page_state state) +{ + struct check_walk_data d = { + .desired = state, + .get_page_state = host_get_page_state, + }; + + hyp_assert_lock_held(&host_mmu.lock); + return check_page_state_range(&host_mmu.pgt, addr, size, &d); +} + +static int __host_set_page_state_range(u64 addr, u64 size, + enum pkvm_page_state state) +{ + enum kvm_pgtable_prot prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, state); + + return host_stage2_idmap_locked(addr, size, prot, true); +} + +static int host_request_owned_transition(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + u64 addr = tx->initiator.addr; + + *completer_addr = tx->initiator.host.completer_addr; + return __host_check_page_state_range(addr, size, PKVM_PAGE_OWNED); +} + +static int host_request_unshare(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + u64 addr = tx->initiator.addr; + + *completer_addr = tx->initiator.host.completer_addr; + return __host_check_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED); +} + +static int host_initiate_share(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + u64 addr = tx->initiator.addr; + + *completer_addr = tx->initiator.host.completer_addr; + return __host_set_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED); +} + +static int host_initiate_unshare(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + u64 addr = tx->initiator.addr; + + *completer_addr = tx->initiator.host.completer_addr; + return __host_set_page_state_range(addr, size, PKVM_PAGE_OWNED); +} + +static int host_initiate_donation(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + enum pkvm_component_id owner_id = tx->completer.id; + u64 size = tx->nr_pages * PAGE_SIZE; + + *completer_addr = tx->initiator.host.completer_addr; + return host_stage2_set_owner_locked(tx->initiator.addr, size, owner_id); +} + +static bool __host_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) +{ + return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) || + tx->initiator.id != PKVM_ID_HYP); +} + +static int __host_ack_transition(u64 addr, const struct pkvm_mem_transition *tx, + enum pkvm_page_state state) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + + if (__host_ack_skip_pgtable_check(tx)) + return 0; + + return __host_check_page_state_range(addr, size, state); +} + +static int host_ack_share(u64 addr, const struct pkvm_mem_transition *tx, + enum kvm_pgtable_prot perms) +{ + if (perms != PKVM_HOST_MEM_PROT) + return -EPERM; + + return __host_ack_transition(addr, tx, PKVM_NOPAGE); +} + +static int host_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) +{ + return __host_ack_transition(addr, tx, PKVM_NOPAGE); +} + +static int host_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx) +{ + return __host_ack_transition(addr, tx, PKVM_PAGE_SHARED_BORROWED); +} + +static int host_complete_share(u64 addr, const struct pkvm_mem_transition *tx, + enum kvm_pgtable_prot perms) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + int err; + + err = __host_set_page_state_range(addr, size, PKVM_PAGE_SHARED_BORROWED); + if (err) + return err; + + if (tx->initiator.id == PKVM_ID_GUEST) + psci_mem_protect_dec(tx->nr_pages); + + return 0; +} + +static int host_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx) +{ + enum pkvm_component_id owner_id = tx->initiator.id; + u64 size = tx->nr_pages * PAGE_SIZE; + + if (tx->initiator.id == PKVM_ID_GUEST) + psci_mem_protect_inc(tx->nr_pages); + + return host_stage2_set_owner_locked(addr, size, owner_id); +} + +static int host_complete_donation(u64 addr, const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + enum pkvm_component_id host_id = tx->completer.id; + + return host_stage2_set_owner_locked(addr, size, host_id); +} + +static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte, u64 addr) +{ + enum pkvm_page_state state = 0; + enum kvm_pgtable_prot prot; + + if (!kvm_pte_valid(pte)) + return PKVM_NOPAGE; + + prot = kvm_pgtable_hyp_pte_prot(pte); + if (kvm_pte_valid(pte) && ((prot & KVM_PGTABLE_PROT_RWX) != PAGE_HYP)) + state = PKVM_PAGE_RESTRICTED_PROT; + + return state | pkvm_getstate(prot); +} + +static int __hyp_check_page_state_range(u64 addr, u64 size, + enum pkvm_page_state state) +{ + struct check_walk_data d = { + .desired = state, + .get_page_state = hyp_get_page_state, + }; + + hyp_assert_lock_held(&pkvm_pgd_lock); + return check_page_state_range(&pkvm_pgtable, addr, size, &d); +} + +static int hyp_request_donation(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + u64 addr = tx->initiator.addr; + + *completer_addr = tx->initiator.hyp.completer_addr; + return __hyp_check_page_state_range(addr, size, PKVM_PAGE_OWNED); +} + +static int hyp_initiate_donation(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + int ret; + + *completer_addr = tx->initiator.hyp.completer_addr; + ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, tx->initiator.addr, size); + return (ret != size) ? -EFAULT : 0; +} + +static bool __hyp_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) +{ + return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) || + tx->initiator.id != PKVM_ID_HOST); +} + +static int hyp_ack_share(u64 addr, const struct pkvm_mem_transition *tx, + enum kvm_pgtable_prot perms) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + + if (perms != PAGE_HYP) + return -EPERM; + + if (__hyp_ack_skip_pgtable_check(tx)) + return 0; + + return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE); +} + +static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + + if (tx->initiator.id == PKVM_ID_HOST && hyp_page_count((void *)addr)) + return -EBUSY; + + if (__hyp_ack_skip_pgtable_check(tx)) + return 0; + + return __hyp_check_page_state_range(addr, size, + PKVM_PAGE_SHARED_BORROWED); +} + +static int hyp_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + + if (__hyp_ack_skip_pgtable_check(tx)) + return 0; + + return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE); +} + +static int hyp_complete_share(u64 addr, const struct pkvm_mem_transition *tx, + enum kvm_pgtable_prot perms) +{ + void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE); + enum kvm_pgtable_prot prot; + + prot = pkvm_mkstate(perms, PKVM_PAGE_SHARED_BORROWED); + return pkvm_create_mappings_locked(start, end, prot); +} + +static int hyp_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + int ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, addr, size); + + return (ret != size) ? -EFAULT : 0; +} + +static int hyp_complete_donation(u64 addr, + const struct pkvm_mem_transition *tx) +{ + void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE); + enum kvm_pgtable_prot prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED); + + return pkvm_create_mappings_locked(start, end, prot); +} + +static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr) +{ + enum pkvm_page_state state = 0; + enum kvm_pgtable_prot prot; + + if (!kvm_pte_valid(pte)) + return PKVM_NOPAGE; + + prot = kvm_pgtable_stage2_pte_prot(pte); + if (kvm_pte_valid(pte) && ((prot & KVM_PGTABLE_PROT_RWX) != KVM_PGTABLE_PROT_RWX)) + state = PKVM_PAGE_RESTRICTED_PROT; + + return state | pkvm_getstate(prot); +} + +static int __guest_check_page_state_range(struct pkvm_hyp_vcpu *vcpu, u64 addr, + u64 size, enum pkvm_page_state state) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + struct check_walk_data d = { + .desired = state, + .get_page_state = guest_get_page_state, + }; + + hyp_assert_lock_held(&vm->lock); + return check_page_state_range(&vm->pgt, addr, size, &d); +} + +static int guest_ack_share(u64 addr, const struct pkvm_mem_transition *tx, + enum kvm_pgtable_prot perms) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + + if (perms != KVM_PGTABLE_PROT_RWX) + return -EPERM; + + return __guest_check_page_state_range(tx->completer.guest.hyp_vcpu, + addr, size, PKVM_NOPAGE); +} + +static int guest_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + + return __guest_check_page_state_range(tx->completer.guest.hyp_vcpu, + addr, size, PKVM_NOPAGE); +} + +static int guest_complete_share(u64 addr, const struct pkvm_mem_transition *tx, + enum kvm_pgtable_prot perms) +{ + struct pkvm_hyp_vcpu *vcpu = tx->completer.guest.hyp_vcpu; + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 size = tx->nr_pages * PAGE_SIZE; + enum kvm_pgtable_prot prot; + + prot = pkvm_mkstate(perms, PKVM_PAGE_SHARED_BORROWED); + return kvm_pgtable_stage2_map(&vm->pgt, addr, size, tx->completer.guest.phys, + prot, &vcpu->vcpu.arch.pkvm_memcache); +} + +static int guest_complete_donation(u64 addr, const struct pkvm_mem_transition *tx) +{ + enum kvm_pgtable_prot prot = pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_OWNED); + struct pkvm_hyp_vcpu *vcpu = tx->completer.guest.hyp_vcpu; + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + phys_addr_t phys = tx->completer.guest.phys; + u64 size = tx->nr_pages * PAGE_SIZE; + int err; + + if (tx->initiator.id == PKVM_ID_HOST) + psci_mem_protect_inc(tx->nr_pages); + + if (pkvm_ipa_range_has_pvmfw(vm, addr, addr + size)) { + if (WARN_ON(!pkvm_hyp_vcpu_is_protected(vcpu))) { + err = -EPERM; + goto err_undo_psci; + } + + WARN_ON(tx->initiator.id != PKVM_ID_HOST); + err = pkvm_load_pvmfw_pages(vm, addr, phys, size); + if (err) + goto err_undo_psci; + } + + /* + * If this fails, we effectively leak the pages since they're now + * owned by the guest but not mapped into its stage-2 page-table. + */ + return kvm_pgtable_stage2_map(&vm->pgt, addr, size, phys, prot, + &vcpu->vcpu.arch.pkvm_memcache); + +err_undo_psci: + if (tx->initiator.id == PKVM_ID_HOST) + psci_mem_protect_dec(tx->nr_pages); + return err; +} + +static int __guest_get_completer_addr(u64 *completer_addr, phys_addr_t phys, + const struct pkvm_mem_transition *tx) +{ + switch (tx->completer.id) { + case PKVM_ID_HOST: + *completer_addr = phys; + break; + case PKVM_ID_HYP: + *completer_addr = (u64)__hyp_va(phys); + break; + default: + return -EINVAL; + } + + return 0; +} + +static int __guest_request_page_transition(u64 *completer_addr, + const struct pkvm_mem_transition *tx, + enum pkvm_page_state desired) +{ + struct pkvm_hyp_vcpu *vcpu = tx->initiator.guest.hyp_vcpu; + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + enum pkvm_page_state state; + phys_addr_t phys; + kvm_pte_t pte; + u32 level; + int ret; + + if (tx->nr_pages != 1) + return -E2BIG; + + ret = kvm_pgtable_get_leaf(&vm->pgt, tx->initiator.addr, &pte, &level); + if (ret) + return ret; + + state = guest_get_page_state(pte, tx->initiator.addr); + if (state == PKVM_NOPAGE) + return -EFAULT; + + if (state != desired) + return -EPERM; + + /* + * We only deal with page granular mappings in the guest for now as + * the pgtable code relies on being able to recreate page mappings + * lazily after zapping a block mapping, which doesn't work once the + * pages have been donated. + */ + if (level != KVM_PGTABLE_MAX_LEVELS - 1) + return -EINVAL; + + phys = kvm_pte_to_phys(pte); + if (!addr_is_allowed_memory(phys)) + return -EINVAL; + + return __guest_get_completer_addr(completer_addr, phys, tx); +} + +static int guest_request_share(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + return __guest_request_page_transition(completer_addr, tx, + PKVM_PAGE_OWNED); +} + +static int guest_request_unshare(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + return __guest_request_page_transition(completer_addr, tx, + PKVM_PAGE_SHARED_OWNED); +} + +static int __guest_initiate_page_transition(u64 *completer_addr, + const struct pkvm_mem_transition *tx, + enum pkvm_page_state state) +{ + struct pkvm_hyp_vcpu *vcpu = tx->initiator.guest.hyp_vcpu; + struct kvm_hyp_memcache *mc = &vcpu->vcpu.arch.pkvm_memcache; + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 size = tx->nr_pages * PAGE_SIZE; + u64 addr = tx->initiator.addr; + enum kvm_pgtable_prot prot; + phys_addr_t phys; + kvm_pte_t pte; + int ret; + + ret = kvm_pgtable_get_leaf(&vm->pgt, addr, &pte, NULL); + if (ret) + return ret; + + phys = kvm_pte_to_phys(pte); + prot = pkvm_mkstate(kvm_pgtable_stage2_pte_prot(pte), state); + ret = kvm_pgtable_stage2_map(&vm->pgt, addr, size, phys, prot, mc); + if (ret) + return ret; + + return __guest_get_completer_addr(completer_addr, phys, tx); +} + +static int guest_initiate_share(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + return __guest_initiate_page_transition(completer_addr, tx, + PKVM_PAGE_SHARED_OWNED); +} + +static int guest_initiate_unshare(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + return __guest_initiate_page_transition(completer_addr, tx, + PKVM_PAGE_OWNED); +} + +static int check_share(struct pkvm_mem_share *share) +{ + const struct pkvm_mem_transition *tx = &share->tx; + u64 completer_addr; + int ret; + + switch (tx->initiator.id) { + case PKVM_ID_HOST: + ret = host_request_owned_transition(&completer_addr, tx); + break; + case PKVM_ID_GUEST: + ret = guest_request_share(&completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + if (ret) + return ret; + + switch (tx->completer.id) { + case PKVM_ID_HOST: + ret = host_ack_share(completer_addr, tx, share->completer_prot); + break; + case PKVM_ID_HYP: + ret = hyp_ack_share(completer_addr, tx, share->completer_prot); + break; + case PKVM_ID_GUEST: + ret = guest_ack_share(completer_addr, tx, share->completer_prot); + break; + case PKVM_ID_FFA: + /* + * We only check the host; the secure side will check the other + * end when we forward the FFA call. + */ + ret = 0; + break; + default: + ret = -EINVAL; + } + + return ret; +} + +static int __do_share(struct pkvm_mem_share *share) +{ + const struct pkvm_mem_transition *tx = &share->tx; + u64 completer_addr; + int ret; + + switch (tx->initiator.id) { + case PKVM_ID_HOST: + ret = host_initiate_share(&completer_addr, tx); + break; + case PKVM_ID_GUEST: + ret = guest_initiate_share(&completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + if (ret) + return ret; + + switch (tx->completer.id) { + case PKVM_ID_HOST: + ret = host_complete_share(completer_addr, tx, share->completer_prot); + break; + case PKVM_ID_HYP: + ret = hyp_complete_share(completer_addr, tx, share->completer_prot); + break; + case PKVM_ID_GUEST: + ret = guest_complete_share(completer_addr, tx, share->completer_prot); + break; + case PKVM_ID_FFA: + /* + * We're not responsible for any secure page-tables, so there's + * nothing to do here. + */ + ret = 0; + break; + default: + ret = -EINVAL; + } + + return ret; +} + +/* + * do_share(): + * + * The page owner grants access to another component with a given set + * of permissions. + * + * Initiator: OWNED => SHARED_OWNED + * Completer: NOPAGE => SHARED_BORROWED + */ +static int do_share(struct pkvm_mem_share *share) +{ + int ret; + + ret = check_share(share); + if (ret) + return ret; + + return WARN_ON(__do_share(share)); +} + +static int check_unshare(struct pkvm_mem_share *share) +{ + const struct pkvm_mem_transition *tx = &share->tx; + u64 completer_addr; + int ret; + + switch (tx->initiator.id) { + case PKVM_ID_HOST: + ret = host_request_unshare(&completer_addr, tx); + break; + case PKVM_ID_GUEST: + ret = guest_request_unshare(&completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + if (ret) + return ret; + + switch (tx->completer.id) { + case PKVM_ID_HOST: + ret = host_ack_unshare(completer_addr, tx); + break; + case PKVM_ID_HYP: + ret = hyp_ack_unshare(completer_addr, tx); + break; + case PKVM_ID_FFA: + /* See check_share() */ + ret = 0; + break; + default: + ret = -EINVAL; + } + + return ret; +} + +static int __do_unshare(struct pkvm_mem_share *share) +{ + const struct pkvm_mem_transition *tx = &share->tx; + u64 completer_addr; + int ret; + + switch (tx->initiator.id) { + case PKVM_ID_HOST: + ret = host_initiate_unshare(&completer_addr, tx); + break; + case PKVM_ID_GUEST: + ret = guest_initiate_unshare(&completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + if (ret) + return ret; + + switch (tx->completer.id) { + case PKVM_ID_HOST: + ret = host_complete_unshare(completer_addr, tx); + break; + case PKVM_ID_HYP: + ret = hyp_complete_unshare(completer_addr, tx); + break; + case PKVM_ID_FFA: + /* See __do_share() */ + ret = 0; + break; + default: + ret = -EINVAL; + } + + return ret; +} + +/* + * do_unshare(): + * + * The page owner revokes access from another component for a range of + * pages which were previously shared using do_share(). + * + * Initiator: SHARED_OWNED => OWNED + * Completer: SHARED_BORROWED => NOPAGE + */ +static int do_unshare(struct pkvm_mem_share *share) +{ + int ret; + + ret = check_unshare(share); + if (ret) + return ret; + + return WARN_ON(__do_unshare(share)); +} + +static int check_donation(struct pkvm_mem_donation *donation) +{ + const struct pkvm_mem_transition *tx = &donation->tx; + u64 completer_addr; + int ret; + + switch (tx->initiator.id) { + case PKVM_ID_HOST: + ret = host_request_owned_transition(&completer_addr, tx); + break; + case PKVM_ID_HYP: + ret = hyp_request_donation(&completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + if (ret) + return ret; + + switch (tx->completer.id) { + case PKVM_ID_HOST: + ret = host_ack_donation(completer_addr, tx); + break; + case PKVM_ID_HYP: + ret = hyp_ack_donation(completer_addr, tx); + break; + case PKVM_ID_GUEST: + ret = guest_ack_donation(completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +static int __do_donate(struct pkvm_mem_donation *donation) +{ + const struct pkvm_mem_transition *tx = &donation->tx; + u64 completer_addr; + int ret; + + switch (tx->initiator.id) { + case PKVM_ID_HOST: + ret = host_initiate_donation(&completer_addr, tx); + break; + case PKVM_ID_HYP: + ret = hyp_initiate_donation(&completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + if (ret) + return ret; + + switch (tx->completer.id) { + case PKVM_ID_HOST: + ret = host_complete_donation(completer_addr, tx); + break; + case PKVM_ID_HYP: + ret = hyp_complete_donation(completer_addr, tx); + break; + case PKVM_ID_GUEST: + ret = guest_complete_donation(completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +/* + * do_donate(): + * + * The page owner transfers ownership to another component, losing access + * as a consequence. + * + * Initiator: OWNED => NOPAGE + * Completer: NOPAGE => OWNED + */ +static int do_donate(struct pkvm_mem_donation *donation) +{ + int ret; + + ret = check_donation(donation); + if (ret) + return ret; + + return WARN_ON(__do_donate(donation)); +} + +int __pkvm_host_share_hyp(u64 pfn) +{ + int ret; + u64 host_addr = hyp_pfn_to_phys(pfn); + u64 hyp_addr = (u64)__hyp_va(host_addr); + struct pkvm_mem_share share = { + .tx = { + .nr_pages = 1, + .initiator = { + .id = PKVM_ID_HOST, + .addr = host_addr, + .host = { + .completer_addr = hyp_addr, + }, + }, + .completer = { + .id = PKVM_ID_HYP, + }, + }, + .completer_prot = PAGE_HYP, + }; + + host_lock_component(); + hyp_lock_component(); + + ret = do_share(&share); + + hyp_unlock_component(); + host_unlock_component(); + + return ret; +} + +int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *vcpu, u64 ipa) +{ + int ret; + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + struct pkvm_mem_share share = { + .tx = { + .nr_pages = 1, + .initiator = { + .id = PKVM_ID_GUEST, + .addr = ipa, + .guest = { + .hyp_vcpu = vcpu, + }, + }, + .completer = { + .id = PKVM_ID_HOST, + }, + }, + .completer_prot = PKVM_HOST_MEM_PROT, + }; + + host_lock_component(); + guest_lock_component(vm); + + ret = do_share(&share); + + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} + +int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *vcpu, u64 ipa) +{ + int ret; + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + struct pkvm_mem_share share = { + .tx = { + .nr_pages = 1, + .initiator = { + .id = PKVM_ID_GUEST, + .addr = ipa, + .guest = { + .hyp_vcpu = vcpu, + }, + }, + .completer = { + .id = PKVM_ID_HOST, + }, + }, + .completer_prot = PKVM_HOST_MEM_PROT, + }; + + host_lock_component(); + guest_lock_component(vm); + + ret = do_unshare(&share); + + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} + +int __pkvm_host_unshare_hyp(u64 pfn) +{ + int ret; + u64 host_addr = hyp_pfn_to_phys(pfn); + u64 hyp_addr = (u64)__hyp_va(host_addr); + struct pkvm_mem_share share = { + .tx = { + .nr_pages = 1, + .initiator = { + .id = PKVM_ID_HOST, + .addr = host_addr, + .host = { + .completer_addr = hyp_addr, + }, + }, + .completer = { + .id = PKVM_ID_HYP, + }, + }, + .completer_prot = PAGE_HYP, + }; + + host_lock_component(); + hyp_lock_component(); + + ret = do_unshare(&share); + + hyp_unlock_component(); + host_unlock_component(); + + return ret; +} + +int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) +{ + int ret; + u64 host_addr = hyp_pfn_to_phys(pfn); + u64 hyp_addr = (u64)__hyp_va(host_addr); + struct pkvm_mem_donation donation = { + .tx = { + .nr_pages = nr_pages, + .initiator = { + .id = PKVM_ID_HOST, + .addr = host_addr, + .host = { + .completer_addr = hyp_addr, + }, + }, + .completer = { + .id = PKVM_ID_HYP, + }, + }, + }; + + host_lock_component(); + hyp_lock_component(); + + ret = do_donate(&donation); + + hyp_unlock_component(); + host_unlock_component(); + + return ret; +} + +int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) +{ + int ret; + u64 host_addr = hyp_pfn_to_phys(pfn); + u64 hyp_addr = (u64)__hyp_va(host_addr); + struct pkvm_mem_donation donation = { + .tx = { + .nr_pages = nr_pages, + .initiator = { + .id = PKVM_ID_HYP, + .addr = hyp_addr, + .hyp = { + .completer_addr = host_addr, + }, + }, + .completer = { + .id = PKVM_ID_HOST, + }, + }, + }; + + host_lock_component(); + hyp_lock_component(); + + ret = do_donate(&donation); + + hyp_unlock_component(); + host_unlock_component(); + + return ret; +} + +static int restrict_host_page_perms(u64 addr, kvm_pte_t pte, u32 level, enum kvm_pgtable_prot prot) +{ + int ret = 0; + + /* XXX: optimize ... */ + if (kvm_pte_valid(pte) && (level == KVM_PGTABLE_MAX_LEVELS - 1)) + ret = kvm_pgtable_stage2_unmap(&host_mmu.pgt, addr, PAGE_SIZE); + if (!ret) + ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot, false); + + return ret; +} + +int module_change_host_page_prot(u64 pfn, enum kvm_pgtable_prot prot) +{ + u64 addr = hyp_pfn_to_phys(pfn); + struct hyp_page *page; + kvm_pte_t pte; + u32 level; + int ret; + + if ((prot & KVM_PGTABLE_PROT_RWX) != prot || !addr_is_memory(addr)) + return -EINVAL; + + host_lock_component(); + ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, &level); + if (ret) + goto unlock; + + ret = -EPERM; + page = hyp_phys_to_page(addr); + + /* + * Modules can only relax permissions of pages they own, and restrict + * permissions of pristine pages. + */ + if (prot == KVM_PGTABLE_PROT_RWX) { + if (!(page->flags & MODULE_OWNED_PAGE)) + goto unlock; + } else if (host_get_page_state(pte, addr) != PKVM_PAGE_OWNED) { + goto unlock; + } + + if (prot == KVM_PGTABLE_PROT_RWX) + ret = host_stage2_set_owner_locked(addr, PAGE_SIZE, PKVM_ID_HOST); + else if (!prot) + ret = host_stage2_set_owner_locked(addr, PAGE_SIZE, PKVM_ID_PROTECTED); + else + ret = restrict_host_page_perms(addr, pte, level, prot); + + if (ret) + goto unlock; + + if (prot != KVM_PGTABLE_PROT_RWX) + hyp_phys_to_page(addr)->flags |= MODULE_OWNED_PAGE; + else + hyp_phys_to_page(addr)->flags &= ~MODULE_OWNED_PAGE; + +unlock: + host_unlock_component(); + + return ret; +} + +int hyp_pin_shared_mem(void *from, void *to) +{ + u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); + u64 end = PAGE_ALIGN((u64)to); + u64 size = end - start; + int ret; + + host_lock_component(); + hyp_lock_component(); + + ret = __host_check_page_state_range(__hyp_pa(start), size, + PKVM_PAGE_SHARED_OWNED); + if (ret) + goto unlock; + + ret = __hyp_check_page_state_range(start, size, + PKVM_PAGE_SHARED_BORROWED); + if (ret) + goto unlock; + + for (cur = start; cur < end; cur += PAGE_SIZE) + hyp_page_ref_inc(hyp_virt_to_page(cur)); + +unlock: + hyp_unlock_component(); + host_unlock_component(); + + return ret; +} + +void hyp_unpin_shared_mem(void *from, void *to) +{ + u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); + u64 end = PAGE_ALIGN((u64)to); + + host_lock_component(); + hyp_lock_component(); + + for (cur = start; cur < end; cur += PAGE_SIZE) + hyp_page_ref_dec(hyp_virt_to_page(cur)); + + hyp_unlock_component(); + host_unlock_component(); +} + +int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu) +{ + int ret; + u64 host_addr = hyp_pfn_to_phys(pfn); + u64 guest_addr = hyp_pfn_to_phys(gfn); + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + struct pkvm_mem_share share = { + .tx = { + .nr_pages = 1, + .initiator = { + .id = PKVM_ID_HOST, + .addr = host_addr, + .host = { + .completer_addr = guest_addr, + }, + }, + .completer = { + .id = PKVM_ID_GUEST, + .guest = { + .hyp_vcpu = vcpu, + .phys = host_addr, + }, + }, + }, + .completer_prot = KVM_PGTABLE_PROT_RWX, + }; + + host_lock_component(); + guest_lock_component(vm); + + ret = do_share(&share); + + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} + +int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu) +{ + int ret; + u64 host_addr = hyp_pfn_to_phys(pfn); + u64 guest_addr = hyp_pfn_to_phys(gfn); + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + struct pkvm_mem_donation donation = { + .tx = { + .nr_pages = 1, + .initiator = { + .id = PKVM_ID_HOST, + .addr = host_addr, + .host = { + .completer_addr = guest_addr, + }, + }, + .completer = { + .id = PKVM_ID_GUEST, + .guest = { + .hyp_vcpu = vcpu, + .phys = host_addr, + }, + }, + }, + }; + + host_lock_component(); + guest_lock_component(vm); + + ret = do_donate(&donation); + + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} + +int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages) +{ + int ret; + struct pkvm_mem_share share = { + .tx = { + .nr_pages = nr_pages, + .initiator = { + .id = PKVM_ID_HOST, + .addr = hyp_pfn_to_phys(pfn), + }, + .completer = { + .id = PKVM_ID_FFA, + }, + }, + }; + + host_lock_component(); + ret = do_share(&share); + host_unlock_component(); + + return ret; +} + + +int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) +{ + int ret; + struct pkvm_mem_share share = { + .tx = { + .nr_pages = nr_pages, + .initiator = { + .id = PKVM_ID_HOST, + .addr = hyp_pfn_to_phys(pfn), + }, + .completer = { + .id = PKVM_ID_FFA, + }, + }, + }; + + host_lock_component(); + ret = do_unshare(&share); + host_unlock_component(); + + return ret; +} + +void hyp_poison_page(phys_addr_t phys) +{ + void *addr = hyp_fixmap_map(phys); + + memset(addr, 0, PAGE_SIZE); + /* + * Prefer kvm_flush_dcache_to_poc() over __clean_dcache_guest_page() + * here as the latter may elide the CMO under the assumption that FWB + * will be enabled on CPUs that support it. This is incorrect for the + * host stage-2 and would otherwise lead to a malicious host potentially + * being able to read the contents of newly reclaimed guest pages. + */ + kvm_flush_dcache_to_poc(addr, PAGE_SIZE); + hyp_fixmap_unmap(); +} + +void destroy_hyp_vm_pgt(struct pkvm_hyp_vm *vm) +{ + guest_lock_component(vm); + kvm_pgtable_stage2_destroy(&vm->pgt); + guest_unlock_component(vm); +} + +void drain_hyp_pool(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) +{ + void *addr = hyp_alloc_pages(&vm->pool, 0); + + while (addr) { + memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page)); + push_hyp_memcache(mc, addr, hyp_virt_to_phys); + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1)); + addr = hyp_alloc_pages(&vm->pool, 0); + } +} + +int __pkvm_host_reclaim_page(struct pkvm_hyp_vm *vm, u64 pfn, u64 ipa) +{ + phys_addr_t phys = hyp_pfn_to_phys(pfn); + kvm_pte_t pte; + int ret; + + host_lock_component(); + guest_lock_component(vm); + + ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, NULL); + if (ret) + goto unlock; + + if (!kvm_pte_valid(pte)) { + ret = -EINVAL; + goto unlock; + } else if (phys != kvm_pte_to_phys(pte)) { + ret = -EPERM; + goto unlock; + } + + /* We could avoid TLB inval, it is done per VMID on the finalize path */ + WARN_ON(kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE)); + + switch(guest_get_page_state(pte, ipa)) { + case PKVM_PAGE_OWNED: + WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_NOPAGE)); + hyp_poison_page(phys); + psci_mem_protect_dec(1); + break; + case PKVM_PAGE_SHARED_BORROWED: + WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_OWNED)); + break; + case PKVM_PAGE_SHARED_OWNED: + WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED)); + break; + default: + BUG_ON(1); + } + + WARN_ON(host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HOST)); + +unlock: + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} + +/* Replace this with something more structured once day */ +#define MMIO_NOTE (('M' << 24 | 'M' << 16 | 'I' << 8 | 'O') << 1) + +static bool __check_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + kvm_pte_t pte; + u32 level; + int ret; + + ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level); + if (ret) + return false; + + /* Must be a PAGE_SIZE mapping with our annotation */ + return (BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)) == PAGE_SIZE && + pte == MMIO_NOTE); +} + +int __pkvm_install_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + kvm_pte_t pte; + u32 level; + int ret; + + if (!test_bit(KVM_ARCH_FLAG_MMIO_GUARD, &vm->kvm.arch.flags)) + return -EINVAL; + + if (ipa & ~PAGE_MASK) + return -EINVAL; + + guest_lock_component(vm); + + ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level); + if (ret) + goto unlock; + + if (pte && BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)) == PAGE_SIZE) { + /* + * Already flagged as MMIO, let's accept it, and fail + * otherwise + */ + if (pte != MMIO_NOTE) + ret = -EBUSY; + + goto unlock; + } + + ret = kvm_pgtable_stage2_annotate(&vm->pgt, ipa, PAGE_SIZE, + &hyp_vcpu->vcpu.arch.pkvm_memcache, + MMIO_NOTE); + +unlock: + guest_unlock_component(vm); + return ret; +} + +int __pkvm_remove_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + + if (!test_bit(KVM_ARCH_FLAG_MMIO_GUARD, &vm->kvm.arch.flags)) + return -EINVAL; + + guest_lock_component(vm); + + if (__check_ioguard_page(hyp_vcpu, ipa)) + WARN_ON(kvm_pgtable_stage2_unmap(&vm->pgt, + ALIGN_DOWN(ipa, PAGE_SIZE), PAGE_SIZE)); + + guest_unlock_component(vm); + return 0; +} + +bool __pkvm_check_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + u64 ipa, end; + bool ret; + + if (!kvm_vcpu_dabt_isvalid(&hyp_vcpu->vcpu)) + return false; + + if (!test_bit(KVM_ARCH_FLAG_MMIO_GUARD, &vm->kvm.arch.flags)) + return true; + + ipa = kvm_vcpu_get_fault_ipa(&hyp_vcpu->vcpu); + ipa |= kvm_vcpu_get_hfar(&hyp_vcpu->vcpu) & FAR_MASK; + end = ipa + kvm_vcpu_dabt_get_as(&hyp_vcpu->vcpu) - 1; + + guest_lock_component(vm); + ret = __check_ioguard_page(hyp_vcpu, ipa); + if ((end & PAGE_MASK) != (ipa & PAGE_MASK)) + ret &= __check_ioguard_page(hyp_vcpu, end); + guest_unlock_component(vm); + + return ret; +} + +int host_stage2_protect_pages_locked(phys_addr_t addr, u64 size) +{ + int ret; + + hyp_assert_lock_held(&host_mmu.lock); + + ret = __host_check_page_state_range(addr, size, PKVM_PAGE_OWNED); + if (!ret) + ret = host_stage2_set_owner_locked(addr, size, PKVM_ID_PROTECTED); + + return ret; +} + +int host_stage2_get_leaf(phys_addr_t phys, kvm_pte_t *ptep, u32 *level) +{ + int ret; + + host_lock_component(); + ret = kvm_pgtable_get_leaf(&host_mmu.pgt, phys, ptep, level); + host_unlock_component(); + + return ret; } diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 5146fb170505..76f26dc73cc7 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -8,21 +8,32 @@ #include #include #include +#include #include #include #include #include +#include #include +#include #include struct kvm_pgtable pkvm_pgtable; hyp_spinlock_t pkvm_pgd_lock; -u64 __io_map_base; struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS]; unsigned int hyp_memblock_nr; +static u64 __private_range_base; +static u64 __private_range_cur; + +struct hyp_fixmap_slot { + u64 addr; + kvm_pte_t *ptep; +}; +static DEFINE_PER_CPU(struct hyp_fixmap_slot, fixmap_slots); + static int __pkvm_create_mappings(unsigned long start, unsigned long size, unsigned long phys, enum kvm_pgtable_prot prot) { @@ -35,36 +46,126 @@ static int __pkvm_create_mappings(unsigned long start, unsigned long size, return err; } -unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size, - enum kvm_pgtable_prot prot) +/** + * pkvm_alloc_private_va_range - Allocates a private VA range. + * @size: The size of the VA range to reserve. + * @haddr: The hypervisor virtual start address of the allocation. + * + * The private virtual address (VA) range is allocated above __private_range_base + * and aligned based on the order of @size. + * + * Return: 0 on success or negative error code on failure. + */ +int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr) +{ + unsigned long cur, addr; + int ret = 0; + + hyp_spin_lock(&pkvm_pgd_lock); + + /* Align the allocation based on the order of its size */ + addr = ALIGN(__private_range_cur, PAGE_SIZE << get_order(size)); + + /* The allocated size is always a multiple of PAGE_SIZE */ + cur = addr + PAGE_ALIGN(size); + + /* Has the private range grown too large ? */ + if (!addr || cur > __hyp_vmemmap || (cur - __private_range_base) > __PKVM_PRIVATE_SZ) { + ret = -ENOMEM; + } else { + __private_range_cur = cur; + *haddr = addr; + } + + hyp_spin_unlock(&pkvm_pgd_lock); + + return ret; +} + +int __pkvm_create_private_mapping(phys_addr_t phys, size_t size, + enum kvm_pgtable_prot prot, + unsigned long *haddr) { unsigned long addr; int err; - hyp_spin_lock(&pkvm_pgd_lock); - size = PAGE_ALIGN(size + offset_in_page(phys)); - addr = __io_map_base; - __io_map_base += size; + err = pkvm_alloc_private_va_range(size, &addr); + if (err) + return err; - /* Are we overflowing on the vmemmap ? */ - if (__io_map_base > __hyp_vmemmap) { - __io_map_base -= size; - addr = (unsigned long)ERR_PTR(-ENOMEM); - goto out; + err = __pkvm_create_mappings(addr, size, phys, prot); + if (err) + return err; + + *haddr = addr + offset_in_page(phys); + return err; +} + +#ifdef CONFIG_NVHE_EL2_DEBUG +static unsigned long mod_range_start = ULONG_MAX; +static unsigned long mod_range_end; +static DEFINE_HYP_SPINLOCK(mod_range_lock); + +static void update_mod_range(unsigned long addr, size_t size) +{ + hyp_spin_lock(&mod_range_lock); + mod_range_start = min(mod_range_start, addr); + mod_range_end = max(mod_range_end, addr + size); + hyp_spin_unlock(&mod_range_lock); +} + +static void assert_in_mod_range(unsigned long addr) +{ + /* + * This is not entirely watertight if there are private range + * allocations between modules being loaded, but in practice that is + * probably going to be allocation initiated by the modules themselves. + */ + hyp_spin_lock(&mod_range_lock); + WARN_ON(addr < mod_range_start || mod_range_end <= addr); + hyp_spin_unlock(&mod_range_lock); +} +#else +static inline void update_mod_range(unsigned long addr, size_t size) { } +static inline void assert_in_mod_range(unsigned long addr) { } +#endif + +void *__pkvm_alloc_module_va(u64 nr_pages) +{ + size_t size = nr_pages << PAGE_SHIFT; + unsigned long addr = 0; + + if (!pkvm_alloc_private_va_range(size, &addr)) + update_mod_range(addr, size); + + return (void *)addr; +} + +int __pkvm_map_module_page(u64 pfn, void *va, enum kvm_pgtable_prot prot, bool is_protected) +{ + unsigned long addr = (unsigned long)va; + int ret; + + assert_in_mod_range(addr); + + if (!is_protected) { + ret = __pkvm_host_donate_hyp(pfn, 1); + if (ret) + return ret; } - err = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, size, phys, prot); - if (err) { - addr = (unsigned long)ERR_PTR(err); - goto out; - } + ret = __pkvm_create_mappings(addr, PAGE_SIZE, hyp_pfn_to_phys(pfn), prot); + if (ret && !is_protected) + WARN_ON(__pkvm_hyp_donate_host(pfn, 1)); - addr = addr + offset_in_page(phys); -out: - hyp_spin_unlock(&pkvm_pgd_lock); + return ret; +} - return addr; +void __pkvm_unmap_module_page(u64 pfn, void *va) +{ + WARN_ON(__pkvm_hyp_donate_host(pfn, 1)); + pkvm_remove_mappings(va, va + PAGE_SIZE); } int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot) @@ -103,13 +204,45 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) return ret; } -int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back) +void pkvm_remove_mappings(void *from, void *to) { - unsigned long start, end; + unsigned long size = (unsigned long)to - (unsigned long)from; - hyp_vmemmap_range(phys, size, &start, &end); + hyp_spin_lock(&pkvm_pgd_lock); + WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, (u64)from, size) != size); + hyp_spin_unlock(&pkvm_pgd_lock); +} - return __pkvm_create_mappings(start, end - start, back, PAGE_HYP); +int hyp_back_vmemmap(phys_addr_t back) +{ + unsigned long i, start, size, end = 0; + int ret; + + for (i = 0; i < hyp_memblock_nr; i++) { + start = hyp_memory[i].base; + start = ALIGN_DOWN((u64)hyp_phys_to_page(start), PAGE_SIZE); + /* + * The begining of the hyp_vmemmap region for the current + * memblock may already be backed by the page backing the end + * the previous region, so avoid mapping it twice. + */ + start = max(start, end); + + end = hyp_memory[i].base + hyp_memory[i].size; + end = PAGE_ALIGN((u64)hyp_phys_to_page(end)); + if (start >= end) + continue; + + size = end - start; + ret = __pkvm_create_mappings(start, size, back, PAGE_HYP); + if (ret) + return ret; + + memset(hyp_phys_to_virt(back), 0, size); + back += size; + } + + return 0; } static void *__hyp_bp_vect_base; @@ -144,7 +277,8 @@ int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot) int hyp_map_vectors(void) { phys_addr_t phys; - void *bp_base; + unsigned long bp_base; + int ret; if (!kvm_system_needs_idmapped_vectors()) { __hyp_bp_vect_base = __bp_harden_hyp_vecs; @@ -152,13 +286,109 @@ int hyp_map_vectors(void) } phys = __hyp_pa(__bp_harden_hyp_vecs); - bp_base = (void *)__pkvm_create_private_mapping(phys, - __BP_HARDEN_HYP_VECS_SZ, - PAGE_HYP_EXEC); - if (IS_ERR_OR_NULL(bp_base)) - return PTR_ERR(bp_base); + ret = __pkvm_create_private_mapping(phys, __BP_HARDEN_HYP_VECS_SZ, + PAGE_HYP_EXEC, &bp_base); + if (ret) + return ret; - __hyp_bp_vect_base = bp_base; + __hyp_bp_vect_base = (void *)bp_base; + + return 0; +} + +void *hyp_fixmap_map(phys_addr_t phys) +{ + struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots); + kvm_pte_t pte, *ptep = slot->ptep; + + pte = *ptep; + pte &= ~kvm_phys_to_pte(KVM_PHYS_INVALID); + pte |= kvm_phys_to_pte(phys) | KVM_PTE_VALID; + WRITE_ONCE(*ptep, pte); + dsb(ishst); + + return (void *)slot->addr + offset_in_page(phys); +} + +static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) +{ + kvm_pte_t *ptep = slot->ptep; + u64 addr = slot->addr; + + WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID); + + /* + * Irritatingly, the architecture requires that we use inner-shareable + * broadcast TLB invalidation here in case another CPU speculates + * through our fixmap and decides to create an "amalagamation of the + * values held in the TLB" due to the apparent lack of a + * break-before-make sequence. + * + * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03 + */ + dsb(ishst); + __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), (KVM_PGTABLE_MAX_LEVELS - 1)); + dsb(ish); + isb(); +} + +void hyp_fixmap_unmap(void) +{ + fixmap_clear_slot(this_cpu_ptr(&fixmap_slots)); +} + +static int __create_fixmap_slot_cb(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)arg); + + if (!kvm_pte_valid(*ptep) || level != KVM_PGTABLE_MAX_LEVELS - 1) + return -EINVAL; + + slot->addr = addr; + slot->ptep = ptep; + + /* + * Clear the PTE, but keep the page-table page refcount elevated to + * prevent it from ever being freed. This lets us manipulate the PTEs + * by hand safely without ever needing to allocate memory. + */ + fixmap_clear_slot(slot); + + return 0; +} + +static int create_fixmap_slot(u64 addr, u64 cpu) +{ + struct kvm_pgtable_walker walker = { + .cb = __create_fixmap_slot_cb, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = (void *)cpu, + }; + + return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker); +} + +int hyp_create_pcpu_fixmap(void) +{ + unsigned long addr, i; + int ret; + + for (i = 0; i < hyp_nr_cpus; i++) { + ret = pkvm_alloc_private_va_range(PAGE_SIZE, &addr); + if (ret) + return ret; + + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PAGE_SIZE, + __hyp_pa(__hyp_bss_start), PAGE_HYP); + if (ret) + return ret; + + ret = create_fixmap_slot(addr, i); + if (ret) + return ret; + } return 0; } @@ -181,9 +411,43 @@ int hyp_create_idmap(u32 hyp_va_bits) * with the idmap to place the IOs and the vmemmap. IOs use the lower * half of the quarter and the vmemmap the upper half. */ - __io_map_base = start & BIT(hyp_va_bits - 2); - __io_map_base ^= BIT(hyp_va_bits - 2); - __hyp_vmemmap = __io_map_base | BIT(hyp_va_bits - 3); + __private_range_base = start & BIT(hyp_va_bits - 2); + __private_range_base ^= BIT(hyp_va_bits - 2); + __private_range_cur = __private_range_base; + __hyp_vmemmap = __private_range_base | BIT(hyp_va_bits - 3); return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC); } + +static void *admit_host_page(void *arg) +{ + struct kvm_hyp_memcache *host_mc = arg; + + if (!host_mc->nr_pages) + return NULL; + + /* + * The host still owns the pages in its memcache, so we need to go + * through a full host-to-hyp donation cycle to change it. Fortunately, + * __pkvm_host_donate_hyp() takes care of races for us, so if it + * succeeds we're good to go. + */ + if (__pkvm_host_donate_hyp(hyp_phys_to_pfn(host_mc->head), 1)) + return NULL; + + return pop_hyp_memcache(host_mc, hyp_phys_to_virt); +} + +/* Refill our local memcache by poping pages from the one provided by the host. */ +int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, + struct kvm_hyp_memcache *host_mc) +{ + struct kvm_hyp_memcache tmp = *host_mc; + int ret; + + ret = __topup_hyp_memcache(mc, min_pages, admit_host_page, + hyp_virt_to_phys, &tmp); + *host_mc = tmp; + + return ret; +} diff --git a/arch/arm64/kvm/hyp/nvhe/module.lds.S b/arch/arm64/kvm/hyp/nvhe/module.lds.S new file mode 100644 index 000000000000..645080c681cd --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/module.lds.S @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include + +SECTIONS { + .hyp.text : { + HYP_SECTION_SYMBOL_NAME(.text) = .; + *(.text .text.*) + } + + .hyp.bss : { + HYP_SECTION_SYMBOL_NAME(.bss) = .; + *(.bss .bss.*) + } + + .hyp.rodata : { + HYP_SECTION_SYMBOL_NAME(.rodata) = .; + *(.rodata .rodata.*) + } + + .hyp.data : { + HYP_SECTION_SYMBOL_NAME(.data) = .; + *(.data .data.*) + } +} diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c new file mode 100644 index 000000000000..7eda0724d7f6 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/modules.c @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 Google LLC + */ +#include +#include + +#include +#include +#include +#include +#include +#include + +static void __kvm_flush_dcache_to_poc(void *addr, size_t size) +{ + kvm_flush_dcache_to_poc((unsigned long)addr, (unsigned long)size); +} + +static atomic_t early_lm_pages; +static void *__pkvm_linear_map_early(phys_addr_t phys, size_t size, enum kvm_pgtable_prot prot) +{ + void *addr = NULL; + int ret; + + if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) + return NULL; + + addr = __hyp_va(phys); + ret = pkvm_create_mappings(addr, addr + size, prot); + if (ret) + addr = NULL; + else + atomic_add(size, &early_lm_pages); + + return addr; +} + +static void __pkvm_linear_unmap_early(void *addr, size_t size) +{ + pkvm_remove_mappings(addr, addr + size); + atomic_sub(size, &early_lm_pages); +} + +int __pkvm_close_module_registration(void) +{ + /* + * Page ownership tracking might go out of sync if there are stale + * entries in pKVM's linear map range, so they must really be gone by + * now. + */ + WARN_ON(atomic_read(&early_lm_pages)); + return reset_pkvm_priv_hcall_limit(); + + /* The fuse is blown! No way back until reset */ +} + +const struct pkvm_module_ops module_ops = { + .create_private_mapping = __pkvm_create_private_mapping, + .alloc_module_va = __pkvm_alloc_module_va, + .map_module_page = __pkvm_map_module_page, + .register_serial_driver = __pkvm_register_serial_driver, + .puts = hyp_puts, + .putx64 = hyp_putx64, + .fixmap_map = hyp_fixmap_map, + .fixmap_unmap = hyp_fixmap_unmap, + .linear_map_early = __pkvm_linear_map_early, + .linear_unmap_early = __pkvm_linear_unmap_early, + .flush_dcache_to_poc = __kvm_flush_dcache_to_poc, + .register_host_perm_fault_handler = hyp_register_host_perm_fault_handler, + .host_stage2_mod_prot = module_change_host_page_prot, + .host_stage2_get_leaf = host_stage2_get_leaf, + .register_host_smc_handler = __pkvm_register_host_smc_handler, + .register_default_trap_handler = __pkvm_register_default_trap_handler, + .register_illegal_abt_notifier = __pkvm_register_illegal_abt_notifier, + .register_psci_notifier = __pkvm_register_psci_notifier, + .register_hyp_panic_notifier = __pkvm_register_hyp_panic_notifier, + .host_donate_hyp = __pkvm_host_donate_hyp, + .hyp_donate_host = __pkvm_hyp_donate_host, + .memcpy = memcpy, + .memset = memset, + .hyp_pa = hyp_virt_to_phys, + .hyp_va = hyp_phys_to_virt, + .kern_hyp_va = __kern_hyp_va, +}; + +int __pkvm_init_module(void *module_init) +{ + int (*do_module_init)(const struct pkvm_module_ops *ops) = module_init; + + return do_module_init(&module_ops); +} + +#define MAX_DYNAMIC_HCALLS 128 + +atomic_t num_dynamic_hcalls = ATOMIC_INIT(0); +DEFINE_HYP_SPINLOCK(dyn_hcall_lock); + +static dyn_hcall_t host_dynamic_hcalls[MAX_DYNAMIC_HCALLS]; + +int handle_host_dynamic_hcall(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned long, id, host_ctxt, 0); + dyn_hcall_t hfn; + int dyn_id; + + /* + * TODO: static key to protect when no dynamic hcall is registered? + */ + + dyn_id = (int)(id - KVM_HOST_SMCCC_ID(0)) - + __KVM_HOST_SMCCC_FUNC___dynamic_hcalls; + if (dyn_id < 0) + return HCALL_UNHANDLED; + + cpu_reg(host_ctxt, 0) = SMCCC_RET_NOT_SUPPORTED; + + /* + * Order access to num_dynamic_hcalls and host_dynamic_hcalls. Paired + * with __pkvm_register_hcall(). + */ + if (dyn_id >= atomic_read_acquire(&num_dynamic_hcalls)) + goto end; + + hfn = READ_ONCE(host_dynamic_hcalls[dyn_id]); + if (!hfn) + goto end; + + cpu_reg(host_ctxt, 0) = SMCCC_RET_SUCCESS; + hfn(host_ctxt); +end: + return HCALL_HANDLED; +} + +int __pkvm_register_hcall(unsigned long hvn_hyp_va) +{ + dyn_hcall_t hfn = (void *)hvn_hyp_va; + int reserved_id, ret; + + hyp_spin_lock(&dyn_hcall_lock); + + reserved_id = atomic_read(&num_dynamic_hcalls); + + if (reserved_id >= MAX_DYNAMIC_HCALLS) { + ret = -ENOMEM; + goto err_hcall_unlock; + } + + WRITE_ONCE(host_dynamic_hcalls[reserved_id], hfn); + + /* + * Order access to num_dynamic_hcalls and host_dynamic_hcalls. Paired + * with handle_host_dynamic_hcall. + */ + atomic_set_release(&num_dynamic_hcalls, reserved_id + 1); + + ret = reserved_id + __KVM_HOST_SMCCC_FUNC___dynamic_hcalls; +err_hcall_unlock: + hyp_spin_unlock(&dyn_hcall_lock); + + return ret; +}; diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index 0bd7701ad1df..11b190ff49d1 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -32,7 +32,7 @@ u64 __hyp_vmemmap; */ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { phys_addr_t addr = hyp_page_to_phys(p); @@ -51,7 +51,7 @@ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool, /* Find a buddy page currently available for allocation */ static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order); @@ -93,16 +93,21 @@ static inline struct hyp_page *node_to_page(struct list_head *node) static void __hyp_attach_page(struct hyp_pool *pool, struct hyp_page *p) { - unsigned short order = p->order; + phys_addr_t phys = hyp_page_to_phys(p); struct hyp_page *buddy; + u8 order = p->order; memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order); + /* Skip coalescing for 'external' pages being freed into the pool. */ + if (phys < pool->range_start || phys >= pool->range_end) + goto insert; + /* * Only the first struct hyp_page of a high-order page (otherwise known * as the 'head') should have p->order set. The non-head pages should * have p->order = HYP_NO_ORDER. Here @p may no longer be the head - * after coallescing, so make sure to mark it HYP_NO_ORDER proactively. + * after coalescing, so make sure to mark it HYP_NO_ORDER proactively. */ p->order = HYP_NO_ORDER; for (; (order + 1) < pool->max_order; order++) { @@ -110,12 +115,13 @@ static void __hyp_attach_page(struct hyp_pool *pool, if (!buddy) break; - /* Take the buddy out of its list, and coallesce with @p */ + /* Take the buddy out of its list, and coalesce with @p */ page_remove_from_list(buddy); buddy->order = HYP_NO_ORDER; p = min(p, buddy); } +insert: /* Mark the new head, and insert it */ p->order = order; page_add_to_list(p, &pool->free_area[order]); @@ -123,7 +129,7 @@ static void __hyp_attach_page(struct hyp_pool *pool, static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { struct hyp_page *buddy; @@ -144,25 +150,6 @@ static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool, return p; } -static inline void hyp_page_ref_inc(struct hyp_page *p) -{ - BUG_ON(p->refcount == USHRT_MAX); - p->refcount++; -} - -static inline int hyp_page_ref_dec_and_test(struct hyp_page *p) -{ - BUG_ON(!p->refcount); - p->refcount--; - return (p->refcount == 0); -} - -static inline void hyp_set_page_refcounted(struct hyp_page *p) -{ - BUG_ON(p->refcount); - p->refcount = 1; -} - static void __hyp_put_page(struct hyp_pool *pool, struct hyp_page *p) { if (hyp_page_ref_dec_and_test(p)) @@ -196,7 +183,7 @@ void hyp_get_page(struct hyp_pool *pool, void *addr) void hyp_split_page(struct hyp_page *p) { - unsigned short order = p->order; + u8 order = p->order; unsigned int i; p->order = 0; @@ -208,10 +195,10 @@ void hyp_split_page(struct hyp_page *p) } } -void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order) +void *hyp_alloc_pages(struct hyp_pool *pool, u8 order) { - unsigned short i = order; struct hyp_page *p; + u8 i = order; hyp_spin_lock(&pool->lock); @@ -241,7 +228,7 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, int i; hyp_spin_lock_init(&pool->lock); - pool->max_order = min(MAX_ORDER, get_order(nr_pages << PAGE_SHIFT)); + pool->max_order = min(MAX_ORDER, get_order((nr_pages + 1) << PAGE_SHIFT)); for (i = 0; i < pool->max_order; i++) INIT_LIST_HEAD(&pool->free_area[i]); pool->range_start = phys; @@ -249,10 +236,8 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, /* Init the vmemmap portion */ p = hyp_phys_to_page(phys); - for (i = 0; i < nr_pages; i++) { - p[i].order = 0; + for (i = 0; i < nr_pages; i++) hyp_set_page_refcounted(&p[i]); - } /* Attach the unused pages to the buddy tree */ for (i = reserved_pages; i < nr_pages; i++) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c new file mode 100644 index 000000000000..f160bc4306a5 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -0,0 +1,1526 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Google LLC + * Author: Fuad Tabba + */ + +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include + +/* Used by icache_is_vpipt(). */ +unsigned long __icache_flags; + +/* Used by kvm_get_vttbr(). */ +unsigned int kvm_arm_vmid_bits; + +/* + * The currently loaded hyp vCPU for each physical CPU. Used only when + * protected KVM is enabled, but for both protected and non-protected VMs. + */ +static DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, loaded_hyp_vcpu); + +/* + * Set trap register values based on features in ID_AA64PFR0. + */ +static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); + u64 hcr_set = HCR_RW; + u64 hcr_clear = 0; + u64 cptr_set = 0; + + /* Protected KVM does not support AArch32 guests. */ + BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), + PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_ELx_64BIT_ONLY); + BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), + PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_ELx_64BIT_ONLY); + + /* + * Linux guests assume support for floating-point and Advanced SIMD. Do + * not change the trapping behavior for these from the KVM default. + */ + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP), + PVM_ID_AA64PFR0_ALLOW)); + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD), + PVM_ID_AA64PFR0_ALLOW)); + + /* Trap RAS unless all current versions are supported */ + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), feature_ids) < + ID_AA64PFR0_EL1_RAS_V1P1) { + hcr_set |= HCR_TERR | HCR_TEA; + hcr_clear |= HCR_FIEN; + } + + /* Trap AMU */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), feature_ids)) { + hcr_clear |= HCR_AMVOFFEN; + cptr_set |= CPTR_EL2_TAM; + } + + /* Trap SVE */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) + cptr_set |= CPTR_EL2_TZ; + + vcpu->arch.hcr_el2 |= hcr_set; + vcpu->arch.hcr_el2 &= ~hcr_clear; + vcpu->arch.cptr_el2 |= cptr_set; +} + +/* + * Set trap register values based on features in ID_AA64PFR1. + */ +static void pvm_init_traps_aa64pfr1(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); + u64 hcr_set = 0; + u64 hcr_clear = 0; + + /* Memory Tagging: Trap and Treat as Untagged if not supported. */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), feature_ids)) { + hcr_set |= HCR_TID5; + hcr_clear |= HCR_DCT | HCR_ATA; + } + + vcpu->arch.hcr_el2 |= hcr_set; + vcpu->arch.hcr_el2 &= ~hcr_clear; +} + +/* + * Set trap register values based on features in ID_AA64DFR0. + */ +static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); + u64 mdcr_set = 0; + u64 mdcr_clear = 0; + u64 cptr_set = 0; + + /* Trap/constrain PMU */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), feature_ids)) { + mdcr_set |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; + mdcr_clear |= MDCR_EL2_HPME | MDCR_EL2_MTPME | + MDCR_EL2_HPMN_MASK; + } + + /* Trap Debug */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), feature_ids)) + mdcr_set |= MDCR_EL2_TDRA | MDCR_EL2_TDA; + + /* Trap OS Double Lock */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DoubleLock), feature_ids)) + mdcr_set |= MDCR_EL2_TDOSA; + + /* Trap SPE */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer), feature_ids)) { + mdcr_set |= MDCR_EL2_TPMS; + mdcr_clear |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; + } + + /* Trap Trace Filter */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceFilt), feature_ids)) + mdcr_set |= MDCR_EL2_TTRF; + + /* Trap Trace */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) + cptr_set |= CPTR_EL2_TTA; + + vcpu->arch.mdcr_el2 |= mdcr_set; + vcpu->arch.mdcr_el2 &= ~mdcr_clear; + vcpu->arch.cptr_el2 |= cptr_set; +} + +/* + * Set trap register values based on features in ID_AA64MMFR0. + */ +static void pvm_init_traps_aa64mmfr0(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1); + u64 mdcr_set = 0; + + /* Trap Debug Communications Channel registers */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_FGT), feature_ids)) + mdcr_set |= MDCR_EL2_TDCC; + + vcpu->arch.mdcr_el2 |= mdcr_set; +} + +/* + * Set trap register values based on features in ID_AA64MMFR1. + */ +static void pvm_init_traps_aa64mmfr1(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1); + u64 hcr_set = 0; + + /* Trap LOR */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_LO), feature_ids)) + hcr_set |= HCR_TLOR; + + vcpu->arch.hcr_el2 |= hcr_set; +} + +/* + * Set baseline trap register values. + */ +static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) +{ + /* + * Always trap: + * - Feature id registers: to control features exposed to guests + * - Implementation-defined features + */ + vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS | + HCR_TID3 | HCR_TACR | HCR_TIDCP | HCR_TID1; + + if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) { + /* route synchronous external abort exceptions to EL2 */ + vcpu->arch.hcr_el2 |= HCR_TEA; + /* trap error record accesses */ + vcpu->arch.hcr_el2 |= HCR_TERR; + } + + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + vcpu->arch.hcr_el2 |= HCR_FWB; + + if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE)) + vcpu->arch.hcr_el2 |= HCR_TID2; +} + +/* + * Initialize trap register values for protected VMs. + */ +static void pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + hyp_vcpu->vcpu.arch.cptr_el2 = CPTR_EL2_DEFAULT; + hyp_vcpu->vcpu.arch.mdcr_el2 = 0; + + if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu)) { + u64 hcr = READ_ONCE(hyp_vcpu->host_vcpu->arch.hcr_el2); + + hyp_vcpu->vcpu.arch.hcr_el2 = HCR_GUEST_FLAGS | hcr; + return; + } + + pvm_init_trap_regs(&hyp_vcpu->vcpu); + pvm_init_traps_aa64pfr0(&hyp_vcpu->vcpu); + pvm_init_traps_aa64pfr1(&hyp_vcpu->vcpu); + pvm_init_traps_aa64dfr0(&hyp_vcpu->vcpu); + pvm_init_traps_aa64mmfr0(&hyp_vcpu->vcpu); + pvm_init_traps_aa64mmfr1(&hyp_vcpu->vcpu); +} + +/* + * Start the VM table handle at the offset defined instead of at 0. + * Mainly for sanity checking and debugging. + */ +#define HANDLE_OFFSET 0x1000 + +static unsigned int vm_handle_to_idx(pkvm_handle_t handle) +{ + return handle - HANDLE_OFFSET; +} + +static pkvm_handle_t idx_to_vm_handle(unsigned int idx) +{ + return idx + HANDLE_OFFSET; +} + +/* + * Spinlock for protecting state related to the VM table. Protects writes + * to 'vm_table' and 'nr_table_entries' as well as reads and writes to + * 'last_hyp_vcpu_lookup'. + */ +static DEFINE_HYP_SPINLOCK(vm_table_lock); + +/* + * The table of VM entries for protected VMs in hyp. + * Allocated at hyp initialization and setup. + */ +static struct pkvm_hyp_vm **vm_table; + +void pkvm_hyp_vm_table_init(void *tbl) +{ + WARN_ON(vm_table); + vm_table = tbl; +} + +/* + * Return the hyp vm structure corresponding to the handle. + */ +static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle) +{ + unsigned int idx = vm_handle_to_idx(handle); + + if (unlikely(idx >= KVM_MAX_PVMS)) + return NULL; + + return vm_table[idx]; +} + +int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 pfn, u64 ipa) +{ + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm || !hyp_vm->is_dying) + goto unlock; + + ret = __pkvm_host_reclaim_page(hyp_vm, pfn, ipa); + if (ret) + goto unlock; + + drain_hyp_pool(hyp_vm, &hyp_vm->host_kvm->arch.pkvm.teardown_stage2_mc); +unlock: + hyp_spin_unlock(&vm_table_lock); + + return ret; +} + +struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, + unsigned int vcpu_idx) +{ + struct pkvm_hyp_vcpu *hyp_vcpu = NULL; + struct pkvm_hyp_vm *hyp_vm; + + /* Cannot load a new vcpu without putting the old one first. */ + if (__this_cpu_read(loaded_hyp_vcpu)) + return NULL; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm || hyp_vm->is_dying || hyp_vm->nr_vcpus <= vcpu_idx) + goto unlock; + + hyp_vcpu = hyp_vm->vcpus[vcpu_idx]; + + /* Ensure vcpu isn't loaded on more than one cpu simultaneously. */ + if (unlikely(hyp_vcpu->loaded_hyp_vcpu)) { + hyp_vcpu = NULL; + goto unlock; + } + + hyp_vcpu->loaded_hyp_vcpu = this_cpu_ptr(&loaded_hyp_vcpu); + hyp_page_ref_inc(hyp_virt_to_page(hyp_vm)); +unlock: + hyp_spin_unlock(&vm_table_lock); + + if (hyp_vcpu) + __this_cpu_write(loaded_hyp_vcpu, hyp_vcpu); + return hyp_vcpu; +} + +void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + + hyp_spin_lock(&vm_table_lock); + hyp_vcpu->loaded_hyp_vcpu = NULL; + __this_cpu_write(loaded_hyp_vcpu, NULL); + hyp_page_ref_dec(hyp_virt_to_page(hyp_vm)); + hyp_spin_unlock(&vm_table_lock); +} + +struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void) +{ + return __this_cpu_read(loaded_hyp_vcpu); +} + +static void pkvm_vcpu_init_features_from_host(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES); + + /* No restrictions for non-protected VMs. */ + if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu)) { + bitmap_copy(hyp_vcpu->vcpu.arch.features, + host_vcpu->arch.features, + KVM_VCPU_MAX_FEATURES); + return; + } + + bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES); + + /* + * For protected vms, always allow: + * - CPU starting in poweroff state + * - PSCI v0.2 + */ + set_bit(KVM_ARM_VCPU_POWER_OFF, allowed_features); + set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); + + /* + * Check if remaining features are allowed: + * - Performance Monitoring + * - Scalable Vectors + * - Pointer Authentication + */ + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), PVM_ID_AA64DFR0_ALLOW)) + set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features); + + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), PVM_ID_AA64PFR0_ALLOW)) + set_bit(KVM_ARM_VCPU_SVE, allowed_features); + + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), PVM_ID_AA64ISAR1_ALLOW) && + FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), PVM_ID_AA64ISAR1_ALLOW)) + set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features); + + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), PVM_ID_AA64ISAR1_ALLOW) && + FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA), PVM_ID_AA64ISAR1_ALLOW)) + set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features); + + bitmap_and(hyp_vcpu->vcpu.arch.features, host_vcpu->arch.features, + allowed_features, KVM_VCPU_MAX_FEATURES); + + /* + * Now sanitise the configuration flags that we have inherited + * from the host, as they may refer to features that protected + * mode doesn't support. + */ + if (!vcpu_has_feature(&hyp_vcpu->vcpu,(KVM_ARM_VCPU_SVE))) { + vcpu_clear_flag(&hyp_vcpu->vcpu, GUEST_HAS_SVE); + vcpu_clear_flag(&hyp_vcpu->vcpu, VCPU_SVE_FINALIZED); + } + + if (!vcpu_has_feature(&hyp_vcpu->vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) || + !vcpu_has_feature(&hyp_vcpu->vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC)) + vcpu_clear_flag(&hyp_vcpu->vcpu, GUEST_HAS_PTRAUTH); +} + +static int pkvm_vcpu_init_ptrauth(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + int ret = 0; + + if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) || + test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features)) + ret = kvm_vcpu_enable_ptrauth(vcpu); + + return ret; +} + +static int pkvm_vcpu_init_psci(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct vcpu_reset_state *reset_state = &hyp_vcpu->vcpu.arch.reset_state; + struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + + if (test_bit(KVM_ARM_VCPU_POWER_OFF, hyp_vcpu->vcpu.arch.features)) { + reset_state->reset = false; + hyp_vcpu->power_state = PSCI_0_2_AFFINITY_LEVEL_OFF; + } else if (pkvm_hyp_vm_has_pvmfw(hyp_vm)) { + if (hyp_vm->pvmfw_entry_vcpu) + return -EINVAL; + + hyp_vm->pvmfw_entry_vcpu = hyp_vcpu; + reset_state->reset = true; + hyp_vcpu->power_state = PSCI_0_2_AFFINITY_LEVEL_ON_PENDING; + } else { + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + reset_state->pc = READ_ONCE(host_vcpu->arch.ctxt.regs.pc); + reset_state->r0 = READ_ONCE(host_vcpu->arch.ctxt.regs.regs[0]); + reset_state->reset = true; + hyp_vcpu->power_state = PSCI_0_2_AFFINITY_LEVEL_ON_PENDING; + } + + return 0; +} + +static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu) +{ + if (host_vcpu) + hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1); +} + +static void unpin_host_sve_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + void *sve_state; + + if (!test_bit(KVM_ARM_VCPU_SVE, hyp_vcpu->vcpu.arch.features)) + return; + + sve_state = kern_hyp_va(hyp_vcpu->vcpu.arch.sve_state); + hyp_unpin_shared_mem(sve_state, + sve_state + vcpu_sve_state_size(&hyp_vcpu->vcpu)); +} + +static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[], + unsigned int nr_vcpus) +{ + int i; + + for (i = 0; i < nr_vcpus; i++) { + struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vcpus[i]; + + unpin_host_vcpu(hyp_vcpu->host_vcpu); + unpin_host_sve_state(hyp_vcpu); + } +} + +static size_t pkvm_get_last_ran_size(void) +{ + return array_size(hyp_nr_cpus, sizeof(int)); +} + +static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, + int *last_ran, unsigned int nr_vcpus) +{ + u64 pvmfw_load_addr = PVMFW_INVALID_LOAD_ADDR; + + hyp_vm->host_kvm = host_kvm; + hyp_vm->kvm.created_vcpus = nr_vcpus; + hyp_vm->kvm.arch.vtcr = host_mmu.arch.vtcr; + hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled); + + if (hyp_vm->kvm.arch.pkvm.enabled) + pvmfw_load_addr = READ_ONCE(host_kvm->arch.pkvm.pvmfw_load_addr); + hyp_vm->kvm.arch.pkvm.pvmfw_load_addr = pvmfw_load_addr; + + hyp_vm->kvm.arch.mmu.last_vcpu_ran = (int __percpu *)last_ran; + memset(last_ran, -1, pkvm_get_last_ran_size()); +} + +static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, + struct pkvm_hyp_vm *hyp_vm, + struct kvm_vcpu *host_vcpu, + unsigned int vcpu_idx) +{ + int ret = 0; + + if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1)) + return -EBUSY; + + if (host_vcpu->vcpu_idx != vcpu_idx) { + ret = -EINVAL; + goto done; + } + + hyp_vcpu->host_vcpu = host_vcpu; + + hyp_vcpu->vcpu.kvm = &hyp_vm->kvm; + hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id); + hyp_vcpu->vcpu.vcpu_idx = vcpu_idx; + + hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu; + hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); + hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; + hyp_vcpu->vcpu.arch.debug_ptr = &host_vcpu->arch.vcpu_debug_state; + + pkvm_vcpu_init_features_from_host(hyp_vcpu); + + ret = pkvm_vcpu_init_ptrauth(hyp_vcpu); + if (ret) + goto done; + + ret = pkvm_vcpu_init_psci(hyp_vcpu); + if (ret) + goto done; + + if (test_bit(KVM_ARM_VCPU_SVE, hyp_vcpu->vcpu.arch.features)) { + size_t sve_state_size; + void *sve_state; + + hyp_vcpu->vcpu.arch.sve_state = READ_ONCE(host_vcpu->arch.sve_state); + hyp_vcpu->vcpu.arch.sve_max_vl = READ_ONCE(host_vcpu->arch.sve_max_vl); + + sve_state = kern_hyp_va(hyp_vcpu->vcpu.arch.sve_state); + sve_state_size = vcpu_sve_state_size(&hyp_vcpu->vcpu); + + if (!hyp_vcpu->vcpu.arch.sve_state || !sve_state_size || + hyp_pin_shared_mem(sve_state, sve_state + sve_state_size)) { + clear_bit(KVM_ARM_VCPU_SVE, hyp_vcpu->vcpu.arch.features); + hyp_vcpu->vcpu.arch.sve_state = NULL; + hyp_vcpu->vcpu.arch.sve_max_vl = 0; + ret = -EINVAL; + goto done; + } + } + + pkvm_vcpu_init_traps(hyp_vcpu); + kvm_reset_pvm_sys_regs(&hyp_vcpu->vcpu); +done: + if (ret) + unpin_host_vcpu(host_vcpu); + return ret; +} + +static int find_free_vm_table_entry(struct kvm *host_kvm) +{ + int i; + + for (i = 0; i < KVM_MAX_PVMS; ++i) { + if (!vm_table[i]) + return i; + } + + return -ENOMEM; +} + +/* + * Allocate a VM table entry and insert a pointer to the new vm. + * + * Return a unique handle to the protected VM on success, + * negative error code on failure. + */ +static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm, + struct pkvm_hyp_vm *hyp_vm) +{ + struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu; + int idx; + + hyp_assert_lock_held(&vm_table_lock); + + /* + * Initializing protected state might have failed, yet a malicious + * host could trigger this function. Thus, ensure that 'vm_table' + * exists. + */ + if (unlikely(!vm_table)) + return -EINVAL; + + idx = find_free_vm_table_entry(host_kvm); + if (idx < 0) + return idx; + + hyp_vm->kvm.arch.pkvm.handle = idx_to_vm_handle(idx); + + /* VMID 0 is reserved for the host */ + atomic64_set(&mmu->vmid.id, idx + 1); + + mmu->arch = &hyp_vm->kvm.arch; + mmu->pgt = &hyp_vm->pgt; + + vm_table[idx] = hyp_vm; + return hyp_vm->kvm.arch.pkvm.handle; +} + +/* + * Deallocate and remove the VM table entry corresponding to the handle. + */ +static void remove_vm_table_entry(pkvm_handle_t handle) +{ + hyp_assert_lock_held(&vm_table_lock); + vm_table[vm_handle_to_idx(handle)] = NULL; +} + +static size_t pkvm_get_hyp_vm_size(unsigned int nr_vcpus) +{ + return size_add(sizeof(struct pkvm_hyp_vm), + size_mul(sizeof(struct pkvm_hyp_vcpu *), nr_vcpus)); +} + +static void *map_donated_memory_noclear(unsigned long host_va, size_t size) +{ + void *va = (void *)kern_hyp_va(host_va); + + if (!PAGE_ALIGNED(va)) + return NULL; + + if (__pkvm_host_donate_hyp(hyp_virt_to_pfn(va), + PAGE_ALIGN(size) >> PAGE_SHIFT)) + return NULL; + + return va; +} + +static void *map_donated_memory(unsigned long host_va, size_t size) +{ + void *va = map_donated_memory_noclear(host_va, size); + + if (va) + memset(va, 0, size); + + return va; +} + +static void __unmap_donated_memory(void *va, size_t size) +{ + kvm_flush_dcache_to_poc(va, size); + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va), + PAGE_ALIGN(size) >> PAGE_SHIFT)); +} + +static void unmap_donated_memory(void *va, size_t size) +{ + if (!va) + return; + + memset(va, 0, size); + __unmap_donated_memory(va, size); +} + +static void unmap_donated_memory_noclear(void *va, size_t size) +{ + if (!va) + return; + + __unmap_donated_memory(va, size); +} + +/* + * Initialize the hypervisor copy of the protected VM state using the + * memory donated by the host. + * + * Unmaps the donated memory from the host at stage 2. + * + * host_kvm: A pointer to the host's struct kvm. + * vm_hva: The host va of the area being donated for the VM state. + * Must be page aligned. + * pgd_hva: The host va of the area being donated for the stage-2 PGD for + * the VM. Must be page aligned. Its size is implied by the VM's + * VTCR. + * last_ran_hva: The host va of the area being donated for hyp to use to track + * the most recent physical cpu on which each vcpu has run. + * Return a unique handle to the protected VM on success, + * negative error code on failure. + */ +int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, + unsigned long pgd_hva, unsigned long last_ran_hva) +{ + struct pkvm_hyp_vm *hyp_vm = NULL; + int *last_ran = NULL; + size_t vm_size, pgd_size, last_ran_size; + unsigned int nr_vcpus; + void *pgd = NULL; + int ret; + + ret = hyp_pin_shared_mem(host_kvm, host_kvm + 1); + if (ret) + return ret; + + nr_vcpus = READ_ONCE(host_kvm->created_vcpus); + if (nr_vcpus < 1) { + ret = -EINVAL; + goto err_unpin_kvm; + } + + vm_size = pkvm_get_hyp_vm_size(nr_vcpus); + last_ran_size = pkvm_get_last_ran_size(); + pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.vtcr); + + ret = -ENOMEM; + + hyp_vm = map_donated_memory(vm_hva, vm_size); + if (!hyp_vm) + goto err_remove_mappings; + + last_ran = map_donated_memory(last_ran_hva, last_ran_size); + if (!last_ran) + goto err_remove_mappings; + + pgd = map_donated_memory_noclear(pgd_hva, pgd_size); + if (!pgd) + goto err_remove_mappings; + + init_pkvm_hyp_vm(host_kvm, hyp_vm, last_ran, nr_vcpus); + + hyp_spin_lock(&vm_table_lock); + ret = insert_vm_table_entry(host_kvm, hyp_vm); + if (ret < 0) + goto err_unlock; + + ret = kvm_guest_prepare_stage2(hyp_vm, pgd); + if (ret) + goto err_remove_vm_table_entry; + hyp_spin_unlock(&vm_table_lock); + + return hyp_vm->kvm.arch.pkvm.handle; + +err_remove_vm_table_entry: + remove_vm_table_entry(hyp_vm->kvm.arch.pkvm.handle); +err_unlock: + hyp_spin_unlock(&vm_table_lock); +err_remove_mappings: + unmap_donated_memory(hyp_vm, vm_size); + unmap_donated_memory(last_ran, last_ran_size); + unmap_donated_memory(pgd, pgd_size); +err_unpin_kvm: + hyp_unpin_shared_mem(host_kvm, host_kvm + 1); + return ret; +} + +/* + * Initialize the hypervisor copy of the protected vCPU state using the + * memory donated by the host. + * + * handle: The handle for the protected vm. + * host_vcpu: A pointer to the corresponding host vcpu. + * vcpu_hva: The host va of the area being donated for the vcpu state. + * Must be page aligned. The size of the area must be equal to + * the page-aligned size of 'struct pkvm_hyp_vcpu'. + * Return 0 on success, negative error code on failure. + */ +int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, + unsigned long vcpu_hva) +{ + struct pkvm_hyp_vcpu *hyp_vcpu; + struct pkvm_hyp_vm *hyp_vm; + unsigned int idx; + int ret; + + hyp_vcpu = map_donated_memory(vcpu_hva, sizeof(*hyp_vcpu)); + if (!hyp_vcpu) + return -ENOMEM; + + hyp_spin_lock(&vm_table_lock); + + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm) { + ret = -ENOENT; + goto unlock; + } + + idx = hyp_vm->nr_vcpus; + if (idx >= hyp_vm->kvm.created_vcpus) { + ret = -EINVAL; + goto unlock; + } + + ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu, idx); + if (ret) + goto unlock; + + hyp_vm->vcpus[idx] = hyp_vcpu; + hyp_vm->nr_vcpus++; +unlock: + hyp_spin_unlock(&vm_table_lock); + + if (ret) + unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu)); + + return ret; +} + +static void +teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size) +{ + void *start; + + size = PAGE_ALIGN(size); + memset(addr, 0, size); + + for (start = addr; start < addr + size; start += PAGE_SIZE) + push_hyp_memcache(mc, start, hyp_virt_to_phys); + + unmap_donated_memory_noclear(addr, size); +} + +int __pkvm_start_teardown_vm(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm; + int ret = 0; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm) { + ret = -ENOENT; + goto unlock; + } else if (WARN_ON(hyp_page_count(hyp_vm))) { + ret = -EBUSY; + goto unlock; + } else if (hyp_vm->is_dying) { + ret = -EINVAL; + goto unlock; + } + + hyp_vm->is_dying = true; + +unlock: + hyp_spin_unlock(&vm_table_lock); + + return ret; +} + +int __pkvm_finalize_teardown_vm(pkvm_handle_t handle) +{ + struct kvm_hyp_memcache *mc, *stage2_mc; + size_t vm_size, last_ran_size; + int __percpu *last_vcpu_ran; + struct pkvm_hyp_vm *hyp_vm; + struct kvm *host_kvm; + unsigned int idx; + int err; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm) { + err = -ENOENT; + goto err_unlock; + } else if (!hyp_vm->is_dying) { + err = -EBUSY; + goto err_unlock; + } + + host_kvm = hyp_vm->host_kvm; + + /* Ensure the VMID is clean before it can be reallocated */ + __kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu); + remove_vm_table_entry(handle); + hyp_spin_unlock(&vm_table_lock); + + mc = &host_kvm->arch.pkvm.teardown_mc; + stage2_mc = &host_kvm->arch.pkvm.teardown_stage2_mc; + + destroy_hyp_vm_pgt(hyp_vm); + drain_hyp_pool(hyp_vm, stage2_mc); + unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus); + + /* Push the metadata pages to the teardown memcache */ + for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) { + struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx]; + struct kvm_hyp_memcache *vcpu_mc; + void *addr; + + vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache; + while (vcpu_mc->nr_pages) { + addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt); + push_hyp_memcache(stage2_mc, addr, hyp_virt_to_phys); + unmap_donated_memory_noclear(addr, PAGE_SIZE); + } + + teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu)); + } + + last_vcpu_ran = hyp_vm->kvm.arch.mmu.last_vcpu_ran; + last_ran_size = pkvm_get_last_ran_size(); + teardown_donated_memory(mc, (__force void *)last_vcpu_ran, + last_ran_size); + + vm_size = pkvm_get_hyp_vm_size(hyp_vm->kvm.created_vcpus); + teardown_donated_memory(mc, hyp_vm, vm_size); + hyp_unpin_shared_mem(host_kvm, host_kvm + 1); + return 0; + +err_unlock: + hyp_spin_unlock(&vm_table_lock); + return err; +} + +int pkvm_load_pvmfw_pages(struct pkvm_hyp_vm *vm, u64 ipa, phys_addr_t phys, + u64 size) +{ + struct kvm_protected_vm *pkvm = &vm->kvm.arch.pkvm; + u64 npages, offset = ipa - pkvm->pvmfw_load_addr; + void *src = hyp_phys_to_virt(pvmfw_base) + offset; + + if (offset >= pvmfw_size) + return -EINVAL; + + size = min(size, pvmfw_size - offset); + if (!PAGE_ALIGNED(size) || !PAGE_ALIGNED(src)) + return -EINVAL; + + npages = size >> PAGE_SHIFT; + while (npages--) { + /* + * No need for cache maintenance here, as the pgtable code will + * take care of this when installing the pte in the guest's + * stage-2 page table. + */ + memcpy(hyp_fixmap_map(phys), src, PAGE_SIZE); + hyp_fixmap_unmap(); + + src += PAGE_SIZE; + phys += PAGE_SIZE; + } + + return 0; +} + +void pkvm_poison_pvmfw_pages(void) +{ + u64 npages = pvmfw_size >> PAGE_SHIFT; + phys_addr_t addr = pvmfw_base; + + while (npages--) { + hyp_poison_page(addr); + addr += PAGE_SIZE; + } +} + +/* + * This function sets the registers on the vcpu to their architecturally defined + * reset values. + * + * Note: Can only be called by the vcpu on itself, after it has been turned on. + */ +void pkvm_reset_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct vcpu_reset_state *reset_state = &hyp_vcpu->vcpu.arch.reset_state; + struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + + WARN_ON(!reset_state->reset); + + pkvm_vcpu_init_ptrauth(hyp_vcpu); + kvm_reset_vcpu_core(&hyp_vcpu->vcpu); + kvm_reset_pvm_sys_regs(&hyp_vcpu->vcpu); + + /* Must be done after reseting sys registers. */ + kvm_reset_vcpu_psci(&hyp_vcpu->vcpu, reset_state); + if (hyp_vm->pvmfw_entry_vcpu == hyp_vcpu) { + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + u64 entry = hyp_vm->kvm.arch.pkvm.pvmfw_load_addr; + int i; + + /* X0 - X14 provided by the VMM (preserved) */ + for (i = 0; i <= 14; ++i) { + u64 val = vcpu_get_reg(host_vcpu, i); + + vcpu_set_reg(&hyp_vcpu->vcpu, i, val); + } + + /* X15: Boot protocol version */ + vcpu_set_reg(&hyp_vcpu->vcpu, 15, 0); + + /* PC: IPA of pvmfw base */ + *vcpu_pc(&hyp_vcpu->vcpu) = entry; + hyp_vm->pvmfw_entry_vcpu = NULL; + + /* Auto enroll MMIO guard */ + set_bit(KVM_ARCH_FLAG_MMIO_GUARD, &hyp_vm->kvm.arch.flags); + } + + reset_state->reset = false; + + hyp_vcpu->exit_code = 0; + + WARN_ON(hyp_vcpu->power_state != PSCI_0_2_AFFINITY_LEVEL_ON_PENDING); + WRITE_ONCE(hyp_vcpu->vcpu.arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE); + WRITE_ONCE(hyp_vcpu->power_state, PSCI_0_2_AFFINITY_LEVEL_ON); +} + +struct pkvm_hyp_vcpu *pkvm_mpidr_to_hyp_vcpu(struct pkvm_hyp_vm *hyp_vm, + u64 mpidr) +{ + int i; + + mpidr &= MPIDR_HWID_BITMASK; + + for (i = 0; i < hyp_vm->nr_vcpus; i++) { + struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[i]; + + if (mpidr == kvm_vcpu_get_mpidr_aff(&hyp_vcpu->vcpu)) + return hyp_vcpu; + } + + return NULL; +} + +/* + * Returns true if the hypervisor has handled the PSCI call, and control should + * go back to the guest, or false if the host needs to do some additional work + * (i.e., wake up the vcpu). + */ +static bool pvm_psci_vcpu_on(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + struct vcpu_reset_state *reset_state; + struct pkvm_hyp_vcpu *target; + unsigned long cpu_id, ret; + int power_state; + + cpu_id = smccc_get_arg1(&hyp_vcpu->vcpu); + if (!kvm_psci_valid_affinity(&hyp_vcpu->vcpu, cpu_id)) { + ret = PSCI_RET_INVALID_PARAMS; + goto error; + } + + target = pkvm_mpidr_to_hyp_vcpu(hyp_vm, cpu_id); + if (!target) { + ret = PSCI_RET_INVALID_PARAMS; + goto error; + } + + /* + * Make sure the requested vcpu is not on to begin with. + * Atomic to avoid race between vcpus trying to power on the same vcpu. + */ + power_state = cmpxchg(&target->power_state, + PSCI_0_2_AFFINITY_LEVEL_OFF, + PSCI_0_2_AFFINITY_LEVEL_ON_PENDING); + switch (power_state) { + case PSCI_0_2_AFFINITY_LEVEL_ON_PENDING: + ret = PSCI_RET_ON_PENDING; + goto error; + case PSCI_0_2_AFFINITY_LEVEL_ON: + ret = PSCI_RET_ALREADY_ON; + goto error; + case PSCI_0_2_AFFINITY_LEVEL_OFF: + break; + default: + ret = PSCI_RET_INTERNAL_FAILURE; + goto error; + } + + reset_state = &target->vcpu.arch.reset_state; + reset_state->pc = smccc_get_arg2(&hyp_vcpu->vcpu); + reset_state->r0 = smccc_get_arg3(&hyp_vcpu->vcpu); + /* Propagate caller endianness */ + reset_state->be = kvm_vcpu_is_be(&hyp_vcpu->vcpu); + reset_state->reset = true; + + /* + * Return to the host, which should make the KVM_REQ_VCPU_RESET request + * as well as kvm_vcpu_wake_up() to schedule the vcpu. + */ + return false; + +error: + /* If there's an error go back straight to the guest. */ + smccc_set_retval(&hyp_vcpu->vcpu, ret, 0, 0, 0); + return true; +} + +static bool pvm_psci_vcpu_affinity_info(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + unsigned long target_affinity_mask, target_affinity, lowest_affinity_level; + struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + unsigned long mpidr, ret; + int i, matching_cpus = 0; + + target_affinity = smccc_get_arg1(vcpu); + lowest_affinity_level = smccc_get_arg2(vcpu); + if (!kvm_psci_valid_affinity(vcpu, target_affinity)) { + ret = PSCI_RET_INVALID_PARAMS; + goto done; + } + + /* Determine target affinity mask */ + target_affinity_mask = psci_affinity_mask(lowest_affinity_level); + if (!target_affinity_mask) { + ret = PSCI_RET_INVALID_PARAMS; + goto done; + } + + /* Ignore other bits of target affinity */ + target_affinity &= target_affinity_mask; + ret = PSCI_0_2_AFFINITY_LEVEL_OFF; + + /* + * If at least one vcpu matching target affinity is ON then return ON, + * then if at least one is PENDING_ON then return PENDING_ON. + * Otherwise, return OFF. + */ + for (i = 0; i < hyp_vm->nr_vcpus; i++) { + struct pkvm_hyp_vcpu *target = hyp_vm->vcpus[i]; + + mpidr = kvm_vcpu_get_mpidr_aff(&target->vcpu); + + if ((mpidr & target_affinity_mask) == target_affinity) { + int power_state; + + matching_cpus++; + power_state = READ_ONCE(target->power_state); + switch (power_state) { + case PSCI_0_2_AFFINITY_LEVEL_ON_PENDING: + ret = PSCI_0_2_AFFINITY_LEVEL_ON_PENDING; + break; + case PSCI_0_2_AFFINITY_LEVEL_ON: + ret = PSCI_0_2_AFFINITY_LEVEL_ON; + goto done; + case PSCI_0_2_AFFINITY_LEVEL_OFF: + break; + default: + ret = PSCI_RET_INTERNAL_FAILURE; + goto done; + } + } + } + + if (!matching_cpus) + ret = PSCI_RET_INVALID_PARAMS; + +done: + /* Nothing to be handled by the host. Go back to the guest. */ + smccc_set_retval(vcpu, ret, 0, 0, 0); + return true; +} + +/* + * Returns true if the hypervisor has handled the PSCI call, and control should + * go back to the guest, or false if the host needs to do some additional work + * (e.g., turn off and update vcpu scheduling status). + */ +static bool pvm_psci_vcpu_off(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + + WARN_ON(vcpu->arch.mp_state.mp_state == KVM_MP_STATE_STOPPED); + WARN_ON(hyp_vcpu->power_state != PSCI_0_2_AFFINITY_LEVEL_ON); + + WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED); + WRITE_ONCE(hyp_vcpu->power_state, PSCI_0_2_AFFINITY_LEVEL_OFF); + + /* Return to the host so that it can finish powering off the vcpu. */ + return false; +} + +static bool pvm_psci_version(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + /* Nothing to be handled by the host. Go back to the guest. */ + smccc_set_retval(&hyp_vcpu->vcpu, KVM_ARM_PSCI_1_1, 0, 0, 0); + return true; +} + +static bool pvm_psci_not_supported(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + /* Nothing to be handled by the host. Go back to the guest. */ + smccc_set_retval(&hyp_vcpu->vcpu, PSCI_RET_NOT_SUPPORTED, 0, 0, 0); + return true; +} + +static bool pvm_psci_features(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + u32 feature = smccc_get_arg1(vcpu); + unsigned long val; + + switch (feature) { + case PSCI_0_2_FN_PSCI_VERSION: + case PSCI_0_2_FN_CPU_SUSPEND: + case PSCI_0_2_FN64_CPU_SUSPEND: + case PSCI_0_2_FN_CPU_OFF: + case PSCI_0_2_FN_CPU_ON: + case PSCI_0_2_FN64_CPU_ON: + case PSCI_0_2_FN_AFFINITY_INFO: + case PSCI_0_2_FN64_AFFINITY_INFO: + case PSCI_0_2_FN_SYSTEM_OFF: + case PSCI_0_2_FN_SYSTEM_RESET: + case PSCI_1_0_FN_PSCI_FEATURES: + case PSCI_1_1_FN_SYSTEM_RESET2: + case PSCI_1_1_FN64_SYSTEM_RESET2: + case ARM_SMCCC_VERSION_FUNC_ID: + val = PSCI_RET_SUCCESS; + break; + default: + val = PSCI_RET_NOT_SUPPORTED; + break; + } + + /* Nothing to be handled by the host. Go back to the guest. */ + smccc_set_retval(vcpu, val, 0, 0, 0); + return true; +} + +static bool pkvm_handle_psci(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + u32 psci_fn = smccc_get_function(vcpu); + + switch (psci_fn) { + case PSCI_0_2_FN_CPU_ON: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_0_2_FN64_CPU_ON: + return pvm_psci_vcpu_on(hyp_vcpu); + case PSCI_0_2_FN_CPU_OFF: + return pvm_psci_vcpu_off(hyp_vcpu); + case PSCI_0_2_FN_AFFINITY_INFO: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_0_2_FN64_AFFINITY_INFO: + return pvm_psci_vcpu_affinity_info(hyp_vcpu); + case PSCI_0_2_FN_PSCI_VERSION: + return pvm_psci_version(hyp_vcpu); + case PSCI_1_0_FN_PSCI_FEATURES: + return pvm_psci_features(hyp_vcpu); + case PSCI_0_2_FN_SYSTEM_RESET: + case PSCI_0_2_FN_CPU_SUSPEND: + case PSCI_0_2_FN64_CPU_SUSPEND: + case PSCI_0_2_FN_SYSTEM_OFF: + case PSCI_1_1_FN_SYSTEM_RESET2: + case PSCI_1_1_FN64_SYSTEM_RESET2: + return false; /* Handled by the host. */ + default: + break; + } + + return pvm_psci_not_supported(hyp_vcpu); +} + +static u64 __pkvm_memshare_page_req(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + u64 elr; + + /* Fake up a data abort (Level 3 translation fault on write) */ + vcpu->arch.fault.esr_el2 = (u32)ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT | + ESR_ELx_WNR | ESR_ELx_FSC_FAULT | + FIELD_PREP(ESR_ELx_FSC_LEVEL, 3); + + /* Shuffle the IPA around into the HPFAR */ + vcpu->arch.fault.hpfar_el2 = (ipa >> 8) & HPFAR_MASK; + + /* This is a virtual address. 0's good. Let's go with 0. */ + vcpu->arch.fault.far_el2 = 0; + + /* Rewind the ELR so we return to the HVC once the IPA is mapped */ + elr = read_sysreg(elr_el2); + elr -= 4; + write_sysreg(elr, elr_el2); + + return ARM_EXCEPTION_TRAP; +} + +static bool pkvm_memshare_call(struct pkvm_hyp_vcpu *hyp_vcpu, u64 *exit_code) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + u64 ipa = smccc_get_arg1(vcpu); + u64 arg2 = smccc_get_arg2(vcpu); + u64 arg3 = smccc_get_arg3(vcpu); + int err; + + if (arg2 || arg3) + goto out_guest_err; + + err = __pkvm_guest_share_host(hyp_vcpu, ipa); + switch (err) { + case 0: + /* Success! Now tell the host. */ + goto out_host; + case -EFAULT: + /* + * Convert the exception into a data abort so that the page + * being shared is mapped into the guest next time. + */ + *exit_code = __pkvm_memshare_page_req(hyp_vcpu, ipa); + goto out_host; + } + +out_guest_err: + smccc_set_retval(vcpu, SMCCC_RET_INVALID_PARAMETER, 0, 0, 0); + return true; + +out_host: + return false; +} + +static bool pkvm_memunshare_call(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + u64 ipa = smccc_get_arg1(vcpu); + u64 arg2 = smccc_get_arg2(vcpu); + u64 arg3 = smccc_get_arg3(vcpu); + int err; + + if (arg2 || arg3) + goto out_guest_err; + + err = __pkvm_guest_unshare_host(hyp_vcpu, ipa); + if (err) + goto out_guest_err; + + return false; + +out_guest_err: + smccc_set_retval(vcpu, SMCCC_RET_INVALID_PARAMETER, 0, 0, 0); + return true; +} + +static bool pkvm_meminfo_call(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + u64 arg1 = smccc_get_arg1(vcpu); + u64 arg2 = smccc_get_arg2(vcpu); + u64 arg3 = smccc_get_arg3(vcpu); + + if (arg1 || arg2 || arg3) + goto out_guest_err; + + smccc_set_retval(vcpu, PAGE_SIZE, 0, 0, 0); + return true; + +out_guest_err: + smccc_set_retval(vcpu, SMCCC_RET_INVALID_PARAMETER, 0, 0, 0); + return true; +} + +static bool pkvm_memrelinquish_call(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + u64 ipa = smccc_get_arg1(vcpu); + u64 arg2 = smccc_get_arg2(vcpu); + u64 arg3 = smccc_get_arg3(vcpu); + u64 pa = 0; + int ret; + + if (arg2 || arg3) + goto out_guest_err; + + ret = __pkvm_guest_relinquish_to_host(hyp_vcpu, ipa, &pa); + if (ret) + goto out_guest_err; + + if (pa != 0) { + /* Now pass to host. */ + return false; + } + + /* This was a NOP as no page was actually mapped at the IPA. */ + smccc_set_retval(vcpu, 0, 0, 0, 0); + return true; + +out_guest_err: + smccc_set_retval(vcpu, SMCCC_RET_INVALID_PARAMETER, 0, 0, 0); + return true; +} + +static bool pkvm_install_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu, u64 *exit_code) +{ + u64 retval = SMCCC_RET_SUCCESS; + u64 ipa = smccc_get_arg1(&hyp_vcpu->vcpu); + int ret; + + ret = __pkvm_install_ioguard_page(hyp_vcpu, ipa); + if (ret == -ENOMEM) { + /* + * We ran out of memcache, let's ask for more. Cancel + * the effects of the HVC that took us here, and + * forward the hypercall to the host for page donation + * purposes. + */ + write_sysreg_el2(read_sysreg_el2(SYS_ELR) - 4, SYS_ELR); + return false; + } + + if (ret) + retval = SMCCC_RET_INVALID_PARAMETER; + + smccc_set_retval(&hyp_vcpu->vcpu, retval, 0, 0, 0); + return true; +} + +bool smccc_trng_available; + +static bool pkvm_forward_trng(struct kvm_vcpu *vcpu) +{ + u32 fn = smccc_get_function(vcpu); + struct arm_smccc_res res; + unsigned long arg1 = 0; + + /* + * Forward TRNG calls to EL3, as we can't trust the host to handle + * these for us. + */ + switch (fn) { + case ARM_SMCCC_TRNG_FEATURES: + case ARM_SMCCC_TRNG_RND32: + case ARM_SMCCC_TRNG_RND64: + arg1 = smccc_get_arg1(vcpu); + fallthrough; + case ARM_SMCCC_TRNG_VERSION: + case ARM_SMCCC_TRNG_GET_UUID: + arm_smccc_1_1_smc(fn, arg1, &res); + smccc_set_retval(vcpu, res.a0, res.a1, res.a2, res.a3); + memzero_explicit(&res, sizeof(res)); + break; + } + + return true; +} + +/* + * Handler for protected VM HVC calls. + * + * Returns true if the hypervisor has handled the exit, and control should go + * back to the guest, or false if it hasn't. + */ +bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 val[4] = { SMCCC_RET_NOT_SUPPORTED }; + u32 fn = smccc_get_function(vcpu); + struct pkvm_hyp_vcpu *hyp_vcpu; + + hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu); + + switch (fn) { + case ARM_SMCCC_VERSION_FUNC_ID: + /* Nothing to be handled by the host. Go back to the guest. */ + val[0] = ARM_SMCCC_VERSION_1_1; + break; + case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID: + val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0; + val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1; + val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2; + val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3; + break; + case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: + val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES); + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_HYP_MEMINFO); + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_SHARE); + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_UNSHARE); + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_RELINQUISH); + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO); + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL); + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP); + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP); + break; + case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_ENROLL_FUNC_ID: + set_bit(KVM_ARCH_FLAG_MMIO_GUARD, &vcpu->kvm->arch.flags); + val[0] = SMCCC_RET_SUCCESS; + break; + case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID: + return pkvm_install_ioguard_page(hyp_vcpu, exit_code); + case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_UNMAP_FUNC_ID: + if (__pkvm_remove_ioguard_page(hyp_vcpu, vcpu_get_reg(vcpu, 1))) + val[0] = SMCCC_RET_INVALID_PARAMETER; + else + val[0] = SMCCC_RET_SUCCESS; + break; + case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_INFO_FUNC_ID: + case ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID: + return pkvm_meminfo_call(hyp_vcpu); + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID: + return pkvm_memshare_call(hyp_vcpu, exit_code); + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID: + return pkvm_memunshare_call(hyp_vcpu); + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID: + return pkvm_memrelinquish_call(hyp_vcpu); + case ARM_SMCCC_TRNG_VERSION ... ARM_SMCCC_TRNG_RND32: + case ARM_SMCCC_TRNG_RND64: + if (smccc_trng_available) + return pkvm_forward_trng(vcpu); + break; + default: + return pkvm_handle_psci(hyp_vcpu); + } + + smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]); + return true; +} + +/* + * Handler for non-protected VM HVC calls. + * + * Returns true if the hypervisor has handled the exit, and control should go + * back to the guest, or false if it hasn't. + */ +bool kvm_hyp_handle_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u32 fn = smccc_get_function(vcpu); + struct pkvm_hyp_vcpu *hyp_vcpu; + + hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu); + + switch (fn) { + case ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID: + return pkvm_meminfo_call(hyp_vcpu); + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID: + return pkvm_memrelinquish_call(hyp_vcpu); + } + + return false; +} diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index 08508783ec3d..f48321fa6af9 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -6,12 +6,15 @@ #include #include +#include #include #include #include #include +#include #include +#include #include void kvm_hyp_cpu_entry(unsigned long r0); @@ -22,6 +25,20 @@ void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt); /* Config options set by the host. */ struct kvm_host_psci_config __ro_after_init kvm_host_psci_config; +static void (*pkvm_psci_notifier)(enum pkvm_psci_notification, struct kvm_cpu_context *); +static void pkvm_psci_notify(enum pkvm_psci_notification notif, struct kvm_cpu_context *host_ctxt) +{ + if (READ_ONCE(pkvm_psci_notifier)) + pkvm_psci_notifier(notif, host_ctxt); +} + +#ifdef CONFIG_MODULES +int __pkvm_register_psci_notifier(void (*cb)(enum pkvm_psci_notification, struct kvm_cpu_context *)) +{ + return cmpxchg(&pkvm_psci_notifier, NULL, cb) ? -EBUSY : 0; +} +#endif + #define INVALID_CPU_ID UINT_MAX struct psci_boot_args { @@ -153,6 +170,7 @@ static int psci_cpu_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt) DECLARE_REG(u64, power_state, host_ctxt, 1); DECLARE_REG(unsigned long, pc, host_ctxt, 2); DECLARE_REG(unsigned long, r0, host_ctxt, 3); + int ret; struct psci_boot_args *boot_args; struct kvm_nvhe_init_params *init_params; @@ -167,13 +185,18 @@ static int psci_cpu_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt) boot_args->pc = pc; boot_args->r0 = r0; + pkvm_psci_notify(PKVM_PSCI_CPU_SUSPEND, host_ctxt); + trace_hyp_exit(); /* * Will either return if shallow sleep state, or wake up into the entry * point if it is a deep sleep state. */ - return psci_call(func_id, power_state, - __hyp_pa(&kvm_hyp_cpu_resume), - __hyp_pa(init_params)); + ret = psci_call(func_id, power_state, + __hyp_pa(&kvm_hyp_cpu_resume), + __hyp_pa(init_params)); + trace_hyp_enter(); + + return ret; } static int psci_system_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt) @@ -194,6 +217,8 @@ static int psci_system_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt) boot_args->pc = pc; boot_args->r0 = r0; + pkvm_psci_notify(PKVM_PSCI_SYSTEM_SUSPEND, host_ctxt); + /* Will only return on error. */ return psci_call(func_id, __hyp_pa(&kvm_hyp_cpu_resume), @@ -218,9 +243,49 @@ asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on) if (is_cpu_on) release_boot_args(boot_args); + pkvm_psci_notify(PKVM_PSCI_CPU_ENTRY, host_ctxt); + __host_enter(host_ctxt); } +static DEFINE_HYP_SPINLOCK(mem_protect_lock); + +static u64 psci_mem_protect(s64 offset) +{ + static u64 cnt; + u64 new = cnt + offset; + + hyp_assert_lock_held(&mem_protect_lock); + + if (!offset || kvm_host_psci_config.version < PSCI_VERSION(1, 1)) + return cnt; + + if (!cnt || !new) + psci_call(PSCI_1_1_FN_MEM_PROTECT, offset < 0 ? 0 : 1, 0, 0); + + cnt = new; + return cnt; +} + +static bool psci_mem_protect_active(void) +{ + return psci_mem_protect(0); +} + +void psci_mem_protect_inc(u64 n) +{ + hyp_spin_lock(&mem_protect_lock); + psci_mem_protect(n); + hyp_spin_unlock(&mem_protect_lock); +} + +void psci_mem_protect_dec(u64 n) +{ + hyp_spin_lock(&mem_protect_lock); + psci_mem_protect(-n); + hyp_spin_unlock(&mem_protect_lock); +} + static unsigned long psci_0_1_handler(u64 func_id, struct kvm_cpu_context *host_ctxt) { if (is_psci_0_1(cpu_off, func_id) || is_psci_0_1(migrate, func_id)) @@ -249,6 +314,9 @@ static unsigned long psci_0_2_handler(u64 func_id, struct kvm_cpu_context *host_ */ case PSCI_0_2_FN_SYSTEM_OFF: case PSCI_0_2_FN_SYSTEM_RESET: + pkvm_poison_pvmfw_pages(); + /* Avoid racing with a MEM_PROTECT call. */ + hyp_spin_lock(&mem_protect_lock); return psci_forward(host_ctxt); case PSCI_0_2_FN64_CPU_SUSPEND: return psci_cpu_suspend(func_id, host_ctxt); @@ -262,9 +330,14 @@ static unsigned long psci_0_2_handler(u64 func_id, struct kvm_cpu_context *host_ static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_ctxt) { switch (func_id) { + case PSCI_1_1_FN64_SYSTEM_RESET2: + pkvm_poison_pvmfw_pages(); + hyp_spin_lock(&mem_protect_lock); + if (psci_mem_protect_active()) + cpu_reg(host_ctxt, 0) = PSCI_0_2_FN_SYSTEM_RESET; + fallthrough; case PSCI_1_0_FN_PSCI_FEATURES: case PSCI_1_0_FN_SET_SUSPEND_MODE: - case PSCI_1_1_FN64_SYSTEM_RESET2: return psci_forward(host_ctxt); case PSCI_1_0_FN64_SYSTEM_SUSPEND: return psci_system_suspend(func_id, host_ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/serial.c b/arch/arm64/kvm/hyp/nvhe/serial.c new file mode 100644 index 000000000000..0b2cf3b6d6a5 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/serial.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022 - Google LLC + */ + +#include +#include + +static void (*__hyp_putc)(char c); + +static inline void __hyp_putx4(unsigned int x) +{ + x &= 0xf; + if (x <= 9) + x += '0'; + else + x += ('a' - 0xa); + + __hyp_putc(x); +} + +static inline void __hyp_putx4n(unsigned long x, int n) +{ + int i = n >> 2; + + __hyp_putc('0'); + __hyp_putc('x'); + + while (i--) + __hyp_putx4(x >> (4 * i)); + + __hyp_putc('\n'); + __hyp_putc('\r'); +} + +static inline bool hyp_serial_enabled(void) +{ + return !!READ_ONCE(__hyp_putc); +} + +void hyp_puts(const char *s) +{ + if (!hyp_serial_enabled()) + return; + + while (*s) + __hyp_putc(*s++); + + __hyp_putc('\n'); + __hyp_putc('\r'); +} + +void hyp_putx64(u64 x) +{ + if (hyp_serial_enabled()) + __hyp_putx4n(x, 64); +} + +void hyp_putc(char c) +{ + if (hyp_serial_enabled()) + __hyp_putc(c); +} + +int __pkvm_register_serial_driver(void (*cb)(char)) +{ + return cmpxchg(&__hyp_putc, NULL, cb) ? -EBUSY : 0; +} diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 58ad9c5ba311..a3f787859fa5 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -8,37 +8,51 @@ #include #include #include +#include #include +#include #include +#include #include #include #include +#include +#include #include -struct hyp_pool hpool; unsigned long hyp_nr_cpus; +phys_addr_t pvmfw_base; +phys_addr_t pvmfw_size; + #define hyp_percpu_size ((unsigned long)__per_cpu_end - \ (unsigned long)__per_cpu_start) static void *vmemmap_base; +static void *vm_table_base; static void *hyp_pgt_base; static void *host_s2_pgt_base; +static void *ffa_proxy_pages; static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; +static struct hyp_pool hpool; static int divide_memory_pool(void *virt, unsigned long size) { - unsigned long vstart, vend, nr_pages; + unsigned long nr_pages; hyp_early_alloc_init(virt, size); - hyp_vmemmap_range(__hyp_pa(virt), size, &vstart, &vend); - nr_pages = (vend - vstart) >> PAGE_SHIFT; + nr_pages = hyp_vmemmap_pages(sizeof(struct hyp_page)); vmemmap_base = hyp_early_alloc_contig(nr_pages); if (!vmemmap_base) return -ENOMEM; + nr_pages = hyp_vm_table_pages(); + vm_table_base = hyp_early_alloc_contig(nr_pages); + if (!vm_table_base) + return -ENOMEM; + nr_pages = hyp_s1_pgtable_pages(); hyp_pgt_base = hyp_early_alloc_contig(nr_pages); if (!hyp_pgt_base) @@ -49,6 +63,11 @@ static int divide_memory_pool(void *virt, unsigned long size) if (!host_s2_pgt_base) return -ENOMEM; + nr_pages = hyp_ffa_proxy_pages(); + ffa_proxy_pages = hyp_early_alloc_contig(nr_pages); + if (!ffa_proxy_pages) + return -ENOMEM; + return 0; } @@ -76,7 +95,7 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; - ret = hyp_back_vmemmap(phys, size, hyp_virt_to_phys(vmemmap_base)); + ret = hyp_back_vmemmap(hyp_virt_to_phys(vmemmap_base)); if (ret) return ret; @@ -84,6 +103,10 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; + ret = pkvm_create_mappings(__hyp_data_start, __hyp_data_end, PAGE_HYP); + if (ret) + return ret; + ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO); if (ret) return ret; @@ -97,34 +120,63 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, return ret; for (i = 0; i < hyp_nr_cpus; i++) { + struct kvm_nvhe_init_params *params = per_cpu_ptr(&kvm_init_params, i); + unsigned long hyp_addr; + start = (void *)kern_hyp_va(per_cpu_base[i]); end = start + PAGE_ALIGN(hyp_percpu_size); ret = pkvm_create_mappings(start, end, PAGE_HYP); if (ret) return ret; - end = (void *)per_cpu_ptr(&kvm_init_params, i)->stack_hyp_va; - start = end - PAGE_SIZE; - ret = pkvm_create_mappings(start, end, PAGE_HYP); + /* + * Allocate a contiguous HYP private VA range for the stack + * and guard page. The allocation is also aligned based on + * the order of its size. + */ + ret = pkvm_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr); if (ret) return ret; + + /* + * Since the stack grows downwards, map the stack to the page + * at the higher address and leave the lower guard page + * unbacked. + * + * Any valid stack address now has the PAGE_SHIFT bit as 1 + * and addresses corresponding to the guard page have the + * PAGE_SHIFT bit as 0 - this is used for overflow detection. + */ + hyp_spin_lock(&pkvm_pgd_lock); + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, hyp_addr + PAGE_SIZE, + PAGE_SIZE, params->stack_pa, PAGE_HYP); + hyp_spin_unlock(&pkvm_pgd_lock); + if (ret) + return ret; + + /* Update stack_hyp_va to end of the stack's private VA range */ + params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE); } /* - * Map the host's .bss and .rodata sections RO in the hypervisor, but - * transfer the ownership from the host to the hypervisor itself to - * make sure it can't be donated or shared with another entity. + * Map the host sections RO in the hypervisor, but transfer the + * ownership from the host to the hypervisor itself to make sure they + * can't be donated or shared with another entity. * * The ownership transition requires matching changes in the host * stage-2. This will be done later (see finalize_host_mappings()) once * the hyp_vmemmap is addressable. */ prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED); - ret = pkvm_create_mappings(__start_rodata, __end_rodata, prot); + ret = pkvm_create_mappings(&kvm_vgic_global_state, + &kvm_vgic_global_state + 1, prot); if (ret) return ret; - ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, prot); + start = hyp_phys_to_virt(pvmfw_base); + end = start + pvmfw_size; + prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_OWNED); + ret = pkvm_create_mappings(start, end, prot); if (ret) return ret; @@ -159,10 +211,10 @@ static void hpool_put_page(void *addr) hyp_put_page(&hpool, addr); } -static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) +static int fix_host_ownership_walker(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) { enum kvm_pgtable_prot prot; enum pkvm_page_state state; @@ -186,7 +238,7 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte)); switch (state) { case PKVM_PAGE_OWNED: - return host_stage2_set_owner_locked(phys, PAGE_SIZE, pkvm_hyp_id); + return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP); case PKVM_PAGE_SHARED_OWNED: prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED); break; @@ -197,13 +249,32 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, return -EINVAL; } - return host_stage2_idmap_locked(phys, PAGE_SIZE, prot); + return host_stage2_idmap_locked(phys, PAGE_SIZE, prot, false); } -static int finalize_host_mappings(void) +static int fix_hyp_pgtable_refcnt_walker(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + struct kvm_pgtable_mm_ops *mm_ops = arg; + kvm_pte_t pte = *ptep; + + /* + * Fix-up the refcount for the page-table pages as the early allocator + * was unable to access the hyp_vmemmap and so the buddy allocator has + * initialised the refcount to '1'. + */ + if (kvm_pte_valid(pte)) + mm_ops->get_page(ptep); + + return 0; +} + +static int fix_host_ownership(void) { struct kvm_pgtable_walker walker = { - .cb = finalize_host_mappings_walker, + .cb = fix_host_ownership_walker, .flags = KVM_PGTABLE_WALK_LEAF, }; int i, ret; @@ -220,6 +291,35 @@ static int finalize_host_mappings(void) return 0; } +static int fix_hyp_pgtable_refcnt(void) +{ + struct kvm_pgtable_walker walker = { + .cb = fix_hyp_pgtable_refcnt_walker, + .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, + .arg = pkvm_pgtable.mm_ops, + }; + + return kvm_pgtable_walk(&pkvm_pgtable, 0, BIT(pkvm_pgtable.ia_bits), + &walker); +} + +static int unmap_protected_regions(void) +{ + struct pkvm_moveable_reg *reg; + int i, ret; + + for (i = 0; i < pkvm_moveable_regs_nr; i++) { + reg = &pkvm_moveable_regs[i]; + if (reg->type != PKVM_MREG_PROTECTED_RANGE) + continue; + ret = host_stage2_protect_pages_locked(reg->start, reg->size); + if (ret) + return ret; + } + + return 0; +} + void __noreturn __pkvm_init_finalise(void) { struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data); @@ -239,19 +339,37 @@ void __noreturn __pkvm_init_finalise(void) if (ret) goto out; - ret = finalize_host_mappings(); - if (ret) - goto out; - pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) { .zalloc_page = hyp_zalloc_hyp_page, .phys_to_virt = hyp_phys_to_virt, .virt_to_phys = hyp_virt_to_phys, .get_page = hpool_get_page, .put_page = hpool_put_page, + .page_count = hyp_page_count, }; pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops; + ret = fix_host_ownership(); + if (ret) + goto out; + + ret = fix_hyp_pgtable_refcnt(); + if (ret) + goto out; + + ret = hyp_create_pcpu_fixmap(); + if (ret) + goto out; + + ret = unmap_protected_regions(); + if (ret) + goto out; + + ret = hyp_ffa_init(ffa_proxy_pages); + if (ret) + goto out; + + pkvm_hyp_vm_table_init(vm_table_base); out: /* * We tail-called to here from handle___pkvm_init() and will not return, @@ -270,6 +388,8 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, void (*fn)(phys_addr_t params_pa, void *finalize_fn_va); int ret; + BUG_ON(kvm_check_pvm_sysreg_table()); + if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) return -EINVAL; diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c new file mode 100644 index 000000000000..58f645ad66bc --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM nVHE hypervisor stack tracing support. + * + * Copyright (C) 2022 Google LLC + */ +#include +#include +#include +#include + +DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) + __aligned(16); + +DEFINE_PER_CPU(struct kvm_nvhe_stacktrace_info, kvm_stacktrace_info); + +/* + * hyp_prepare_backtrace - Prepare non-protected nVHE backtrace. + * + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + * + * Save the information needed by the host to unwind the non-protected + * nVHE hypervisor stack in EL1. + */ +static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr(&kvm_stacktrace_info); + struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); + + stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - PAGE_SIZE); + stacktrace_info->overflow_stack_base = (unsigned long)this_cpu_ptr(overflow_stack); + stacktrace_info->fp = fp; + stacktrace_info->pc = pc; +} + +#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE +#include + +DEFINE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], pkvm_stacktrace); + +static bool on_overflow_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + unsigned long low = (unsigned long)this_cpu_ptr(overflow_stack); + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info); +} + +static bool on_hyp_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); + unsigned long high = params->stack_hyp_va; + unsigned long low = high - PAGE_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_HYP, info); +} + +static bool on_accessible_stack(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info) +{ + if (info) + info->type = STACK_TYPE_UNKNOWN; + + return (on_overflow_stack(sp, size, info) || + on_hyp_stack(sp, size, info)); +} + +static int unwind_next(struct unwind_state *state) +{ + struct stack_info info; + + return unwind_next_common(state, &info, on_accessible_stack, NULL); +} + +static void notrace unwind(struct unwind_state *state, + stack_trace_consume_fn consume_entry, + void *cookie) +{ + while (1) { + int ret; + + if (!consume_entry(cookie, state->pc)) + break; + ret = unwind_next(state); + if (ret < 0) + break; + } +} + +/* + * pkvm_save_backtrace_entry - Saves a protected nVHE HYP stacktrace entry + * + * @arg : index of the entry in the stacktrace buffer + * @where : the program counter corresponding to the stack frame + * + * Save the return address of a stack frame to the shared stacktrace buffer. + * The host can access this shared buffer from EL1 to dump the backtrace. + */ +static bool pkvm_save_backtrace_entry(void *arg, unsigned long where) +{ + unsigned long *stacktrace = this_cpu_ptr(pkvm_stacktrace); + int *idx = (int *)arg; + + /* + * Need 2 free slots: 1 for current entry and 1 for the + * delimiter. + */ + if (*idx > ARRAY_SIZE(pkvm_stacktrace) - 2) + return false; + + stacktrace[*idx] = where; + stacktrace[++*idx] = 0UL; + + return true; +} + +/* + * pkvm_save_backtrace - Saves the protected nVHE HYP stacktrace + * + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + * + * Save the unwinded stack addresses to the shared stacktrace buffer. + * The host can access this shared buffer from EL1 to dump the backtrace. + */ +static void pkvm_save_backtrace(unsigned long fp, unsigned long pc) +{ + struct unwind_state state; + int idx = 0; + + kvm_nvhe_unwind_init(&state, fp, pc); + + unwind(&state, pkvm_save_backtrace_entry, &idx); +} +#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */ +static void pkvm_save_backtrace(unsigned long fp, unsigned long pc) +{ +} +#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ + +/* + * kvm_nvhe_prepare_backtrace - prepare to dump the nVHE backtrace + * + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + * + * Saves the information needed by the host to dump the nVHE hypervisor + * backtrace. + */ +void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc) +{ + if (is_protected_kvm_enabled()) + pkvm_save_backtrace(fp, pc); + else + hyp_prepare_backtrace(fp, pc); +} diff --git a/arch/arm64/kvm/hyp/nvhe/stub.c b/arch/arm64/kvm/hyp/nvhe/stub.c deleted file mode 100644 index c0aa6bbfd79d..000000000000 --- a/arch/arm64/kvm/hyp/nvhe/stub.c +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Stubs for out-of-line function calls caused by re-using kernel - * infrastructure at EL2. - * - * Copyright (C) 2020 - Google LLC - */ - -#include - -#ifdef CONFIG_DEBUG_LIST -bool __list_add_valid(struct list_head *new, struct list_head *prev, - struct list_head *next) -{ - return true; -} - -bool __list_del_entry_valid(struct list_head *entry) -{ - return true; -} -#endif diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 4db5409f40c4..01694d36e9df 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -21,19 +21,22 @@ #include #include #include +#include #include #include #include #include -#include #include +#include /* Non-VHE specific context */ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); +extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); + static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; @@ -43,10 +46,12 @@ static void __activate_traps(struct kvm_vcpu *vcpu) val = vcpu->arch.cptr_el2; val |= CPTR_EL2_TTA | CPTR_EL2_TAM; - if (!update_fp_enabled(vcpu)) { + if (!guest_owns_fp_regs(vcpu)) { val |= CPTR_EL2_TFP | CPTR_EL2_TZ; __activate_traps_fpsimd32(vcpu); } + if (cpus_have_final_cap(ARM64_SME)) + val |= CPTR_EL2_TSM; write_sysreg(val, cptr_el2); write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2); @@ -95,8 +100,10 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); cptr = CPTR_EL2_DEFAULT; - if (vcpu_has_sve(vcpu) && (vcpu->arch.flags & KVM_ARM64_FP_ENABLED)) + if (vcpu_has_sve(vcpu) && (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED)) cptr |= CPTR_EL2_TZ; + if (cpus_have_final_cap(ARM64_SME)) + cptr &= ~CPTR_EL2_TSM; write_sysreg(cptr, cptr_el2); write_sysreg(__kvm_hyp_host_vector, vbar_el2); @@ -111,7 +118,7 @@ static void __hyp_vgic_save_state(struct kvm_vcpu *vcpu) } } -/* Restore VGICv3 state on non_VEH systems */ +/* Restore VGICv3 state on non-VHE systems */ static void __hyp_vgic_restore_state(struct kvm_vcpu *vcpu) { if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { @@ -120,16 +127,13 @@ static void __hyp_vgic_restore_state(struct kvm_vcpu *vcpu) } } -/** +/* * Disable host events, enable guest events */ -static bool __pmu_switch_to_guest(struct kvm_cpu_context *host_ctxt) +#ifdef CONFIG_HW_PERF_EVENTS +static bool __pmu_switch_to_guest(struct kvm_vcpu *vcpu) { - struct kvm_host_data *host; - struct kvm_pmu_events *pmu; - - host = container_of(host_ctxt, struct kvm_host_data, host_ctxt); - pmu = &host->pmu_events; + struct kvm_pmu_events *pmu = &vcpu->arch.pmu.events; if (pmu->events_host) write_sysreg(pmu->events_host, pmcntenclr_el0); @@ -140,16 +144,12 @@ static bool __pmu_switch_to_guest(struct kvm_cpu_context *host_ctxt) return (pmu->events_host || pmu->events_guest); } -/** +/* * Disable guest events, enable host events */ -static void __pmu_switch_to_host(struct kvm_cpu_context *host_ctxt) +static void __pmu_switch_to_host(struct kvm_vcpu *vcpu) { - struct kvm_host_data *host; - struct kvm_pmu_events *pmu; - - host = container_of(host_ctxt, struct kvm_host_data, host_ctxt); - pmu = &host->pmu_events; + struct kvm_pmu_events *pmu = &vcpu->arch.pmu.events; if (pmu->events_guest) write_sysreg(pmu->events_guest, pmcntenclr_el0); @@ -157,6 +157,85 @@ static void __pmu_switch_to_host(struct kvm_cpu_context *host_ctxt) if (pmu->events_host) write_sysreg(pmu->events_host, pmcntenset_el0); } +#else +#define __pmu_switch_to_guest(v) ({ false; }) +#define __pmu_switch_to_host(v) do {} while (0) +#endif + +/* + * Handler for protected VM MSR, MRS or System instruction execution in AArch64. + * + * Returns true if the hypervisor has handled the exit, and control should go + * back to the guest, or false if it hasn't. + */ +static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + /* + * Make sure we handle the exit for workarounds and ptrauth + * before the pKVM handling, as the latter could decide to + * UNDEF. + */ + return (kvm_hyp_handle_sysreg(vcpu, exit_code) || + kvm_handle_pvm_sysreg(vcpu, exit_code)); +} + +static const exit_handler_fn hyp_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, + [ESR_ELx_EC_HVC64] = kvm_hyp_handle_hvc64, + [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg, + [ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, + [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, + [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, +}; + +static const exit_handler_fn pvm_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_HVC64] = kvm_handle_pvm_hvc64, + [ESR_ELx_EC_SYS64] = kvm_handle_pvm_sys64, + [ESR_ELx_EC_SVE] = kvm_handle_pvm_restricted, + [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, + [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, + [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, +}; + +static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) +{ + if (unlikely(vcpu_is_protected(vcpu))) + return pvm_exit_handlers; + + return hyp_exit_handlers; +} + +/* + * Some guests (e.g., protected VMs) are not be allowed to run in AArch32. + * The ARMv8 architecture does not give the hypervisor a mechanism to prevent a + * guest from dropping to AArch32 EL0 if implemented by the CPU. If the + * hypervisor spots a guest in such a state ensure it is handled, and don't + * trust the host to spot or fix it. The check below is based on the one in + * kvm_arch_vcpu_ioctl_run(). + * + * Returns false if the guest ran in AArch32 when it shouldn't have, and + * thus should exit to the host, or true if a the guest run loop can continue. + */ +static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (unlikely(vcpu_is_protected(vcpu) && vcpu_mode_is_32bit(vcpu))) { + /* + * As we have caught the guest red-handed, decide that it isn't + * fit for purpose anymore by making the vcpu invalid. The VMM + * can try and fix it by re-initializing the vcpu with + * KVM_ARM_VCPU_INIT, however, this is likely not possible for + * protected VMs. + */ + vcpu->arch.target = -1; + *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT); + *exit_code |= ARM_EXCEPTION_IL; + } +} /* Switch to the guest for legacy non-VHE systems */ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) @@ -182,7 +261,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) host_ctxt->__hyp_running_vcpu = vcpu; guest_ctxt = &vcpu->arch.ctxt; - pmu_switch_needed = __pmu_switch_to_guest(host_ctxt); + pmu_switch_needed = __pmu_switch_to_guest(vcpu); __sysreg_save_state_nvhe(host_ctxt); /* @@ -217,10 +296,13 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __debug_switch_to_guest(vcpu); do { + trace_hyp_exit(); + /* Jump in the fire! */ exit_code = __guest_enter(vcpu); /* And we're baaack! */ + trace_hyp_enter(); } while (fixup_guest_exit(vcpu, &exit_code)); __sysreg_save_state_nvhe(guest_ctxt); @@ -233,7 +315,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __sysreg_restore_state_nvhe(host_ctxt); - if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) + if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) __fpsimd_save_fpexc32(vcpu); __debug_switch_to_host(vcpu); @@ -244,7 +326,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __debug_restore_host_buffers_nvhe(vcpu); if (pmu_switch_needed) - __pmu_switch_to_host(host_ctxt); + __pmu_switch_to_host(vcpu); /* Returning to host will clear PSR.I, remask PMR if needed */ if (system_uses_irq_prio_masking()) @@ -255,7 +337,13 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) return exit_code; } -void __noreturn hyp_panic(void) +static void (*hyp_panic_notifier)(struct kvm_cpu_context *host_ctxt); +int __pkvm_register_hyp_panic_notifier(void (*cb)(struct kvm_cpu_context *host_ctxt)) +{ + return cmpxchg(&hyp_panic_notifier, NULL, cb) ? -EBUSY : 0; +} + +asmlinkage void __noreturn hyp_panic(void) { u64 spsr = read_sysreg_el2(SYS_SPSR); u64 elr = read_sysreg_el2(SYS_ELR); @@ -266,6 +354,9 @@ void __noreturn hyp_panic(void) host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; vcpu = host_ctxt->__hyp_running_vcpu; + if (READ_ONCE(hyp_panic_notifier)) + hyp_panic_notifier(host_ctxt); + if (vcpu) { __timer_disable_traps(vcpu); __deactivate_traps(vcpu); @@ -273,10 +364,19 @@ void __noreturn hyp_panic(void) __sysreg_restore_state_nvhe(host_ctxt); } + /* Prepare to dump kvm nvhe hyp stacktrace */ + kvm_nvhe_prepare_backtrace((unsigned long)__builtin_frame_address(0), + _THIS_IP_); + __hyp_do_panic(host_ctxt, spsr, elr, par); unreachable(); } +asmlinkage void __noreturn hyp_panic_bad_stack(void) +{ + hyp_panic(); +} + asmlinkage void kvm_unexpected_el2_exception(void) { __kvm_unexpected_el2_exception(); diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c new file mode 100644 index 000000000000..eeee7367f09d --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -0,0 +1,615 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Google LLC + * Author: Fuad Tabba + */ + +#include + +#include +#include + +#include + +#include + +#include "../../sys_regs.h" + +/* + * Copies of the host's CPU features registers holding sanitized values at hyp. + */ +u64 id_aa64pfr0_el1_sys_val; +u64 id_aa64pfr1_el1_sys_val; +u64 id_aa64isar0_el1_sys_val; +u64 id_aa64isar1_el1_sys_val; +u64 id_aa64isar2_el1_sys_val; +u64 id_aa64mmfr0_el1_sys_val; +u64 id_aa64mmfr1_el1_sys_val; +u64 id_aa64mmfr2_el1_sys_val; + +/* + * Inject an unknown/undefined exception to an AArch64 guest while most of its + * sysregs are live. + */ +static void inject_undef64(struct kvm_vcpu *vcpu) +{ + u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); + + *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); + *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); + + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); + + __kvm_adjust_pc(vcpu); + + write_sysreg_el1(esr, SYS_ESR); + write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); + write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); + write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); +} + +/* + * Returns the restricted features values of the feature register based on the + * limitations in restrict_fields. + * A feature id field value of 0b0000 does not impose any restrictions. + * Note: Use only for unsigned feature field values. + */ +static u64 get_restricted_features_unsigned(u64 sys_reg_val, + u64 restrict_fields) +{ + u64 value = 0UL; + u64 mask = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); + + /* + * According to the Arm Architecture Reference Manual, feature fields + * use increasing values to indicate increases in functionality. + * Iterate over the restricted feature fields and calculate the minimum + * unsigned value between the one supported by the system, and what the + * value is being restricted to. + */ + while (sys_reg_val && restrict_fields) { + value |= min(sys_reg_val & mask, restrict_fields & mask); + sys_reg_val &= ~mask; + restrict_fields &= ~mask; + mask <<= ARM64_FEATURE_FIELD_BITS; + } + + return value; +} + +/* + * Functions that return the value of feature id registers for protected VMs + * based on allowed features, system features, and KVM support. + */ + +static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu) +{ + const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm); + u64 set_mask = 0; + u64 allow_mask = PVM_ID_AA64PFR0_ALLOW; + + set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val, + PVM_ID_AA64PFR0_RESTRICT_UNSIGNED); + + /* Spectre and Meltdown mitigation in KVM */ + set_mask |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), + (u64)kvm->arch.pfr0_csv2); + set_mask |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), + (u64)kvm->arch.pfr0_csv3); + + return (id_aa64pfr0_el1_sys_val & allow_mask) | set_mask; +} + +static u64 get_pvm_id_aa64pfr1(const struct kvm_vcpu *vcpu) +{ + const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm); + u64 allow_mask = PVM_ID_AA64PFR1_ALLOW; + + if (!kvm_has_mte(kvm)) + allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); + + return id_aa64pfr1_el1_sys_val & allow_mask; +} + +static u64 get_pvm_id_aa64zfr0(const struct kvm_vcpu *vcpu) +{ + /* + * No support for Scalable Vectors, therefore, hyp has no sanitized + * copy of the feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64ZFR0_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64dfr0(const struct kvm_vcpu *vcpu) +{ + /* + * No support for debug, including breakpoints, and watchpoints, + * therefore, pKVM has no sanitized copy of the feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64DFR0_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64dfr1(const struct kvm_vcpu *vcpu) +{ + /* + * No support for debug, therefore, hyp has no sanitized copy of the + * feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64DFR1_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64afr0(const struct kvm_vcpu *vcpu) +{ + /* + * No support for implementation defined features, therefore, hyp has no + * sanitized copy of the feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64AFR0_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64afr1(const struct kvm_vcpu *vcpu) +{ + /* + * No support for implementation defined features, therefore, hyp has no + * sanitized copy of the feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64AFR1_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64isar0(const struct kvm_vcpu *vcpu) +{ + return id_aa64isar0_el1_sys_val & PVM_ID_AA64ISAR0_ALLOW; +} + +static u64 get_pvm_id_aa64isar1(const struct kvm_vcpu *vcpu) +{ + u64 allow_mask = PVM_ID_AA64ISAR1_ALLOW; + + if (!vcpu_has_ptrauth(vcpu)) + allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI)); + + return id_aa64isar1_el1_sys_val & allow_mask; +} + +static u64 get_pvm_id_aa64isar2(const struct kvm_vcpu *vcpu) +{ + u64 allow_mask = PVM_ID_AA64ISAR2_ALLOW; + + if (!vcpu_has_ptrauth(vcpu)) + allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | + ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3)); + + return id_aa64isar2_el1_sys_val & allow_mask; +} + +static u64 get_pvm_id_aa64mmfr0(const struct kvm_vcpu *vcpu) +{ + u64 set_mask; + + set_mask = get_restricted_features_unsigned(id_aa64mmfr0_el1_sys_val, + PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED); + + return (id_aa64mmfr0_el1_sys_val & PVM_ID_AA64MMFR0_ALLOW) | set_mask; +} + +static u64 get_pvm_id_aa64mmfr1(const struct kvm_vcpu *vcpu) +{ + return id_aa64mmfr1_el1_sys_val & PVM_ID_AA64MMFR1_ALLOW; +} + +static u64 get_pvm_id_aa64mmfr2(const struct kvm_vcpu *vcpu) +{ + return id_aa64mmfr2_el1_sys_val & PVM_ID_AA64MMFR2_ALLOW; +} + +/* Read a sanitized cpufeature ID register by its encoding */ +u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id) +{ + switch (id) { + case SYS_ID_AA64PFR0_EL1: + return get_pvm_id_aa64pfr0(vcpu); + case SYS_ID_AA64PFR1_EL1: + return get_pvm_id_aa64pfr1(vcpu); + case SYS_ID_AA64ZFR0_EL1: + return get_pvm_id_aa64zfr0(vcpu); + case SYS_ID_AA64DFR0_EL1: + return get_pvm_id_aa64dfr0(vcpu); + case SYS_ID_AA64DFR1_EL1: + return get_pvm_id_aa64dfr1(vcpu); + case SYS_ID_AA64AFR0_EL1: + return get_pvm_id_aa64afr0(vcpu); + case SYS_ID_AA64AFR1_EL1: + return get_pvm_id_aa64afr1(vcpu); + case SYS_ID_AA64ISAR0_EL1: + return get_pvm_id_aa64isar0(vcpu); + case SYS_ID_AA64ISAR1_EL1: + return get_pvm_id_aa64isar1(vcpu); + case SYS_ID_AA64ISAR2_EL1: + return get_pvm_id_aa64isar2(vcpu); + case SYS_ID_AA64MMFR0_EL1: + return get_pvm_id_aa64mmfr0(vcpu); + case SYS_ID_AA64MMFR1_EL1: + return get_pvm_id_aa64mmfr1(vcpu); + case SYS_ID_AA64MMFR2_EL1: + return get_pvm_id_aa64mmfr2(vcpu); + default: + /* Unhandled ID register, RAZ */ + return 0; + } +} + +static u64 read_id_reg(const struct kvm_vcpu *vcpu, + struct sys_reg_desc const *r) +{ + return pvm_read_id_reg(vcpu, reg_to_encoding(r)); +} + +/* Handler to RAZ/WI sysregs */ +static bool pvm_access_raz_wi(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!p->is_write) + p->regval = 0; + + return true; +} + +/* + * Accessor for AArch32 feature id registers. + * + * The value of these registers is "unknown" according to the spec if AArch32 + * isn't supported. + */ +static bool pvm_access_id_aarch32(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) { + inject_undef64(vcpu); + return false; + } + + /* + * No support for AArch32 guests, therefore, pKVM has no sanitized copy + * of AArch32 feature id registers. + */ + BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), + PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) > ID_AA64PFR0_EL1_ELx_64BIT_ONLY); + + return pvm_access_raz_wi(vcpu, p, r); +} + +/* + * Accessor for AArch64 feature id registers. + * + * If access is allowed, set the regval to the protected VM's view of the + * register and return true. + * Otherwise, inject an undefined exception and return false. + */ +static bool pvm_access_id_aarch64(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) { + inject_undef64(vcpu); + return false; + } + + p->regval = read_id_reg(vcpu, r); + return true; +} + +static bool pvm_gic_read_sre(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + /* pVMs only support GICv3. 'nuf said. */ + if (!p->is_write) + p->regval = ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB | ICC_SRE_EL1_SRE; + + return true; +} + +/* Mark the specified system register as an AArch32 feature id register. */ +#define AARCH32(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch32 } + +/* Mark the specified system register as an AArch64 feature id register. */ +#define AARCH64(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch64 } + +/* + * sys_reg_desc initialiser for architecturally unallocated cpufeature ID + * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2 + * (1 <= crm < 8, 0 <= Op2 < 8). + */ +#define ID_UNALLOCATED(crm, op2) { \ + Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \ + .access = pvm_access_id_aarch64, \ +} + +/* Mark the specified system register as Read-As-Zero/Write-Ignored */ +#define RAZ_WI(REG) { SYS_DESC(REG), .access = pvm_access_raz_wi } + +/* Mark the specified system register as not being handled in hyp. */ +#define HOST_HANDLED(REG) { SYS_DESC(REG), .access = NULL } + +/* + * Architected system registers. + * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 + * + * NOTE: Anything not explicitly listed here is *restricted by default*, i.e., + * it will lead to injecting an exception into the guest. + */ +static const struct sys_reg_desc pvm_sys_reg_descs[] = { + /* Cache maintenance by set/way operations are restricted. */ + + /* Debug and Trace Registers are restricted. */ + RAZ_WI(SYS_DBGBVRn_EL1(0)), + RAZ_WI(SYS_DBGBCRn_EL1(0)), + RAZ_WI(SYS_DBGWVRn_EL1(0)), + RAZ_WI(SYS_DBGWCRn_EL1(0)), + RAZ_WI(SYS_MDSCR_EL1), + RAZ_WI(SYS_OSLAR_EL1), + RAZ_WI(SYS_OSLSR_EL1), + RAZ_WI(SYS_OSDLR_EL1), + + /* Group 1 ID registers */ + RAZ_WI(SYS_REVIDR_EL1), + + /* AArch64 mappings of the AArch32 ID registers */ + /* CRm=1 */ + AARCH32(SYS_ID_PFR0_EL1), + AARCH32(SYS_ID_PFR1_EL1), + AARCH32(SYS_ID_DFR0_EL1), + AARCH32(SYS_ID_AFR0_EL1), + AARCH32(SYS_ID_MMFR0_EL1), + AARCH32(SYS_ID_MMFR1_EL1), + AARCH32(SYS_ID_MMFR2_EL1), + AARCH32(SYS_ID_MMFR3_EL1), + + /* CRm=2 */ + AARCH32(SYS_ID_ISAR0_EL1), + AARCH32(SYS_ID_ISAR1_EL1), + AARCH32(SYS_ID_ISAR2_EL1), + AARCH32(SYS_ID_ISAR3_EL1), + AARCH32(SYS_ID_ISAR4_EL1), + AARCH32(SYS_ID_ISAR5_EL1), + AARCH32(SYS_ID_MMFR4_EL1), + AARCH32(SYS_ID_ISAR6_EL1), + + /* CRm=3 */ + AARCH32(SYS_MVFR0_EL1), + AARCH32(SYS_MVFR1_EL1), + AARCH32(SYS_MVFR2_EL1), + ID_UNALLOCATED(3,3), + AARCH32(SYS_ID_PFR2_EL1), + AARCH32(SYS_ID_DFR1_EL1), + AARCH32(SYS_ID_MMFR5_EL1), + ID_UNALLOCATED(3,7), + + /* AArch64 ID registers */ + /* CRm=4 */ + AARCH64(SYS_ID_AA64PFR0_EL1), + AARCH64(SYS_ID_AA64PFR1_EL1), + ID_UNALLOCATED(4,2), + ID_UNALLOCATED(4,3), + AARCH64(SYS_ID_AA64ZFR0_EL1), + ID_UNALLOCATED(4,5), + ID_UNALLOCATED(4,6), + ID_UNALLOCATED(4,7), + AARCH64(SYS_ID_AA64DFR0_EL1), + AARCH64(SYS_ID_AA64DFR1_EL1), + ID_UNALLOCATED(5,2), + ID_UNALLOCATED(5,3), + AARCH64(SYS_ID_AA64AFR0_EL1), + AARCH64(SYS_ID_AA64AFR1_EL1), + ID_UNALLOCATED(5,6), + ID_UNALLOCATED(5,7), + AARCH64(SYS_ID_AA64ISAR0_EL1), + AARCH64(SYS_ID_AA64ISAR1_EL1), + AARCH64(SYS_ID_AA64ISAR2_EL1), + ID_UNALLOCATED(6,3), + ID_UNALLOCATED(6,4), + ID_UNALLOCATED(6,5), + ID_UNALLOCATED(6,6), + ID_UNALLOCATED(6,7), + AARCH64(SYS_ID_AA64MMFR0_EL1), + AARCH64(SYS_ID_AA64MMFR1_EL1), + AARCH64(SYS_ID_AA64MMFR2_EL1), + ID_UNALLOCATED(7,3), + ID_UNALLOCATED(7,4), + ID_UNALLOCATED(7,5), + ID_UNALLOCATED(7,6), + ID_UNALLOCATED(7,7), + + /* Scalable Vector Registers are restricted. */ + + RAZ_WI(SYS_ERRIDR_EL1), + RAZ_WI(SYS_ERRSELR_EL1), + RAZ_WI(SYS_ERXFR_EL1), + RAZ_WI(SYS_ERXCTLR_EL1), + RAZ_WI(SYS_ERXSTATUS_EL1), + RAZ_WI(SYS_ERXADDR_EL1), + RAZ_WI(SYS_ERXMISC0_EL1), + RAZ_WI(SYS_ERXMISC1_EL1), + + /* Performance Monitoring Registers are restricted. */ + + /* Limited Ordering Regions Registers are restricted. */ + + HOST_HANDLED(SYS_ICC_SGI1R_EL1), + HOST_HANDLED(SYS_ICC_ASGI1R_EL1), + HOST_HANDLED(SYS_ICC_SGI0R_EL1), + { SYS_DESC(SYS_ICC_SRE_EL1), .access = pvm_gic_read_sre, }, + + HOST_HANDLED(SYS_CCSIDR_EL1), + HOST_HANDLED(SYS_CLIDR_EL1), + HOST_HANDLED(SYS_CSSELR_EL1), + HOST_HANDLED(SYS_CTR_EL0), + + /* Performance Monitoring Registers are restricted. */ + + /* Activity Monitoring Registers are restricted. */ + + HOST_HANDLED(SYS_CNTP_TVAL_EL0), + HOST_HANDLED(SYS_CNTP_CTL_EL0), + HOST_HANDLED(SYS_CNTP_CVAL_EL0), + + /* Performance Monitoring Registers are restricted. */ +}; + +/* A structure to track reset values for system registers in protected vcpus. */ +struct sys_reg_desc_reset { + /* Index into sys_reg[]. */ + int reg; + + /* Reset function. */ + void (*reset)(struct kvm_vcpu *, const struct sys_reg_desc_reset *); + + /* Reset value. */ + u64 value; +}; + +static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc_reset *r) +{ + __vcpu_sys_reg(vcpu, r->reg) = read_sysreg(actlr_el1); +} + +static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc_reset *r) +{ + __vcpu_sys_reg(vcpu, r->reg) = read_sysreg(amair_el1); +} + +static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc_reset *r) +{ + __vcpu_sys_reg(vcpu, r->reg) = calculate_mpidr(vcpu); +} + +static void reset_value(struct kvm_vcpu *vcpu, const struct sys_reg_desc_reset *r) +{ + __vcpu_sys_reg(vcpu, r->reg) = r->value; +} + +/* Specify the register's reset value. */ +#define RESET_VAL(REG, RESET_VAL) { REG, reset_value, RESET_VAL } + +/* Specify a function that calculates the register's reset value. */ +#define RESET_FUNC(REG, RESET_FUNC) { REG, RESET_FUNC, 0 } + +/* + * Architected system registers reset values for Protected VMs. + * Important: Must be sorted ascending by REG (index into sys_reg[]) + */ +static const struct sys_reg_desc_reset pvm_sys_reg_reset_vals[] = { + RESET_FUNC(MPIDR_EL1, reset_mpidr), + RESET_VAL(SCTLR_EL1, 0x00C50078), + RESET_FUNC(ACTLR_EL1, reset_actlr), + RESET_VAL(CPACR_EL1, 0), + RESET_VAL(ZCR_EL1, 0), + RESET_VAL(TCR_EL1, 0), + RESET_VAL(VBAR_EL1, 0), + RESET_VAL(CONTEXTIDR_EL1, 0), + RESET_FUNC(AMAIR_EL1, reset_amair_el1), + RESET_VAL(CNTKCTL_EL1, 0), + RESET_VAL(MDSCR_EL1, 0), + RESET_VAL(MDCCINT_EL1, 0), + RESET_VAL(DISR_EL1, 0), + RESET_VAL(PMCCFILTR_EL0, 0), + RESET_VAL(PMUSERENR_EL0, 0), +}; + +/* + * Sets system registers to reset value + * + * This function finds the right entry and sets the registers on the protected + * vcpu to their architecturally defined reset values. + */ +void kvm_reset_pvm_sys_regs(struct kvm_vcpu *vcpu) +{ + unsigned long i; + + for (i = 0; i < ARRAY_SIZE(pvm_sys_reg_reset_vals); i++) { + const struct sys_reg_desc_reset *r = &pvm_sys_reg_reset_vals[i]; + + r->reset(vcpu, r); + } +} + +/* + * Checks that the sysreg tables are unique and in-order. + * + * Returns 0 if the table is consistent, or 1 otherwise. + */ +int kvm_check_pvm_sysreg_table(void) +{ + unsigned int i; + + for (i = 1; i < ARRAY_SIZE(pvm_sys_reg_descs); i++) { + if (cmp_sys_reg(&pvm_sys_reg_descs[i-1], &pvm_sys_reg_descs[i]) >= 0) + return 1; + } + + for (i = 1; i < ARRAY_SIZE(pvm_sys_reg_reset_vals); i++) { + if (pvm_sys_reg_reset_vals[i-1].reg >= pvm_sys_reg_reset_vals[i].reg) + return 1; + } + + return 0; +} + +/* + * Handler for protected VM MSR, MRS or System instruction execution. + * + * Returns true if the hypervisor has handled the exit, and control should go + * back to the guest, or false if it hasn't, to be handled by the host. + */ +bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + const struct sys_reg_desc *r; + struct sys_reg_params params; + unsigned long esr = kvm_vcpu_get_esr(vcpu); + int Rt = kvm_vcpu_sys_get_rt(vcpu); + + params = esr_sys64_to_params(esr); + params.regval = vcpu_get_reg(vcpu, Rt); + + r = find_reg(¶ms, pvm_sys_reg_descs, ARRAY_SIZE(pvm_sys_reg_descs)); + + /* Undefined (RESTRICTED). */ + if (r == NULL) { + inject_undef64(vcpu); + return true; + } + + /* Handled by the host (HOST_HANDLED) */ + if (r->access == NULL) + return false; + + /* Handled by hyp: skip instruction if instructed to do so. */ + if (r->access(vcpu, ¶ms, r)) + __kvm_skip_instr(vcpu); + + if (!params.is_write) + vcpu_set_reg(vcpu, Rt, params.regval); + + return true; +} + +/* + * Handler for protected VM restricted exceptions. + * + * Inject an undefined exception into the guest and return true to indicate that + * the hypervisor has handled the exit, and control should go back to the guest. + */ +bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + inject_undef64(vcpu); + return true; +} diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index d296d617f589..35092e154614 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -11,26 +11,62 @@ #include struct tlb_inv_context { - u64 tcr; + struct kvm_s2_mmu *mmu; + u64 tcr; + u64 sctlr; }; -static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, - struct tlb_inv_context *cxt) +static void enter_vmid_context(struct kvm_s2_mmu *mmu, + struct tlb_inv_context *cxt) { + struct kvm_s2_mmu *host_s2_mmu = &host_mmu.arch.mmu; + struct kvm_cpu_context *host_ctxt; + struct kvm_vcpu *vcpu; + + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + vcpu = host_ctxt->__hyp_running_vcpu; + cxt->mmu = NULL; + + /* + * If we're already in the desired context, then there's nothing + * to do. + */ + if (vcpu) { + if (mmu == vcpu->arch.hw_mmu || WARN_ON(mmu != host_s2_mmu)) + return; + } else if (mmu == host_s2_mmu) { + return; + } + + cxt->mmu = mmu; if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { u64 val; /* * For CPUs that are affected by ARM 1319367, we need to - * avoid a host Stage-1 walk while we have the guest's - * VMID set in the VTTBR in order to invalidate TLBs. - * We're guaranteed that the S1 MMU is enabled, so we can - * simply set the EPD bits to avoid any further TLB fill. + * avoid a Stage-1 walk with the old VMID while we have + * the new VMID set in the VTTBR in order to invalidate TLBs. + * We're guaranteed that the host S1 MMU is enabled, so + * we can simply set the EPD bits to avoid any further + * TLB fill. For guests, we ensure that the S1 MMU is + * temporarily enabled in the next context. */ val = cxt->tcr = read_sysreg_el1(SYS_TCR); val |= TCR_EPD1_MASK | TCR_EPD0_MASK; write_sysreg_el1(val, SYS_TCR); isb(); + + if (vcpu) { + val = cxt->sctlr = read_sysreg_el1(SYS_SCTLR); + if (!(val & SCTLR_ELx_M)) { + val |= SCTLR_ELx_M; + write_sysreg_el1(val, SYS_SCTLR); + isb(); + } + } else { + /* The host S1 MMU is always enabled. */ + cxt->sctlr = SCTLR_ELx_M; + } } /* @@ -39,20 +75,44 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, * ensuring that we always have an ISB, but not two ISBs back * to back. */ - __load_stage2(mmu, kern_hyp_va(mmu->arch)); + if (vcpu) + __load_host_stage2(); + else + __load_stage2(mmu, kern_hyp_va(mmu->arch)); + asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT)); } -static void __tlb_switch_to_host(struct tlb_inv_context *cxt) +static void exit_vmid_context(struct tlb_inv_context *cxt) { - __load_host_stage2(); + struct kvm_s2_mmu *mmu = cxt->mmu; + struct kvm_cpu_context *host_ctxt; + struct kvm_vcpu *vcpu; + + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + vcpu = host_ctxt->__hyp_running_vcpu; + + if (!mmu) + return; + + if (vcpu) + __load_stage2(mmu, kern_hyp_va(mmu->arch)); + else + __load_host_stage2(); if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { - /* Ensure write of the host VMID */ + /* Ensure write of the old VMID */ isb(); - /* Restore the host's TCR_EL1 */ + + if (!(cxt->sctlr & SCTLR_ELx_M)) { + write_sysreg_el1(cxt->sctlr, SYS_SCTLR); + isb(); + } + write_sysreg_el1(cxt->tcr, SYS_TCR); } + + cxt->mmu = NULL; } void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, @@ -63,7 +123,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, dsb(ishst); /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt); /* * We could do so much better if we had the VA as well. @@ -106,7 +166,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, if (icache_is_vpipt()) icache_inval_all_pou(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) @@ -116,13 +176,13 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) dsb(ishst); /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt); __tlbi(vmalls12e1is); dsb(ish); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) @@ -130,14 +190,14 @@ void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) struct tlb_inv_context cxt; /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt); __tlbi(vmalle1); asm volatile("ic iallu"); dsb(nsh); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_flush_vm_context(void) diff --git a/arch/arm64/kvm/hyp/nvhe/trace.c b/arch/arm64/kvm/hyp/nvhe/trace.c new file mode 100644 index 000000000000..714433b684a0 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/trace.c @@ -0,0 +1,596 @@ +#include +#include +#include +#include + +#include +#include + +#include + +#define HYP_RB_PAGE_HEAD 1UL +#define HYP_RB_PAGE_UPDATE 2UL +#define HYP_RB_FLAG_MASK 3UL + +static struct hyp_buffer_pages_backing hyp_buffer_pages_backing; +DEFINE_PER_CPU(struct hyp_rb_per_cpu, trace_rb); +DEFINE_HYP_SPINLOCK(trace_rb_lock); + +static bool rb_set_flag(struct hyp_buffer_page *bpage, int new_flag) +{ + unsigned long ret, val = (unsigned long)bpage->list.next; + + ret = cmpxchg((unsigned long *)&bpage->list.next, + val, (val & ~HYP_RB_FLAG_MASK) | new_flag); + + return ret == val; +} + +static void rb_set_footer_status(struct hyp_buffer_page *bpage, + unsigned long status, + bool reader) +{ + struct buffer_data_page *page = bpage->page; + struct rb_ext_page_footer *footer; + + footer = rb_ext_page_get_footer(page); + + if (reader) + atomic_set(&footer->reader_status, status); + else + atomic_set(&footer->writer_status, status); +} + +static void rb_footer_writer_status(struct hyp_buffer_page *bpage, + unsigned long status) +{ + rb_set_footer_status(bpage, status, false); +} + +static void rb_footer_reader_status(struct hyp_buffer_page *bpage, + unsigned long status) +{ + rb_set_footer_status(bpage, status, true); +} + +static struct hyp_buffer_page *rb_hyp_buffer_page(struct list_head *list) +{ + unsigned long ptr = (unsigned long)list & ~HYP_RB_FLAG_MASK; + + return container_of((struct list_head *)ptr, struct hyp_buffer_page, list); +} + +static struct hyp_buffer_page *rb_next_page(struct hyp_buffer_page *bpage) +{ + return rb_hyp_buffer_page(bpage->list.next); +} + +static bool rb_is_head_page(struct hyp_buffer_page *bpage) +{ + return (unsigned long)bpage->list.prev->next & HYP_RB_PAGE_HEAD; +} + +static struct hyp_buffer_page *rb_set_head_page(struct hyp_rb_per_cpu *cpu_buffer) +{ + struct hyp_buffer_page *bpage, *prev_head; + int cnt = 0; +again: + bpage = prev_head = cpu_buffer->head_page; + do { + if (rb_is_head_page(bpage)) { + cpu_buffer->head_page = bpage; + rb_footer_reader_status(prev_head, 0); + rb_footer_reader_status(bpage, RB_PAGE_FT_HEAD); + return bpage; + } + + bpage = rb_next_page(bpage); + } while (bpage != prev_head); + + cnt++; + + /* We might have race with the writer let's try again */ + if (cnt < 3) + goto again; + + return NULL; +} + +static int rb_swap_reader_page(struct hyp_rb_per_cpu *cpu_buffer) +{ + unsigned long *old_head_link, old_link_val, new_link_val, overrun; + struct hyp_buffer_page *head, *reader = cpu_buffer->reader_page; + struct rb_ext_page_footer *footer; + + rb_footer_reader_status(cpu_buffer->reader_page, 0); +spin: + /* Update the cpu_buffer->header_page according to HYP_RB_PAGE_HEAD */ + head = rb_set_head_page(cpu_buffer); + if (!head) + return -ENODEV; + + /* Connect the reader page around the header page */ + reader->list.next = head->list.next; + reader->list.prev = head->list.prev; + + /* The reader page points to the new header page */ + rb_set_flag(reader, HYP_RB_PAGE_HEAD); + + /* + * Paired with the cmpxchg in rb_move_tail(). Order the read of the head + * page and overrun. + */ + smp_mb(); + overrun = atomic_read(&cpu_buffer->overrun); + + /* Try to swap the prev head link to the reader page */ + old_head_link = (unsigned long *)&reader->list.prev->next; + old_link_val = (*old_head_link & ~HYP_RB_FLAG_MASK) | HYP_RB_PAGE_HEAD; + new_link_val = (unsigned long)&reader->list; + if (cmpxchg(old_head_link, old_link_val, new_link_val) + != old_link_val) + goto spin; + + cpu_buffer->head_page = rb_hyp_buffer_page(reader->list.next); + cpu_buffer->head_page->list.prev = &reader->list; + cpu_buffer->reader_page = head; + + rb_footer_reader_status(cpu_buffer->reader_page, RB_PAGE_FT_READER); + rb_footer_reader_status(cpu_buffer->head_page, RB_PAGE_FT_HEAD); + + footer = rb_ext_page_get_footer(cpu_buffer->reader_page->page); + footer->stats.overrun = overrun; + + return 0; +} + +static struct hyp_buffer_page * +rb_move_tail(struct hyp_rb_per_cpu *cpu_buffer) +{ + struct hyp_buffer_page *tail_page, *new_tail, *new_head; + + tail_page = cpu_buffer->tail_page; + new_tail = rb_next_page(tail_page); +again: + /* + * We caught the reader ... Let's try to move the head page. + * The writer can only rely on ->next links to check if this is head. + */ + if ((unsigned long)tail_page->list.next & HYP_RB_PAGE_HEAD) { + /* The reader moved the head in between */ + if (!rb_set_flag(tail_page, HYP_RB_PAGE_UPDATE)) + goto again; + + atomic_add(atomic_read(&new_tail->entries), &cpu_buffer->overrun); + + /* Move the head */ + rb_set_flag(new_tail, HYP_RB_PAGE_HEAD); + + /* The new head is in place, reset the update flag */ + rb_set_flag(tail_page, 0); + + new_head = rb_next_page(new_tail); + } + + rb_footer_writer_status(tail_page, 0); + rb_footer_writer_status(new_tail, RB_PAGE_FT_COMMIT); + + local_set(&new_tail->page->commit, 0); + + atomic_set(&new_tail->write, 0); + atomic_set(&new_tail->entries, 0); + + atomic_inc(&cpu_buffer->pages_touched); + + cpu_buffer->tail_page = new_tail; + + return new_tail; +} + +unsigned long rb_event_size(unsigned long length) +{ + struct ring_buffer_event *event; + + return length + RB_EVNT_HDR_SIZE + sizeof(event->array[0]); +} + +static struct ring_buffer_event * +rb_add_ts_extend(struct ring_buffer_event *event, u64 delta) +{ + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + + return (struct ring_buffer_event *)((unsigned long)event + 8); +} + +static struct ring_buffer_event * +rb_reserve_next(struct hyp_rb_per_cpu *cpu_buffer, unsigned long length) +{ + unsigned long ts_ext_size = 0, event_size = rb_event_size(length); + struct hyp_buffer_page *tail_page = cpu_buffer->tail_page; + struct ring_buffer_event *event; + unsigned long write, prev_write; + u64 ts, time_delta; + + ts = trace_clock(); + + time_delta = ts - atomic64_read(&cpu_buffer->write_stamp); + + if (test_time_stamp(time_delta)) + ts_ext_size = 8; + + prev_write = atomic_read(&tail_page->write); + write = prev_write + event_size + ts_ext_size; + + if (unlikely(write > BUF_EXT_PAGE_SIZE)) + tail_page = rb_move_tail(cpu_buffer); + + if (!atomic_read(&tail_page->entries)) { + tail_page->page->time_stamp = ts; + time_delta = 0; + ts_ext_size = 0; + write = event_size; + prev_write = 0; + } + + atomic_set(&tail_page->write, write); + atomic_inc(&tail_page->entries); + + local_set(&tail_page->page->commit, write); + + atomic_inc(&cpu_buffer->nr_entries); + atomic64_set(&cpu_buffer->write_stamp, ts); + + event = (struct ring_buffer_event *)(tail_page->page->data + + prev_write); + if (ts_ext_size) { + event = rb_add_ts_extend(event, time_delta); + time_delta = 0; + } + + event->type_len = 0; + event->time_delta = time_delta; + event->array[0] = event_size - RB_EVNT_HDR_SIZE; + + return event; +} + +void * +rb_reserve_trace_entry(struct hyp_rb_per_cpu *cpu_buffer, unsigned long length) +{ + struct ring_buffer_event *rb_event; + + rb_event = rb_reserve_next(cpu_buffer, length); + + return &rb_event->array[1]; +} + +static int rb_update_footers(struct hyp_rb_per_cpu *cpu_buffer) +{ + unsigned long entries, pages_touched, overrun; + struct rb_ext_page_footer *footer; + struct buffer_data_page *reader; + + if (!rb_set_head_page(cpu_buffer)) + return -ENODEV; + + reader = cpu_buffer->reader_page->page; + footer = rb_ext_page_get_footer(reader); + + entries = atomic_read(&cpu_buffer->nr_entries); + footer->stats.entries = entries; + pages_touched = atomic_read(&cpu_buffer->pages_touched); + footer->stats.pages_touched = pages_touched; + overrun = atomic_read(&cpu_buffer->overrun); + footer->stats.overrun = overrun; + + return 0; +} + +static int rb_page_init(struct hyp_buffer_page *bpage, unsigned long hva) +{ + void *hyp_va = (void *)kern_hyp_va(hva); + int ret; + + ret = hyp_pin_shared_mem(hyp_va, hyp_va + PAGE_SIZE); + if (ret) + return ret; + + INIT_LIST_HEAD(&bpage->list); + bpage->page = (struct buffer_data_page *)hyp_va; + + atomic_set(&bpage->write, 0); + + rb_footer_reader_status(bpage, 0); + rb_footer_writer_status(bpage, 0); + + return 0; +} + +static void rb_cpu_teardown(struct hyp_rb_per_cpu *cpu_buffer) +{ + unsigned int prev_status; + int i; + + /* Wait for release of the buffer */ + do { + prev_status = atomic_cmpxchg_acquire(&cpu_buffer->status, + HYP_RB_READY, + HYP_RB_UNUSED); + } while (prev_status == HYP_RB_WRITE); + + if (prev_status == HYP_RB_READY) + rb_update_footers(cpu_buffer); + + for (i = 0; i < cpu_buffer->nr_pages; i++) { + struct hyp_buffer_page *bpage = &cpu_buffer->bpages[i]; + + if (!bpage->page) + continue; + + hyp_unpin_shared_mem((void *)bpage->page, + (void *)bpage->page + PAGE_SIZE); + } +} + +static bool rb_cpu_fits_backing(unsigned long nr_pages, + struct hyp_buffer_page *start) +{ + unsigned long max = hyp_buffer_pages_backing.start + + hyp_buffer_pages_backing.size; + struct hyp_buffer_page *end = start + nr_pages; + + return (unsigned long)end <= max; +} + +static bool rb_cpu_fits_pack(struct ring_buffer_pack *rb_pack, + unsigned long pack_end) +{ + unsigned long *end; + + /* Check we can at least read nr_pages */ + if ((unsigned long)&rb_pack->nr_pages >= pack_end) + return false; + + end = &rb_pack->page_va[rb_pack->nr_pages]; + + return (unsigned long)end <= pack_end; +} + +static int rb_cpu_init(struct ring_buffer_pack *rb_pack, struct hyp_buffer_page *start, + struct hyp_rb_per_cpu *cpu_buffer) +{ + struct hyp_buffer_page *bpage = start; + int i, ret; + + if (!rb_pack->nr_pages || + !rb_cpu_fits_backing(rb_pack->nr_pages + 1, start)) + return -EINVAL; + + memset(cpu_buffer, 0, sizeof(*cpu_buffer)); + + cpu_buffer->bpages = start; + cpu_buffer->nr_pages = rb_pack->nr_pages + 1; + + /* The reader page is not part of the ring initially */ + ret = rb_page_init(bpage, rb_pack->reader_page_va); + if (ret) + return ret; + + cpu_buffer->reader_page = bpage; + cpu_buffer->tail_page = bpage + 1; + cpu_buffer->head_page = bpage + 1; + + for (i = 0; i < rb_pack->nr_pages; i++) { + ret = rb_page_init(++bpage, rb_pack->page_va[i]); + if (ret) + goto err; + + bpage->list.next = &(bpage + 1)->list; + bpage->list.prev = &(bpage - 1)->list; + } + + /* Close the ring */ + bpage->list.next = &cpu_buffer->tail_page->list; + cpu_buffer->tail_page->list.prev = &bpage->list; + + /* The last init'ed page points to the head page */ + rb_set_flag(bpage, HYP_RB_PAGE_HEAD); + + rb_footer_reader_status(cpu_buffer->reader_page, RB_PAGE_FT_READER); + rb_footer_reader_status(cpu_buffer->head_page, RB_PAGE_FT_HEAD); + rb_footer_writer_status(cpu_buffer->head_page, RB_PAGE_FT_COMMIT); + + atomic_set(&cpu_buffer->overrun, 0); + atomic64_set(&cpu_buffer->write_stamp, 0); + + /* + * Paired with __start_write_hyp_rb() + */ + atomic_set_release(&cpu_buffer->status, HYP_RB_READY); + + return 0; +err: + rb_cpu_teardown(cpu_buffer); + + return ret; +} + +static int rb_setup_bpage_backing(struct hyp_trace_pack *pack) +{ + unsigned long start = kern_hyp_va(pack->backing.start); + size_t size = pack->backing.size; + int ret; + + if (hyp_buffer_pages_backing.size) + return -EBUSY; + + if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(size)) + return -EINVAL; + + ret = __pkvm_host_donate_hyp(hyp_virt_to_pfn((void *)start), size >> PAGE_SHIFT); + if (ret) + return ret; + + memset((void *)start, 0, size); + + hyp_buffer_pages_backing.start = start; + hyp_buffer_pages_backing.size = size; + + return 0; +} + +static void rb_teardown_bpage_backing(void) +{ + unsigned long start = hyp_buffer_pages_backing.start; + size_t size = hyp_buffer_pages_backing.size; + + if (!size) + return; + + memset((void *)start, 0, size); + + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(start), size >> PAGE_SHIFT)); + + hyp_buffer_pages_backing.start = 0; + hyp_buffer_pages_backing.size = 0; +} + +int __pkvm_rb_update_footers(int cpu) +{ + struct hyp_rb_per_cpu *cpu_buffer; + int ret = 0; + + /* TODO: per-CPU lock for */ + hyp_spin_lock(&trace_rb_lock); + + if (cpu >= hyp_nr_cpus) { + ret = -EINVAL; + goto err; + } + + cpu_buffer = per_cpu_ptr(&trace_rb, cpu); + + if (atomic_read(&cpu_buffer->status) == HYP_RB_UNUSED) { + ret = -ENODEV; + goto err; + } + + ret = rb_update_footers(cpu_buffer); +err: + hyp_spin_unlock(&trace_rb_lock); + + return ret; +} + +int __pkvm_rb_swap_reader_page(int cpu) +{ + struct hyp_rb_per_cpu *cpu_buffer = per_cpu_ptr(&trace_rb, cpu); + int ret = 0; + + /* TODO: per-CPU lock for */ + hyp_spin_lock(&trace_rb_lock); + + if (cpu >= hyp_nr_cpus) { + ret = -EINVAL; + goto err; + } + cpu_buffer = per_cpu_ptr(&trace_rb, cpu); + + if (atomic_read(&cpu_buffer->status) == HYP_RB_UNUSED) { + ret = -ENODEV; + goto err; + } + + ret = rb_swap_reader_page(cpu_buffer); + if (ret) + goto err; +err: + hyp_spin_unlock(&trace_rb_lock); + + return ret; +} + +static void __pkvm_stop_tracing_locked(void) +{ + int cpu; + + hyp_assert_lock_held(&trace_rb_lock); + + for (cpu = 0; cpu < hyp_nr_cpus; cpu++) { + struct hyp_rb_per_cpu *cpu_buffer = per_cpu_ptr(&trace_rb, cpu); + + if (atomic_read(&cpu_buffer->status) == HYP_RB_UNUSED) + continue; + + rb_cpu_teardown(cpu_buffer); + } + + rb_teardown_bpage_backing(); +} + +void __pkvm_stop_tracing(void) +{ + hyp_spin_lock(&trace_rb_lock); + __pkvm_stop_tracing_locked(); + hyp_spin_unlock(&trace_rb_lock); +} + +int __pkvm_start_tracing(unsigned long pack_hva, size_t pack_size) +{ + struct hyp_trace_pack *pack = (struct hyp_trace_pack *)kern_hyp_va(pack_hva); + struct trace_buffer_pack *trace_pack = &pack->trace_buffer_pack; + struct hyp_buffer_page *bpage_backing_start; + struct ring_buffer_pack *rb_pack; + int ret, cpu; + + if (!pack_size || !PAGE_ALIGNED(pack_hva) || !PAGE_ALIGNED(pack_size)) + return -EINVAL; + + ret = __pkvm_host_donate_hyp(hyp_virt_to_pfn((void *)pack), + pack_size >> PAGE_SHIFT); + if (ret) + return ret; + + hyp_spin_lock(&trace_rb_lock); + + ret = rb_setup_bpage_backing(pack); + if (ret) + goto err; + + trace_clock_update(&pack->trace_clock_data); + + bpage_backing_start = (struct hyp_buffer_page *)hyp_buffer_pages_backing.start; + + for_each_ring_buffer_pack(rb_pack, cpu, trace_pack) { + struct hyp_rb_per_cpu *cpu_buffer; + int cpu; + + ret = -EINVAL; + if (!rb_cpu_fits_pack(rb_pack, pack_hva + pack_size)) + break; + + cpu = rb_pack->cpu; + if (cpu >= hyp_nr_cpus) + break; + + cpu_buffer = per_cpu_ptr(&trace_rb, cpu); + + ret = rb_cpu_init(rb_pack, bpage_backing_start, cpu_buffer); + if (ret) + break; + + /* reader page + nr pages in rb */ + bpage_backing_start += 1 + rb_pack->nr_pages; + } +err: + if (ret) + __pkvm_stop_tracing_locked(); + + hyp_spin_unlock(&trace_rb_lock); + + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn((void *)pack), + pack_size >> PAGE_SHIFT)); + return ret; +} diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 4c77ff556f0a..0add34ea9ff2 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -12,43 +12,10 @@ #include -#define KVM_PTE_TYPE BIT(1) -#define KVM_PTE_TYPE_BLOCK 0 -#define KVM_PTE_TYPE_PAGE 1 -#define KVM_PTE_TYPE_TABLE 1 - -#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) - -#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3 -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1 -#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) -#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 -#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) - -#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) -#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) -#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) -#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) -#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 -#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) - -#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) - -#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) - -#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) - -#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) - #define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ KVM_PTE_LEAF_ATTR_HI_S2_XN) -#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) -#define KVM_MAX_OWNER_ID 1 - struct kvm_pgtable_walk_data { struct kvm_pgtable *pgt; struct kvm_pgtable_walker *walker; @@ -57,11 +24,9 @@ struct kvm_pgtable_walk_data { u64 end; }; -#define KVM_PHYS_INVALID (-1ULL) - static bool kvm_phys_is_valid(u64 phys) { - return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_PARANGE_MAX)); + return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX)); } static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level) @@ -111,27 +76,6 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) return __kvm_pgd_page_idx(&pgt, -1ULL) + 1; } -static bool kvm_pte_table(kvm_pte_t pte, u32 level) -{ - if (level == KVM_PGTABLE_MAX_LEVELS - 1) - return false; - - if (!kvm_pte_valid(pte)) - return false; - - return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; -} - -static kvm_pte_t kvm_phys_to_pte(u64 pa) -{ - kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK; - - if (PAGE_SHIFT == 16) - pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48); - - return pte; -} - static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops) { return mm_ops->phys_to_virt(kvm_pte_to_phys(pte)); @@ -167,11 +111,6 @@ static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level) return pte; } -static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id) -{ - return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id); -} - static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr, u32 level, kvm_pte_t *ptep, enum kvm_pgtable_walk_flags flag) @@ -334,16 +273,26 @@ struct hyp_map_data { static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) { - bool device = prot & KVM_PGTABLE_PROT_DEVICE; - u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL; - kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype); - u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS; u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW : KVM_PTE_LEAF_ATTR_LO_S1_AP_RO; + bool device = prot & KVM_PGTABLE_PROT_DEVICE; + u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS; + bool nc = prot & KVM_PGTABLE_PROT_NC; + kvm_pte_t attr; + u32 mtype; - if (!(prot & KVM_PGTABLE_PROT_R)) + if (!(prot & KVM_PGTABLE_PROT_R) || (device && nc)) return -EINVAL; + if (device) + mtype = MT_DEVICE_nGnRnE; + else if (nc) + mtype = MT_NORMAL_NC; + else + mtype = MT_NORMAL; + + attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype); + if (prot & KVM_PGTABLE_PROT_X) { if (prot & KVM_PGTABLE_PROT_W) return -EINVAL; @@ -383,21 +332,6 @@ enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) return prot; } -static bool hyp_pte_needs_update(kvm_pte_t old, kvm_pte_t new) -{ - /* - * Tolerate KVM recreating the exact same mapping, or changing software - * bits if the existing mapping was valid. - */ - if (old == new) - return false; - - if (!kvm_pte_valid(old)) - return true; - - return !WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW); -} - static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct hyp_map_data *data) { @@ -407,11 +341,16 @@ static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, if (!kvm_block_mapping_supported(addr, end, phys, level)) return false; - new = kvm_init_valid_leaf_pte(phys, data->attr, level); - if (hyp_pte_needs_update(old, new)) - smp_store_release(ptep, new); - data->phys += granule; + new = kvm_init_valid_leaf_pte(phys, data->attr, level); + if (old == new) + return true; + if (!kvm_pte_valid(old)) + data->mm_ops->get_page(ptep); + else if (WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) + return false; + + smp_store_release(ptep, new); return true; } @@ -433,6 +372,7 @@ static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, return -ENOMEM; kvm_set_table_pte(ptep, childp, mm_ops); + mm_ops->get_page(ptep); return 0; } @@ -460,6 +400,69 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, return ret; } +struct hyp_unmap_data { + u64 unmapped; + struct kvm_pgtable_mm_ops *mm_ops; +}; + +static int hyp_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, void * const arg) +{ + kvm_pte_t pte = *ptep, *childp = NULL; + u64 granule = kvm_granule_size(level); + struct hyp_unmap_data *data = arg; + struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; + + if (!kvm_pte_valid(pte)) + return -EINVAL; + + if (kvm_pte_table(pte, level)) { + childp = kvm_pte_follow(pte, mm_ops); + + if (mm_ops->page_count(childp) != 1) + return 0; + + kvm_clear_pte(ptep); + dsb(ishst); + __tlbi_level(vae2is, __TLBI_VADDR(addr, 0), level); + } else { + if (end - addr < granule) + return -EINVAL; + + kvm_clear_pte(ptep); + dsb(ishst); + __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level); + data->unmapped += granule; + } + + dsb(ish); + isb(); + mm_ops->put_page(ptep); + + if (childp) + mm_ops->put_page(childp); + + return 0; +} + +u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct hyp_unmap_data unmap_data = { + .mm_ops = pgt->mm_ops, + }; + struct kvm_pgtable_walker walker = { + .cb = hyp_unmap_walker, + .arg = &unmap_data, + .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, + }; + + if (!pgt->mm_ops->page_count) + return 0; + + kvm_pgtable_walk(pgt, addr, size, &walker); + return unmap_data.unmapped; +} + int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, struct kvm_pgtable_mm_ops *mm_ops) { @@ -473,7 +476,7 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; pgt->mm_ops = mm_ops; pgt->mmu = NULL; - pgt->force_pte_cb = NULL; + pgt->pte_ops = NULL; return 0; } @@ -482,8 +485,16 @@ static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, enum kvm_pgtable_walk_flags flag, void * const arg) { struct kvm_pgtable_mm_ops *mm_ops = arg; + kvm_pte_t pte = *ptep; + + if (!kvm_pte_valid(pte)) + return 0; + + mm_ops->put_page(ptep); + + if (kvm_pte_table(pte, level)) + mm_ops->put_page(kvm_pte_follow(pte, mm_ops)); - mm_ops->put_page((void *)kvm_pte_follow(*ptep, mm_ops)); return 0; } @@ -491,7 +502,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) { struct kvm_pgtable_walker walker = { .cb = hyp_free_walker, - .flags = KVM_PGTABLE_WALK_TABLE_POST, + .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, .arg = pgt->mm_ops, }; @@ -503,7 +514,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) struct stage2_map_data { u64 phys; kvm_pte_t attr; - u8 owner_id; + u64 annotation; kvm_pte_t *anchor; kvm_pte_t *childp; @@ -562,9 +573,19 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p kvm_pte_t *ptep) { bool device = prot & KVM_PGTABLE_PROT_DEVICE; - kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) : - KVM_S2_MEMATTR(pgt, NORMAL); u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; + bool nc = prot & KVM_PGTABLE_PROT_NC; + kvm_pte_t attr; + + if (device && nc) + return -EINVAL; + + if (device) + attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE); + else if (nc) + attr = KVM_S2_MEMATTR(pgt, NORMAL_NC); + else + attr = KVM_S2_MEMATTR(pgt, NORMAL); if (!(prot & KVM_PGTABLE_PROT_X)) attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; @@ -610,14 +631,14 @@ static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new) return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)); } -static bool stage2_pte_is_counted(kvm_pte_t pte) +static void stage2_clear_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr, + u32 level) { - /* - * The refcount tracks valid entries as well as invalid entries if they - * encode ownership of a page to another entity than the page-table - * owner, whose id is 0. - */ - return !!pte; + if (!kvm_pte_valid(*ptep)) + return; + + kvm_clear_pte(ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level); } static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr, @@ -627,23 +648,19 @@ static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr, * Clear the existing PTE, and perform break-before-make with * TLB maintenance if it was valid. */ - if (kvm_pte_valid(*ptep)) { - kvm_clear_pte(ptep); - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level); - } - + stage2_clear_pte(ptep, mmu, addr, level); mm_ops->put_page(ptep); } static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) { u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; - return memattr == KVM_S2_MEMATTR(pgt, NORMAL); + return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL); } static bool stage2_pte_executable(kvm_pte_t pte) { - return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); + return kvm_pte_valid(pte) && !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); } static bool stage2_leaf_mapping_allowed(u64 addr, u64 end, u32 level, @@ -662,6 +679,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, kvm_pte_t new, old = *ptep; u64 granule = kvm_granule_size(level), phys = data->phys; struct kvm_pgtable *pgt = data->mmu->pgt; + struct kvm_pgtable_pte_ops *pte_ops = pgt->pte_ops; struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; if (!stage2_leaf_mapping_allowed(addr, end, level, data)) @@ -670,32 +688,41 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, if (kvm_phys_is_valid(phys)) new = kvm_init_valid_leaf_pte(phys, data->attr, level); else - new = kvm_init_invalid_leaf_owner(data->owner_id); + new = data->annotation; - if (stage2_pte_is_counted(old)) { - /* - * Skip updating the PTE if we are trying to recreate the exact - * same mapping or only change the access permissions. Instead, - * the vCPU will exit one more time from guest if still needed - * and then go through the path of relaxing permissions. - */ - if (!stage2_pte_needs_update(old, new)) - return -EAGAIN; + /* + * Skip updating the PTE if we are trying to recreate the exact + * same mapping or only change the access permissions. Instead, + * the vCPU will exit one more time from guest if still needed + * and then go through the path of relaxing permissions. + */ + if (!stage2_pte_needs_update(old, new)) + return -EAGAIN; - stage2_put_pte(ptep, data->mmu, addr, level, mm_ops); - } + if (pte_ops->pte_is_counted_cb(old, level)) + mm_ops->put_page(ptep); + + /* + * If we're only changing software bits, then we don't need to + * do anything else. + */ + if (!((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) + goto out_set_pte; + + stage2_clear_pte(ptep, data->mmu, addr, level); /* Perform CMOs before installation of the guest stage-2 PTE */ if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new)) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops), - granule); - + granule); if (mm_ops->icache_inval_pou && stage2_pte_executable(new)) mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); - smp_store_release(ptep, new); - if (stage2_pte_is_counted(new)) +out_set_pte: + if (pte_ops->pte_is_counted_cb(new, level)) mm_ops->get_page(ptep); + + smp_store_release(ptep, new); if (kvm_phys_is_valid(phys)) data->phys += granule; return 0; @@ -728,11 +755,13 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct stage2_map_data *data) { struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; + struct kvm_pgtable *pgt = data->mmu->pgt; + struct kvm_pgtable_pte_ops *pte_ops = pgt->pte_ops; kvm_pte_t *childp, pte = *ptep; int ret; if (data->anchor) { - if (stage2_pte_is_counted(pte)) + if (pte_ops->pte_is_counted_cb(pte, level)) mm_ops->put_page(ptep); return 0; @@ -757,8 +786,15 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, * a table. Accesses beyond 'end' that fall within the new table * will be mapped lazily. */ - if (stage2_pte_is_counted(pte)) + if (pte_ops->pte_is_counted_cb(pte, level)) { stage2_put_pte(ptep, data->mmu, addr, level, mm_ops); + } else { + /* + * On non-refcounted PTEs we just clear them out without + * dropping the refcount. + */ + stage2_clear_pte(ptep, data->mmu, addr, level); + } kvm_set_table_pte(ptep, childp, mm_ops); mm_ops->get_page(ptep); @@ -766,6 +802,35 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, return 0; } +static void stage2_coalesce_walk_table_post(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + struct stage2_map_data *data) +{ + struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; + kvm_pte_t *childp = kvm_pte_follow(*ptep, mm_ops); + + /* + * Decrement the refcount only on the set ownership path to avoid a + * loop situation when the following happens: + * 1. We take a host stage2 fault and we create a small mapping which + * has default attributes (is not refcounted). + * 2. On the way back we execute the post handler and we zap the + * table that holds our mapping. + */ + if (kvm_phys_is_valid(data->phys) || + !kvm_level_supports_block_mapping(level)) + return; + + /* + * Free a page that is not referenced anymore and drop the reference + * of the page table page. + */ + if (mm_ops->page_count(childp) == 1) { + stage2_put_pte(ptep, data->mmu, addr, level, mm_ops); + mm_ops->put_page(childp); + } +} + static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct stage2_map_data *data) @@ -774,8 +839,11 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level, kvm_pte_t *childp; int ret = 0; - if (!data->anchor) + if (!data->anchor) { + stage2_coalesce_walk_table_post(addr, end, level, ptep, + data); return 0; + } if (data->anchor == ptep) { childp = data->childp; @@ -833,12 +901,12 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, void *mc) { int ret; + struct kvm_pgtable_pte_ops *pte_ops = pgt->pte_ops; struct stage2_map_data map_data = { .phys = ALIGN_DOWN(phys, PAGE_SIZE), .mmu = pgt->mmu, .memcache = mc, .mm_ops = pgt->mm_ops, - .force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot), }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, @@ -848,6 +916,9 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, .arg = &map_data, }; + if (pte_ops->force_pte_cb) + map_data.force_pte = pte_ops->force_pte_cb(addr, addr + size, prot); + if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys))) return -EINVAL; @@ -860,8 +931,8 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, return ret; } -int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, - void *mc, u8 owner_id) +int kvm_pgtable_stage2_annotate(struct kvm_pgtable *pgt, u64 addr, u64 size, + void *mc, kvm_pte_t annotation) { int ret; struct stage2_map_data map_data = { @@ -869,8 +940,8 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, .mmu = pgt->mmu, .memcache = mc, .mm_ops = pgt->mm_ops, - .owner_id = owner_id, .force_pte = true, + .annotation = annotation, }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, @@ -880,7 +951,7 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, .arg = &map_data, }; - if (owner_id > KVM_MAX_OWNER_ID) + if (annotation & PTE_VALID) return -EINVAL; ret = kvm_pgtable_walk(pgt, addr, size, &walker); @@ -894,11 +965,12 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct kvm_pgtable *pgt = arg; struct kvm_s2_mmu *mmu = pgt->mmu; struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; + struct kvm_pgtable_pte_ops *pte_ops = pgt->pte_ops; kvm_pte_t pte = *ptep, *childp = NULL; bool need_flush = false; if (!kvm_pte_valid(pte)) { - if (stage2_pte_is_counted(pte)) { + if (pte_ops->pte_is_counted_cb(pte, level)) { kvm_clear_pte(ptep); mm_ops->put_page(ptep); } @@ -919,7 +991,8 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, * block entry and rely on the remaining portions being faulted * back lazily. */ - stage2_put_pte(ptep, mmu, addr, level, mm_ops); + if (pte_ops->pte_is_counted_cb(pte, level)) + stage2_put_pte(ptep, mmu, addr, level, mm_ops); if (need_flush && mm_ops->dcache_clean_inval_poc) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops), @@ -1086,7 +1159,7 @@ static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; kvm_pte_t pte = *ptep; - if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte)) + if (!stage2_pte_cacheable(pgt, pte)) return 0; if (mm_ops->dcache_clean_inval_poc) @@ -1110,13 +1183,13 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) } -int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch, +int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops, enum kvm_pgtable_stage2_flags flags, - kvm_pgtable_force_pte_cb_t force_pte_cb) + struct kvm_pgtable_pte_ops *pte_ops) { size_t pgd_sz; - u64 vtcr = arch->vtcr; + u64 vtcr = mmu->arch->vtcr; u32 ia_bits = VTCR_EL2_IPA(vtcr); u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; @@ -1129,23 +1202,34 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch, pgt->ia_bits = ia_bits; pgt->start_level = start_level; pgt->mm_ops = mm_ops; - pgt->mmu = &arch->mmu; + pgt->mmu = mmu; pgt->flags = flags; - pgt->force_pte_cb = force_pte_cb; + pgt->pte_ops = pte_ops; /* Ensure zeroed PGD pages are visible to the hardware walker */ dsb(ishst); return 0; } +size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) +{ + u32 ia_bits = VTCR_EL2_IPA(vtcr); + u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); + u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; + + return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; +} + static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, enum kvm_pgtable_walk_flags flag, void * const arg) { - struct kvm_pgtable_mm_ops *mm_ops = arg; + struct kvm_pgtable *pgt = arg; + struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; + struct kvm_pgtable_pte_ops *pte_ops = pgt->pte_ops; kvm_pte_t pte = *ptep; - if (!stage2_pte_is_counted(pte)) + if (!pte_ops->pte_is_counted_cb(pte, level)) return 0; mm_ops->put_page(ptep); @@ -1163,7 +1247,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) .cb = stage2_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, - .arg = pgt->mm_ops, + .arg = pgt, }; WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); diff --git a/arch/arm64/kvm/hyp/reserved_mem.c b/arch/arm64/kvm/hyp/reserved_mem.c deleted file mode 100644 index 578670e3f608..000000000000 --- a/arch/arm64/kvm/hyp/reserved_mem.c +++ /dev/null @@ -1,109 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2020 - Google LLC - * Author: Quentin Perret - */ - -#include -#include -#include - -#include - -#include -#include - -static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory); -static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr); - -phys_addr_t hyp_mem_base; -phys_addr_t hyp_mem_size; - -static int cmp_hyp_memblock(const void *p1, const void *p2) -{ - const struct memblock_region *r1 = p1; - const struct memblock_region *r2 = p2; - - return r1->base < r2->base ? -1 : (r1->base > r2->base); -} - -static void __init sort_memblock_regions(void) -{ - sort(hyp_memory, - *hyp_memblock_nr_ptr, - sizeof(struct memblock_region), - cmp_hyp_memblock, - NULL); -} - -static int __init register_memblock_regions(void) -{ - struct memblock_region *reg; - - for_each_mem_region(reg) { - if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS) - return -ENOMEM; - - hyp_memory[*hyp_memblock_nr_ptr] = *reg; - (*hyp_memblock_nr_ptr)++; - } - sort_memblock_regions(); - - return 0; -} - -void __init kvm_hyp_reserve(void) -{ - u64 nr_pages, prev, hyp_mem_pages = 0; - int ret; - - if (!is_hyp_mode_available() || is_kernel_in_hyp_mode()) - return; - - if (kvm_get_mode() != KVM_MODE_PROTECTED) - return; - - ret = register_memblock_regions(); - if (ret) { - *hyp_memblock_nr_ptr = 0; - kvm_err("Failed to register hyp memblocks: %d\n", ret); - return; - } - - hyp_mem_pages += hyp_s1_pgtable_pages(); - hyp_mem_pages += host_s2_pgtable_pages(); - - /* - * The hyp_vmemmap needs to be backed by pages, but these pages - * themselves need to be present in the vmemmap, so compute the number - * of pages needed by looking for a fixed point. - */ - nr_pages = 0; - do { - prev = nr_pages; - nr_pages = hyp_mem_pages + prev; - nr_pages = DIV_ROUND_UP(nr_pages * sizeof(struct hyp_page), PAGE_SIZE); - nr_pages += __hyp_pgtable_max_pages(nr_pages); - } while (nr_pages != prev); - hyp_mem_pages += nr_pages; - - /* - * Try to allocate a PMD-aligned region to reduce TLB pressure once - * this is unmapped from the host stage-2, and fallback to PAGE_SIZE. - */ - hyp_mem_size = hyp_mem_pages << PAGE_SHIFT; - hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE), - PMD_SIZE); - if (!hyp_mem_base) - hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE); - else - hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE); - - if (!hyp_mem_base) { - kvm_err("Failed to reserve hyp memory\n"); - return; - } - - kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20, - hyp_mem_base); -} diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 39f8f7f9227c..d5a51ea5459c 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -330,7 +330,7 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) write_gicreg(0, ICH_HCR_EL2); } -void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) +static void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) { u64 val; u32 nr_pre_bits; @@ -363,7 +363,7 @@ void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) } } -void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if) +static void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if) { u64 val; u32 nr_pre_bits; @@ -455,16 +455,35 @@ u64 __vgic_v3_get_gic_config(void) return val; } -u64 __vgic_v3_read_vmcr(void) +static u64 __vgic_v3_read_vmcr(void) { return read_gicreg(ICH_VMCR_EL2); } -void __vgic_v3_write_vmcr(u32 vmcr) +static void __vgic_v3_write_vmcr(u32 vmcr) { write_gicreg(vmcr, ICH_VMCR_EL2); } +void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) +{ + __vgic_v3_save_aprs(cpu_if); + if (cpu_if->vgic_sre) + cpu_if->vgic_vmcr = __vgic_v3_read_vmcr(); +} + +void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) +{ + /* + * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen + * is dependent on ICC_SRE_EL1.SRE, and we have to perform the + * VMCR_EL2 save/restore in the world switch. + */ + if (cpu_if->vgic_sre) + __vgic_v3_write_vmcr(cpu_if->vgic_vmcr); + __vgic_v3_restore_aprs(cpu_if); +} + static int __vgic_v3_bpr_min(void) { /* See Pseudocode for VPriorityGroup */ @@ -695,9 +714,7 @@ static void __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt) goto spurious; lr_val &= ~ICH_LR_STATE; - /* No active state for LPIs */ - if ((lr_val & ICH_LR_VIRTUAL_ID_MASK) <= VGIC_MAX_SPI) - lr_val |= ICH_LR_ACTIVE_BIT; + lr_val |= ICH_LR_ACTIVE_BIT; __gic_v3_set_lr(lr_val, lr); __vgic_v3_set_active_priority(lr_prio, vmcr, grp); vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); @@ -764,20 +781,18 @@ static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) /* Drop priority in any case */ act_prio = __vgic_v3_clear_highest_active_priority(); - /* If EOIing an LPI, no deactivate to be performed */ - if (vid >= VGIC_MIN_LPI) - return; - - /* EOImode == 1, nothing to be done here */ - if (vmcr & ICH_VMCR_EOIM_MASK) - return; - lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); if (lr == -1) { - __vgic_v3_bump_eoicount(); + /* Do not bump EOIcount for LPIs that aren't in the LRs */ + if (!(vid >= VGIC_MIN_LPI)) + __vgic_v3_bump_eoicount(); return; } + /* EOImode == 1 and not an LPI, nothing to be done here */ + if ((vmcr & ICH_VMCR_EOIM_MASK) && !(vid >= VGIC_MIN_LPI)) + return; + lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; /* If priorities or group do not match, the guest has fscked-up. */ @@ -988,7 +1003,8 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) /* IDbits */ val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; /* SEIS */ - val |= ((vtr >> 22) & 1) << ICC_CTLR_EL1_SEIS_SHIFT; + if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) + val |= BIT(ICC_CTLR_EL1_SEIS_SHIFT); /* A3V */ val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; /* EOImode */ diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 813e6e2178c1..1a97391fedd2 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -41,7 +41,8 @@ static void __activate_traps(struct kvm_vcpu *vcpu) val = read_sysreg(cpacr_el1); val |= CPACR_EL1_TTA; - val &= ~CPACR_EL1_ZEN; + val &= ~(CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN | + CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN); /* * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to @@ -54,11 +55,11 @@ static void __activate_traps(struct kvm_vcpu *vcpu) val |= CPTR_EL2_TAM; - if (update_fp_enabled(vcpu)) { + if (guest_owns_fp_regs(vcpu)) { if (vcpu_has_sve(vcpu)) - val |= CPACR_EL1_ZEN; + val |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; } else { - val &= ~CPACR_EL1_FPEN; + val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN); __activate_traps_fpsimd32(vcpu); } @@ -101,6 +102,26 @@ void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu) __deactivate_traps_common(vcpu); } +static const exit_handler_fn hyp_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, + [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg, + [ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, + [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, + [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, +}; + +static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) +{ + return hyp_exit_handlers; +} + +static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) +{ +} + /* Switch to the guest for VHE systems running in EL2 */ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) { @@ -146,7 +167,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) sysreg_restore_host_state_vhe(host_ctxt); - if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) + if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) __fpsimd_save_fpexc32(vcpu); __debug_switch_to_host(vcpu); diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 007a12dd4351..7b44f6b3b547 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -79,7 +79,7 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu) __sysreg_restore_user_state(guest_ctxt); __sysreg_restore_el1_state(guest_ctxt); - vcpu->arch.sysregs_loaded_on_cpu = true; + vcpu_set_flag(vcpu, SYSREGS_ON_CPU); activate_traps_vhe_load(vcpu); } @@ -110,5 +110,5 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu) /* Restore host user state */ __sysreg_restore_user_state(host_ctxt); - vcpu->arch.sysregs_loaded_on_cpu = false; + vcpu_clear_flag(vcpu, SYSREGS_ON_CPU); } diff --git a/arch/arm64/kvm/hyp_events.c b/arch/arm64/kvm/hyp_events.c new file mode 100644 index 000000000000..64b88dc85093 --- /dev/null +++ b/arch/arm64/kvm/hyp_events.c @@ -0,0 +1,393 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 Google LLC + */ + +#include +#include + +#include +#include +#include + +#include "hyp_trace.h" + +#define HYP_EVENT_NAME_MAX 32 + +struct hyp_event { + struct trace_event_call *call; + char name[HYP_EVENT_NAME_MAX]; + bool *enabled; +}; + +#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \ + HYP_EVENT_FORMAT(__name, __struct); \ + enum print_line_t hyp_event_trace_##__name(struct trace_iterator *iter, \ + int flags, struct trace_event *event) \ + { \ + struct ht_iterator *ht_iter = (struct ht_iterator *)iter; \ + struct trace_hyp_format_##__name __maybe_unused *__entry = \ + (struct trace_hyp_format_##__name *)ht_iter->ent; \ + trace_seq_puts(&ht_iter->seq, #__name); \ + trace_seq_putc(&ht_iter->seq, ' '); \ + trace_seq_printf(&ht_iter->seq, __printk); \ + trace_seq_putc(&ht_iter->seq, '\n'); \ + return TRACE_TYPE_HANDLED; \ + } +#include + +#undef he_field +#define he_field(_type, _item) \ + { \ + .type = #_type, .name = #_item, \ + .size = sizeof(_type), .align = __alignof__(_type), \ + .is_signed = is_signed_type(_type), \ + }, +#undef HYP_EVENT +#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \ + static struct trace_event_fields hyp_event_fields_##__name[] = { \ + __struct \ + {} \ + }; \ + +#undef __ARM64_KVM_HYPEVENTS_H_ +#include + +#undef HYP_EVENT +#undef HE_PRINTK +#define __entry REC +#define HE_PRINTK(fmt, args...) "\"" fmt "\", " __stringify(args) +#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \ + static char hyp_event_print_fmt_##__name[] = __printk; \ + static struct trace_event_functions hyp_event_funcs_##__name = { \ + .trace = &hyp_event_trace_##__name, \ + }; \ + static struct trace_event_class hyp_event_class_##__name = { \ + .system = "nvhe-hypervisor", \ + .fields_array = hyp_event_fields_##__name, \ + .fields = LIST_HEAD_INIT(hyp_event_class_##__name.fields),\ + }; \ + static struct trace_event_call hyp_event_call_##__name = { \ + .class = &hyp_event_class_##__name, \ + .event.funcs = &hyp_event_funcs_##__name, \ + .print_fmt = hyp_event_print_fmt_##__name, \ + }; \ + static bool hyp_event_enabled_##__name; \ + struct hyp_event __section("_hyp_events") hyp_event_##__name = { \ + .name = #__name, \ + .call = &hyp_event_call_##__name, \ + .enabled = &hyp_event_enabled_##__name, \ + } + +#undef __ARM64_KVM_HYPEVENTS_H_ +#include + +extern struct hyp_event __start_hyp_events[]; +extern struct hyp_event __stop_hyp_events[]; + +/* hyp_event section used by the hypervisor */ +extern struct hyp_event_id __hyp_event_ids_start[]; +extern struct hyp_event_id __hyp_event_ids_end[]; + +static struct hyp_event *find_hyp_event(const char *name) +{ + struct hyp_event *event = __start_hyp_events; + + for (; (unsigned long)event < (unsigned long)__stop_hyp_events; + event++) { + if (!strncmp(name, event->name, HYP_EVENT_NAME_MAX)) + return event; + } + + return NULL; +} + +static int enable_hyp_event(struct hyp_event *event, bool enable) +{ + unsigned short id = event->call->event.type; + int ret; + + if (enable == *event->enabled) + return 0; + + ret = kvm_call_hyp_nvhe(__pkvm_enable_event, id, enable); + if (ret) + return ret; + + *event->enabled = enable; + + return 0; +} + +static ssize_t +hyp_event_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct seq_file *seq_file = (struct seq_file *)filp->private_data; + struct hyp_event *evt = (struct hyp_event *)seq_file->private; + bool enabling; + int ret; + char c; + + if (cnt != 2) + return -EINVAL; + + if (get_user(c, ubuf)) + return -EFAULT; + + switch (c) { + case '1': + enabling = true; + break; + case '0': + enabling = false; + break; + default: + return -EINVAL; + } + + ret = enable_hyp_event(evt, enabling); + if (ret) + return ret; + + return cnt; +} + +static int hyp_event_show(struct seq_file *m, void *v) +{ + struct hyp_event *evt = (struct hyp_event *)m->private; + + /* lock ?? Ain't no time for that ! */ + seq_printf(m, "%d\n", *evt->enabled); + + return 0; +} + +static int hyp_event_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, hyp_event_show, inode->i_private); +} + +static const struct file_operations hyp_event_fops = { + .open = hyp_event_open, + .write = hyp_event_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hyp_event_id_show(struct seq_file *m, void *v) +{ + struct hyp_event *evt = (struct hyp_event *)m->private; + + seq_printf(m, "%d\n", evt->call->event.type); + + return 0; +} + +static int hyp_event_id_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, hyp_event_id_show, inode->i_private); +} + +static const struct file_operations hyp_event_id_fops = { + .open = hyp_event_id_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hyp_event_format_show(struct seq_file *m, void *v) +{ + struct hyp_event *evt = (struct hyp_event *)m->private; + struct trace_event_fields *field; + unsigned int offset = sizeof(struct hyp_entry_hdr); + + seq_printf(m, "name: %s\n", evt->name); + seq_printf(m, "ID: %d\n", evt->call->event.type); + seq_puts(m, "format:\n\tfield:unsigned short common_type;\toffset:0;\tsize:2;\tsigned:0;\n"); + seq_puts(m, "\n"); + + field = &evt->call->class->fields_array[0]; + while (field->name) { + seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", + field->type, field->name, offset, field->size, + !!field->is_signed); + offset += field->size; + field++; + } + + if (field != &evt->call->class->fields_array[0]) + seq_puts(m, "\n"); + + seq_printf(m, "print fmt: %s\n", evt->call->print_fmt); + + return 0; +} + +static int hyp_event_format_open(struct inode *inode, struct file *file) +{ + return single_open(file, hyp_event_format_show, inode->i_private); +} + +static const struct file_operations hyp_event_format_fops = { + .open = hyp_event_format_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hyp_header_page_show(struct seq_file *m, void *v) +{ + struct buffer_data_page bpage; + + seq_printf(m, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%lu;\tsigned:%d;\n", + sizeof(bpage.time_stamp), + is_signed_type(u64)); + + seq_printf(m, "\tfield: local_t commit;\t" + "offset:%lu;\tsize:%lu;\tsigned:%d;\n", + offsetof(typeof(bpage), commit), + sizeof(bpage.commit), + is_signed_type(long)); + + seq_printf(m, "\tfield: char data;\t" + "offset:%lu;\tsize:%lu;\tsigned:%d;\n", + offsetof(typeof(bpage), data), + BUF_EXT_PAGE_SIZE, + is_signed_type(char)); + + return 0; +} + +static int hyp_header_page_open(struct inode *inode, struct file *file) +{ + return single_open(file, hyp_header_page_show, NULL); +} + +static const struct file_operations hyp_header_page_fops = { + .open = hyp_header_page_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static char early_events[COMMAND_LINE_SIZE]; + +static __init int setup_hyp_event_early(char *str) +{ + strscpy(early_events, str, COMMAND_LINE_SIZE); + + return 1; +} +__setup("hyp_event=", setup_hyp_event_early); + +bool kvm_hyp_events_enable_early(void) +{ + char *token, *buf = early_events; + bool enabled = false; + + while (true) { + token = strsep(&buf, ","); + + if (!token) + break; + + if (*token) { + struct hyp_event *event; + int ret; + + event = find_hyp_event(token); + if (event) { + ret = enable_hyp_event(event, true); + if (ret) + pr_warn("Couldn't enable hyp event %s:%d\n", + token, ret); + else + enabled = true; + } else { + pr_warn("Couldn't find hyp event %s\n", token); + } + } + + if (buf) + *(buf - 1) = ','; + } + + return enabled; +} + +void kvm_hyp_init_events_tracefs(struct dentry *parent) +{ + struct hyp_event *event = __start_hyp_events; + struct dentry *d, *event_dir; + + parent = tracefs_create_dir("events", parent); + if (!parent) { + pr_err("Failed to create tracefs folder for hyp events\n"); + return; + } + + d = tracefs_create_file("header_page", 0400, parent, NULL, + &hyp_header_page_fops); + if (!d) + pr_err("Failed to create events/header_page\n"); + + + for (; (unsigned long)event < (unsigned long)__stop_hyp_events; event++) { + event_dir = tracefs_create_dir(event->name, parent); + if (!event_dir) { + pr_err("Failed to create events/hyp/%s\n", event->name); + continue; + } + d = tracefs_create_file("enable", 0700, event_dir, (void *)event, + &hyp_event_fops); + if (!d) + pr_err("Failed to create events/hyp/%s/enable\n", event->name); + + d = tracefs_create_file("id", 0400, event_dir, (void *)event, + &hyp_event_id_fops); + if (!d) + pr_err("Failed to create events/hyp/%s/id\n", event->name); + + d = tracefs_create_file("format", 0400, event_dir, (void *)event, + &hyp_event_format_fops); + if (!d) + pr_err("Failed to create events/hyp/%s/format\n", + event->name); + + } +} + +/* + * Register hyp events and write their id into the hyp section _hyp_event_ids. + */ +int kvm_hyp_init_events(void) +{ + struct hyp_event *event = __start_hyp_events; + struct hyp_event_id *hyp_event_id = __hyp_event_ids_start; + int ret, err = -ENODEV; + + /* TODO: BUILD_BUG nr events host side / hyp side */ + + for (; (unsigned long)event < (unsigned long)__stop_hyp_events; + event++, hyp_event_id++) { + event->call->name = event->name; + ret = register_trace_event(&event->call->event); + if (!ret) { + pr_warn("Couldn't register trace event for %s\n", event->name); + continue; + } + + /* + * Both the host and the hypervisor relies on the same hyp event + * declarations from kvm_hypevents.h. We have then a 1:1 + * mapping. + */ + hyp_event_id->id = ret; + + err = 0; + } + + return err; +} diff --git a/arch/arm64/kvm/hyp_trace.c b/arch/arm64/kvm/hyp_trace.c new file mode 100644 index 000000000000..cabf73099570 --- /dev/null +++ b/arch/arm64/kvm/hyp_trace.c @@ -0,0 +1,796 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 Google LLC + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "hyp_constants.h" +#include "hyp_trace.h" + +#define RB_POLL_MS 1000 + +#define TRACEFS_DIR "hyp" + +static bool hyp_trace_on; +static int hyp_trace_readers; +static struct trace_buffer *hyp_trace_buffer; +static size_t hyp_trace_buffer_size = 7 << 10; +static struct hyp_buffer_pages_backing hyp_buffer_pages_backing; +static DEFINE_MUTEX(hyp_trace_lock); +static DEFINE_PER_CPU(struct mutex, hyp_trace_reader_lock); + +static int bpage_backing_setup(struct hyp_trace_pack *pack) +{ + size_t backing_size; + void *start; + + if (hyp_buffer_pages_backing.start) + return -EBUSY; + + backing_size = STRUCT_HYP_BUFFER_PAGE_SIZE * + pack->trace_buffer_pack.total_pages; + backing_size = PAGE_ALIGN(backing_size); + + start = alloc_pages_exact(backing_size, GFP_KERNEL_ACCOUNT); + if (!start) + return -ENOMEM; + + hyp_buffer_pages_backing.start = (unsigned long)start; + hyp_buffer_pages_backing.size = backing_size; + pack->backing.start = (unsigned long)start; + pack->backing.size = backing_size; + + return 0; +} + +static void bpage_backing_teardown(void) +{ + unsigned long backing = hyp_buffer_pages_backing.start; + + if (!hyp_buffer_pages_backing.start) + return; + + free_pages_exact((void *)backing, hyp_buffer_pages_backing.size); + + hyp_buffer_pages_backing.start = 0; + hyp_buffer_pages_backing.size = 0; +} + +/* + * Configure the hyp tracing clock. So far, only one is supported: "boot". This + * clock doesn't stop during suspend making it a good candidate. The downside is + * if this clock is corrected by NTP while tracing, the hyp clock will slightly + * drift compared to the host version. + */ +static void hyp_clock_setup(struct hyp_trace_pack *pack) +{ + struct kvm_nvhe_clock_data *clock_data = &pack->trace_clock_data; + struct system_time_snapshot snap; + + ktime_get_snapshot(&snap); + + clock_data->epoch_cyc = snap.cycles; + clock_data->epoch_ns = snap.boot; + clock_data->mult = snap.mono_mult; + clock_data->shift = snap.mono_shift; +} + +static int __swap_reader_page(int cpu) +{ + return kvm_call_hyp_nvhe(__pkvm_rb_swap_reader_page, cpu); +} + +static int __update_footers(int cpu) +{ + return kvm_call_hyp_nvhe(__pkvm_rb_update_footers, cpu); +} + +struct ring_buffer_ext_cb hyp_cb = { + .update_footers = __update_footers, + .swap_reader = __swap_reader_page, +}; + +static inline int share_page(unsigned long va) +{ + return kvm_call_hyp_nvhe(__pkvm_host_share_hyp, virt_to_pfn(va), 1); +} + +static inline int unshare_page(unsigned long va) +{ + return kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, virt_to_pfn(va), 1); +} + +static int trace_pack_pages_apply(struct trace_buffer_pack *trace_pack, + int (*func)(unsigned long)) +{ + struct ring_buffer_pack *rb_pack; + int cpu, i, ret; + + for_each_ring_buffer_pack(rb_pack, cpu, trace_pack) { + ret = func(rb_pack->reader_page_va); + if (ret) + return ret; + + for (i = 0; i < rb_pack->nr_pages; i++) { + ret = func(rb_pack->page_va[i]); + if (ret) + return ret; + } + } + + return 0; +} + +/* + * hyp_trace_pack size depends on trace_buffer_pack's, so + * trace_buffer_setup is in charge of the allocation for the former. + */ +static int trace_buffer_setup(struct hyp_trace_pack **pack, size_t *pack_size) +{ + struct trace_buffer_pack *trace_pack; + int ret; + + hyp_trace_buffer = ring_buffer_alloc_ext(hyp_trace_buffer_size, &hyp_cb); + if (!hyp_trace_buffer) + return -ENOMEM; + + *pack_size = offsetof(struct hyp_trace_pack, trace_buffer_pack) + + trace_buffer_pack_size(hyp_trace_buffer); + /* + * The hypervisor will unmap the pack from the host to protect the + * reading. Page granularity for the pack allocation ensures no other + * useful data will be unmapped. + */ + *pack_size = PAGE_ALIGN(*pack_size); + *pack = alloc_pages_exact(*pack_size, GFP_KERNEL); + if (!*pack) { + ret = -ENOMEM; + goto err; + } + + trace_pack = &(*pack)->trace_buffer_pack; + WARN_ON(trace_buffer_pack(hyp_trace_buffer, trace_pack)); + + ret = trace_pack_pages_apply(trace_pack, share_page); + if (ret) { + trace_pack_pages_apply(trace_pack, unshare_page); + free_pages_exact(*pack, *pack_size); + goto err; + } + + return 0; +err: + ring_buffer_free(hyp_trace_buffer); + hyp_trace_buffer = NULL; + + return ret; +} + +static void trace_buffer_teardown(struct trace_buffer_pack *trace_pack) +{ + bool alloc_trace_pack = !trace_pack; + + if (alloc_trace_pack) { + trace_pack = kzalloc(trace_buffer_pack_size(hyp_trace_buffer), GFP_KERNEL); + if (!trace_pack) { + WARN_ON(1); + goto end; + } + } + + WARN_ON(trace_buffer_pack(hyp_trace_buffer, trace_pack)); + WARN_ON(trace_pack_pages_apply(trace_pack, unshare_page)); + + if (alloc_trace_pack) + kfree(trace_pack); +end: + ring_buffer_free(hyp_trace_buffer); + hyp_trace_buffer = NULL; +} + +static void hyp_free_tracing(void) +{ + if (!hyp_trace_buffer) + return; + + trace_buffer_teardown(NULL); + bpage_backing_teardown(); +} + +static int hyp_start_tracing(void) +{ + struct hyp_trace_pack *pack; + size_t pack_size; + int ret = 0; + + if (hyp_trace_on || hyp_trace_readers) + return -EBUSY; + + hyp_free_tracing(); + + ret = trace_buffer_setup(&pack, &pack_size); + if (ret) + return ret; + + hyp_clock_setup(pack); + + ret = bpage_backing_setup(pack); + if (ret) + goto end_buffer_teardown; + + ret = kvm_call_hyp_nvhe(__pkvm_start_tracing, (unsigned long)pack, pack_size); + if (!ret) { + hyp_trace_on = true; + goto end_free_pack; + } + + bpage_backing_teardown(); +end_buffer_teardown: + trace_buffer_teardown(&pack->trace_buffer_pack); +end_free_pack: + free_pages_exact(pack, pack_size); + + return ret; +} + +static void hyp_stop_tracing(void) +{ + int ret; + + if (!hyp_trace_buffer || !hyp_trace_on) + return; + + ret = kvm_call_hyp_nvhe(__pkvm_stop_tracing); + if (ret) { + WARN_ON(1); + return; + } + + hyp_trace_on = false; +} + +static ssize_t +hyp_tracing_on(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + int err = 0; + char c; + + if (cnt != 2) + return -EINVAL; + + if (get_user(c, ubuf)) + return -EFAULT; + + mutex_lock(&hyp_trace_lock); + + switch (c) { + case '1': + err = hyp_start_tracing(); + break; + case '0': + hyp_stop_tracing(); + break; + default: + err = -EINVAL; + } + + mutex_unlock(&hyp_trace_lock); + + return err ? err : cnt; +} + +static ssize_t hyp_tracing_on_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[3]; + int r; + + mutex_lock(&hyp_trace_lock); + r = sprintf(buf, "%d\n", hyp_trace_on); + mutex_unlock(&hyp_trace_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static const struct file_operations hyp_tracing_on_fops = { + .write = hyp_tracing_on, + .read = hyp_tracing_on_read, +}; + +static ssize_t hyp_buffer_size(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (!val) + return -EINVAL; + + mutex_lock(&hyp_trace_lock); + hyp_trace_buffer_size = val << 10; /* KB to B */ + mutex_unlock(&hyp_trace_lock); + + return cnt; +} + +static ssize_t hyp_buffer_size_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + mutex_lock(&hyp_trace_lock); + r = sprintf(buf, "%lu\n", hyp_trace_buffer_size >> 10); + mutex_unlock(&hyp_trace_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static const struct file_operations hyp_buffer_size_fops = { + .write = hyp_buffer_size, + .read = hyp_buffer_size_read, +}; + +static inline void hyp_trace_read_start(int cpu) +{ + mutex_lock(&per_cpu(hyp_trace_reader_lock, cpu)); +} + +static inline void hyp_trace_read_stop(int cpu) +{ + mutex_unlock(&per_cpu(hyp_trace_reader_lock, cpu)); +} + +static void ht_print_trace_time(struct ht_iterator *iter) +{ + unsigned long usecs_rem; + u64 ts_ns = iter->ts; + + do_div(ts_ns, 1000); + usecs_rem = do_div(ts_ns, USEC_PER_SEC); + + trace_seq_printf(&iter->seq, "[%5lu.%06lu] ", + (unsigned long)ts_ns, usecs_rem); +} + +extern struct trace_event *ftrace_find_event(int type); + +static void ht_print_trace_fmt(struct ht_iterator *iter) +{ + struct trace_event *e; + + if (iter->lost_events) + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->cpu, iter->lost_events); + + /* TODO: format bin/hex/raw */ + + ht_print_trace_time(iter); + + e = ftrace_find_event(iter->ent->id); + if (e) { + e->funcs->trace((struct trace_iterator *)iter, 0, e); + return; + } + + trace_seq_printf(&iter->seq, "Unknown event id %d\n", iter->ent->id); +}; + +static void *ht_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ht_iterator *iter = m->private; + struct ring_buffer_event *evt; + u64 ts; + + (*pos)++; + + evt = ring_buffer_iter_peek(iter->buf_iter, &ts); + if (!evt) + return NULL; + + iter->ent = (struct hyp_entry_hdr *)&evt->array[1]; + iter->ts = ts; + iter->ent_size = evt->array[0]; + ring_buffer_iter_advance(iter->buf_iter); + + return iter; +} + +static void *ht_start(struct seq_file *m, loff_t *pos) +{ + struct ht_iterator *iter = m->private; + + if (*pos == 0) { + ring_buffer_iter_reset(iter->buf_iter); + (*pos)++; + iter->ent = NULL; + + return iter; + } + + hyp_trace_read_start(iter->cpu); + + return ht_next(m, NULL, pos); +} + +static void ht_stop(struct seq_file *m, void *v) +{ + struct ht_iterator *iter = m->private; + + hyp_trace_read_stop(iter->cpu); +} + +static int ht_show(struct seq_file *m, void *v) +{ + struct ht_iterator *iter = v; + + if (!iter->ent) { + unsigned long entries, overrun; + + entries = ring_buffer_entries_cpu(hyp_trace_buffer, iter->cpu); + overrun = ring_buffer_overrun_cpu(hyp_trace_buffer, iter->cpu); + + seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu\n", + entries, overrun + entries); + } else { + ht_print_trace_fmt(iter); + trace_print_seq(m, &iter->seq); + } + + return 0; +} + +static const struct seq_operations hyp_trace_ops = { + .start = ht_start, + .next = ht_next, + .stop = ht_stop, + .show = ht_show, +}; + +static int hyp_trace_open(struct inode *inode, struct file *file) +{ + unsigned long cpu = (unsigned long)inode->i_private; + struct ht_iterator *iter; + int ret = 0; + + mutex_lock(&hyp_trace_lock); + + if (!hyp_trace_buffer) { + ret = -ENODEV; + goto unlock; + } + + iter = __seq_open_private(file, &hyp_trace_ops, sizeof(*iter)); + if (!iter) { + ret = -ENOMEM; + goto unlock; + } + + iter->buf_iter = ring_buffer_read_prepare(hyp_trace_buffer, cpu, GFP_KERNEL); + if (!iter->buf_iter) { + seq_release_private(inode, file); + ret = -ENOMEM; + goto unlock; + } + + iter->cpu = cpu; + + ring_buffer_read_prepare_sync(); + ring_buffer_read_start(iter->buf_iter); + + hyp_trace_readers++; +unlock: + mutex_unlock(&hyp_trace_lock); + + return ret; +} + +int hyp_trace_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct ht_iterator *iter = m->private; + + ring_buffer_read_finish(iter->buf_iter); + + mutex_lock(&hyp_trace_lock); + hyp_trace_readers--; + mutex_unlock(&hyp_trace_lock); + + return seq_release_private(inode, file); +} + +static const struct file_operations hyp_trace_fops = { + .open = hyp_trace_open, + .read = seq_read, + .llseek = seq_lseek, + .release = hyp_trace_release, +}; + +/* + * TODO: should be merged with the ring_buffer_iterator version + */ +static void *trace_buffer_peek(struct ht_iterator *iter) +{ + struct ring_buffer_event *event; + + if (ring_buffer_empty_cpu(iter->trace_buffer, iter->cpu)) + return NULL; + + event = ring_buffer_peek(iter->trace_buffer, iter->cpu, &iter->ts, &iter->lost_events); + if (!event) + return NULL; + + iter->ent = (struct hyp_entry_hdr *)&event->array[1]; + iter->ent_size = event->array[0]; + + return iter; +} + +static ssize_t +hyp_trace_pipe_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct ht_iterator *iter = (struct ht_iterator *)file->private_data; + struct trace_buffer *trace_buffer = iter->trace_buffer; + int ret; + + trace_seq_init(&iter->seq); +again: + ret = ring_buffer_wait(trace_buffer, iter->cpu, 0); + if (ret < 0) + return ret; + + hyp_trace_read_start(iter->cpu); + while (trace_buffer_peek(iter)) { + unsigned long lost_events; + + ht_print_trace_fmt(iter); + ring_buffer_consume(iter->trace_buffer, iter->cpu, NULL, &lost_events); + } + hyp_trace_read_stop(iter->cpu); + + ret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (ret == -EBUSY) + goto again; + + return ret; +} + +static void __poke_reader(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct ht_iterator *iter; + + iter = container_of(dwork, struct ht_iterator, poke_work); + + WARN_ON_ONCE(ring_buffer_poke(iter->trace_buffer, iter->cpu)); + + schedule_delayed_work((struct delayed_work *)work, + msecs_to_jiffies(RB_POLL_MS)); +} + +static int hyp_trace_pipe_open(struct inode *inode, struct file *file) +{ + unsigned long cpu = (unsigned long)inode->i_private; + struct ht_iterator *iter; + int ret = -EINVAL; + + mutex_lock(&hyp_trace_lock); + + if (!hyp_trace_buffer) + goto unlock; + + ret = ring_buffer_poke(hyp_trace_buffer, cpu); + if (ret) + goto unlock; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + ret = -ENOMEM; + goto unlock; + } + + iter->cpu = cpu; + iter->trace_buffer = hyp_trace_buffer; + + INIT_DELAYED_WORK(&iter->poke_work, __poke_reader); + schedule_delayed_work(&iter->poke_work, msecs_to_jiffies(RB_POLL_MS)); + + file->private_data = iter; + + hyp_trace_readers++; +unlock: + mutex_unlock(&hyp_trace_lock); + + return ret; +} + +static int hyp_trace_pipe_release(struct inode *inode, struct file *file) +{ + struct ht_iterator *iter = file->private_data; + + cancel_delayed_work_sync(&iter->poke_work); + + kfree(iter); + + mutex_lock(&hyp_trace_lock); + hyp_trace_readers--; + mutex_unlock(&hyp_trace_lock); + + return 0; +} + +static const struct file_operations hyp_trace_pipe_fops = { + .open = hyp_trace_pipe_open, + .read = hyp_trace_pipe_read, + .release = hyp_trace_pipe_release, + .llseek = no_llseek, +}; + +static ssize_t +hyp_trace_raw_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct ht_iterator *iter = (struct ht_iterator *)file->private_data; + struct trace_buffer *trace_buffer = iter->trace_buffer; + size_t size; + int ret; + + if (iter->copy_leftover) + goto read; +again: + hyp_trace_read_start(iter->cpu); + ret = ring_buffer_read_page(trace_buffer, &iter->spare, + cnt, iter->cpu, 0); + hyp_trace_read_stop(iter->cpu); + if (ret < 0) { + if (!ring_buffer_empty_cpu(iter->trace_buffer, iter->cpu)) + return 0; + + ret = ring_buffer_wait(trace_buffer, iter->cpu, 0); + if (ret < 0) + return ret; + + goto again; + } + + iter->copy_leftover = 0; +read: + size = PAGE_SIZE - iter->copy_leftover; + if (size > cnt) + size = cnt; + + ret = copy_to_user(ubuf, iter->spare + PAGE_SIZE - size, size); + if (ret == size) + return -EFAULT; + + size -= ret; + *ppos += size; + iter->copy_leftover = ret; + + return size; +} + +static int hyp_trace_raw_open(struct inode *inode, struct file *file) +{ + int ret = hyp_trace_pipe_open(inode, file); + struct ht_iterator *iter; + + if (ret) + return ret; + + iter = file->private_data; + iter->spare = ring_buffer_alloc_read_page(iter->trace_buffer, iter->cpu); + if (IS_ERR(iter->spare)) { + ret = PTR_ERR(iter->spare); + iter->spare = NULL; + return ret; + } + + return 0; +} + +static int hyp_trace_raw_release(struct inode *inode, struct file *file) +{ + struct ht_iterator *iter = file->private_data; + + ring_buffer_free_read_page(iter->trace_buffer, iter->cpu, iter->spare); + + return hyp_trace_pipe_release(inode, file); +} + +static const struct file_operations hyp_trace_raw_fops = { + .open = hyp_trace_raw_open, + .read = hyp_trace_raw_read, + .release = hyp_trace_raw_release, + .llseek = no_llseek, +}; + +static void hyp_tracefs_create_cpu_file(const char *file_name, + unsigned long cpu, + const struct file_operations *fops, + struct dentry *parent) +{ + if (!tracefs_create_file(file_name, 0440, parent, (void *)cpu, fops)) + pr_warn("Failed to create tracefs %pd/%s\n", parent, file_name); +} + +void kvm_hyp_init_events_tracefs(struct dentry *parent); +bool kvm_hyp_events_enable_early(void); + +int init_hyp_tracefs(void) +{ + struct dentry *d, *root_dir, *per_cpu_root_dir; + char per_cpu_name[16]; + unsigned long cpu; + int err; + + if (!is_protected_kvm_enabled()) + return 0; + + root_dir = tracefs_create_dir(TRACEFS_DIR, NULL); + if (!root_dir) { + pr_err("Failed to create tracefs "TRACEFS_DIR"/\n"); + return -ENODEV; + } + + d = tracefs_create_file("tracing_on", 0640, root_dir, NULL, + &hyp_tracing_on_fops); + if (!d) { + pr_err("Failed to create tracefs "TRACEFS_DIR"/tracing_on\n"); + return -ENODEV; + } + + d = tracefs_create_file("buffer_size_kb", 0640, root_dir, NULL, + &hyp_buffer_size_fops); + if (!d) + pr_err("Failed to create tracefs "TRACEFS_DIR"/buffer_size_kb\n"); + + + per_cpu_root_dir = tracefs_create_dir("per_cpu", root_dir); + if (!per_cpu_root_dir) { + pr_err("Failed to create tracefs "TRACEFS_DIR"/per_cpu/\n"); + return -ENODEV; + } + + for_each_possible_cpu(cpu) { + struct dentry *dir; + + snprintf(per_cpu_name, sizeof(per_cpu_name), "cpu%lu", cpu); + dir = tracefs_create_dir(per_cpu_name, per_cpu_root_dir); + if (!dir) { + pr_warn("Failed to create tracefs "TRACEFS_DIR"/per_cpu/cpu%lu\n", + cpu); + continue; + } + + hyp_tracefs_create_cpu_file("trace", cpu, &hyp_trace_fops, dir); + hyp_tracefs_create_cpu_file("trace_pipe", cpu, + &hyp_trace_pipe_fops, dir); + hyp_tracefs_create_cpu_file("trace_pipe_raw", cpu, + &hyp_trace_raw_fops, dir); + } + + kvm_hyp_init_events_tracefs(root_dir); + if (kvm_hyp_events_enable_early()) { + err = hyp_start_tracing(); + if (err) + pr_warn("Failed to start early events tracing: %d\n", err); + } + + return 0; +} diff --git a/arch/arm64/kvm/hyp_trace.h b/arch/arm64/kvm/hyp_trace.h new file mode 100644 index 000000000000..fb8172e464f2 --- /dev/null +++ b/arch/arm64/kvm/hyp_trace.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ARM64_KVM_HYP_TRACE_H__ +#define __ARM64_KVM_HYP_TRACE_H__ + +#include +#include + +struct ht_iterator { + struct ring_buffer_iter *buf_iter; + struct trace_buffer *trace_buffer; + struct hyp_entry_hdr *ent; + struct trace_seq seq; + u64 ts; + void *spare; + size_t copy_leftover; + size_t ent_size; + struct delayed_work poke_work; + unsigned long lost_events; + int cpu; +}; + +#ifdef CONFIG_TRACING +int init_hyp_tracefs(void); +#else +static inline int init_hyp_tracefs(void) { return 0; } +#endif +#endif diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 202b8c455724..b4712bcc697d 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -5,10 +5,25 @@ #include #include +#include #include #include +#define KVM_ARM_SMCCC_STD_FEATURES \ + GENMASK(KVM_REG_ARM_STD_BMAP_BIT_COUNT - 1, 0) +#define KVM_ARM_SMCCC_STD_HYP_FEATURES \ + GENMASK(KVM_REG_ARM_STD_HYP_BMAP_BIT_COUNT - 1, 0) +#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES ({ \ + unsigned long f; \ + f = GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_COUNT - 1, 0); \ + if (is_protected_kvm_enabled()) { \ + f |= BIT(ARM_SMCCC_KVM_FUNC_HYP_MEMINFO); \ + f |= BIT(ARM_SMCCC_KVM_FUNC_MEM_RELINQUISH); \ + } \ + f; \ +}) + static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) { struct system_time_snapshot systime_snapshot; @@ -58,13 +73,79 @@ static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) val[3] = lower_32_bits(cycles); } +static bool kvm_hvc_call_default_allowed(u32 func_id) +{ + switch (func_id) { + /* + * List of function-ids that are not gated with the bitmapped + * feature firmware registers, and are to be allowed for + * servicing the call by default. + */ + case ARM_SMCCC_VERSION_FUNC_ID: + case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID: + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID: + case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID: + return true; + default: + /* PSCI 0.2 and up is in the 0:0x1f range */ + if (ARM_SMCCC_OWNER_NUM(func_id) == ARM_SMCCC_OWNER_STANDARD && + ARM_SMCCC_FUNC_NUM(func_id) <= 0x1f) + return true; + + /* + * KVM's PSCI 0.1 doesn't comply with SMCCC, and has + * its own function-id base and range + */ + if (func_id >= KVM_PSCI_FN(0) && func_id <= KVM_PSCI_FN(3)) + return true; + + return false; + } +} + +static bool kvm_hvc_call_allowed(struct kvm_vcpu *vcpu, u32 func_id) +{ + struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat; + + switch (func_id) { + case ARM_SMCCC_TRNG_VERSION: + case ARM_SMCCC_TRNG_FEATURES: + case ARM_SMCCC_TRNG_GET_UUID: + case ARM_SMCCC_TRNG_RND32: + case ARM_SMCCC_TRNG_RND64: + return test_bit(KVM_REG_ARM_STD_BIT_TRNG_V1_0, + &smccc_feat->std_bmap); + case ARM_SMCCC_HV_PV_TIME_FEATURES: + case ARM_SMCCC_HV_PV_TIME_ST: + return test_bit(KVM_REG_ARM_STD_HYP_BIT_PV_TIME, + &smccc_feat->std_hyp_bmap); + case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: + case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID: + return test_bit(KVM_REG_ARM_VENDOR_HYP_BIT_FUNC_FEAT, + &smccc_feat->vendor_hyp_bmap); + case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: + return test_bit(KVM_REG_ARM_VENDOR_HYP_BIT_PTP, + &smccc_feat->vendor_hyp_bmap); + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID: + return test_bit(ARM_SMCCC_KVM_FUNC_MEM_RELINQUISH, + &smccc_feat->vendor_hyp_bmap); + default: + return kvm_hvc_call_default_allowed(func_id); + } +} + int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) { + struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat; u32 func_id = smccc_get_function(vcpu); u64 val[4] = {SMCCC_RET_NOT_SUPPORTED}; u32 feature; gpa_t gpa; + if (!kvm_hvc_call_allowed(vcpu, func_id)) + goto out; + switch (func_id) { case ARM_SMCCC_VERSION_FUNC_ID: val[0] = ARM_SMCCC_VERSION_1_1; @@ -120,7 +201,9 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) } break; case ARM_SMCCC_HV_PV_TIME_FEATURES: - val[0] = SMCCC_RET_SUCCESS; + if (test_bit(KVM_REG_ARM_STD_HYP_BIT_PV_TIME, + &smccc_feat->std_hyp_bmap)) + val[0] = SMCCC_RET_SUCCESS; break; } break; @@ -139,12 +222,29 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3; break; case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: - val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES); - val[0] |= BIT(ARM_SMCCC_KVM_FUNC_PTP); + val[0] = smccc_feat->vendor_hyp_bmap; break; case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: kvm_ptp_get_time(vcpu, val); break; + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID: + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID: + if (!kvm_vm_is_protected(vcpu->kvm)) + break; + atomic64_add( + func_id == ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID ? + PAGE_SIZE : -PAGE_SIZE, + &vcpu->kvm->stat.protected_shared_mem); + val[0] = SMCCC_RET_SUCCESS; + break; + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID: + pkvm_host_reclaim_page(vcpu->kvm, smccc_get_arg1(vcpu)); + val[0] = SMCCC_RET_SUCCESS; + break; + case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID: + if (kvm_vm_is_protected(vcpu->kvm) && !topup_hyp_memcache(vcpu)) + val[0] = SMCCC_RET_SUCCESS; + break; case ARM_SMCCC_TRNG_VERSION: case ARM_SMCCC_TRNG_FEATURES: case ARM_SMCCC_TRNG_GET_UUID: @@ -155,6 +255,259 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) return kvm_psci_call(vcpu); } +out: smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]); return 1; } + +static const u64 kvm_arm_fw_reg_ids[] = { + KVM_REG_ARM_PSCI_VERSION, + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1, + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2, + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3, + KVM_REG_ARM_STD_BMAP, + KVM_REG_ARM_STD_HYP_BMAP, + KVM_REG_ARM_VENDOR_HYP_BMAP, +}; + +void kvm_arm_init_hypercalls(struct kvm *kvm) +{ + struct kvm_smccc_features *smccc_feat = &kvm->arch.smccc_feat; + + smccc_feat->std_bmap = KVM_ARM_SMCCC_STD_FEATURES; + smccc_feat->std_hyp_bmap = KVM_ARM_SMCCC_STD_HYP_FEATURES; + smccc_feat->vendor_hyp_bmap = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES; +} + +int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu) +{ + return ARRAY_SIZE(kvm_arm_fw_reg_ids); +} + +int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(kvm_arm_fw_reg_ids); i++) { + if (put_user(kvm_arm_fw_reg_ids[i], uindices++)) + return -EFAULT; + } + + return 0; +} + +#define KVM_REG_FEATURE_LEVEL_MASK GENMASK(3, 0) + +/* + * Convert the workaround level into an easy-to-compare number, where higher + * values mean better protection. + */ +static int get_kernel_wa_level(u64 regid) +{ + switch (regid) { + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + switch (arm64_get_spectre_v2_state()) { + case SPECTRE_VULNERABLE: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; + case SPECTRE_MITIGATED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL; + case SPECTRE_UNAFFECTED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED; + } + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + switch (arm64_get_spectre_v4_state()) { + case SPECTRE_MITIGATED: + /* + * As for the hypercall discovery, we pretend we + * don't have any FW mitigation if SSBS is there at + * all times. + */ + if (cpus_have_final_cap(ARM64_SSBS)) + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; + fallthrough; + case SPECTRE_UNAFFECTED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED; + case SPECTRE_VULNERABLE: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; + } + break; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: + switch (arm64_get_spectre_bhb_state()) { + case SPECTRE_VULNERABLE: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL; + case SPECTRE_MITIGATED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_AVAIL; + case SPECTRE_UNAFFECTED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_REQUIRED; + } + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL; + } + + return -EINVAL; +} + +int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat; + void __user *uaddr = (void __user *)(long)reg->addr; + u64 val; + + switch (reg->id) { + case KVM_REG_ARM_PSCI_VERSION: + val = kvm_psci_version(vcpu); + break; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: + val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; + break; + case KVM_REG_ARM_STD_BMAP: + val = READ_ONCE(smccc_feat->std_bmap); + break; + case KVM_REG_ARM_STD_HYP_BMAP: + val = READ_ONCE(smccc_feat->std_hyp_bmap); + break; + case KVM_REG_ARM_VENDOR_HYP_BMAP: + val = READ_ONCE(smccc_feat->vendor_hyp_bmap); + break; + default: + return -ENOENT; + } + + if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +static int kvm_arm_set_fw_reg_bmap(struct kvm_vcpu *vcpu, u64 reg_id, u64 val) +{ + int ret = 0; + struct kvm *kvm = vcpu->kvm; + struct kvm_smccc_features *smccc_feat = &kvm->arch.smccc_feat; + unsigned long *fw_reg_bmap, fw_reg_features; + + switch (reg_id) { + case KVM_REG_ARM_STD_BMAP: + fw_reg_bmap = &smccc_feat->std_bmap; + fw_reg_features = KVM_ARM_SMCCC_STD_FEATURES; + break; + case KVM_REG_ARM_STD_HYP_BMAP: + fw_reg_bmap = &smccc_feat->std_hyp_bmap; + fw_reg_features = KVM_ARM_SMCCC_STD_HYP_FEATURES; + break; + case KVM_REG_ARM_VENDOR_HYP_BMAP: + fw_reg_bmap = &smccc_feat->vendor_hyp_bmap; + fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES; + break; + default: + return -ENOENT; + } + + /* Check for unsupported bit */ + if (val & ~fw_reg_features) + return -EINVAL; + + mutex_lock(&kvm->lock); + + if (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags) && + val != *fw_reg_bmap) { + ret = -EBUSY; + goto out; + } + + WRITE_ONCE(*fw_reg_bmap, val); +out: + mutex_unlock(&kvm->lock); + return ret; +} + +int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + void __user *uaddr = (void __user *)(long)reg->addr; + u64 val; + int wa_level; + + if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + switch (reg->id) { + case KVM_REG_ARM_PSCI_VERSION: + { + bool wants_02; + + wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features); + + switch (val) { + case KVM_ARM_PSCI_0_1: + if (wants_02) + return -EINVAL; + vcpu->kvm->arch.psci_version = val; + return 0; + case KVM_ARM_PSCI_0_2: + case KVM_ARM_PSCI_1_0: + case KVM_ARM_PSCI_1_1: + if (!wants_02) + return -EINVAL; + vcpu->kvm->arch.psci_version = val; + return 0; + } + break; + } + + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: + if (val & ~KVM_REG_FEATURE_LEVEL_MASK) + return -EINVAL; + + if (get_kernel_wa_level(reg->id) < val) + return -EINVAL; + + return 0; + + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + if (val & ~(KVM_REG_FEATURE_LEVEL_MASK | + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED)) + return -EINVAL; + + /* The enabled bit must not be set unless the level is AVAIL. */ + if ((val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED) && + (val & KVM_REG_FEATURE_LEVEL_MASK) != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL) + return -EINVAL; + + /* + * Map all the possible incoming states to the only two we + * really want to deal with. + */ + switch (val & KVM_REG_FEATURE_LEVEL_MASK) { + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN: + wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; + break; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: + wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED; + break; + default: + return -EINVAL; + } + + /* + * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the + * other way around. + */ + if (get_kernel_wa_level(reg->id) < wa_level) + return -EINVAL; + + return 0; + case KVM_REG_ARM_STD_BMAP: + case KVM_REG_ARM_STD_HYP_BMAP: + case KVM_REG_ARM_VENDOR_HYP_BMAP: + return kvm_arm_set_fw_reg_bmap(vcpu, reg->id, val); + default: + return -ENOENT; + } + + return -EINVAL; +} diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index b47df73e98d7..9bcc15fb140f 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -20,9 +20,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr bool is_aarch32 = vcpu_mode_is_32bit(vcpu); u32 esr = 0; - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 | - KVM_ARM64_EXCEPT_AA64_ELx_SYNC | - KVM_ARM64_PENDING_EXCEPTION); + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); vcpu_write_sys_reg(vcpu, addr, FAR_EL1); @@ -52,9 +50,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) { u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 | - KVM_ARM64_EXCEPT_AA64_ELx_SYNC | - KVM_ARM64_PENDING_EXCEPTION); + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); /* * Build an unknown exception, depending on the instruction @@ -73,8 +69,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) static void inject_undef32(struct kvm_vcpu *vcpu) { - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_UND | - KVM_ARM64_PENDING_EXCEPTION); + kvm_pend_exception(vcpu, EXCEPT_AA32_UND); } /* @@ -97,14 +92,12 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, u32 addr) far = vcpu_read_sys_reg(vcpu, FAR_EL1); if (is_pabt) { - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_IABT | - KVM_ARM64_PENDING_EXCEPTION); + kvm_pend_exception(vcpu, EXCEPT_AA32_IABT); far &= GENMASK(31, 0); far |= (u64)addr << 32; vcpu_write_sys_reg(vcpu, fsr, IFSR32_EL2); } else { /* !iabt */ - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_DABT | - KVM_ARM64_PENDING_EXCEPTION); + kvm_pend_exception(vcpu, EXCEPT_AA32_DABT); far &= GENMASK(63, 32); far |= addr; vcpu_write_sys_reg(vcpu, fsr, ESR_EL1); @@ -145,6 +138,34 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) inject_abt64(vcpu, true, addr); } +void kvm_inject_size_fault(struct kvm_vcpu *vcpu) +{ + unsigned long addr, esr; + + addr = kvm_vcpu_get_fault_ipa(vcpu); + addr |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); + + if (kvm_vcpu_trap_is_iabt(vcpu)) + kvm_inject_pabt(vcpu, addr); + else + kvm_inject_dabt(vcpu, addr); + + /* + * If AArch64 or LPAE, set FSC to 0 to indicate an Address + * Size Fault at level 0, as if exceeding PARange. + * + * Non-LPAE guests will only get the external abort, as there + * is no way to to describe the ASF. + */ + if (vcpu_el1_is_32bit(vcpu) && + !(vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE)) + return; + + esr = vcpu_read_sys_reg(vcpu, ESR_EL1); + esr &= ~GENMASK_ULL(5, 0); + vcpu_write_sys_reg(vcpu, esr, ESR_EL1); +} + /** * kvm_inject_undefined - inject an undefined instruction into the guest * @vcpu: The vCPU in which to inject the exception diff --git a/arch/arm64/kvm/iommu.c b/arch/arm64/kvm/iommu.c new file mode 100644 index 000000000000..08b318a2aae9 --- /dev/null +++ b/arch/arm64/kvm/iommu.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022 - Google LLC + * Author: David Brazdil + */ + +#include + +static unsigned long dev_to_id(struct device *dev) +{ + /* Use the struct device pointer as a unique identifier. */ + return (unsigned long)dev; +} + +int pkvm_iommu_driver_init(u64 drv, void *data, size_t size) +{ + return kvm_call_hyp_nvhe(__pkvm_iommu_driver_init, drv, data, size); +} +EXPORT_SYMBOL_GPL(pkvm_iommu_driver_init); + +int pkvm_iommu_register(struct device *dev, u64 drv, phys_addr_t pa, + size_t size, struct device *parent) +{ + void *mem; + int ret; + + /* + * Hypcall to register the device. It will return -ENOMEM if it needs + * more memory. In that case allocate a page and retry. + * We assume that hyp never allocates more than a page per hypcall. + */ + ret = kvm_call_hyp_nvhe(__pkvm_iommu_register, dev_to_id(dev), + drv, pa, size, dev_to_id(parent), NULL, 0); + if (ret == -ENOMEM) { + mem = (void *)__get_free_page(GFP_KERNEL); + if (!mem) + return -ENOMEM; + + ret = kvm_call_hyp_nvhe(__pkvm_iommu_register, dev_to_id(dev), + drv, pa, size, dev_to_id(parent), + mem, PAGE_SIZE); + } + return ret; +} +EXPORT_SYMBOL_GPL(pkvm_iommu_register); + +int pkvm_iommu_suspend(struct device *dev) +{ + return kvm_call_hyp_nvhe(__pkvm_iommu_pm_notify, dev_to_id(dev), + PKVM_IOMMU_PM_SUSPEND); +} +EXPORT_SYMBOL_GPL(pkvm_iommu_suspend); + +int pkvm_iommu_resume(struct device *dev) +{ + return kvm_call_hyp_nvhe(__pkvm_iommu_pm_notify, dev_to_id(dev), + PKVM_IOMMU_PM_RESUME); +} +EXPORT_SYMBOL_GPL(pkvm_iommu_resume); + +int pkvm_iommu_finalize(void) +{ + return kvm_call_hyp_nvhe(__pkvm_iommu_finalize); +} +EXPORT_SYMBOL_GPL(pkvm_iommu_finalize); diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c index 3e2d8ba11a02..db6630c70f8b 100644 --- a/arch/arm64/kvm/mmio.c +++ b/arch/arm64/kvm/mmio.c @@ -133,9 +133,19 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) /* * No valid syndrome? Ask userspace for help if it has * volunteered to do so, and bail out otherwise. + * + * In the protected VM case, there isn't much userspace can do + * though, so directly deliver an exception to the guest. */ if (!kvm_vcpu_dabt_isvalid(vcpu)) { - if (vcpu->kvm->arch.return_nisv_io_abort_to_user) { + if (is_protected_kvm_enabled() && + kvm_vm_is_protected(vcpu->kvm)) { + kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); + return 1; + } + + if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER, + &vcpu->kvm->arch.flags)) { run->exit_reason = KVM_EXIT_ARM_NISV; run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu); run->arm_nisv.fault_ipa = fault_ipa; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 69bd1732a299..5475a91515d6 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -31,6 +31,13 @@ static phys_addr_t hyp_idmap_vector; static unsigned long io_map_base; +static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); + phys_addr_t boundary = ALIGN_DOWN(addr + size, size); + + return (boundary - 1 < end - 1) ? boundary : end; +} /* * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, @@ -52,13 +59,13 @@ static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr, if (!pgt) return -EINVAL; - next = stage2_pgd_addr_end(kvm, addr, end); + next = stage2_range_addr_end(addr, end); ret = fn(pgt, addr, next - addr); if (ret) break; if (resched && next != end) - cond_resched_lock(&kvm->mmu_lock); + cond_resched_rwlock_write(&kvm->mmu_lock); } while (addr = next, addr != end); return ret; @@ -92,9 +99,13 @@ static bool kvm_is_device_pfn(unsigned long pfn) static void *stage2_memcache_zalloc_page(void *arg) { struct kvm_mmu_memory_cache *mc = arg; + void *virt; /* Allocated with __GFP_ZERO, so no need to zero */ - return kvm_mmu_memory_cache_alloc(mc); + virt = kvm_mmu_memory_cache_alloc(mc); + if (virt) + kvm_account_pgtable_pages(virt, 1); + return virt; } static void *kvm_host_zalloc_pages_exact(size_t size) @@ -102,6 +113,21 @@ static void *kvm_host_zalloc_pages_exact(size_t size) return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); } +static void *kvm_s2_zalloc_pages_exact(size_t size) +{ + void *virt = kvm_host_zalloc_pages_exact(size); + + if (virt) + kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT)); + return virt; +} + +static void kvm_s2_free_pages_exact(void *virt, size_t size) +{ + kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT)); + free_pages_exact(virt, size); +} + static void kvm_host_get_page(void *addr) { get_page(virt_to_page(addr)); @@ -112,6 +138,15 @@ static void kvm_host_put_page(void *addr) put_page(virt_to_page(addr)); } +static void kvm_s2_put_page(void *addr) +{ + struct page *p = virt_to_page(addr); + /* Dropping last refcount, the page will be freed */ + if (page_count(p) == 1) + kvm_account_pgtable_pages(addr, -1); + put_page(p); +} + static int kvm_host_page_count(void *addr) { return page_count(virt_to_page(addr)); @@ -179,7 +214,7 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); phys_addr_t end = start + size; - assert_spin_locked(&kvm->mmu_lock); + lockdep_assert_held_write(&kvm->mmu_lock); WARN_ON(size & ~PAGE_MASK); WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap, may_block)); @@ -190,6 +225,24 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si __unmap_stage2_range(mmu, start, size, true); } +static void pkvm_stage2_flush(struct kvm *kvm) +{ + struct kvm_pinned_page *ppage; + struct rb_node *node; + + /* + * Contrary to stage2_apply_range(), we don't need to check + * whether the VM is being torn down, as this is always called + * from a vcpu thread, and the list is only ever freed on VM + * destroy (which only occurs when all vcpu are gone). + */ + for (node = rb_first(&kvm->arch.pkvm.pinned_pages); node; node = rb_next(node)) { + ppage = rb_entry(node, struct kvm_pinned_page, node); + __clean_dcache_guest_page(page_address(ppage->page), PAGE_SIZE); + cond_resched_rwlock_write(&kvm->mmu_lock); + } +} + static void stage2_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { @@ -213,13 +266,17 @@ static void stage2_flush_vm(struct kvm *kvm) int idx; idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); - slots = kvm_memslots(kvm); - kvm_for_each_memslot(memslot, slots) - stage2_flush_memslot(kvm, memslot); + if (!is_protected_kvm_enabled()) { + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) + stage2_flush_memslot(kvm, memslot); + } else if (!kvm_vm_is_protected(kvm)) { + pkvm_stage2_flush(kvm); + } - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); } @@ -239,6 +296,9 @@ void free_hyp_pgds(void) static bool kvm_host_owns_hyp_mappings(void) { + if (is_kernel_in_hyp_mode()) + return false; + if (static_branch_likely(&kvm_protected_mode_initialized)) return false; @@ -255,8 +315,8 @@ static bool kvm_host_owns_hyp_mappings(void) return true; } -static int __create_hyp_mappings(unsigned long start, unsigned long size, - unsigned long phys, enum kvm_pgtable_prot prot) +int __create_hyp_mappings(unsigned long start, unsigned long size, + unsigned long phys, enum kvm_pgtable_prot prot) { int err; @@ -281,14 +341,117 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr) } } -static int pkvm_share_hyp(phys_addr_t start, phys_addr_t end) +struct hyp_shared_pfn { + u64 pfn; + int count; + struct rb_node node; +}; + +static DEFINE_MUTEX(hyp_shared_pfns_lock); +static struct rb_root hyp_shared_pfns = RB_ROOT; + +static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node, + struct rb_node **parent) { - phys_addr_t addr; + struct hyp_shared_pfn *this; + + *node = &hyp_shared_pfns.rb_node; + *parent = NULL; + while (**node) { + this = container_of(**node, struct hyp_shared_pfn, node); + *parent = **node; + if (this->pfn < pfn) + *node = &((**node)->rb_left); + else if (this->pfn > pfn) + *node = &((**node)->rb_right); + else + return this; + } + + return NULL; +} + +static int share_pfn_hyp(u64 pfn) +{ + struct rb_node **node, *parent; + struct hyp_shared_pfn *this; + int ret = 0; + + mutex_lock(&hyp_shared_pfns_lock); + this = find_shared_pfn(pfn, &node, &parent); + if (this) { + this->count++; + goto unlock; + } + + this = kzalloc(sizeof(*this), GFP_KERNEL); + if (!this) { + ret = -ENOMEM; + goto unlock; + } + + this->pfn = pfn; + this->count = 1; + rb_link_node(&this->node, parent, node); + rb_insert_color(&this->node, &hyp_shared_pfns); + ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1); +unlock: + mutex_unlock(&hyp_shared_pfns_lock); + + return ret; +} + +static int unshare_pfn_hyp(u64 pfn) +{ + struct rb_node **node, *parent; + struct hyp_shared_pfn *this; + int ret = 0; + + mutex_lock(&hyp_shared_pfns_lock); + this = find_shared_pfn(pfn, &node, &parent); + if (WARN_ON(!this)) { + ret = -ENOENT; + goto unlock; + } + + this->count--; + if (this->count) + goto unlock; + + rb_erase(&this->node, &hyp_shared_pfns); + kfree(this); + ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1); +unlock: + mutex_unlock(&hyp_shared_pfns_lock); + + return ret; +} + +int kvm_share_hyp(void *from, void *to) +{ + phys_addr_t start, end, cur; + u64 pfn; int ret; - for (addr = ALIGN_DOWN(start, PAGE_SIZE); addr < end; addr += PAGE_SIZE) { - ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, - __phys_to_pfn(addr)); + if (is_kernel_in_hyp_mode()) + return 0; + + /* + * The share hcall maps things in the 'fixed-offset' region of the hyp + * VA space, so we can only share physically contiguous data-structures + * for now. + */ + if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to)) + return -EINVAL; + + if (kvm_host_owns_hyp_mappings()) + return create_hyp_mappings(from, to, PAGE_HYP); + + start = ALIGN_DOWN(__pa(from), PAGE_SIZE); + end = PAGE_ALIGN(__pa(to)); + for (cur = start; cur < end; cur += PAGE_SIZE) { + pfn = __phys_to_pfn(cur); + ret = share_pfn_hyp(pfn); if (ret) return ret; } @@ -296,6 +459,22 @@ static int pkvm_share_hyp(phys_addr_t start, phys_addr_t end) return 0; } +void kvm_unshare_hyp(void *from, void *to) +{ + phys_addr_t start, end, cur; + u64 pfn; + + if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from) + return; + + start = ALIGN_DOWN(__pa(from), PAGE_SIZE); + end = PAGE_ALIGN(__pa(to)); + for (cur = start; cur < end; cur += PAGE_SIZE) { + pfn = __phys_to_pfn(cur); + WARN_ON(unshare_pfn_hyp(pfn)); + } +} + /** * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode * @from: The virtual kernel start address of the range @@ -316,12 +495,8 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) if (is_kernel_in_hyp_mode()) return 0; - if (!kvm_host_owns_hyp_mappings()) { - if (WARN_ON(prot != PAGE_HYP)) - return -EPERM; - return pkvm_share_hyp(kvm_kaddr_to_phys(from), - kvm_kaddr_to_phys(to)); - } + if (!kvm_host_owns_hyp_mappings()) + return -EPERM; start = start & PAGE_MASK; end = PAGE_ALIGN(end); @@ -339,23 +514,22 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) return 0; } -static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, - unsigned long *haddr, - enum kvm_pgtable_prot prot) + +/** + * hyp_alloc_private_va_range - Allocates a private VA range. + * @size: The size of the VA range to reserve. + * @haddr: The hypervisor virtual start address of the allocation. + * + * The private virtual address (VA) range is allocated below io_map_base + * and aligned based on the order of @size. + * + * Return: 0 on success or negative error code on failure. + */ +int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) { unsigned long base; int ret = 0; - if (!kvm_host_owns_hyp_mappings()) { - base = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, - phys_addr, size, prot); - if (IS_ERR_OR_NULL((void *)base)) - return PTR_ERR((void *)base); - *haddr = base; - - return 0; - } - mutex_lock(&kvm_hyp_pgd_mutex); /* @@ -366,8 +540,10 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, * * The allocated size is always a multiple of PAGE_SIZE. */ - size = PAGE_ALIGN(size + offset_in_page(phys_addr)); - base = io_map_base - size; + base = io_map_base - PAGE_ALIGN(size); + + /* Align the allocation based on the order of its size */ + base = ALIGN_DOWN(base, PAGE_SIZE << get_order(size)); /* * Verify that BIT(VA_BITS - 1) hasn't been flipped by @@ -377,19 +553,40 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, if ((base ^ io_map_base) & BIT(VA_BITS - 1)) ret = -ENOMEM; else - io_map_base = base; + *haddr = io_map_base = base; mutex_unlock(&kvm_hyp_pgd_mutex); - if (ret) - goto out; + return ret; +} - ret = __create_hyp_mappings(base, size, phys_addr, prot); - if (ret) - goto out; +static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, + unsigned long *haddr, + enum kvm_pgtable_prot prot) +{ + unsigned long addr; + int ret = 0; - *haddr = base + offset_in_page(phys_addr); -out: + if (!kvm_host_owns_hyp_mappings()) { + addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, + phys_addr, size, prot); + if (IS_ERR_VALUE(addr)) + return addr; + *haddr = addr; + + return 0; + } + + size = PAGE_ALIGN(size + offset_in_page(phys_addr)); + ret = hyp_alloc_private_va_range(size, &addr); + if (ret) + return ret; + + ret = __create_hyp_mappings(addr, size, phys_addr, prot); + if (ret) + return ret; + + *haddr = addr + offset_in_page(phys_addr); return ret; } @@ -407,6 +604,9 @@ int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, unsigned long addr; int ret; + if (is_protected_kvm_enabled()) + return -EPERM; + *kaddr = ioremap(phys_addr, size); if (!*kaddr) return -ENOMEM; @@ -480,12 +680,23 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr) return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); } +static bool stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot) +{ + return true; +} + +static bool stage2_pte_is_counted(kvm_pte_t pte, u32 level) + +{ + return !!pte; +} + static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .zalloc_page = stage2_memcache_zalloc_page, - .zalloc_pages_exact = kvm_host_zalloc_pages_exact, - .free_pages_exact = free_pages_exact, + .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, + .free_pages_exact = kvm_s2_free_pages_exact, .get_page = kvm_host_get_page, - .put_page = kvm_host_put_page, + .put_page = kvm_s2_put_page, .page_count = kvm_host_page_count, .phys_to_virt = kvm_host_va, .virt_to_phys = kvm_host_pa, @@ -493,30 +704,67 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .icache_inval_pou = invalidate_icache_guest_page, }; +static struct kvm_pgtable_pte_ops kvm_s2_pte_ops = { + .force_pte_cb = stage2_force_pte_cb, + .pte_is_counted_cb = stage2_pte_is_counted + +}; + /** - * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure + * kvm_init_stage2_mmu - Initialise a S2 MMU structure * @kvm: The pointer to the KVM structure * @mmu: The pointer to the s2 MMU structure + * @type: The machine type of the virtual machine * * Allocates only the stage-2 HW PGD level table(s). * Note we don't need locking here as this is only called when the VM is * created, which can only be done once. */ -int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) +int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) { + u32 kvm_ipa_limit = get_kvm_ipa_limit(); int cpu, err; struct kvm_pgtable *pgt; + u64 mmfr0, mmfr1; + u32 phys_shift; + + phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); + if (is_protected_kvm_enabled()) { + phys_shift = kvm_ipa_limit; + } else if (phys_shift) { + if (phys_shift > kvm_ipa_limit || + phys_shift < ARM64_MIN_PARANGE_BITS) + return -EINVAL; + } else { + phys_shift = KVM_PHYS_SHIFT; + if (phys_shift > kvm_ipa_limit) { + pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", + current->comm); + return -EINVAL; + } + } + + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); + kvm->arch.pkvm.pinned_pages = RB_ROOT; + mmu->arch = &kvm->arch; + + if (is_protected_kvm_enabled()) + return 0; if (mmu->pgt != NULL) { kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } - pgt = kzalloc(sizeof(*pgt), GFP_KERNEL); + pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); if (!pgt) return -ENOMEM; - err = kvm_pgtable_stage2_init(pgt, &kvm->arch, &kvm_s2_mm_ops); + mmu->arch = &kvm->arch; + err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops, + &kvm_s2_pte_ops); if (err) goto out_free_pgtable; @@ -529,10 +777,8 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) for_each_possible_cpu(cpu) *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; - mmu->arch = &kvm->arch; mmu->pgt = pgt; mmu->pgd_phys = __pa(pgt->pgd); - WRITE_ONCE(mmu->vmid.vmid_gen, 0); return 0; out_destroy_pgtable: @@ -599,13 +845,13 @@ void stage2_unmap_vm(struct kvm *kvm) idx = srcu_read_lock(&kvm->srcu); mmap_read_lock(current->mm); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); slots = kvm_memslots(kvm); kvm_for_each_memslot(memslot, slots) stage2_unmap_memslot(kvm, memslot); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); mmap_read_unlock(current->mm); srcu_read_unlock(&kvm->srcu, idx); } @@ -615,14 +861,17 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); struct kvm_pgtable *pgt = NULL; - spin_lock(&kvm->mmu_lock); + if (is_protected_kvm_enabled()) + return; + + write_lock(&kvm->mmu_lock); pgt = mmu->pgt; if (pgt) { mmu->pgd_phys = 0; mmu->pgt = NULL; free_percpu(mmu->last_vcpu_ran); } - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); if (pgt) { kvm_pgtable_stage2_destroy(pgt); @@ -630,6 +879,95 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) } } +static void hyp_mc_free_fn(void *addr, void *args) +{ + bool account_stage2 = (bool)args; + + if (account_stage2) + kvm_account_pgtable_pages(addr, -1); + + free_page((unsigned long)addr); +} + +static void *hyp_mc_alloc_fn(void *unused) +{ + void *addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + + if (addr) + kvm_account_pgtable_pages(addr, 1); + + return addr; +} + +static void account_hyp_memcache(struct kvm_hyp_memcache *mc, + unsigned long prev_nr_pages, + struct kvm *kvm) +{ + unsigned long nr_pages = mc->nr_pages; + + if (prev_nr_pages == nr_pages) + return; + + if (nr_pages > prev_nr_pages) { + atomic64_add((nr_pages - prev_nr_pages) << PAGE_SHIFT, + &kvm->stat.protected_hyp_mem); + } else { + atomic64_sub((prev_nr_pages - nr_pages) << PAGE_SHIFT, + &kvm->stat.protected_hyp_mem); + } +} + +static void __free_account_hyp_memcache(struct kvm_hyp_memcache *mc, + struct kvm *kvm, + bool account_stage2) +{ + unsigned long prev_nr_pages; + + if (!is_protected_kvm_enabled()) + return; + + prev_nr_pages = mc->nr_pages; + __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, + (void *)account_stage2); + account_hyp_memcache(mc, prev_nr_pages, kvm); +} + +void free_hyp_memcache(struct kvm_hyp_memcache *mc, struct kvm *kvm) +{ + __free_account_hyp_memcache(mc, kvm, false); +} + +/* + * All pages donated to the hypervisor through kvm_hyp_memcache are for the + * stage-2 page table. However, kvm_hyp_memcache is also a vehicule to retrieve + * meta-data from the hypervisor, hence the need for a stage2 specific free + * function. + */ +void free_hyp_stage2_memcache(struct kvm_hyp_memcache *mc, struct kvm *kvm) +{ + __free_account_hyp_memcache(mc, kvm, true); +} + +int topup_hyp_memcache(struct kvm_vcpu *vcpu) +{ + struct kvm_hyp_memcache *mc = &vcpu->arch.pkvm_memcache; + unsigned long prev_nr_pages; + int err; + + if (!is_protected_kvm_enabled()) + return 0; + + prev_nr_pages = mc->nr_pages; + + err = __topup_hyp_memcache(mc, kvm_mmu_cache_min_pages(vcpu->kvm), + hyp_mc_alloc_fn, + kvm_host_pa, NULL); + if (!err) + account_hyp_memcache(mc, prev_nr_pages, vcpu->kvm); + + return err; +} + /** * kvm_phys_addr_ioremap - map a device range to guest IPA * @@ -650,6 +988,9 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, KVM_PGTABLE_PROT_R | (writable ? KVM_PGTABLE_PROT_W : 0); + if (is_protected_kvm_enabled()) + return -EPERM; + size += offset_in_page(guest_ipa); guest_ipa &= PAGE_MASK; @@ -659,10 +1000,10 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, if (ret) break; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot, &cache); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); if (ret) break; @@ -710,9 +1051,9 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) start = memslot->base_gfn << PAGE_SHIFT; end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); stage2_wp_range(&kvm->arch.mmu, start, end); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); } @@ -939,6 +1280,118 @@ static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, return 0; } +static int pkvm_host_map_guest(u64 pfn, u64 gfn) +{ + int ret = kvm_call_hyp_nvhe(__pkvm_host_map_guest, pfn, gfn); + + /* + * Getting -EPERM at this point implies that the pfn has already been + * mapped. This should only ever happen when two vCPUs faulted on the + * same page, and the current one lost the race to do the mapping. + */ + return (ret == -EPERM) ? -EAGAIN : ret; +} + +static int cmp_ppages(struct rb_node *node, const struct rb_node *parent) +{ + struct kvm_pinned_page *a = container_of(node, struct kvm_pinned_page, node); + struct kvm_pinned_page *b = container_of(parent, struct kvm_pinned_page, node); + + if (a->ipa < b->ipa) + return -1; + if (a->ipa > b->ipa) + return 1; + return 0; +} + +static int insert_ppage(struct kvm *kvm, struct kvm_pinned_page *ppage) +{ + if (rb_find_add(&ppage->node, &kvm->arch.pkvm.pinned_pages, cmp_ppages)) + return -EEXIST; + + return 0; +} + +static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + unsigned long hva) +{ + struct mm_struct *mm = current->mm; + unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE; + struct kvm_pinned_page *ppage; + struct kvm *kvm = vcpu->kvm; + struct page *page; + u64 pfn; + int ret; + + ret = topup_hyp_memcache(vcpu); + if (ret) + return -ENOMEM; + + ppage = kmalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT); + if (!ppage) + return -ENOMEM; + + ret = account_locked_vm(mm, 1, true); + if (ret) + goto free_ppage; + + mmap_read_lock(mm); + ret = pin_user_pages(hva, 1, flags, &page, NULL); + mmap_read_unlock(mm); + + if (ret == -EHWPOISON) { + kvm_send_hwpoison_signal(hva, PAGE_SHIFT); + ret = 0; + goto dec_account; + } else if (ret != 1) { + ret = -EFAULT; + goto dec_account; + } else if (!PageSwapBacked(page)) { + /* + * We really can't deal with page-cache pages returned by GUP + * because (a) we may trigger writeback of a page for which we + * no longer have access and (b) page_mkclean() won't find the + * stage-2 mapping in the rmap so we can get out-of-whack with + * the filesystem when marking the page dirty during unpinning + * (see cc5095747edf ("ext4: don't BUG if someone dirty pages + * without asking ext4 first")). + * + * Ideally we'd just restrict ourselves to anonymous pages, but + * we also want to allow memfd (i.e. shmem) pages, so check for + * pages backed by swap in the knowledge that the GUP pin will + * prevent try_to_unmap() from succeeding. + */ + ret = -EIO; + goto dec_account; + } + + write_lock(&kvm->mmu_lock); + pfn = page_to_pfn(page); + ret = pkvm_host_map_guest(pfn, fault_ipa >> PAGE_SHIFT); + if (ret) { + if (ret == -EAGAIN) + ret = 0; + goto unpin; + } + + ppage->page = page; + ppage->ipa = fault_ipa; + WARN_ON(insert_ppage(kvm, ppage)); + write_unlock(&kvm->mmu_lock); + + return 0; + +unpin: + write_unlock(&kvm->mmu_lock); + unpin_user_pages(&page, 1); +dec_account: + account_locked_vm(mm, 1, false); +free_ppage: + kfree(ppage); + + return ret; +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) @@ -956,6 +1409,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, gfn_t gfn; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); + bool use_read_lock = false; unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu); unsigned long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; @@ -990,6 +1444,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (logging_active) { force_pte = true; vma_shift = PAGE_SHIFT; + use_read_lock = (fault_status == FSC_PERM && write_fault && + fault_granule == PAGE_SIZE); } else { vma_shift = get_vma_page_shift(vma, hva); } @@ -1088,7 +1544,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault && device) return -ENOEXEC; - spin_lock(&kvm->mmu_lock); + /* + * To reduce MMU contentions and enhance concurrency during dirty + * logging dirty logging, only acquire read lock for permission + * relaxation. + */ + if (use_read_lock) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); pgt = vcpu->arch.hw_mmu->pgt; if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; @@ -1135,6 +1599,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (fault_status == FSC_PERM && vma_pagesize == fault_granule) { ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); } else { + WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n"); + ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, __pfn_to_phys(pfn), prot, memcache); @@ -1147,7 +1613,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } out_unlock: - spin_unlock(&kvm->mmu_lock); + if (use_read_lock) + read_unlock(&kvm->mmu_lock); + else + write_unlock(&kvm->mmu_lock); kvm_set_pfn_accessed(pfn); kvm_release_pfn_clean(pfn); return ret != -EAGAIN ? ret : 0; @@ -1162,10 +1631,10 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) trace_kvm_access_fault(fault_ipa); - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); mmu = vcpu->arch.hw_mmu; kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa); - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); pte = __pte(kpte); if (pte_valid(pte)) @@ -1198,6 +1667,26 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); is_iabt = kvm_vcpu_trap_is_iabt(vcpu); + if (fault_status == FSC_FAULT) { + /* Beyond sanitised PARange (which is the IPA limit) */ + if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { + kvm_inject_size_fault(vcpu); + return 1; + } + + /* Falls between the IPA range and the PARange? */ + if (!is_protected_kvm_enabled() && + fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) { + fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); + + if (is_iabt) + kvm_inject_pabt(vcpu, fault_ipa); + else + kvm_inject_dabt(vcpu, fault_ipa); + return 1; + } + } + /* Synchronous External Abort? */ if (kvm_vcpu_abt_issea(vcpu)) { /* @@ -1269,7 +1758,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) * faulting VA. This is always 12 bits, irrespective * of the page size. */ - fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); + fault_ipa |= kvm_vcpu_get_hfar(vcpu) & FAR_MASK; ret = io_mem_abort(vcpu, fault_ipa); goto out_unlock; } @@ -1283,7 +1772,11 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) goto out_unlock; } - ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); + if (is_protected_kvm_enabled()) + ret = pkvm_mem_abort(vcpu, fault_ipa, hva); + else + ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); + if (ret == 0) ret = 1; out: @@ -1494,6 +1987,19 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, hva_t reg_end = hva + mem->memory_size; int ret = 0; + if (is_protected_kvm_enabled()) { + /* In protected mode, cannot modify memslots once a VM has run. */ + if ((change == KVM_MR_DELETE || change == KVM_MR_MOVE) && + kvm->arch.pkvm.handle) { + return -EPERM; + } + + if (memslot && + memslot->flags & (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)) { + return -EPERM; + } + } + if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && change != KVM_MR_FLAGS_ONLY) return 0; @@ -1567,9 +2073,13 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, gpa_t gpa = slot->base_gfn << PAGE_SHIFT; phys_addr_t size = slot->npages << PAGE_SHIFT; - spin_lock(&kvm->mmu_lock); + /* Stage-2 is managed by hyp in protected mode. */ + if (is_protected_kvm_enabled()) + return; + + write_lock(&kvm->mmu_lock); unmap_stage2_range(&kvm->arch.mmu, gpa, size); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } /* diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c deleted file mode 100644 index c84fe24b2ea1..000000000000 --- a/arch/arm64/kvm/perf.c +++ /dev/null @@ -1,59 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Based on the x86 implementation. - * - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier - */ - -#include -#include - -#include - -DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); - -static int kvm_is_in_guest(void) -{ - return kvm_get_running_vcpu() != NULL; -} - -static int kvm_is_user_mode(void) -{ - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_running_vcpu(); - - if (vcpu) - return !vcpu_mode_priv(vcpu); - - return 0; -} - -static unsigned long kvm_get_guest_ip(void) -{ - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_running_vcpu(); - - if (vcpu) - return *vcpu_pc(vcpu); - - return 0; -} - -static struct perf_guest_info_callbacks kvm_guest_cbs = { - .is_in_guest = kvm_is_in_guest, - .is_user_mode = kvm_is_user_mode, - .get_guest_ip = kvm_get_guest_ip, -}; - -int kvm_perf_init(void) -{ - return perf_register_guest_info_callbacks(&kvm_guest_cbs); -} - -int kvm_perf_teardown(void) -{ - return perf_unregister_guest_info_callbacks(&kvm_guest_cbs); -} diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c new file mode 100644 index 000000000000..4ff98340fcb2 --- /dev/null +++ b/arch/arm64/kvm/pkvm.c @@ -0,0 +1,670 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 - Google LLC + * Author: Quentin Perret + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "hyp_constants.h" + +static struct reserved_mem *pkvm_firmware_mem; +static phys_addr_t *pvmfw_base = &kvm_nvhe_sym(pvmfw_base); +static phys_addr_t *pvmfw_size = &kvm_nvhe_sym(pvmfw_size); + +static struct pkvm_moveable_reg *moveable_regs = kvm_nvhe_sym(pkvm_moveable_regs); +static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory); +static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr); + +phys_addr_t hyp_mem_base; +phys_addr_t hyp_mem_size; + +static int cmp_hyp_memblock(const void *p1, const void *p2) +{ + const struct memblock_region *r1 = p1; + const struct memblock_region *r2 = p2; + + return r1->base < r2->base ? -1 : (r1->base > r2->base); +} + +static void __init sort_memblock_regions(void) +{ + sort(hyp_memory, + *hyp_memblock_nr_ptr, + sizeof(struct memblock_region), + cmp_hyp_memblock, + NULL); +} + +static int __init register_memblock_regions(void) +{ + struct memblock_region *reg; + + for_each_mem_region(reg) { + if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS) + return -ENOMEM; + + hyp_memory[*hyp_memblock_nr_ptr] = *reg; + (*hyp_memblock_nr_ptr)++; + } + sort_memblock_regions(); + + return 0; +} + +static int cmp_moveable_reg(const void *p1, const void *p2) +{ + const struct pkvm_moveable_reg *r1 = p1; + const struct pkvm_moveable_reg *r2 = p2; + + /* + * Moveable regions may overlap, so put the largest one first when start + * addresses are equal to allow a simpler walk from e.g. + * host_stage2_unmap_unmoveable_regs(). + */ + if (r1->start < r2->start) + return -1; + else if (r1->start > r2->start) + return 1; + else if (r1->size > r2->size) + return -1; + else if (r1->size < r2->size) + return 1; + return 0; +} + +static void __init sort_moveable_regs(void) +{ + sort(moveable_regs, + kvm_nvhe_sym(pkvm_moveable_regs_nr), + sizeof(struct pkvm_moveable_reg), + cmp_moveable_reg, + NULL); +} + +static int __init register_moveable_regions(void) +{ + struct memblock_region *reg; + struct device_node *np; + int i = 0; + + for_each_mem_region(reg) { + if (i >= PKVM_NR_MOVEABLE_REGS) + return -ENOMEM; + moveable_regs[i].start = reg->base; + moveable_regs[i].size = reg->size; + moveable_regs[i].type = PKVM_MREG_MEMORY; + i++; + } + + for_each_compatible_node(np, NULL, "pkvm,protected-region") { + struct resource res; + u64 start, size; + int ret; + + if (i >= PKVM_NR_MOVEABLE_REGS) + return -ENOMEM; + + ret = of_address_to_resource(np, 0, &res); + if (ret) + return ret; + + start = res.start; + size = resource_size(&res); + if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(size)) + return -EINVAL; + + moveable_regs[i].start = start; + moveable_regs[i].size = size; + moveable_regs[i].type = PKVM_MREG_PROTECTED_RANGE; + i++; + } + + kvm_nvhe_sym(pkvm_moveable_regs_nr) = i; + sort_moveable_regs(); + + return 0; +} + +void __init kvm_hyp_reserve(void) +{ + u64 hyp_mem_pages = 0; + int ret; + + if (!is_hyp_mode_available() || is_kernel_in_hyp_mode()) + return; + + if (kvm_get_mode() != KVM_MODE_PROTECTED) + return; + + ret = register_memblock_regions(); + if (ret) { + *hyp_memblock_nr_ptr = 0; + kvm_err("Failed to register hyp memblocks: %d\n", ret); + return; + } + + ret = register_moveable_regions(); + if (ret) { + *hyp_memblock_nr_ptr = 0; + kvm_err("Failed to register pkvm moveable regions: %d\n", ret); + return; + } + + hyp_mem_pages += hyp_s1_pgtable_pages(); + hyp_mem_pages += host_s2_pgtable_pages(); + hyp_mem_pages += hyp_vm_table_pages(); + hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE); + hyp_mem_pages += hyp_ffa_proxy_pages(); + + /* + * Try to allocate a PMD-aligned region to reduce TLB pressure once + * this is unmapped from the host stage-2, and fallback to PAGE_SIZE. + */ + hyp_mem_size = hyp_mem_pages << PAGE_SHIFT; + hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE), + PMD_SIZE); + if (!hyp_mem_base) + hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE); + else + hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE); + + if (!hyp_mem_base) { + kvm_err("Failed to reserve hyp memory\n"); + return; + } + + kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20, + hyp_mem_base); +} + +/* + * Allocates and donates memory for hypervisor VM structs at EL2. + * + * Allocates space for the VM state, which includes the hyp vm as well as + * the hyp vcpus. + * + * Stores an opaque handler in the kvm struct for future reference. + * + * Return 0 on success, negative error code on failure. + */ +static int __pkvm_create_hyp_vm(struct kvm *host_kvm) +{ + size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz, last_ran_sz, total_sz; + struct kvm_vcpu *host_vcpu; + pkvm_handle_t handle; + void *pgd, *hyp_vm, *last_ran; + unsigned long idx; + int ret; + + if (host_kvm->created_vcpus < 1) + return -EINVAL; + + pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr); + + /* + * The PGD pages will be reclaimed using a hyp_memcache which implies + * page granularity. So, use alloc_pages_exact() to get individual + * refcounts. + */ + pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT); + if (!pgd) + return -ENOMEM; + + /* Allocate memory to donate to hyp for vm and vcpu pointers. */ + hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE, + size_mul(sizeof(void *), + host_kvm->created_vcpus))); + hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT); + if (!hyp_vm) { + ret = -ENOMEM; + goto free_pgd; + } + + /* Allocate memory to donate to hyp for tracking mmu->last_vcpu_ran. */ + last_ran_sz = PAGE_ALIGN(array_size(num_possible_cpus(), sizeof(int))); + last_ran = alloc_pages_exact(last_ran_sz, GFP_KERNEL_ACCOUNT); + if (!last_ran) { + ret = -ENOMEM; + goto free_vm; + } + + /* Donate the VM memory to hyp and let hyp initialize it. */ + ret = kvm_call_hyp_nvhe(__pkvm_init_vm, host_kvm, hyp_vm, pgd, last_ran); + if (ret < 0) + goto free_last_ran; + + handle = ret; + + host_kvm->arch.pkvm.handle = handle; + + total_sz = hyp_vm_sz + last_ran_sz + pgd_sz; + + /* Donate memory for the vcpus at hyp and initialize it. */ + hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE); + kvm_for_each_vcpu(idx, host_vcpu, host_kvm) { + void *hyp_vcpu; + + /* Indexing of the vcpus to be sequential starting at 0. */ + if (WARN_ON(host_vcpu->vcpu_idx != idx)) { + ret = -EINVAL; + goto destroy_vm; + } + + hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT); + if (!hyp_vcpu) { + ret = -ENOMEM; + goto destroy_vm; + } + + total_sz += hyp_vcpu_sz; + + ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu, + hyp_vcpu); + if (ret) { + free_pages_exact(hyp_vcpu, hyp_vcpu_sz); + goto destroy_vm; + } + } + + atomic64_set(&host_kvm->stat.protected_hyp_mem, total_sz); + kvm_account_pgtable_pages(pgd, pgd_sz >> PAGE_SHIFT); + + return 0; + +destroy_vm: + pkvm_destroy_hyp_vm(host_kvm); + return ret; +free_last_ran: + free_pages_exact(last_ran, last_ran_sz); +free_vm: + free_pages_exact(hyp_vm, hyp_vm_sz); +free_pgd: + free_pages_exact(pgd, pgd_sz); + return ret; +} + +int pkvm_create_hyp_vm(struct kvm *host_kvm) +{ + int ret = 0; + + mutex_lock(&host_kvm->lock); + if (!host_kvm->arch.pkvm.handle) + ret = __pkvm_create_hyp_vm(host_kvm); + mutex_unlock(&host_kvm->lock); + + return ret; +} + +void pkvm_destroy_hyp_vm(struct kvm *host_kvm) +{ + struct kvm_pinned_page *ppage; + struct mm_struct *mm = current->mm; + struct rb_node *node; + + if (!host_kvm->arch.pkvm.handle) + goto out_free; + + WARN_ON(kvm_call_hyp_nvhe(__pkvm_start_teardown_vm, host_kvm->arch.pkvm.handle)); + + node = rb_first(&host_kvm->arch.pkvm.pinned_pages); + while (node) { + ppage = rb_entry(node, struct kvm_pinned_page, node); + WARN_ON(kvm_call_hyp_nvhe(__pkvm_reclaim_dying_guest_page, + host_kvm->arch.pkvm.handle, + page_to_pfn(ppage->page), + ppage->ipa)); + cond_resched(); + + account_locked_vm(mm, 1, false); + unpin_user_pages_dirty_lock(&ppage->page, 1, true); + node = rb_next(node); + rb_erase(&ppage->node, &host_kvm->arch.pkvm.pinned_pages); + kfree(ppage); + } + + WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm, host_kvm->arch.pkvm.handle)); + +out_free: + host_kvm->arch.pkvm.handle = 0; + free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc, host_kvm); + free_hyp_stage2_memcache(&host_kvm->arch.pkvm.teardown_stage2_mc, + host_kvm); +} + +int pkvm_init_host_vm(struct kvm *host_kvm, unsigned long type) +{ + mutex_init(&host_kvm->lock); + + if (!(type & KVM_VM_TYPE_ARM_PROTECTED)) + return 0; + + if (!is_protected_kvm_enabled()) + return -EINVAL; + + host_kvm->arch.pkvm.pvmfw_load_addr = PVMFW_INVALID_LOAD_ADDR; + host_kvm->arch.pkvm.enabled = true; + return 0; +} + +static int rb_ppage_cmp(const void *key, const struct rb_node *node) +{ + struct kvm_pinned_page *p = container_of(node, struct kvm_pinned_page, node); + phys_addr_t ipa = (phys_addr_t)key; + + return (ipa < p->ipa) ? -1 : (ipa > p->ipa); +} + +void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa) +{ + struct kvm_pinned_page *ppage; + struct mm_struct *mm = current->mm; + struct rb_node *node; + + write_lock(&host_kvm->mmu_lock); + node = rb_find((void *)ipa, &host_kvm->arch.pkvm.pinned_pages, + rb_ppage_cmp); + if (node) + rb_erase(node, &host_kvm->arch.pkvm.pinned_pages); + write_unlock(&host_kvm->mmu_lock); + + WARN_ON(!node); + if (!node) + return; + + ppage = container_of(node, struct kvm_pinned_page, node); + account_locked_vm(mm, 1, false); + unpin_user_pages_dirty_lock(&ppage->page, 1, true); + kfree(ppage); +} + +static int __init pkvm_firmware_rmem_err(struct reserved_mem *rmem, + const char *reason) +{ + phys_addr_t end = rmem->base + rmem->size; + + kvm_err("Ignoring pkvm guest firmware memory reservation [%pa - %pa]: %s\n", + &rmem->base, &end, reason); + return -EINVAL; +} + +static int __init pkvm_firmware_rmem_init(struct reserved_mem *rmem) +{ + unsigned long node = rmem->fdt_node; + + if (pkvm_firmware_mem) + return pkvm_firmware_rmem_err(rmem, "duplicate reservation"); + + if (!of_get_flat_dt_prop(node, "no-map", NULL)) + return pkvm_firmware_rmem_err(rmem, "missing \"no-map\" property"); + + if (of_get_flat_dt_prop(node, "reusable", NULL)) + return pkvm_firmware_rmem_err(rmem, "\"reusable\" property unsupported"); + + if (!PAGE_ALIGNED(rmem->base)) + return pkvm_firmware_rmem_err(rmem, "base is not page-aligned"); + + if (!PAGE_ALIGNED(rmem->size)) + return pkvm_firmware_rmem_err(rmem, "size is not page-aligned"); + + *pvmfw_size = rmem->size; + *pvmfw_base = rmem->base; + pkvm_firmware_mem = rmem; + return 0; +} +RESERVEDMEM_OF_DECLARE(pkvm_firmware, "linux,pkvm-guest-firmware-memory", + pkvm_firmware_rmem_init); + +static int __init pkvm_firmware_rmem_clear(void) +{ + void *addr; + phys_addr_t size; + + if (likely(!pkvm_firmware_mem) || is_protected_kvm_enabled()) + return 0; + + kvm_info("Clearing unused pKVM firmware memory\n"); + size = pkvm_firmware_mem->size; + addr = memremap(pkvm_firmware_mem->base, size, MEMREMAP_WB); + if (!addr) + return -EINVAL; + + memset(addr, 0, size); + dcache_clean_poc((unsigned long)addr, (unsigned long)addr + size); + memunmap(addr); + return 0; +} +device_initcall_sync(pkvm_firmware_rmem_clear); + +static int pkvm_vm_ioctl_set_fw_ipa(struct kvm *kvm, u64 ipa) +{ + int ret = 0; + + if (!pkvm_firmware_mem) + return -EINVAL; + + mutex_lock(&kvm->lock); + if (kvm->arch.pkvm.handle) { + ret = -EBUSY; + goto out_unlock; + } + + kvm->arch.pkvm.pvmfw_load_addr = ipa; +out_unlock: + mutex_unlock(&kvm->lock); + return ret; +} + +static int pkvm_vm_ioctl_info(struct kvm *kvm, + struct kvm_protected_vm_info __user *info) +{ + struct kvm_protected_vm_info kinfo = { + .firmware_size = pkvm_firmware_mem ? + pkvm_firmware_mem->size : + 0, + }; + + return copy_to_user(info, &kinfo, sizeof(kinfo)) ? -EFAULT : 0; +} + +int pkvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) +{ + if (!kvm_vm_is_protected(kvm)) + return -EINVAL; + + if (cap->args[1] || cap->args[2] || cap->args[3]) + return -EINVAL; + + switch (cap->flags) { + case KVM_CAP_ARM_PROTECTED_VM_FLAGS_SET_FW_IPA: + return pkvm_vm_ioctl_set_fw_ipa(kvm, cap->args[0]); + case KVM_CAP_ARM_PROTECTED_VM_FLAGS_INFO: + return pkvm_vm_ioctl_info(kvm, (void __force __user *)cap->args[0]); + default: + return -EINVAL; + } + + return 0; +} + +#ifdef CONFIG_MODULES +static int __init early_pkvm_enable_modules(char *arg) +{ + extern unsigned long kvm_nvhe_sym(pkvm_priv_hcall_limit); + + /* + * Move the limit to allow module loading HVCs. It will be moved back to + * its original position in __pkvm_close_module_registration(). + */ + kvm_nvhe_sym(pkvm_priv_hcall_limit) = __KVM_HOST_SMCCC_FUNC___pkvm_alloc_module_va; + + return 0; +} +early_param("kvm-arm.protected_modules", early_pkvm_enable_modules); + +struct pkvm_mod_sec_mapping { + struct pkvm_module_section *sec; + enum kvm_pgtable_prot prot; +}; + +static void pkvm_unmap_module_pages(void *kern_va, void *hyp_va, size_t size) +{ + size_t offset; + u64 pfn; + + for (offset = 0; offset < size; offset += PAGE_SIZE) { + pfn = vmalloc_to_pfn(kern_va + offset); + kvm_call_hyp_nvhe(__pkvm_unmap_module_page, pfn, + hyp_va + offset); + } +} + +static void pkvm_unmap_module_sections(struct pkvm_mod_sec_mapping *secs_map, void *hyp_va_base, int nr_secs) +{ + size_t offset, size; + void *start; + int i; + + for (i = 0; i < nr_secs; i++) { + start = secs_map[i].sec->start; + size = secs_map[i].sec->end - start; + offset = start - secs_map[0].sec->start; + pkvm_unmap_module_pages(start, hyp_va_base + offset, size); + } +} + +static int pkvm_map_module_section(struct pkvm_mod_sec_mapping *sec_map, void *hyp_va) +{ + size_t offset, size = sec_map->sec->end - sec_map->sec->start; + int ret; + u64 pfn; + + for (offset = 0; offset < size; offset += PAGE_SIZE) { + pfn = vmalloc_to_pfn(sec_map->sec->start + offset); + ret = kvm_call_hyp_nvhe(__pkvm_map_module_page, pfn, + hyp_va + offset, sec_map->prot); + if (ret) { + pkvm_unmap_module_pages(sec_map->sec->start, hyp_va, offset); + return ret; + } + } + + return 0; +} + +static int pkvm_map_module_sections(struct pkvm_mod_sec_mapping *secs_map, void *hyp_va_base, int nr_secs) +{ + size_t offset; + int i, ret; + + for (i = 0; i < nr_secs; i++) { + offset = secs_map[i].sec->start - secs_map[0].sec->start; + ret = pkvm_map_module_section(&secs_map[i], hyp_va_base + offset); + if (ret) { + pkvm_unmap_module_sections(secs_map, hyp_va_base, i); + return ret; + } + } + + return 0; +} + +static int __pkvm_cmp_mod_sec(const void *p1, const void *p2) +{ + struct pkvm_mod_sec_mapping const *s1 = p1; + struct pkvm_mod_sec_mapping const *s2 = p2; + + return s1->sec->start < s2->sec->start ? -1 : s1->sec->start > s2->sec->start; +} + +int __pkvm_load_el2_module(struct module *this, unsigned long *token) +{ + struct pkvm_el2_module *mod = &this->arch.hyp; + struct pkvm_mod_sec_mapping secs_map[] = { + { &mod->text, KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X }, + { &mod->bss, KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W }, + { &mod->rodata, KVM_PGTABLE_PROT_R }, + { &mod->data, KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W }, + }; + void *start, *end, *hyp_va; + kvm_nvhe_reloc_t *endrel; + size_t offset, size; + int ret, i; + + if (!is_protected_kvm_enabled()) + return -EOPNOTSUPP; + + for (i = 0; i < ARRAY_SIZE(secs_map); i++) { + if (!PAGE_ALIGNED(secs_map[i].sec->start)) { + kvm_err("EL2 sections are not page-aligned\n"); + return -EINVAL; + } + } + + if (!try_module_get(this)) { + kvm_err("Kernel module has been unloaded\n"); + return -ENODEV; + } + + sort(secs_map, ARRAY_SIZE(secs_map), sizeof(secs_map[0]), __pkvm_cmp_mod_sec, NULL); + start = secs_map[0].sec->start; + end = secs_map[ARRAY_SIZE(secs_map) - 1].sec->end; + size = end - start; + + hyp_va = (void *)kvm_call_hyp_nvhe(__pkvm_alloc_module_va, size >> PAGE_SHIFT); + if (!hyp_va) { + kvm_err("Failed to allocate hypervisor VA space for EL2 module\n"); + module_put(this); + return -ENOMEM; + } + + /* + * The token can be used for other calls related to this module. + * Conveniently the only information needed is this addr so let's use it + * as an identifier. + */ + if (token) + *token = (unsigned long)hyp_va; + + endrel = (void *)mod->relocs + mod->nr_relocs * sizeof(*endrel); + kvm_apply_hyp_module_relocations(start, hyp_va, mod->relocs, endrel); + + ret = pkvm_map_module_sections(secs_map, hyp_va, ARRAY_SIZE(secs_map)); + if (ret) { + kvm_err("Failed to map EL2 module page: %d\n", ret); + module_put(this); + return ret; + } + + offset = (size_t)((void *)mod->init - start); + ret = kvm_call_hyp_nvhe(__pkvm_init_module, hyp_va + offset); + if (ret) { + kvm_err("Failed to init EL2 module: %d\n", ret); + pkvm_unmap_module_sections(secs_map, hyp_va, ARRAY_SIZE(secs_map)); + module_put(this); + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(__pkvm_load_el2_module); + +int __pkvm_register_el2_call(unsigned long hfn_hyp_va) +{ + return kvm_call_hyp_nvhe(__pkvm_register_hcall, hfn_hyp_va); +} +EXPORT_SYMBOL_GPL(__pkvm_register_el2_call); +#endif /* CONFIG_MODULES */ diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 2af3c37445e0..0003c7d37533 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -14,6 +15,11 @@ #include #include +DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); + +static LIST_HEAD(arm_pmus); +static DEFINE_MUTEX(arm_pmus_lock); + static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx); static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx); static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc); @@ -22,15 +28,20 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc); static u32 kvm_pmu_event_mask(struct kvm *kvm) { - switch (kvm->arch.pmuver) { - case ID_AA64DFR0_PMUVER_8_0: + unsigned int pmuver; + + pmuver = kvm->arch.arm_pmu->pmuver; + + switch (pmuver) { + case ID_AA64DFR0_EL1_PMUVer_IMP: return GENMASK(9, 0); - case ID_AA64DFR0_PMUVER_8_1: - case ID_AA64DFR0_PMUVER_8_4: - case ID_AA64DFR0_PMUVER_8_5: + case ID_AA64DFR0_EL1_PMUVer_V3P1: + case ID_AA64DFR0_EL1_PMUVer_V3P4: + case ID_AA64DFR0_EL1_PMUVer_V3P5: + case ID_AA64DFR0_EL1_PMUVer_V3P7: return GENMASK(15, 0); default: /* Shouldn't be here, just for sanity */ - WARN_ONCE(1, "Unknown PMU version %d\n", kvm->arch.pmuver); + WARN_ONCE(1, "Unknown PMU version %d\n", pmuver); return 0; } } @@ -166,6 +177,9 @@ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) struct kvm_pmu *pmu = &vcpu->arch.pmu; struct kvm_pmc *pmc = &pmu->pmc[select_idx]; + if (!kvm_vcpu_has_pmu(vcpu)) + return 0; + counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); if (kvm_pmu_pmc_is_chained(pmc) && @@ -187,6 +201,9 @@ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) { u64 reg; + if (!kvm_vcpu_has_pmu(vcpu)) + return; + reg = (select_idx == ARMV8_PMU_CYCLE_IDX) ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx; __vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx); @@ -311,6 +328,9 @@ void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) struct kvm_pmu *pmu = &vcpu->arch.pmu; struct kvm_pmc *pmc; + if (!kvm_vcpu_has_pmu(vcpu)) + return; + if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val) return; @@ -346,7 +366,7 @@ void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) struct kvm_pmu *pmu = &vcpu->arch.pmu; struct kvm_pmc *pmc; - if (!val) + if (!kvm_vcpu_has_pmu(vcpu) || !val) return; for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { @@ -516,6 +536,9 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) struct kvm_pmu *pmu = &vcpu->arch.pmu; int i; + if (!kvm_vcpu_has_pmu(vcpu)) + return; + if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) return; @@ -565,6 +588,9 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) { int i; + if (!kvm_vcpu_has_pmu(vcpu)) + return; + if (val & ARMV8_PMU_PMCR_E) { kvm_pmu_enable_counter_mask(vcpu, __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); @@ -597,6 +623,7 @@ static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx) */ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) { + struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu; struct kvm_pmu *pmu = &vcpu->arch.pmu; struct kvm_pmc *pmc; struct perf_event *event; @@ -633,7 +660,7 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) return; memset(&attr, 0, sizeof(struct perf_event_attr)); - attr.type = PERF_TYPE_RAW; + attr.type = arm_pmu->pmu.type; attr.size = sizeof(attr); attr.pinned = 1; attr.disabled = !kvm_pmu_counter_is_enabled(vcpu, pmc->idx); @@ -727,6 +754,9 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, { u64 reg, mask; + if (!kvm_vcpu_has_pmu(vcpu)) + return; + mask = ARMV8_PMU_EVTYPE_MASK; mask &= ~ARMV8_PMU_EVTYPE_EVENT; mask |= kvm_pmu_event_mask(vcpu->kvm); @@ -742,17 +772,32 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, void kvm_host_pmu_init(struct arm_pmu *pmu) { - if (pmu->pmuver != 0 && pmu->pmuver != ID_AA64DFR0_PMUVER_IMP_DEF && - !kvm_arm_support_pmu_v3() && !is_protected_kvm_enabled()) + struct arm_pmu_entry *entry; + + if (pmu->pmuver == 0 || pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + return; + + mutex_lock(&arm_pmus_lock); + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + goto out_unlock; + + entry->arm_pmu = pmu; + list_add_tail(&entry->entry, &arm_pmus); + + if (list_is_singular(&arm_pmus)) static_branch_enable(&kvm_arm_pmu_available); + +out_unlock: + mutex_unlock(&arm_pmus_lock); } -static int kvm_pmu_probe_pmuver(void) +static struct arm_pmu *kvm_pmu_probe_armpmu(void) { struct perf_event_attr attr = { }; struct perf_event *event; - struct arm_pmu *pmu; - int pmuver = ID_AA64DFR0_PMUVER_IMP_DEF; + struct arm_pmu *pmu = NULL; /* * Create a dummy event that only counts user cycles. As we'll never @@ -777,19 +822,20 @@ static int kvm_pmu_probe_pmuver(void) if (IS_ERR(event)) { pr_err_once("kvm: pmu event creation failed %ld\n", PTR_ERR(event)); - return ID_AA64DFR0_PMUVER_IMP_DEF; + return NULL; } if (event->pmu) { pmu = to_arm_pmu(event->pmu); - if (pmu->pmuver) - pmuver = pmu->pmuver; + if (pmu->pmuver == 0 || + pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + pmu = NULL; } perf_event_disable(event); perf_event_release_kernel(event); - return pmuver; + return pmu; } u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) @@ -798,6 +844,9 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) u64 val, mask = 0; int base, i, nr_events; + if (!kvm_vcpu_has_pmu(vcpu)) + return 0; + if (!pmceid1) { val = read_sysreg(pmceid0_el0); base = 0; @@ -807,7 +856,7 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) * Don't advertise STALL_SLOT, as PMMIR_EL0 is handled * as RAZ */ - if (vcpu->kvm->arch.pmuver >= ID_AA64DFR0_PMUVER_8_4) + if (vcpu->kvm->arch.arm_pmu->pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P4) val &= ~BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32); base = 32; } @@ -900,7 +949,7 @@ static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) */ static bool pmu_irq_is_valid(struct kvm *kvm, int irq) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { @@ -919,26 +968,64 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq) return true; } +static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id) +{ + struct kvm *kvm = vcpu->kvm; + struct arm_pmu_entry *entry; + struct arm_pmu *arm_pmu; + int ret = -ENXIO; + + mutex_lock(&kvm->lock); + mutex_lock(&arm_pmus_lock); + + list_for_each_entry(entry, &arm_pmus, entry) { + arm_pmu = entry->arm_pmu; + if (arm_pmu->pmu.type == pmu_id) { + if (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags) || + (kvm->arch.pmu_filter && kvm->arch.arm_pmu != arm_pmu)) { + ret = -EBUSY; + break; + } + + kvm->arch.arm_pmu = arm_pmu; + cpumask_copy(kvm->arch.supported_cpus, &arm_pmu->supported_cpus); + ret = 0; + break; + } + } + + mutex_unlock(&arm_pmus_lock); + mutex_unlock(&kvm->lock); + return ret; +} + int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { + struct kvm *kvm = vcpu->kvm; + if (!kvm_vcpu_has_pmu(vcpu)) return -ENODEV; if (vcpu->arch.pmu.created) return -EBUSY; - if (!vcpu->kvm->arch.pmuver) - vcpu->kvm->arch.pmuver = kvm_pmu_probe_pmuver(); - - if (vcpu->kvm->arch.pmuver == ID_AA64DFR0_PMUVER_IMP_DEF) - return -ENODEV; + mutex_lock(&kvm->lock); + if (!kvm->arch.arm_pmu) { + /* No PMU set, get the default one */ + kvm->arch.arm_pmu = kvm_pmu_probe_armpmu(); + if (!kvm->arch.arm_pmu) { + mutex_unlock(&kvm->lock); + return -ENODEV; + } + } + mutex_unlock(&kvm->lock); switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: { int __user *uaddr = (int __user *)(long)attr->addr; int irq; - if (!irqchip_in_kernel(vcpu->kvm)) + if (!irqchip_in_kernel(kvm)) return -EINVAL; if (get_user(irq, uaddr)) @@ -948,7 +1035,7 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) if (!(irq_is_ppi(irq) || irq_is_spi(irq))) return -EINVAL; - if (!pmu_irq_is_valid(vcpu->kvm, irq)) + if (!pmu_irq_is_valid(kvm, irq)) return -EINVAL; if (kvm_arm_pmu_irq_initialized(vcpu)) @@ -963,7 +1050,7 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) struct kvm_pmu_event_filter filter; int nr_events; - nr_events = kvm_pmu_event_mask(vcpu->kvm) + 1; + nr_events = kvm_pmu_event_mask(kvm) + 1; uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr; @@ -975,12 +1062,17 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) filter.action != KVM_PMU_EVENT_DENY)) return -EINVAL; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&kvm->lock); - if (!vcpu->kvm->arch.pmu_filter) { - vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL); - if (!vcpu->kvm->arch.pmu_filter) { - mutex_unlock(&vcpu->kvm->lock); + if (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags)) { + mutex_unlock(&kvm->lock); + return -EBUSY; + } + + if (!kvm->arch.pmu_filter) { + kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL_ACCOUNT); + if (!kvm->arch.pmu_filter) { + mutex_unlock(&kvm->lock); return -ENOMEM; } @@ -991,20 +1083,29 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) * events, the default is to allow. */ if (filter.action == KVM_PMU_EVENT_ALLOW) - bitmap_zero(vcpu->kvm->arch.pmu_filter, nr_events); + bitmap_zero(kvm->arch.pmu_filter, nr_events); else - bitmap_fill(vcpu->kvm->arch.pmu_filter, nr_events); + bitmap_fill(kvm->arch.pmu_filter, nr_events); } if (filter.action == KVM_PMU_EVENT_ALLOW) - bitmap_set(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents); + bitmap_set(kvm->arch.pmu_filter, filter.base_event, filter.nevents); else - bitmap_clear(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents); + bitmap_clear(kvm->arch.pmu_filter, filter.base_event, filter.nevents); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&kvm->lock); return 0; } + case KVM_ARM_VCPU_PMU_V3_SET_PMU: { + int __user *uaddr = (int __user *)(long)attr->addr; + int pmu_id; + + if (get_user(pmu_id, uaddr)) + return -EFAULT; + + return kvm_arm_pmu_v3_set_pmu(vcpu, pmu_id); + } case KVM_ARM_VCPU_PMU_V3_INIT: return kvm_arm_pmu_v3_init(vcpu); } @@ -1042,6 +1143,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) case KVM_ARM_VCPU_PMU_V3_IRQ: case KVM_ARM_VCPU_PMU_V3_INIT: case KVM_ARM_VCPU_PMU_V3_FILTER: + case KVM_ARM_VCPU_PMU_V3_SET_PMU: if (kvm_vcpu_has_pmu(vcpu)) return 0; } diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 03a6c1f4a09a..7887133d15f0 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -5,7 +5,8 @@ */ #include #include -#include + +static DEFINE_PER_CPU(struct kvm_pmu_events, kvm_pmu_events); /* * Given the perf event attributes and system type, determine @@ -25,21 +26,26 @@ static bool kvm_pmu_switch_needed(struct perf_event_attr *attr) return (attr->exclude_host != attr->exclude_guest); } +struct kvm_pmu_events *kvm_get_pmu_events(void) +{ + return this_cpu_ptr(&kvm_pmu_events); +} + /* * Add events to track that we may want to switch at guest entry/exit * time. */ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) { - struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data); + struct kvm_pmu_events *pmu = kvm_get_pmu_events(); - if (!kvm_arm_support_pmu_v3() || !ctx || !kvm_pmu_switch_needed(attr)) + if (!kvm_arm_support_pmu_v3() || !pmu || !kvm_pmu_switch_needed(attr)) return; if (!attr->exclude_host) - ctx->pmu_events.events_host |= set; + pmu->events_host |= set; if (!attr->exclude_guest) - ctx->pmu_events.events_guest |= set; + pmu->events_guest |= set; } /* @@ -47,13 +53,13 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) */ void kvm_clr_pmu_events(u32 clr) { - struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data); + struct kvm_pmu_events *pmu = kvm_get_pmu_events(); - if (!kvm_arm_support_pmu_v3() || !ctx) + if (!kvm_arm_support_pmu_v3() || !pmu) return; - ctx->pmu_events.events_host &= ~clr; - ctx->pmu_events.events_guest &= ~clr; + pmu->events_host &= ~clr; + pmu->events_guest &= ~clr; } #define PMEVTYPER_READ_CASE(idx) \ @@ -169,16 +175,16 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events) */ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) { - struct kvm_host_data *host; + struct kvm_pmu_events *pmu; u32 events_guest, events_host; if (!kvm_arm_support_pmu_v3() || !has_vhe()) return; preempt_disable(); - host = this_cpu_ptr_hyp_sym(kvm_host_data); - events_guest = host->pmu_events.events_guest; - events_host = host->pmu_events.events_host; + pmu = kvm_get_pmu_events(); + events_guest = pmu->events_guest; + events_host = pmu->events_host; kvm_vcpu_pmu_enable_el0(events_guest); kvm_vcpu_pmu_disable_el0(events_host); @@ -190,15 +196,15 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) */ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) { - struct kvm_host_data *host; + struct kvm_pmu_events *pmu; u32 events_guest, events_host; if (!kvm_arm_support_pmu_v3() || !has_vhe()) return; - host = this_cpu_ptr_hyp_sym(kvm_host_data); - events_guest = host->pmu_events.events_guest; - events_host = host->pmu_events.events_host; + pmu = kvm_get_pmu_events(); + events_guest = pmu->events_guest; + events_host = pmu->events_host; kvm_vcpu_pmu_enable_el0(events_host); kvm_vcpu_pmu_disable_el0(events_guest); diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 44efe12dfc06..b934df9039e1 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -21,16 +21,6 @@ * as described in ARM document number ARM DEN 0022A. */ -#define AFFINITY_MASK(level) ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1) - -static unsigned long psci_affinity_mask(unsigned long affinity_level) -{ - if (affinity_level <= 3) - return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level); - - return 0; -} - static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu) { /* @@ -46,25 +36,11 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu) * specification (ARM DEN 0022A). This means all suspend states * for KVM will preserve the register state. */ - kvm_vcpu_block(vcpu); - kvm_clear_request(KVM_REQ_UNHALT, vcpu); + kvm_vcpu_wfi(vcpu); return PSCI_RET_SUCCESS; } -static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) -{ - vcpu->arch.power_off = true; - kvm_make_request(KVM_REQ_SLEEP, vcpu); - kvm_vcpu_kick(vcpu); -} - -static inline bool kvm_psci_valid_affinity(struct kvm_vcpu *vcpu, - unsigned long affinity) -{ - return !(affinity & ~MPIDR_HWID_BITMASK); -} - static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) { struct vcpu_reset_state *reset_state; @@ -84,8 +60,8 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) */ if (!vcpu) return PSCI_RET_INVALID_PARAMS; - if (!vcpu->arch.power_off) { - if (kvm_psci_version(source_vcpu, kvm) != KVM_ARM_PSCI_0_1) + if (!kvm_arm_vcpu_stopped(vcpu)) { + if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1) return PSCI_RET_ALREADY_ON; else return PSCI_RET_INVALID_PARAMS; @@ -108,12 +84,12 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) kvm_make_request(KVM_REQ_VCPU_RESET, vcpu); /* - * Make sure the reset request is observed if the change to - * power_state is observed. + * Make sure the reset request is observed if the RUNNABLE mp_state is + * observed. */ smp_wmb(); - vcpu->arch.power_off = false; + vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE; kvm_vcpu_wake_up(vcpu); return PSCI_RET_SUCCESS; @@ -121,8 +97,8 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) { - int i, matching_cpus = 0; - unsigned long mpidr; + int matching_cpus = 0; + unsigned long i, mpidr; unsigned long target_affinity; unsigned long target_affinity_mask; unsigned long lowest_affinity_level; @@ -151,7 +127,7 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) mpidr = kvm_vcpu_get_mpidr_aff(tmp); if ((mpidr & target_affinity_mask) == target_affinity) { matching_cpus++; - if (!tmp->arch.power_off) + if (!kvm_arm_vcpu_stopped(tmp)) return PSCI_0_2_AFFINITY_LEVEL_ON; } } @@ -162,9 +138,9 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) return PSCI_0_2_AFFINITY_LEVEL_OFF; } -static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type) +static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags) { - int i; + unsigned long i; struct kvm_vcpu *tmp; /* @@ -177,47 +153,47 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type) * re-initialized. */ kvm_for_each_vcpu(i, tmp, vcpu->kvm) - tmp->arch.power_off = true; + tmp->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP); memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); vcpu->run->system_event.type = type; + vcpu->run->system_event.flags = flags; vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; } static void kvm_psci_system_off(struct kvm_vcpu *vcpu) { - kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN); + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN, 0); } static void kvm_psci_system_reset(struct kvm_vcpu *vcpu) { - kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET); + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET, 0); } -static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu) +static void kvm_psci_system_reset2(struct kvm_vcpu *vcpu) { - int i; + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET, + KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2); +} - /* - * Zero the input registers' upper 32 bits. They will be fully - * zeroed on exit, so we're fine changing them in place. - */ - for (i = 1; i < 4; i++) - vcpu_set_reg(vcpu, i, lower_32_bits(vcpu_get_reg(vcpu, i))); +static void kvm_psci_system_suspend(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + + memset(&run->system_event, 0, sizeof(vcpu->run->system_event)); + run->system_event.type = KVM_SYSTEM_EVENT_SUSPEND; + run->exit_reason = KVM_EXIT_SYSTEM_EVENT; } static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32 fn) { - switch(fn) { - case PSCI_0_2_FN64_CPU_SUSPEND: - case PSCI_0_2_FN64_CPU_ON: - case PSCI_0_2_FN64_AFFINITY_INFO: - /* Disallow these functions for 32bit guests */ - if (vcpu_mode_is_32bit(vcpu)) - return PSCI_RET_NOT_SUPPORTED; - break; - } + /* + * Prevent 32 bit guests from calling 64 bit PSCI functions. + */ + if ((fn & PSCI_0_2_64BIT) && vcpu_mode_is_32bit(vcpu)) + return PSCI_RET_NOT_SUPPORTED; return 0; } @@ -229,10 +205,6 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) unsigned long val; int ret = 1; - val = kvm_psci_check_allowed_function(vcpu, psci_fn); - if (val) - goto out; - switch (psci_fn) { case PSCI_0_2_FN_PSCI_VERSION: /* @@ -246,7 +218,7 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) val = kvm_psci_vcpu_suspend(vcpu); break; case PSCI_0_2_FN_CPU_OFF: - kvm_psci_vcpu_off(vcpu); + kvm_arm_vcpu_power_off(vcpu); val = PSCI_RET_SUCCESS; break; case PSCI_0_2_FN_CPU_ON: @@ -300,29 +272,31 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) break; } -out: smccc_set_retval(vcpu, val, 0, 0, 0); return ret; } -static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu) +static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) { + unsigned long val = PSCI_RET_NOT_SUPPORTED; u32 psci_fn = smccc_get_function(vcpu); - u32 feature; - unsigned long val; + struct kvm *kvm = vcpu->kvm; + u32 arg; int ret = 1; switch(psci_fn) { case PSCI_0_2_FN_PSCI_VERSION: - val = KVM_ARM_PSCI_1_0; + val = minor == 0 ? KVM_ARM_PSCI_1_0 : KVM_ARM_PSCI_1_1; break; case PSCI_1_0_FN_PSCI_FEATURES: - feature = smccc_get_arg1(vcpu); - val = kvm_psci_check_allowed_function(vcpu, feature); + arg = smccc_get_arg1(vcpu); + val = kvm_psci_check_allowed_function(vcpu, arg); if (val) break; - switch(feature) { + val = PSCI_RET_NOT_SUPPORTED; + + switch(arg) { case PSCI_0_2_FN_PSCI_VERSION: case PSCI_0_2_FN_CPU_SUSPEND: case PSCI_0_2_FN64_CPU_SUSPEND: @@ -338,8 +312,47 @@ static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu) case ARM_SMCCC_VERSION_FUNC_ID: val = 0; break; - default: - val = PSCI_RET_NOT_SUPPORTED; + case PSCI_1_0_FN_SYSTEM_SUSPEND: + case PSCI_1_0_FN64_SYSTEM_SUSPEND: + if (test_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags)) + val = 0; + break; + case PSCI_1_1_FN_SYSTEM_RESET2: + case PSCI_1_1_FN64_SYSTEM_RESET2: + if (minor >= 1) + val = 0; + break; + } + break; + case PSCI_1_0_FN_SYSTEM_SUSPEND: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_1_0_FN64_SYSTEM_SUSPEND: + /* + * Return directly to userspace without changing the vCPU's + * registers. Userspace depends on reading the SMCCC parameters + * to implement SYSTEM_SUSPEND. + */ + if (test_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags)) { + kvm_psci_system_suspend(vcpu); + return 0; + } + break; + case PSCI_1_1_FN_SYSTEM_RESET2: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_1_1_FN64_SYSTEM_RESET2: + if (minor >= 1) { + arg = smccc_get_arg1(vcpu); + + if (arg <= PSCI_1_1_RESET_TYPE_SYSTEM_WARM_RESET || + arg >= PSCI_1_1_RESET_TYPE_VENDOR_START) { + kvm_psci_system_reset2(vcpu); + vcpu_set_reg(vcpu, 0, PSCI_RET_INTERNAL_FAILURE); + return 0; + } + + val = PSCI_RET_INVALID_PARAMS; break; } break; @@ -359,7 +372,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) switch (psci_fn) { case KVM_PSCI_FN_CPU_OFF: - kvm_psci_vcpu_off(vcpu); + kvm_arm_vcpu_power_off(vcpu); val = PSCI_RET_SUCCESS; break; case KVM_PSCI_FN_CPU_ON: @@ -392,196 +405,25 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) */ int kvm_psci_call(struct kvm_vcpu *vcpu) { - switch (kvm_psci_version(vcpu, vcpu->kvm)) { + u32 psci_fn = smccc_get_function(vcpu); + unsigned long val; + + val = kvm_psci_check_allowed_function(vcpu, psci_fn); + if (val) { + smccc_set_retval(vcpu, val, 0, 0, 0); + return 1; + } + + switch (kvm_psci_version(vcpu)) { + case KVM_ARM_PSCI_1_1: + return kvm_psci_1_x_call(vcpu, 1); case KVM_ARM_PSCI_1_0: - return kvm_psci_1_0_call(vcpu); + return kvm_psci_1_x_call(vcpu, 0); case KVM_ARM_PSCI_0_2: return kvm_psci_0_2_call(vcpu); case KVM_ARM_PSCI_0_1: return kvm_psci_0_1_call(vcpu); default: return -EINVAL; - }; -} - -int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu) -{ - return 4; /* PSCI version and three workaround registers */ -} - -int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) -{ - if (put_user(KVM_REG_ARM_PSCI_VERSION, uindices++)) - return -EFAULT; - - if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1, uindices++)) - return -EFAULT; - - if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2, uindices++)) - return -EFAULT; - - if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3, uindices++)) - return -EFAULT; - - return 0; -} - -#define KVM_REG_FEATURE_LEVEL_WIDTH 4 -#define KVM_REG_FEATURE_LEVEL_MASK (BIT(KVM_REG_FEATURE_LEVEL_WIDTH) - 1) - -/* - * Convert the workaround level into an easy-to-compare number, where higher - * values mean better protection. - */ -static int get_kernel_wa_level(u64 regid) -{ - switch (regid) { - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: - switch (arm64_get_spectre_v2_state()) { - case SPECTRE_VULNERABLE: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; - case SPECTRE_MITIGATED: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL; - case SPECTRE_UNAFFECTED: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED; - } - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: - switch (arm64_get_spectre_v4_state()) { - case SPECTRE_MITIGATED: - /* - * As for the hypercall discovery, we pretend we - * don't have any FW mitigation if SSBS is there at - * all times. - */ - if (cpus_have_final_cap(ARM64_SSBS)) - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; - fallthrough; - case SPECTRE_UNAFFECTED: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED; - case SPECTRE_VULNERABLE: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; - } - break; - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: - switch (arm64_get_spectre_bhb_state()) { - case SPECTRE_VULNERABLE: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL; - case SPECTRE_MITIGATED: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_AVAIL; - case SPECTRE_UNAFFECTED: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_REQUIRED; - } - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL; } - - return -EINVAL; -} - -int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - void __user *uaddr = (void __user *)(long)reg->addr; - u64 val; - - switch (reg->id) { - case KVM_REG_ARM_PSCI_VERSION: - val = kvm_psci_version(vcpu, vcpu->kvm); - break; - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: - val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; - break; - default: - return -ENOENT; - } - - if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id))) - return -EFAULT; - - return 0; -} - -int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - void __user *uaddr = (void __user *)(long)reg->addr; - u64 val; - int wa_level; - - if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id))) - return -EFAULT; - - switch (reg->id) { - case KVM_REG_ARM_PSCI_VERSION: - { - bool wants_02; - - wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features); - - switch (val) { - case KVM_ARM_PSCI_0_1: - if (wants_02) - return -EINVAL; - vcpu->kvm->arch.psci_version = val; - return 0; - case KVM_ARM_PSCI_0_2: - case KVM_ARM_PSCI_1_0: - if (!wants_02) - return -EINVAL; - vcpu->kvm->arch.psci_version = val; - return 0; - } - break; - } - - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: - if (val & ~KVM_REG_FEATURE_LEVEL_MASK) - return -EINVAL; - - if (get_kernel_wa_level(reg->id) < val) - return -EINVAL; - - return 0; - - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: - if (val & ~(KVM_REG_FEATURE_LEVEL_MASK | - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED)) - return -EINVAL; - - /* The enabled bit must not be set unless the level is AVAIL. */ - if ((val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED) && - (val & KVM_REG_FEATURE_LEVEL_MASK) != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL) - return -EINVAL; - - /* - * Map all the possible incoming states to the only two we - * really want to deal with. - */ - switch (val & KVM_REG_FEATURE_LEVEL_MASK) { - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL: - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN: - wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; - break; - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: - wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED; - break; - default: - return -EINVAL; - } - - /* - * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the - * other way around. - */ - if (get_kernel_wa_level(reg->id) < wa_level) - return -EINVAL; - - return 0; - default: - return -ENOENT; - } - - return -EINVAL; } diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 5ce36b0a3343..29a185472f8b 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -32,36 +32,27 @@ /* Maximum phys_shift supported for any VM on this host */ static u32 kvm_ipa_limit; -/* - * ARMv8 Reset Values - */ -#define VCPU_RESET_PSTATE_EL1 (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | \ - PSR_F_BIT | PSR_D_BIT) - -#define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \ - PSR_AA32_I_BIT | PSR_AA32_F_BIT) - unsigned int kvm_sve_max_vl; int kvm_arm_init_sve(void) { if (system_supports_sve()) { - kvm_sve_max_vl = sve_max_virtualisable_vl; + kvm_sve_max_vl = sve_max_virtualisable_vl(); /* * The get_sve_reg()/set_sve_reg() ioctl interface will need * to be extended with multiple register slice support in * order to support vector lengths greater than - * SVE_VL_ARCH_MAX: + * VL_ARCH_MAX: */ - if (WARN_ON(kvm_sve_max_vl > SVE_VL_ARCH_MAX)) - kvm_sve_max_vl = SVE_VL_ARCH_MAX; + if (WARN_ON(kvm_sve_max_vl > VL_ARCH_MAX)) + kvm_sve_max_vl = VL_ARCH_MAX; /* * Don't even try to make use of vector lengths that * aren't available on all CPUs, for now: */ - if (kvm_sve_max_vl < sve_max_vl) + if (kvm_sve_max_vl < sve_max_vl()) pr_warn("KVM: SVE vector length for guests limited to %u bytes\n", kvm_sve_max_vl); } @@ -81,7 +72,7 @@ static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) * KVM_REG_ARM64_SVE_VLS. Allocation is deferred until * kvm_arm_vcpu_finalize(), which freezes the configuration. */ - vcpu->arch.flags |= KVM_ARM64_GUEST_HAS_SVE; + vcpu_set_flag(vcpu, GUEST_HAS_SVE); return 0; } @@ -94,24 +85,33 @@ static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu) { void *buf; unsigned int vl; + size_t reg_sz; + int ret; vl = vcpu->arch.sve_max_vl; /* * Responsibility for these properties is shared between - * kvm_arm_init_arch_resources(), kvm_vcpu_enable_sve() and + * kvm_arm_init_sve(), kvm_vcpu_enable_sve() and * set_sve_vls(). Double-check here just to be sure: */ - if (WARN_ON(!sve_vl_valid(vl) || vl > sve_max_virtualisable_vl || - vl > SVE_VL_ARCH_MAX)) + if (WARN_ON(!sve_vl_valid(vl) || vl > sve_max_virtualisable_vl() || + vl > VL_ARCH_MAX)) return -EIO; - buf = kzalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)), GFP_KERNEL); + reg_sz = vcpu_sve_state_size(vcpu); + buf = kzalloc(reg_sz, GFP_KERNEL_ACCOUNT); if (!buf) return -ENOMEM; + ret = kvm_share_hyp(buf, buf + reg_sz); + if (ret) { + kfree(buf); + return ret; + } + vcpu->arch.sve_state = buf; - vcpu->arch.flags |= KVM_ARM64_VCPU_SVE_FINALIZED; + vcpu_set_flag(vcpu, VCPU_SVE_FINALIZED); return 0; } @@ -141,7 +141,13 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu) void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) { - kfree(vcpu->arch.sve_state); + void *sve_state = vcpu->arch.sve_state; + + kvm_vcpu_unshare_task_fp(vcpu); + kvm_unshare_hyp(vcpu, vcpu + 1); + if (sve_state) + kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu)); + kfree(sve_state); } static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) @@ -150,53 +156,60 @@ static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu)); } -static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) +/** + * kvm_set_vm_width() - set the register width for the guest + * @vcpu: Pointer to the vcpu being configured + * + * Set both KVM_ARCH_FLAG_EL1_32BIT and KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED + * in the VM flags based on the vcpu's requested register width, the HW + * capabilities and other options (such as MTE). + * When REG_WIDTH_CONFIGURED is already set, the vcpu settings must be + * consistent with the value of the FLAG_EL1_32BIT bit in the flags. + * + * Return: 0 on success, negative error code on failure. + */ +static int kvm_set_vm_width(struct kvm_vcpu *vcpu) { - /* - * For now make sure that both address/generic pointer authentication - * features are requested by the userspace together and the system - * supports these capabilities. - */ - if (!test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) || - !test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features) || - !system_has_full_ptr_auth()) - return -EINVAL; - - vcpu->arch.flags |= KVM_ARM64_GUEST_HAS_PTRAUTH; - return 0; -} - -static bool vcpu_allowed_register_width(struct kvm_vcpu *vcpu) -{ - struct kvm_vcpu *tmp; + struct kvm *kvm = vcpu->kvm; bool is32bit; - int i; is32bit = vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT); - if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1) && is32bit) - return false; - /* MTE is incompatible with AArch32 */ - if (kvm_has_mte(vcpu->kvm) && is32bit) - return false; + lockdep_assert_held(&kvm->lock); - /* Check that the vcpus are either all 32bit or all 64bit */ - kvm_for_each_vcpu(i, tmp, vcpu->kvm) { - if (vcpu_has_feature(tmp, KVM_ARM_VCPU_EL1_32BIT) != is32bit) - return false; + if (test_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags)) { + /* + * The guest's register width is already configured. + * Make sure that the vcpu is consistent with it. + */ + if (is32bit == test_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags)) + return 0; + + return -EINVAL; } - return true; + if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1) && is32bit) + return -EINVAL; + + /* MTE is incompatible with AArch32 */ + if (kvm_has_mte(kvm) && is32bit) + return -EINVAL; + + if (is32bit) + set_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags); + + set_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags); + + return 0; } /** * kvm_reset_vcpu - sets core registers and sys_regs to reset value * @vcpu: The VCPU pointer * - * This function finds the right table above and sets the registers on - * the virtual CPU struct to their architecturally defined reset - * values, except for registers whose reset is deferred until - * kvm_arm_vcpu_finalize(). + * This function sets the registers on the virtual CPU struct to their + * architecturally defined reset values, except for registers whose reset is + * deferred until kvm_arm_vcpu_finalize(). * * Note: This function can be called from two paths: The KVM_ARM_VCPU_INIT * ioctl or as part of handling a request issued by another VCPU in the PSCI @@ -213,13 +226,18 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) struct vcpu_reset_state reset_state; int ret; bool loaded; - u32 pstate; mutex_lock(&vcpu->kvm->lock); - reset_state = vcpu->arch.reset_state; - WRITE_ONCE(vcpu->arch.reset_state.reset, false); + ret = kvm_set_vm_width(vcpu); + if (!ret) { + reset_state = vcpu->arch.reset_state; + WRITE_ONCE(vcpu->arch.reset_state.reset, false); + } mutex_unlock(&vcpu->kvm->lock); + if (ret) + return ret; + /* Reset PMU outside of the non-preemptible section */ kvm_pmu_vcpu_reset(vcpu); @@ -246,34 +264,13 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) } } - if (!vcpu_allowed_register_width(vcpu)) { + if (kvm_vcpu_has_pmu(vcpu) && !kvm_arm_support_pmu_v3()) { ret = -EINVAL; goto out; } - switch (vcpu->arch.target) { - default: - if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) { - pstate = VCPU_RESET_PSTATE_SVC; - } else { - pstate = VCPU_RESET_PSTATE_EL1; - } - - if (kvm_vcpu_has_pmu(vcpu) && !kvm_arm_support_pmu_v3()) { - ret = -EINVAL; - goto out; - } - break; - } - /* Reset core registers */ - memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu))); - memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs)); - vcpu->arch.ctxt.spsr_abt = 0; - vcpu->arch.ctxt.spsr_und = 0; - vcpu->arch.ctxt.spsr_irq = 0; - vcpu->arch.ctxt.spsr_fiq = 0; - vcpu_gp_regs(vcpu)->pstate = pstate; + kvm_reset_vcpu_core(vcpu); /* Reset system registers */ kvm_reset_sys_regs(vcpu); @@ -282,22 +279,8 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) * Additional reset state handling that PSCI may have imposed on us. * Must be done after all the sys_reg reset. */ - if (reset_state.reset) { - unsigned long target_pc = reset_state.pc; - - /* Gracefully handle Thumb2 entry point */ - if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) { - target_pc &= ~1UL; - vcpu_set_thumb(vcpu); - } - - /* Propagate caller endianness */ - if (reset_state.be) - kvm_vcpu_set_be(vcpu); - - *vcpu_pc(vcpu) = target_pc; - vcpu_set_reg(vcpu, 0, reset_state.r0); - } + if (reset_state.reset) + kvm_reset_vcpu_psci(vcpu, &reset_state); /* Reset timer */ ret = kvm_timer_vcpu_reset(vcpu); @@ -320,7 +303,7 @@ int kvm_set_ipa_limit(void) mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); parange = cpuid_feature_extract_unsigned_field(mmfr0, - ID_AA64MMFR0_PARANGE_SHIFT); + ID_AA64MMFR0_EL1_PARANGE_SHIFT); /* * IPA size beyond 48 bits could not be supported * on either 4K or 16K page size. Hence let's cap @@ -328,20 +311,20 @@ int kvm_set_ipa_limit(void) * on the system. */ if (PAGE_SIZE != SZ_64K) - parange = min(parange, (unsigned int)ID_AA64MMFR0_PARANGE_48); + parange = min(parange, (unsigned int)ID_AA64MMFR0_EL1_PARANGE_48); /* * Check with ARMv8.5-GTG that our PAGE_SIZE is supported at * Stage-2. If not, things will stop very quickly. */ - switch (cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_TGRAN_2_SHIFT)) { - case ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE: + switch (cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_TGRAN_2_SHIFT)) { + case ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_NONE: kvm_err("PAGE_SIZE not supported at Stage-2, giving up\n"); return -EINVAL; - case ID_AA64MMFR0_TGRAN_2_SUPPORTED_DEFAULT: + case ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_DEFAULT: kvm_debug("PAGE_SIZE supported at Stage-2 (default)\n"); break; - case ID_AA64MMFR0_TGRAN_2_SUPPORTED_MIN ... ID_AA64MMFR0_TGRAN_2_SUPPORTED_MAX: + case ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MIN ... ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MAX: kvm_debug("PAGE_SIZE supported at Stage-2 (advertised)\n"); break; default: @@ -356,32 +339,3 @@ int kvm_set_ipa_limit(void) return 0; } - -int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) -{ - u64 mmfr0, mmfr1; - u32 phys_shift; - - if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) - return -EINVAL; - - phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); - if (phys_shift) { - if (phys_shift > kvm_ipa_limit || - phys_shift < ARM64_MIN_PARANGE_BITS) - return -EINVAL; - } else { - phys_shift = KVM_PHYS_SHIFT; - if (phys_shift > kvm_ipa_limit) { - pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", - current->comm); - return -EINVAL; - } - } - - mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); - mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); - kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); - - return 0; -} diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c new file mode 100644 index 000000000000..949d19d603fb --- /dev/null +++ b/arch/arm64/kvm/stacktrace.c @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * KVM nVHE hypervisor stack tracing support. + * + * The unwinder implementation depends on the nVHE mode: + * + * 1) Non-protected nVHE mode - the host can directly access the + * HYP stack pages and unwind the HYP stack in EL1. This saves having + * to allocate shared buffers for the host to read the unwinded + * stacktrace. + * + * 2) pKVM (protected nVHE) mode - the host cannot directly access + * the HYP memory. The stack is unwinded in EL2 and dumped to a shared + * buffer where the host can read and print the stacktrace. + * + * Copyright (C) 2022 Google LLC + */ + +#include +#include + +#include + +/* + * kvm_nvhe_stack_kern_va - Convert KVM nVHE HYP stack addresses to a kernel VAs + * + * The nVHE hypervisor stack is mapped in the flexible 'private' VA range, to + * allow for guard pages below the stack. Consequently, the fixed offset address + * translation macros won't work here. + * + * The kernel VA is calculated as an offset from the kernel VA of the hypervisor + * stack base. + * + * Returns true on success and updates @addr to its corresponding kernel VA; + * otherwise returns false. + */ +static bool kvm_nvhe_stack_kern_va(unsigned long *addr, + enum stack_type type) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info; + unsigned long hyp_base, kern_base, hyp_offset; + + stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + + switch (type) { + case STACK_TYPE_HYP: + kern_base = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_page); + hyp_base = (unsigned long)stacktrace_info->stack_base; + break; + case STACK_TYPE_OVERFLOW: + kern_base = (unsigned long)this_cpu_ptr_nvhe_sym(overflow_stack); + hyp_base = (unsigned long)stacktrace_info->overflow_stack_base; + break; + default: + return false; + } + + hyp_offset = *addr - hyp_base; + + *addr = kern_base + hyp_offset; + + return true; +} + +static bool on_overflow_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info + = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + unsigned long low = (unsigned long)stacktrace_info->overflow_stack_base; + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info); +} + +static bool on_hyp_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info + = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + unsigned long low = (unsigned long)stacktrace_info->stack_base; + unsigned long high = low + PAGE_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_HYP, info); +} + +static bool on_accessible_stack(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info) +{ + if (info) + info->type = STACK_TYPE_UNKNOWN; + + return (on_overflow_stack(sp, size, info) || + on_hyp_stack(sp, size, info)); +} + +static int unwind_next(struct unwind_state *state) +{ + struct stack_info info; + + return unwind_next_common(state, &info, on_accessible_stack, + kvm_nvhe_stack_kern_va); +} + +static void unwind(struct unwind_state *state, + stack_trace_consume_fn consume_entry, void *cookie) +{ + while (1) { + int ret; + + if (!consume_entry(cookie, state->pc)) + break; + ret = unwind_next(state); + if (ret < 0) + break; + } +} + +/* + * kvm_nvhe_dump_backtrace_entry - Symbolize and print an nVHE backtrace entry + * + * @arg : the hypervisor offset, used for address translation + * @where : the program counter corresponding to the stack frame + */ +static bool kvm_nvhe_dump_backtrace_entry(void *arg, unsigned long where) +{ + unsigned long va_mask = GENMASK_ULL(vabits_actual - 1, 0); + unsigned long hyp_offset = (unsigned long)arg; + + /* Mask tags and convert to kern addr */ + where = (where & va_mask) + hyp_offset; + kvm_err(" [<%016lx>] %pB\n", where, (void *)(where + kaslr_offset())); + + return true; +} + +static void kvm_nvhe_dump_backtrace_start(void) +{ + kvm_err("nVHE call trace:\n"); +} + +static void kvm_nvhe_dump_backtrace_end(void) +{ + kvm_err("---[ end nVHE call trace ]---\n"); +} + +/* + * hyp_dump_backtrace - Dump the non-protected nVHE backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + * + * The host can directly access HYP stack pages in non-protected + * mode, so the unwinding is done directly from EL1. This removes + * the need for shared buffers between host and hypervisor for + * the stacktrace. + */ +static void hyp_dump_backtrace(unsigned long hyp_offset) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info; + struct unwind_state state; + + stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + + kvm_nvhe_unwind_init(&state, stacktrace_info->fp, stacktrace_info->pc); + + kvm_nvhe_dump_backtrace_start(); + unwind(&state, kvm_nvhe_dump_backtrace_entry, (void *)hyp_offset); + kvm_nvhe_dump_backtrace_end(); +} + +#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE +DECLARE_KVM_NVHE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], + pkvm_stacktrace); + +/* + * pkvm_dump_backtrace - Dump the protected nVHE HYP backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + * + * Dumping of the pKVM HYP backtrace is done by reading the + * stack addresses from the shared stacktrace buffer, since the + * host cannot directly access hypervisor memory in protected + * mode. + */ +static void pkvm_dump_backtrace(unsigned long hyp_offset) +{ + unsigned long *stacktrace + = (unsigned long *) this_cpu_ptr_nvhe_sym(pkvm_stacktrace); + int i; + + kvm_nvhe_dump_backtrace_start(); + /* The saved stacktrace is terminated by a null entry */ + for (i = 0; + i < ARRAY_SIZE(kvm_nvhe_sym(pkvm_stacktrace)) && stacktrace[i]; + i++) + kvm_nvhe_dump_backtrace_entry((void *)hyp_offset, stacktrace[i]); + kvm_nvhe_dump_backtrace_end(); +} +#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */ +static void pkvm_dump_backtrace(unsigned long hyp_offset) +{ + kvm_err("Cannot dump pKVM nVHE stacktrace: !CONFIG_PROTECTED_NVHE_STACKTRACE\n"); +} +#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ + +/* + * kvm_nvhe_dump_backtrace - Dump KVM nVHE hypervisor backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + */ +void kvm_nvhe_dump_backtrace(unsigned long hyp_offset) +{ + if (is_protected_kvm_enabled()) + pkvm_dump_backtrace(hyp_offset); + else + hyp_dump_backtrace(hyp_offset); +} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c11612db4a37..946571763a64 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -34,16 +34,13 @@ #include "trace.h" /* - * All of this file is extremely similar to the ARM coproc.c, but the - * types are different. My gut feeling is that it should be pretty - * easy to merge, but that would be an ABI breakage -- again. VFP - * would also need to be abstracted. - * * For AArch32, we only take care of what is being trapped. Anything * that has to do with init and userspace access has to go via the * 64bit interface. */ +static u64 sys_reg_to_index(const struct sys_reg_desc *reg); + static bool read_from_write_only(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r) @@ -64,26 +61,6 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu, return false; } -u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) -{ - u64 val = 0x8badf00d8badf00d; - - if (vcpu->arch.sysregs_loaded_on_cpu && - __vcpu_read_sys_reg_from_cpu(reg, &val)) - return val; - - return __vcpu_sys_reg(vcpu, reg); -} - -void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) -{ - if (vcpu->arch.sysregs_loaded_on_cpu && - __vcpu_write_sys_reg_to_cpu(val, reg)) - return; - - __vcpu_sys_reg(vcpu, reg) = val; -} - /* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */ static u32 cache_levels; @@ -276,7 +253,7 @@ static bool trap_loregion(struct kvm_vcpu *vcpu, u64 val = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); u32 sr = reg_to_encoding(r); - if (!(val & (0xfUL << ID_AA64MMFR1_LOR_SHIFT))) { + if (!(val & (0xfUL << ID_AA64MMFR1_EL1_LO_SHIFT))) { kvm_inject_undefined(vcpu); return false; } @@ -287,16 +264,47 @@ static bool trap_loregion(struct kvm_vcpu *vcpu, return trap_raz_wi(vcpu, p, r); } +static bool trap_oslar_el1(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u64 oslsr; + + if (!p->is_write) + return read_from_write_only(vcpu, p, r); + + /* Forward the OSLK bit to OSLSR */ + oslsr = __vcpu_sys_reg(vcpu, OSLSR_EL1) & ~SYS_OSLSR_OSLK; + if (p->regval & SYS_OSLAR_OSLK) + oslsr |= SYS_OSLSR_OSLK; + + __vcpu_sys_reg(vcpu, OSLSR_EL1) = oslsr; + return true; +} + static bool trap_oslsr_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - if (p->is_write) { - return ignore_write(vcpu, p); - } else { - p->regval = (1 << 3); - return true; - } + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = __vcpu_sys_reg(vcpu, r->reg); + return true; +} + +static int set_oslsr_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 val) +{ + /* + * The only modifiable bit is the OSLK bit. Refuse the write if + * userspace attempts to change any other bit in the register. + */ + if ((val ^ rd->val) & ~SYS_OSLSR_OSLK) + return -EINVAL; + + __vcpu_sys_reg(vcpu, rd->reg) = val; + return 0; } static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu, @@ -344,7 +352,7 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu, { if (p->is_write) { vcpu_write_sys_reg(vcpu, p->regval, r->reg); - vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; + vcpu_set_flag(vcpu, DEBUG_DIRTY); } else { p->regval = vcpu_read_sys_reg(vcpu, r->reg); } @@ -360,8 +368,8 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu, * A 32 bit write to a debug register leave top bits alone * A 32 bit read from a debug register only returns the bottom bits * - * All writes will set the KVM_ARM64_DEBUG_DIRTY flag to ensure the - * hyp.S code switches between host and guest values in future. + * All writes will set the DEBUG_DIRTY flag to ensure the hyp code + * switches between host and guest values in future. */ static void reg_to_dbg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -377,7 +385,7 @@ static void reg_to_dbg(struct kvm_vcpu *vcpu, val |= (p->regval & (mask >> shift)) << shift; *dbg_reg = val; - vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; + vcpu_set_flag(vcpu, DEBUG_DIRTY); } static void dbg_to_reg(struct kvm_vcpu *vcpu, @@ -408,22 +416,16 @@ static bool trap_bvr(struct kvm_vcpu *vcpu, } static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; - - if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = val; return 0; } static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 *val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; - - if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + *val = vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; return 0; } @@ -450,23 +452,16 @@ static bool trap_bcr(struct kvm_vcpu *vcpu, } static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; - - if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; - + vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = val; return 0; } static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 *val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; - - if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + *val = vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; return 0; } @@ -494,22 +489,16 @@ static bool trap_wvr(struct kvm_vcpu *vcpu, } static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; - - if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = val; return 0; } static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 *val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; - - if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + *val = vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; return 0; } @@ -536,22 +525,16 @@ static bool trap_wcr(struct kvm_vcpu *vcpu, } static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; - - if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = val; return 0; } static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 *val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; - - if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + *val = vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; return 0; } @@ -575,19 +558,7 @@ static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - u64 mpidr; - - /* - * Map the vcpu_id into the first three affinity level fields of - * the MPIDR. We limit the number of VCPUs in level 0 due to a - * limitation to 16 CPUs in that level in the ICC_SGIxR registers - * of the GICv3 to be able to address each CPU directly when - * sending IPIs. - */ - mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0); - mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1); - mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2); - vcpu_write_sys_reg(vcpu, (1ULL << 31) | mpidr, MPIDR_EL1); + vcpu_write_sys_reg(vcpu, calculate_mpidr(vcpu), MPIDR_EL1); } static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, @@ -1060,49 +1031,60 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, } /* Read a sanitised cpufeature ID register by sys_reg_desc */ -static u64 read_id_reg(const struct kvm_vcpu *vcpu, - struct sys_reg_desc const *r, bool raz) +static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) { u32 id = reg_to_encoding(r); - u64 val = raz ? 0 : read_sanitised_ftr_reg(id); + u64 val; + + if (sysreg_visible_as_raz(vcpu, r)) + return 0; + + val = read_sanitised_ftr_reg(id); switch (id) { case SYS_ID_AA64PFR0_EL1: if (!vcpu_has_sve(vcpu)) - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_SVE); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_AMU); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); + if (kvm_vgic_global_state.type == VGIC_V3) { + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), 1); + } break; case SYS_ID_AA64PFR1_EL1: - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_MTE); - if (kvm_has_mte(vcpu->kvm)) { - u64 pfr, mte; + if (!kvm_has_mte(vcpu->kvm)) + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); - pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); - mte = cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR1_MTE_SHIFT); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR1_MTE), mte); - } + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) - val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_APA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_API) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI)); + val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI)); + break; + case SYS_ID_AA64ISAR2_EL1: + if (!vcpu_has_ptrauth(vcpu)) + val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | + ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3)); + if (!cpus_have_final_cap(ARM64_HAS_WFXT)) + val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT); break; case SYS_ID_AA64DFR0_EL1: /* Limit debug to ARMv8.0 */ - val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER), 6); + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), 6); /* Limit guests to PMUv3 for ARMv8.4 */ val = cpuid_feature_cap_perfmon_field(val, - ID_AA64DFR0_PMUVER_SHIFT, - kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_PMUVER_8_4 : 0); + ID_AA64DFR0_EL1_PMUVer_SHIFT, + kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_EL1_PMUVer_V3P4 : 0); /* Hide SPE from guests */ - val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_PMSVER); + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer); break; case SYS_ID_DFR0_EL1: /* Limit guests to PMUv3 for ARMv8.4 */ @@ -1130,40 +1112,39 @@ static unsigned int id_visibility(const struct kvm_vcpu *vcpu, return 0; } -/* cpufeature ID register access trap handlers */ - -static bool __access_id_reg(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r, - bool raz) +static unsigned int aa32_id_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) { - if (p->is_write) - return write_to_read_only(vcpu, p, r); + /* + * AArch32 ID registers are UNKNOWN if AArch32 isn't implemented at any + * EL. Promote to RAZ/WI in order to guarantee consistency between + * systems. + */ + if (!kvm_supports_32bit_el0()) + return REG_RAZ | REG_USER_WI; - p->regval = read_id_reg(vcpu, r, raz); - return true; + return id_visibility(vcpu, r); } +static unsigned int raz_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + return REG_RAZ; +} + +/* cpufeature ID register access trap handlers */ + static bool access_id_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - bool raz = sysreg_visible_as_raz(vcpu, r); + if (p->is_write) + return write_to_read_only(vcpu, p, r); - return __access_id_reg(vcpu, p, r, raz); + p->regval = read_id_reg(vcpu, r); + return true; } -static bool access_raz_id_reg(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - return __access_id_reg(vcpu, p, r, true); -} - -static int reg_from_user(u64 *val, const void __user *uaddr, u64 id); -static int reg_to_user(void __user *uaddr, const u64 *val, u64 id); -static u64 sys_reg_to_index(const struct sys_reg_desc *reg); - /* Visibility overrides for SVE-specific control registers */ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) @@ -1176,42 +1157,35 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - const u64 id = sys_reg_to_index(rd); u8 csv2, csv3; - int err; - u64 val; - - err = reg_from_user(&val, uaddr, id); - if (err) - return err; /* * Allow AA64PFR0_EL1.CSV2 to be set from userspace as long as * it doesn't promise more than what is actually provided (the * guest could otherwise be covered in ectoplasmic residue). */ - csv2 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_CSV2_SHIFT); + csv2 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV2_SHIFT); if (csv2 > 1 || (csv2 && arm64_get_spectre_v2_state() != SPECTRE_UNAFFECTED)) return -EINVAL; /* Same thing for CSV3 */ - csv3 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_CSV3_SHIFT); + csv3 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV3_SHIFT); if (csv3 > 1 || (csv3 && arm64_get_meltdown_state() != SPECTRE_UNAFFECTED)) return -EINVAL; /* We can only differ with CSV[23], and anything else is an error */ - val ^= read_id_reg(vcpu, rd, false); - val &= ~((0xFUL << ID_AA64PFR0_CSV2_SHIFT) | - (0xFUL << ID_AA64PFR0_CSV3_SHIFT)); + val ^= read_id_reg(vcpu, rd); + val &= ~((0xFUL << ID_AA64PFR0_EL1_CSV2_SHIFT) | + (0xFUL << ID_AA64PFR0_EL1_CSV3_SHIFT)); if (val) return -EINVAL; vcpu->kvm->arch.pfr0_csv2 = csv2; - vcpu->kvm->arch.pfr0_csv3 = csv3 ; + vcpu->kvm->arch.pfr0_csv3 = csv3; return 0; } @@ -1223,74 +1197,33 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, * are stored, and for set_id_reg() we don't allow the effective value * to be changed. */ -static int __get_id_reg(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, void __user *uaddr, - bool raz) +static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 *val) { - const u64 id = sys_reg_to_index(rd); - const u64 val = read_id_reg(vcpu, rd, raz); - - return reg_to_user(uaddr, &val, id); + *val = read_id_reg(vcpu, rd); + return 0; } -static int __set_id_reg(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, void __user *uaddr, - bool raz) +static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 val) { - const u64 id = sys_reg_to_index(rd); - int err; - u64 val; - - err = reg_from_user(&val, uaddr, id); - if (err) - return err; - /* This is what we mean by invariant: you can't change it. */ - if (val != read_id_reg(vcpu, rd, raz)) + if (val != read_id_reg(vcpu, rd)) return -EINVAL; return 0; } -static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) +static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 *val) { - bool raz = sysreg_visible_as_raz(vcpu, rd); - - return __get_id_reg(vcpu, rd, uaddr, raz); -} - -static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) -{ - bool raz = sysreg_visible_as_raz(vcpu, rd); - - return __set_id_reg(vcpu, rd, uaddr, raz); -} - -static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) -{ - return __get_id_reg(vcpu, rd, uaddr, true); -} - -static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) -{ - return __set_id_reg(vcpu, rd, uaddr, true); + *val = 0; + return 0; } static int set_wi_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - int err; - u64 val; - - /* Perform the access even if we are going to ignore the value */ - err = reg_from_user(&val, uaddr, sys_reg_to_index(rd)); - if (err) - return err; - return 0; } @@ -1380,6 +1313,15 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, .visibility = id_visibility, \ } +/* sys_reg_desc initialiser for known cpufeature ID registers */ +#define AA32_ID_SANITISED(name) { \ + SYS_DESC(SYS_##name), \ + .access = access_id_reg, \ + .get_user = get_id_reg, \ + .set_user = set_id_reg, \ + .visibility = aa32_id_visibility, \ +} + /* * sys_reg_desc initialiser for architecturally unallocated cpufeature ID * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2 @@ -1387,9 +1329,10 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, */ #define ID_UNALLOCATED(crm, op2) { \ Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \ - .access = access_raz_id_reg, \ - .get_user = get_raz_id_reg, \ - .set_user = set_raz_id_reg, \ + .access = access_id_reg, \ + .get_user = get_id_reg, \ + .set_user = set_id_reg, \ + .visibility = raz_visibility \ } /* @@ -1399,9 +1342,10 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, */ #define ID_HIDDEN(name) { \ SYS_DESC(SYS_##name), \ - .access = access_raz_id_reg, \ - .get_user = get_raz_id_reg, \ - .set_user = set_raz_id_reg, \ + .access = access_id_reg, \ + .get_user = get_id_reg, \ + .set_user = set_id_reg, \ + .visibility = raz_visibility, \ } /* @@ -1411,9 +1355,9 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, * Debug handling: We do trap most, if not all debug related system * registers. The implementation is good enough to ensure that a guest * can use these with minimal performance degradation. The drawback is - * that we don't implement any of the external debug, none of the - * OSlock protocol. This should be revisited if we ever encounter a - * more demanding guest... + * that we don't implement any of the external debug architecture. + * This should be revisited if we ever encounter a more demanding + * guest... */ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_DC_ISW), access_dcsw }, @@ -1440,8 +1384,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { DBG_BCR_BVR_WCR_WVR_EL1(15), { SYS_DESC(SYS_MDRAR_EL1), trap_raz_wi }, - { SYS_DESC(SYS_OSLAR_EL1), trap_raz_wi }, - { SYS_DESC(SYS_OSLSR_EL1), trap_oslsr_el1 }, + { SYS_DESC(SYS_OSLAR_EL1), trap_oslar_el1 }, + { SYS_DESC(SYS_OSLSR_EL1), trap_oslsr_el1, reset_val, OSLSR_EL1, + SYS_OSLSR_OSLM_IMPLEMENTED, .set_user = set_oslsr_el1, }, { SYS_DESC(SYS_OSDLR_EL1), trap_raz_wi }, { SYS_DESC(SYS_DBGPRCR_EL1), trap_raz_wi }, { SYS_DESC(SYS_DBGCLAIMSET_EL1), trap_raz_wi }, @@ -1464,33 +1409,33 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* AArch64 mappings of the AArch32 ID registers */ /* CRm=1 */ - ID_SANITISED(ID_PFR0_EL1), - ID_SANITISED(ID_PFR1_EL1), - ID_SANITISED(ID_DFR0_EL1), + AA32_ID_SANITISED(ID_PFR0_EL1), + AA32_ID_SANITISED(ID_PFR1_EL1), + AA32_ID_SANITISED(ID_DFR0_EL1), ID_HIDDEN(ID_AFR0_EL1), - ID_SANITISED(ID_MMFR0_EL1), - ID_SANITISED(ID_MMFR1_EL1), - ID_SANITISED(ID_MMFR2_EL1), - ID_SANITISED(ID_MMFR3_EL1), + AA32_ID_SANITISED(ID_MMFR0_EL1), + AA32_ID_SANITISED(ID_MMFR1_EL1), + AA32_ID_SANITISED(ID_MMFR2_EL1), + AA32_ID_SANITISED(ID_MMFR3_EL1), /* CRm=2 */ - ID_SANITISED(ID_ISAR0_EL1), - ID_SANITISED(ID_ISAR1_EL1), - ID_SANITISED(ID_ISAR2_EL1), - ID_SANITISED(ID_ISAR3_EL1), - ID_SANITISED(ID_ISAR4_EL1), - ID_SANITISED(ID_ISAR5_EL1), - ID_SANITISED(ID_MMFR4_EL1), - ID_SANITISED(ID_ISAR6_EL1), + AA32_ID_SANITISED(ID_ISAR0_EL1), + AA32_ID_SANITISED(ID_ISAR1_EL1), + AA32_ID_SANITISED(ID_ISAR2_EL1), + AA32_ID_SANITISED(ID_ISAR3_EL1), + AA32_ID_SANITISED(ID_ISAR4_EL1), + AA32_ID_SANITISED(ID_ISAR5_EL1), + AA32_ID_SANITISED(ID_MMFR4_EL1), + AA32_ID_SANITISED(ID_ISAR6_EL1), /* CRm=3 */ - ID_SANITISED(MVFR0_EL1), - ID_SANITISED(MVFR1_EL1), - ID_SANITISED(MVFR2_EL1), + AA32_ID_SANITISED(MVFR0_EL1), + AA32_ID_SANITISED(MVFR1_EL1), + AA32_ID_SANITISED(MVFR2_EL1), ID_UNALLOCATED(3,3), - ID_SANITISED(ID_PFR2_EL1), + AA32_ID_SANITISED(ID_PFR2_EL1), ID_HIDDEN(ID_DFR1_EL1), - ID_SANITISED(ID_MMFR5_EL1), + AA32_ID_SANITISED(ID_MMFR5_EL1), ID_UNALLOCATED(3,7), /* AArch64 ID registers */ @@ -1501,7 +1446,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_UNALLOCATED(4,2), ID_UNALLOCATED(4,3), ID_SANITISED(ID_AA64ZFR0_EL1), - ID_UNALLOCATED(4,5), + ID_HIDDEN(ID_AA64SMFR0_EL1), ID_UNALLOCATED(4,6), ID_UNALLOCATED(4,7), @@ -1544,6 +1489,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility }, { SYS_DESC(SYS_TRFCR_EL1), undef_access }, + { SYS_DESC(SYS_SMPRI_EL1), undef_access }, + { SYS_DESC(SYS_SMCR_EL1), undef_access }, { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 }, { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 }, { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 }, @@ -1626,8 +1573,10 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr }, { SYS_DESC(SYS_CLIDR_EL1), access_clidr }, + { SYS_DESC(SYS_SMIDR_EL1), undef_access }, { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, { SYS_DESC(SYS_CTR_EL0), access_ctr }, + { SYS_DESC(SYS_SVCR), undef_access }, { PMU_SYS_REG(SYS_PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr, .reg = PMCR_EL0 }, @@ -1642,7 +1591,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { * previously (and pointlessly) advertised in the past... */ { PMU_SYS_REG(SYS_PMSWINC_EL0), - .get_user = get_raz_id_reg, .set_user = set_wi_reg, + .get_user = get_raz_reg, .set_user = set_wi_reg, .access = access_pmswinc, .reset = NULL }, { PMU_SYS_REG(SYS_PMSELR_EL0), .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 }, @@ -1667,6 +1616,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 }, { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 }, + { SYS_DESC(SYS_TPIDR2_EL0), undef_access }, { SYS_DESC(SYS_SCXTNUM_EL0), undef_access }, @@ -1832,11 +1782,11 @@ static bool trap_dbgdidr(struct kvm_vcpu *vcpu, } else { u64 dfr = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); u64 pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); - u32 el3 = !!cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR0_EL3_SHIFT); + u32 el3 = !!cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR0_EL1_EL3_SHIFT); - p->regval = ((((dfr >> ID_AA64DFR0_WRPS_SHIFT) & 0xf) << 28) | - (((dfr >> ID_AA64DFR0_BRPS_SHIFT) & 0xf) << 24) | - (((dfr >> ID_AA64DFR0_CTX_CMPS_SHIFT) & 0xf) << 20) + p->regval = ((((dfr >> ID_AA64DFR0_EL1_WRPs_SHIFT) & 0xf) << 28) | + (((dfr >> ID_AA64DFR0_EL1_BRPs_SHIFT) & 0xf) << 24) | + (((dfr >> ID_AA64DFR0_EL1_CTX_CMPs_SHIFT) & 0xf) << 20) | (6 << 16) | (1 << 15) | (el3 << 14) | (el3 << 12)); return true; } @@ -1913,10 +1863,10 @@ static const struct sys_reg_desc cp14_regs[] = { DBGBXVR(0), /* DBGOSLAR */ - { Op1( 0), CRn( 1), CRm( 0), Op2( 4), trap_raz_wi }, + { Op1( 0), CRn( 1), CRm( 0), Op2( 4), trap_oslar_el1 }, DBGBXVR(1), /* DBGOSLSR */ - { Op1( 0), CRn( 1), CRm( 1), Op2( 4), trap_oslsr_el1 }, + { Op1( 0), CRn( 1), CRm( 1), Op2( 4), trap_oslsr_el1, NULL, OSLSR_EL1 }, DBGBXVR(2), DBGBXVR(3), /* DBGOSDLR */ @@ -1962,20 +1912,22 @@ static const struct sys_reg_desc cp14_64_regs[] = { { Op1( 0), CRm( 2), .access = trap_raz_wi }, }; +#define CP15_PMU_SYS_REG(_map, _Op1, _CRn, _CRm, _Op2) \ + AA32(_map), \ + Op1(_Op1), CRn(_CRn), CRm(_CRm), Op2(_Op2), \ + .visibility = pmu_visibility + /* Macro to expand the PMEVCNTRn register */ #define PMU_PMEVCNTR(n) \ - /* PMEVCNTRn */ \ - { Op1(0), CRn(0b1110), \ - CRm((0b1000 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)), \ - access_pmu_evcntr } + { CP15_PMU_SYS_REG(DIRECT, 0, 0b1110, \ + (0b1000 | (((n) >> 3) & 0x3)), ((n) & 0x7)), \ + .access = access_pmu_evcntr } /* Macro to expand the PMEVTYPERn register */ #define PMU_PMEVTYPER(n) \ - /* PMEVTYPERn */ \ - { Op1(0), CRn(0b1110), \ - CRm((0b1100 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)), \ - access_pmu_evtyper } - + { CP15_PMU_SYS_REG(DIRECT, 0, 0b1110, \ + (0b1100 | (((n) >> 3) & 0x3)), ((n) & 0x7)), \ + .access = access_pmu_evtyper } /* * Trapped cp15 registers. TTBR0/TTBR1 get a double encoding, * depending on the way they are accessed (as a 32bit or a 64bit @@ -2015,25 +1967,25 @@ static const struct sys_reg_desc cp15_regs[] = { { Op1( 0), CRn( 7), CRm(14), Op2( 2), access_dcsw }, /* PMU */ - { Op1( 0), CRn( 9), CRm(12), Op2( 0), access_pmcr }, - { Op1( 0), CRn( 9), CRm(12), Op2( 1), access_pmcnten }, - { Op1( 0), CRn( 9), CRm(12), Op2( 2), access_pmcnten }, - { Op1( 0), CRn( 9), CRm(12), Op2( 3), access_pmovs }, - { Op1( 0), CRn( 9), CRm(12), Op2( 4), access_pmswinc }, - { Op1( 0), CRn( 9), CRm(12), Op2( 5), access_pmselr }, - { AA32(LO), Op1( 0), CRn( 9), CRm(12), Op2( 6), access_pmceid }, - { AA32(LO), Op1( 0), CRn( 9), CRm(12), Op2( 7), access_pmceid }, - { Op1( 0), CRn( 9), CRm(13), Op2( 0), access_pmu_evcntr }, - { Op1( 0), CRn( 9), CRm(13), Op2( 1), access_pmu_evtyper }, - { Op1( 0), CRn( 9), CRm(13), Op2( 2), access_pmu_evcntr }, - { Op1( 0), CRn( 9), CRm(14), Op2( 0), access_pmuserenr }, - { Op1( 0), CRn( 9), CRm(14), Op2( 1), access_pminten }, - { Op1( 0), CRn( 9), CRm(14), Op2( 2), access_pminten }, - { Op1( 0), CRn( 9), CRm(14), Op2( 3), access_pmovs }, - { AA32(HI), Op1( 0), CRn( 9), CRm(14), Op2( 4), access_pmceid }, - { AA32(HI), Op1( 0), CRn( 9), CRm(14), Op2( 5), access_pmceid }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 0), .access = access_pmcr }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 1), .access = access_pmcnten }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 2), .access = access_pmcnten }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 3), .access = access_pmovs }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 4), .access = access_pmswinc }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 5), .access = access_pmselr }, + { CP15_PMU_SYS_REG(LO, 0, 9, 12, 6), .access = access_pmceid }, + { CP15_PMU_SYS_REG(LO, 0, 9, 12, 7), .access = access_pmceid }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 0), .access = access_pmu_evcntr }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 1), .access = access_pmu_evtyper }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 2), .access = access_pmu_evcntr }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 0), .access = access_pmuserenr }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 1), .access = access_pminten }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 2), .access = access_pminten }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 3), .access = access_pmovs }, + { CP15_PMU_SYS_REG(HI, 0, 9, 14, 4), .access = access_pmceid }, + { CP15_PMU_SYS_REG(HI, 0, 9, 14, 5), .access = access_pmceid }, /* PMMIR */ - { Op1( 0), CRn( 9), CRm(14), Op2( 6), trap_raz_wi }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 6), .access = trap_raz_wi }, /* PRRR/MAIR0 */ { AA32(LO), Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, MAIR_EL1 }, @@ -2118,7 +2070,7 @@ static const struct sys_reg_desc cp15_regs[] = { PMU_PMEVTYPER(29), PMU_PMEVTYPER(30), /* PMCCFILTR */ - { Op1(0), CRn(14), CRm(15), Op2(7), access_pmu_evtyper }, + { CP15_PMU_SYS_REG(DIRECT, 0, 14, 15, 7), .access = access_pmu_evtyper }, { Op1(1), CRn( 0), CRm( 0), Op2(0), access_ccsidr }, { Op1(1), CRn( 0), CRm( 0), Op2(1), access_clidr }, @@ -2127,7 +2079,7 @@ static const struct sys_reg_desc cp15_regs[] = { static const struct sys_reg_desc cp15_64_regs[] = { { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 }, - { Op1( 0), CRn( 0), CRm( 9), Op2( 0), access_pmu_evcntr }, + { CP15_PMU_SYS_REG(DIRECT, 0, 0, 9, 0), .access = access_pmu_evcntr }, { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI1R */ { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 }, { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */ @@ -2135,25 +2087,24 @@ static const struct sys_reg_desc cp15_64_regs[] = { { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer }, }; -static int check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, - bool is_32) +static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, + bool is_32) { unsigned int i; for (i = 0; i < n; i++) { if (!is_32 && table[i].reg && !table[i].reset) { - kvm_err("sys_reg table %p entry %d has lacks reset\n", - table, i); - return 1; + kvm_err("sys_reg table %pS entry %d lacks reset\n", &table[i], i); + return false; } if (i && cmp_sys_reg(&table[i-1], &table[i]) >= 0) { - kvm_err("sys_reg table %p out of order (%d)\n", table, i - 1); - return 1; + kvm_err("sys_reg table %pS entry %d out of order\n", &table[i - 1], i - 1); + return false; } } - return 0; + return true; } int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu) @@ -2194,27 +2145,27 @@ static void perform_access(struct kvm_vcpu *vcpu, * @table: array of trap descriptors * @num: size of the trap descriptor array * - * Return 0 if the access has been handled, and -1 if not. + * Return true if the access has been handled, false if not. */ -static int emulate_cp(struct kvm_vcpu *vcpu, - struct sys_reg_params *params, - const struct sys_reg_desc *table, - size_t num) +static bool emulate_cp(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, + const struct sys_reg_desc *table, + size_t num) { const struct sys_reg_desc *r; if (!table) - return -1; /* Not handled */ + return false; /* Not handled */ r = find_reg(params, table, num); if (r) { perform_access(vcpu, params, r); - return 0; + return true; } /* Not handled */ - return -1; + return false; } static void unhandled_cp_access(struct kvm_vcpu *vcpu, @@ -2252,7 +2203,7 @@ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu, size_t nr_global) { struct sys_reg_params params; - u32 esr = kvm_vcpu_get_esr(vcpu); + u64 esr = kvm_vcpu_get_esr(vcpu); int Rt = kvm_vcpu_sys_get_rt(vcpu); int Rt2 = (esr >> 10) & 0x1f; @@ -2278,7 +2229,7 @@ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu, * potential register operation in the case of a read and return * with success. */ - if (!emulate_cp(vcpu, ¶ms, global, nr_global)) { + if (emulate_cp(vcpu, ¶ms, global, nr_global)) { /* Split up the value between registers for the read side */ if (!params.is_write) { vcpu_set_reg(vcpu, Rt, lower_32_bits(params.regval)); @@ -2292,34 +2243,144 @@ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu, return 1; } +static bool emulate_sys_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *params); + +/* + * The CP10 ID registers are architecturally mapped to AArch64 feature + * registers. Abuse that fact so we can rely on the AArch64 handler for accesses + * from AArch32. + */ +static bool kvm_esr_cp10_id_to_sys64(u64 esr, struct sys_reg_params *params) +{ + u8 reg_id = (esr >> 10) & 0xf; + bool valid; + + params->is_write = ((esr & 1) == 0); + params->Op0 = 3; + params->Op1 = 0; + params->CRn = 0; + params->CRm = 3; + + /* CP10 ID registers are read-only */ + valid = !params->is_write; + + switch (reg_id) { + /* MVFR0 */ + case 0b0111: + params->Op2 = 0; + break; + /* MVFR1 */ + case 0b0110: + params->Op2 = 1; + break; + /* MVFR2 */ + case 0b0101: + params->Op2 = 2; + break; + default: + valid = false; + } + + if (valid) + return true; + + kvm_pr_unimpl("Unhandled cp10 register %s: %u\n", + params->is_write ? "write" : "read", reg_id); + return false; +} + +/** + * kvm_handle_cp10_id() - Handles a VMRS trap on guest access to a 'Media and + * VFP Register' from AArch32. + * @vcpu: The vCPU pointer + * + * MVFR{0-2} are architecturally mapped to the AArch64 MVFR{0-2}_EL1 registers. + * Work out the correct AArch64 system register encoding and reroute to the + * AArch64 system register emulation. + */ +int kvm_handle_cp10_id(struct kvm_vcpu *vcpu) +{ + int Rt = kvm_vcpu_sys_get_rt(vcpu); + u64 esr = kvm_vcpu_get_esr(vcpu); + struct sys_reg_params params; + + /* UNDEF on any unhandled register access */ + if (!kvm_esr_cp10_id_to_sys64(esr, ¶ms)) { + kvm_inject_undefined(vcpu); + return 1; + } + + if (emulate_sys_reg(vcpu, ¶ms)) + vcpu_set_reg(vcpu, Rt, params.regval); + + return 1; +} + +/** + * kvm_emulate_cp15_id_reg() - Handles an MRC trap on a guest CP15 access where + * CRn=0, which corresponds to the AArch32 feature + * registers. + * @vcpu: the vCPU pointer + * @params: the system register access parameters. + * + * Our cp15 system register tables do not enumerate the AArch32 feature + * registers. Conveniently, our AArch64 table does, and the AArch32 system + * register encoding can be trivially remapped into the AArch64 for the feature + * registers: Append op0=3, leaving op1, CRn, CRm, and op2 the same. + * + * According to DDI0487G.b G7.3.1, paragraph "Behavior of VMSAv8-32 32-bit + * System registers with (coproc=0b1111, CRn==c0)", read accesses from this + * range are either UNKNOWN or RES0. Rerouting remains architectural as we + * treat undefined registers in this range as RAZ. + */ +static int kvm_emulate_cp15_id_reg(struct kvm_vcpu *vcpu, + struct sys_reg_params *params) +{ + int Rt = kvm_vcpu_sys_get_rt(vcpu); + + /* Treat impossible writes to RO registers as UNDEFINED */ + if (params->is_write) { + unhandled_cp_access(vcpu, params); + return 1; + } + + params->Op0 = 3; + + /* + * All registers where CRm > 3 are known to be UNKNOWN/RAZ from AArch32. + * Avoid conflicting with future expansion of AArch64 feature registers + * and simply treat them as RAZ here. + */ + if (params->CRm > 3) + params->regval = 0; + else if (!emulate_sys_reg(vcpu, params)) + return 1; + + vcpu_set_reg(vcpu, Rt, params->regval); + return 1; +} + /** * kvm_handle_cp_32 -- handles a mrc/mcr trap on a guest CP14/CP15 access * @vcpu: The VCPU pointer * @run: The kvm_run struct */ static int kvm_handle_cp_32(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, const struct sys_reg_desc *global, size_t nr_global) { - struct sys_reg_params params; - u32 esr = kvm_vcpu_get_esr(vcpu); int Rt = kvm_vcpu_sys_get_rt(vcpu); - params.CRm = (esr >> 1) & 0xf; - params.regval = vcpu_get_reg(vcpu, Rt); - params.is_write = ((esr & 1) == 0); - params.CRn = (esr >> 10) & 0xf; - params.Op0 = 0; - params.Op1 = (esr >> 14) & 0x7; - params.Op2 = (esr >> 17) & 0x7; + params->regval = vcpu_get_reg(vcpu, Rt); - if (!emulate_cp(vcpu, ¶ms, global, nr_global)) { - if (!params.is_write) - vcpu_set_reg(vcpu, Rt, params.regval); + if (emulate_cp(vcpu, params, global, nr_global)) { + if (!params->is_write) + vcpu_set_reg(vcpu, Rt, params->regval); return 1; } - unhandled_cp_access(vcpu, ¶ms); + unhandled_cp_access(vcpu, params); return 1; } @@ -2330,7 +2391,20 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu) int kvm_handle_cp15_32(struct kvm_vcpu *vcpu) { - return kvm_handle_cp_32(vcpu, cp15_regs, ARRAY_SIZE(cp15_regs)); + struct sys_reg_params params; + + params = esr_cp1x_32_to_params(kvm_vcpu_get_esr(vcpu)); + + /* + * Certain AArch32 ID registers are handled by rerouting to the AArch64 + * system register table. Registers in the ID range where CRm=0 are + * excluded from this scheme as they do not trivially map into AArch64 + * system register encodings. + */ + if (params.Op1 == 0 && params.CRn == 0 && params.CRm) + return kvm_emulate_cp15_id_reg(vcpu, ¶ms); + + return kvm_handle_cp_32(vcpu, ¶ms, cp15_regs, ARRAY_SIZE(cp15_regs)); } int kvm_handle_cp14_64(struct kvm_vcpu *vcpu) @@ -2340,7 +2414,11 @@ int kvm_handle_cp14_64(struct kvm_vcpu *vcpu) int kvm_handle_cp14_32(struct kvm_vcpu *vcpu) { - return kvm_handle_cp_32(vcpu, cp14_regs, ARRAY_SIZE(cp14_regs)); + struct sys_reg_params params; + + params = esr_cp1x_32_to_params(kvm_vcpu_get_esr(vcpu)); + + return kvm_handle_cp_32(vcpu, ¶ms, cp14_regs, ARRAY_SIZE(cp14_regs)); } static bool is_imp_def_sys_reg(struct sys_reg_params *params) @@ -2349,7 +2427,14 @@ static bool is_imp_def_sys_reg(struct sys_reg_params *params) return params->Op0 == 3 && (params->CRn & 0b1011) == 0b1011; } -static int emulate_sys_reg(struct kvm_vcpu *vcpu, +/** + * emulate_sys_reg - Emulate a guest access to an AArch64 system register + * @vcpu: The VCPU pointer + * @params: Decoded system register parameters + * + * Return: true if the system register access was successful, false otherwise. + */ +static bool emulate_sys_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *params) { const struct sys_reg_desc *r; @@ -2358,7 +2443,10 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu, if (likely(r)) { perform_access(vcpu, params, r); - } else if (is_imp_def_sys_reg(params)) { + return true; + } + + if (is_imp_def_sys_reg(params)) { kvm_inject_undefined(vcpu); } else { print_sys_reg_msg(params, @@ -2366,7 +2454,7 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu, *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); kvm_inject_undefined(vcpu); } - return 1; + return false; } /** @@ -2394,18 +2482,18 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) struct sys_reg_params params; unsigned long esr = kvm_vcpu_get_esr(vcpu); int Rt = kvm_vcpu_sys_get_rt(vcpu); - int ret; trace_kvm_handle_sys_reg(esr); params = esr_sys64_to_params(esr); params.regval = vcpu_get_reg(vcpu, Rt); - ret = emulate_sys_reg(vcpu, ¶ms); + if (!emulate_sys_reg(vcpu, ¶ms)) + return 1; if (!params.is_write) vcpu_set_reg(vcpu, Rt, params.regval); - return ret; + return 1; } /****************************************************************************** @@ -2441,35 +2529,34 @@ static bool index_to_params(u64 id, struct sys_reg_params *params) } } -const struct sys_reg_desc *find_reg_by_id(u64 id, - struct sys_reg_params *params, - const struct sys_reg_desc table[], - unsigned int num) +const struct sys_reg_desc *get_reg_by_id(u64 id, + const struct sys_reg_desc table[], + unsigned int num) { - if (!index_to_params(id, params)) + struct sys_reg_params params; + + if (!index_to_params(id, ¶ms)) return NULL; - return find_reg(params, table, num); + return find_reg(¶ms, table, num); } /* Decode an index value, and find the sys_reg_desc entry. */ -static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu, - u64 id) +static const struct sys_reg_desc * +id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id, + const struct sys_reg_desc table[], unsigned int num) + { const struct sys_reg_desc *r; - struct sys_reg_params params; /* We only do sys_reg for now. */ if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM64_SYSREG) return NULL; - if (!index_to_params(id, ¶ms)) - return NULL; - - r = find_reg(¶ms, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); + r = get_reg_by_id(id, table, num); /* Not saved in the sys_reg array and not otherwise accessible? */ - if (r && !(r->reg || r->get_user)) + if (r && (!(r->reg || r->get_user) || sysreg_hidden(vcpu, r))) r = NULL; return r; @@ -2509,48 +2596,30 @@ static struct sys_reg_desc invariant_sys_regs[] = { { SYS_DESC(SYS_CTR_EL0), NULL, get_ctr_el0 }, }; -static int reg_from_user(u64 *val, const void __user *uaddr, u64 id) +static int get_invariant_sys_reg(u64 id, u64 __user *uaddr) { - if (copy_from_user(val, uaddr, KVM_REG_SIZE(id)) != 0) - return -EFAULT; - return 0; -} - -static int reg_to_user(void __user *uaddr, const u64 *val, u64 id) -{ - if (copy_to_user(uaddr, val, KVM_REG_SIZE(id)) != 0) - return -EFAULT; - return 0; -} - -static int get_invariant_sys_reg(u64 id, void __user *uaddr) -{ - struct sys_reg_params params; const struct sys_reg_desc *r; - r = find_reg_by_id(id, ¶ms, invariant_sys_regs, - ARRAY_SIZE(invariant_sys_regs)); + r = get_reg_by_id(id, invariant_sys_regs, + ARRAY_SIZE(invariant_sys_regs)); if (!r) return -ENOENT; - return reg_to_user(uaddr, &r->val, id); + return put_user(r->val, uaddr); } -static int set_invariant_sys_reg(u64 id, void __user *uaddr) +static int set_invariant_sys_reg(u64 id, u64 __user *uaddr) { - struct sys_reg_params params; const struct sys_reg_desc *r; - int err; - u64 val = 0; /* Make sure high bits are 0 for 32-bit regs */ + u64 val; - r = find_reg_by_id(id, ¶ms, invariant_sys_regs, - ARRAY_SIZE(invariant_sys_regs)); + r = get_reg_by_id(id, invariant_sys_regs, + ARRAY_SIZE(invariant_sys_regs)); if (!r) return -ENOENT; - err = reg_from_user(&val, uaddr, id); - if (err) - return err; + if (get_user(val, uaddr)) + return -EFAULT; /* This is what we mean by invariant: you can't change it. */ if (r->val != val) @@ -2641,54 +2710,89 @@ static int demux_c15_set(u64 id, void __user *uaddr) } } +int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, + const struct sys_reg_desc table[], unsigned int num) +{ + u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; + const struct sys_reg_desc *r; + u64 val; + int ret; + + r = id_to_sys_reg_desc(vcpu, reg->id, table, num); + if (!r) + return -ENOENT; + + if (r->get_user) { + ret = (r->get_user)(vcpu, r, &val); + } else { + val = __vcpu_sys_reg(vcpu, r->reg); + ret = 0; + } + + if (!ret) + ret = put_user(val, uaddr); + + return ret; +} + int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { - const struct sys_reg_desc *r; void __user *uaddr = (void __user *)(unsigned long)reg->addr; + int err; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_get(reg->id, uaddr); - if (KVM_REG_SIZE(reg->id) != sizeof(__u64)) - return -ENOENT; + err = get_invariant_sys_reg(reg->id, uaddr); + if (err != -ENOENT) + return err; - r = index_to_sys_reg_desc(vcpu, reg->id); + return kvm_sys_reg_get_user(vcpu, reg, + sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); +} + +int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, + const struct sys_reg_desc table[], unsigned int num) +{ + u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; + const struct sys_reg_desc *r; + u64 val; + int ret; + + if (get_user(val, uaddr)) + return -EFAULT; + + r = id_to_sys_reg_desc(vcpu, reg->id, table, num); if (!r) - return get_invariant_sys_reg(reg->id, uaddr); - - /* Check for regs disabled by runtime config */ - if (sysreg_hidden(vcpu, r)) return -ENOENT; - if (r->get_user) - return (r->get_user)(vcpu, r, reg, uaddr); + if (sysreg_user_write_ignore(vcpu, r)) + return 0; - return reg_to_user(uaddr, &__vcpu_sys_reg(vcpu, r->reg), reg->id); + if (r->set_user) { + ret = (r->set_user)(vcpu, r, val); + } else { + __vcpu_sys_reg(vcpu, r->reg) = val; + ret = 0; + } + + return ret; } int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { - const struct sys_reg_desc *r; void __user *uaddr = (void __user *)(unsigned long)reg->addr; + int err; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_set(reg->id, uaddr); - if (KVM_REG_SIZE(reg->id) != sizeof(__u64)) - return -ENOENT; + err = set_invariant_sys_reg(reg->id, uaddr); + if (err != -ENOENT) + return err; - r = index_to_sys_reg_desc(vcpu, reg->id); - if (!r) - return set_invariant_sys_reg(reg->id, uaddr); - - /* Check for regs disabled by runtime config */ - if (sysreg_hidden(vcpu, r)) - return -ENOENT; - - if (r->set_user) - return (r->set_user)(vcpu, r, reg, uaddr); - - return reg_from_user(&__vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id); + return kvm_sys_reg_set_user(vcpu, reg, + sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); } static unsigned int num_demux_regs(void) @@ -2808,18 +2912,22 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) return write_demux_regids(uindices); } -void kvm_sys_reg_table_init(void) +int kvm_sys_reg_table_init(void) { + bool valid = true; unsigned int i; struct sys_reg_desc clidr; /* Make sure tables are unique and in order. */ - BUG_ON(check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false)); - BUG_ON(check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), true)); - BUG_ON(check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true)); - BUG_ON(check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true)); - BUG_ON(check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true)); - BUG_ON(check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false)); + valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false); + valid &= check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), true); + valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true); + valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true); + valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true); + valid &= check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false); + + if (!valid) + return -EINVAL; /* We abuse the reset function to overwrite the table itself. */ for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) @@ -2842,4 +2950,6 @@ void kvm_sys_reg_table_init(void) break; /* Clear all higher bits. */ cache_levels &= (1 << (i*3))-1; + + return 0; } diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index cc0cc95a0280..c10239980ce4 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -35,12 +35,19 @@ struct sys_reg_params { .Op2 = ((esr) >> 17) & 0x7, \ .is_write = !((esr) & 1) }) +#define esr_cp1x_32_to_params(esr) \ + ((struct sys_reg_params){ .Op1 = ((esr) >> 14) & 0x7, \ + .CRn = ((esr) >> 10) & 0xf, \ + .CRm = ((esr) >> 1) & 0xf, \ + .Op2 = ((esr) >> 17) & 0x7, \ + .is_write = !((esr) & 1) }) + struct sys_reg_desc { /* Sysreg string for debug */ const char *name; enum { - AA32_ZEROHIGH, + AA32_DIRECT, AA32_LO, AA32_HI, } aarch32_map; @@ -68,9 +75,9 @@ struct sys_reg_desc { /* Custom get/set_user functions, fallback to generic if NULL */ int (*get_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr); + u64 *val); int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr); + u64 val); /* Return mask of REG_* runtime visibility overrides */ unsigned int (*visibility)(const struct kvm_vcpu *vcpu, @@ -79,6 +86,7 @@ struct sys_reg_desc { #define REG_HIDDEN (1 << 0) /* hidden from userspace and guest */ #define REG_RAZ (1 << 1) /* RAZ from userspace and guest */ +#define REG_USER_WI (1 << 2) /* WI from userspace only */ static __printf(2, 3) inline void print_sys_reg_msg(const struct sys_reg_params *p, @@ -129,22 +137,31 @@ static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r __vcpu_sys_reg(vcpu, r->reg) = r->val; } +static inline unsigned int sysreg_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + if (likely(!r->visibility)) + return 0; + + return r->visibility(vcpu, r); +} + static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - if (likely(!r->visibility)) - return false; - - return r->visibility(vcpu, r) & REG_HIDDEN; + return sysreg_visibility(vcpu, r) & REG_HIDDEN; } static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - if (likely(!r->visibility)) - return false; + return sysreg_visibility(vcpu, r) & REG_RAZ; +} - return r->visibility(vcpu, r) & REG_RAZ; +static inline bool sysreg_user_write_ignore(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + return sysreg_visibility(vcpu, r) & REG_USER_WI; } static inline int cmp_sys_reg(const struct sys_reg_desc *i1, @@ -183,10 +200,35 @@ find_reg(const struct sys_reg_params *params, const struct sys_reg_desc table[], return __inline_bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); } -const struct sys_reg_desc *find_reg_by_id(u64 id, - struct sys_reg_params *params, - const struct sys_reg_desc table[], - unsigned int num); +static inline u64 calculate_mpidr(const struct kvm_vcpu *vcpu) +{ + u64 mpidr; + + /* + * Map the vcpu_id into the first three affinity level fields of + * the MPIDR. We limit the number of VCPUs in level 0 due to a + * limitation to 16 CPUs in that level in the ICC_SGIxR registers + * of the GICv3 to be able to address each CPU directly when + * sending IPIs. + */ + mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0); + mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1); + mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2); + mpidr |= (1ULL << 31); + + return mpidr; +} + +const struct sys_reg_desc *get_reg_by_id(u64 id, + const struct sys_reg_desc table[], + unsigned int num); + +int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); +int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); +int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, + const struct sys_reg_desc table[], unsigned int num); +int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, + const struct sys_reg_desc table[], unsigned int num); #define AA32(_x) .aarch32_map = AA32_##_x #define Op0(_x) .Op0 = _x diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index acdb7b3cc97d..8c676f1df597 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -12,6 +12,7 @@ #include #include #include +#include /* * The LSB of the HYP VA tag @@ -109,6 +110,29 @@ __init void kvm_apply_hyp_relocations(void) } } +void kvm_apply_hyp_module_relocations(void *mod_start, void *hyp_va, + kvm_nvhe_reloc_t *begin, + kvm_nvhe_reloc_t *end) +{ + kvm_nvhe_reloc_t *rel; + + for (rel = begin; rel < end; ++rel) { + u32 **ptr, *va; + + /* + * Each entry contains a 32-bit relative offset from itself + * to a VA position in the module area. + */ + ptr = (u32 **)((char *)rel + *rel); + + /* Read the module VA value at the relocation address. */ + va = *ptr; + + /* Convert the module VA of the reloc to a hyp VA */ + WARN_ON(aarch64_addr_write(ptr, (u64)(((void *)va - mod_start) + hyp_va))); + } +} + static u32 compute_instruction(int n, u32 rd, u32 rn) { u32 insn = AARCH64_BREAK_FAULT; diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c index 07d5271e9f05..9e7c486b48c2 100644 --- a/arch/arm64/kvm/vgic-sys-reg-v3.c +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -10,293 +10,357 @@ #include "vgic/vgic.h" #include "sys_regs.h" -static bool access_gic_ctlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int set_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) { u32 host_pri_bits, host_id_bits, host_seis, host_a3v, seis, a3v; struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu; struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + + /* + * Disallow restoring VM state if not supported by this + * hardware. + */ + host_pri_bits = FIELD_GET(ICC_CTLR_EL1_PRI_BITS_MASK, val) + 1; + if (host_pri_bits > vgic_v3_cpu->num_pri_bits) + return -EINVAL; + + vgic_v3_cpu->num_pri_bits = host_pri_bits; + + host_id_bits = FIELD_GET(ICC_CTLR_EL1_ID_BITS_MASK, val); + if (host_id_bits > vgic_v3_cpu->num_id_bits) + return -EINVAL; + + vgic_v3_cpu->num_id_bits = host_id_bits; + + host_seis = FIELD_GET(ICH_VTR_SEIS_MASK, kvm_vgic_global_state.ich_vtr_el2); + seis = FIELD_GET(ICC_CTLR_EL1_SEIS_MASK, val); + if (host_seis != seis) + return -EINVAL; + + host_a3v = FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2); + a3v = FIELD_GET(ICC_CTLR_EL1_A3V_MASK, val); + if (host_a3v != a3v) + return -EINVAL; + + /* + * Here set VMCR.CTLR in ICC_CTLR_EL1 layout. + * The vgic_set_vmcr() will convert to ICH_VMCR layout. + */ + vmcr.cbpr = FIELD_GET(ICC_CTLR_EL1_CBPR_MASK, val); + vmcr.eoim = FIELD_GET(ICC_CTLR_EL1_EOImode_MASK, val); + vgic_set_vmcr(vcpu, &vmcr); + + return 0; +} + +static int get_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *valp) +{ + struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu; + struct vgic_vmcr vmcr; u64 val; vgic_get_vmcr(vcpu, &vmcr); - if (p->is_write) { - val = p->regval; + val = 0; + val |= FIELD_PREP(ICC_CTLR_EL1_PRI_BITS_MASK, vgic_v3_cpu->num_pri_bits - 1); + val |= FIELD_PREP(ICC_CTLR_EL1_ID_BITS_MASK, vgic_v3_cpu->num_id_bits); + val |= FIELD_PREP(ICC_CTLR_EL1_SEIS_MASK, + FIELD_GET(ICH_VTR_SEIS_MASK, + kvm_vgic_global_state.ich_vtr_el2)); + val |= FIELD_PREP(ICC_CTLR_EL1_A3V_MASK, + FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2)); + /* + * The VMCR.CTLR value is in ICC_CTLR_EL1 layout. + * Extract it directly using ICC_CTLR_EL1 reg definitions. + */ + val |= FIELD_PREP(ICC_CTLR_EL1_CBPR_MASK, vmcr.cbpr); + val |= FIELD_PREP(ICC_CTLR_EL1_EOImode_MASK, vmcr.eoim); - /* - * Disallow restoring VM state if not supported by this - * hardware. - */ - host_pri_bits = ((val & ICC_CTLR_EL1_PRI_BITS_MASK) >> - ICC_CTLR_EL1_PRI_BITS_SHIFT) + 1; - if (host_pri_bits > vgic_v3_cpu->num_pri_bits) - return false; + *valp = val; - vgic_v3_cpu->num_pri_bits = host_pri_bits; - - host_id_bits = (val & ICC_CTLR_EL1_ID_BITS_MASK) >> - ICC_CTLR_EL1_ID_BITS_SHIFT; - if (host_id_bits > vgic_v3_cpu->num_id_bits) - return false; - - vgic_v3_cpu->num_id_bits = host_id_bits; - - host_seis = ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_SEIS_MASK) >> ICH_VTR_SEIS_SHIFT); - seis = (val & ICC_CTLR_EL1_SEIS_MASK) >> - ICC_CTLR_EL1_SEIS_SHIFT; - if (host_seis != seis) - return false; - - host_a3v = ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_A3V_MASK) >> ICH_VTR_A3V_SHIFT); - a3v = (val & ICC_CTLR_EL1_A3V_MASK) >> ICC_CTLR_EL1_A3V_SHIFT; - if (host_a3v != a3v) - return false; - - /* - * Here set VMCR.CTLR in ICC_CTLR_EL1 layout. - * The vgic_set_vmcr() will convert to ICH_VMCR layout. - */ - vmcr.cbpr = (val & ICC_CTLR_EL1_CBPR_MASK) >> ICC_CTLR_EL1_CBPR_SHIFT; - vmcr.eoim = (val & ICC_CTLR_EL1_EOImode_MASK) >> ICC_CTLR_EL1_EOImode_SHIFT; - vgic_set_vmcr(vcpu, &vmcr); - } else { - val = 0; - val |= (vgic_v3_cpu->num_pri_bits - 1) << - ICC_CTLR_EL1_PRI_BITS_SHIFT; - val |= vgic_v3_cpu->num_id_bits << ICC_CTLR_EL1_ID_BITS_SHIFT; - val |= ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_SEIS_MASK) >> ICH_VTR_SEIS_SHIFT) << - ICC_CTLR_EL1_SEIS_SHIFT; - val |= ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_A3V_MASK) >> ICH_VTR_A3V_SHIFT) << - ICC_CTLR_EL1_A3V_SHIFT; - /* - * The VMCR.CTLR value is in ICC_CTLR_EL1 layout. - * Extract it directly using ICC_CTLR_EL1 reg definitions. - */ - val |= (vmcr.cbpr << ICC_CTLR_EL1_CBPR_SHIFT) & ICC_CTLR_EL1_CBPR_MASK; - val |= (vmcr.eoim << ICC_CTLR_EL1_EOImode_SHIFT) & ICC_CTLR_EL1_EOImode_MASK; - - p->regval = val; - } - - return true; + return 0; } -static bool access_gic_pmr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int set_gic_pmr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - if (p->is_write) { - vmcr.pmr = (p->regval & ICC_PMR_EL1_MASK) >> ICC_PMR_EL1_SHIFT; - vgic_set_vmcr(vcpu, &vmcr); - } else { - p->regval = (vmcr.pmr << ICC_PMR_EL1_SHIFT) & ICC_PMR_EL1_MASK; - } + vmcr.pmr = FIELD_GET(ICC_PMR_EL1_MASK, val); + vgic_set_vmcr(vcpu, &vmcr); - return true; + return 0; } -static bool access_gic_bpr0(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int get_gic_pmr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - if (p->is_write) { - vmcr.bpr = (p->regval & ICC_BPR0_EL1_MASK) >> - ICC_BPR0_EL1_SHIFT; - vgic_set_vmcr(vcpu, &vmcr); - } else { - p->regval = (vmcr.bpr << ICC_BPR0_EL1_SHIFT) & - ICC_BPR0_EL1_MASK; - } + *val = FIELD_PREP(ICC_PMR_EL1_MASK, vmcr.pmr); - return true; + return 0; } -static bool access_gic_bpr1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int set_gic_bpr0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) { struct vgic_vmcr vmcr; - if (!p->is_write) - p->regval = 0; + vgic_get_vmcr(vcpu, &vmcr); + vmcr.bpr = FIELD_GET(ICC_BPR0_EL1_MASK, val); + vgic_set_vmcr(vcpu, &vmcr); + + return 0; +} + +static int get_gic_bpr0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + *val = FIELD_PREP(ICC_BPR0_EL1_MASK, vmcr.bpr); + + return 0; +} + +static int set_gic_bpr1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); if (!vmcr.cbpr) { - if (p->is_write) { - vmcr.abpr = (p->regval & ICC_BPR1_EL1_MASK) >> - ICC_BPR1_EL1_SHIFT; - vgic_set_vmcr(vcpu, &vmcr); - } else { - p->regval = (vmcr.abpr << ICC_BPR1_EL1_SHIFT) & - ICC_BPR1_EL1_MASK; - } - } else { - if (!p->is_write) - p->regval = min((vmcr.bpr + 1), 7U); + vmcr.abpr = FIELD_GET(ICC_BPR1_EL1_MASK, val); + vgic_set_vmcr(vcpu, &vmcr); } - return true; + return 0; } -static bool access_gic_grpen0(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int get_gic_bpr1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - if (p->is_write) { - vmcr.grpen0 = (p->regval & ICC_IGRPEN0_EL1_MASK) >> - ICC_IGRPEN0_EL1_SHIFT; - vgic_set_vmcr(vcpu, &vmcr); - } else { - p->regval = (vmcr.grpen0 << ICC_IGRPEN0_EL1_SHIFT) & - ICC_IGRPEN0_EL1_MASK; - } + if (!vmcr.cbpr) + *val = FIELD_PREP(ICC_BPR1_EL1_MASK, vmcr.abpr); + else + *val = min((vmcr.bpr + 1), 7U); - return true; + + return 0; } -static bool access_gic_grpen1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int set_gic_grpen0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - if (p->is_write) { - vmcr.grpen1 = (p->regval & ICC_IGRPEN1_EL1_MASK) >> - ICC_IGRPEN1_EL1_SHIFT; - vgic_set_vmcr(vcpu, &vmcr); - } else { - p->regval = (vmcr.grpen1 << ICC_IGRPEN1_EL1_SHIFT) & - ICC_IGRPEN1_EL1_MASK; - } + vmcr.grpen0 = FIELD_GET(ICC_IGRPEN0_EL1_MASK, val); + vgic_set_vmcr(vcpu, &vmcr); - return true; + return 0; } -static void vgic_v3_access_apr_reg(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, u8 apr, u8 idx) +static int get_gic_grpen0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + *val = FIELD_PREP(ICC_IGRPEN0_EL1_MASK, vmcr.grpen0); + + return 0; +} + +static int set_gic_grpen1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + vmcr.grpen1 = FIELD_GET(ICC_IGRPEN1_EL1_MASK, val); + vgic_set_vmcr(vcpu, &vmcr); + + return 0; +} + +static int get_gic_grpen1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + *val = FIELD_GET(ICC_IGRPEN1_EL1_MASK, vmcr.grpen1); + + return 0; +} + +static void set_apr_reg(struct kvm_vcpu *vcpu, u64 val, u8 apr, u8 idx) { struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; - uint32_t *ap_reg; if (apr) - ap_reg = &vgicv3->vgic_ap1r[idx]; + vgicv3->vgic_ap1r[idx] = val; else - ap_reg = &vgicv3->vgic_ap0r[idx]; - - if (p->is_write) - *ap_reg = p->regval; - else - p->regval = *ap_reg; + vgicv3->vgic_ap0r[idx] = val; } -static bool access_gic_aprn(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r, u8 apr) +static u64 get_apr_reg(struct kvm_vcpu *vcpu, u8 apr, u8 idx) +{ + struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; + + if (apr) + return vgicv3->vgic_ap1r[idx]; + else + return vgicv3->vgic_ap0r[idx]; +} + +static int set_gic_ap0r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) + { u8 idx = r->Op2 & 3; if (idx > vgic_v3_max_apr_idx(vcpu)) - goto err; + return -EINVAL; - vgic_v3_access_apr_reg(vcpu, p, apr, idx); - return true; -err: - if (!p->is_write) - p->regval = 0; - - return false; + set_apr_reg(vcpu, val, 0, idx); + return 0; } -static bool access_gic_ap0r(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int get_gic_ap0r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + u8 idx = r->Op2 & 3; + + if (idx > vgic_v3_max_apr_idx(vcpu)) + return -EINVAL; + + *val = get_apr_reg(vcpu, 0, idx); + + return 0; +} + +static int set_gic_ap1r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) { - return access_gic_aprn(vcpu, p, r, 0); + u8 idx = r->Op2 & 3; + + if (idx > vgic_v3_max_apr_idx(vcpu)) + return -EINVAL; + + set_apr_reg(vcpu, val, 1, idx); + return 0; } -static bool access_gic_ap1r(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int get_gic_ap1r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) { - return access_gic_aprn(vcpu, p, r, 1); + u8 idx = r->Op2 & 3; + + if (idx > vgic_v3_max_apr_idx(vcpu)) + return -EINVAL; + + *val = get_apr_reg(vcpu, 1, idx); + + return 0; } -static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int set_gic_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + /* Validate SRE bit */ + if (!(val & ICC_SRE_EL1_SRE)) + return -EINVAL; + + return 0; +} + +static int get_gic_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) { struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; - /* Validate SRE bit */ - if (p->is_write) { - if (!(p->regval & ICC_SRE_EL1_SRE)) - return false; - } else { - p->regval = vgicv3->vgic_sre; - } + *val = vgicv3->vgic_sre; - return true; + return 0; } + static const struct sys_reg_desc gic_v3_icc_reg_descs[] = { - { SYS_DESC(SYS_ICC_PMR_EL1), access_gic_pmr }, - { SYS_DESC(SYS_ICC_BPR0_EL1), access_gic_bpr0 }, - { SYS_DESC(SYS_ICC_AP0R0_EL1), access_gic_ap0r }, - { SYS_DESC(SYS_ICC_AP0R1_EL1), access_gic_ap0r }, - { SYS_DESC(SYS_ICC_AP0R2_EL1), access_gic_ap0r }, - { SYS_DESC(SYS_ICC_AP0R3_EL1), access_gic_ap0r }, - { SYS_DESC(SYS_ICC_AP1R0_EL1), access_gic_ap1r }, - { SYS_DESC(SYS_ICC_AP1R1_EL1), access_gic_ap1r }, - { SYS_DESC(SYS_ICC_AP1R2_EL1), access_gic_ap1r }, - { SYS_DESC(SYS_ICC_AP1R3_EL1), access_gic_ap1r }, - { SYS_DESC(SYS_ICC_BPR1_EL1), access_gic_bpr1 }, - { SYS_DESC(SYS_ICC_CTLR_EL1), access_gic_ctlr }, - { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, - { SYS_DESC(SYS_ICC_IGRPEN0_EL1), access_gic_grpen0 }, - { SYS_DESC(SYS_ICC_IGRPEN1_EL1), access_gic_grpen1 }, + { SYS_DESC(SYS_ICC_PMR_EL1), + .set_user = set_gic_pmr, .get_user = get_gic_pmr, }, + { SYS_DESC(SYS_ICC_BPR0_EL1), + .set_user = set_gic_bpr0, .get_user = get_gic_bpr0, }, + { SYS_DESC(SYS_ICC_AP0R0_EL1), + .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, + { SYS_DESC(SYS_ICC_AP0R1_EL1), + .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, + { SYS_DESC(SYS_ICC_AP0R2_EL1), + .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, + { SYS_DESC(SYS_ICC_AP0R3_EL1), + .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, + { SYS_DESC(SYS_ICC_AP1R0_EL1), + .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, + { SYS_DESC(SYS_ICC_AP1R1_EL1), + .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, + { SYS_DESC(SYS_ICC_AP1R2_EL1), + .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, + { SYS_DESC(SYS_ICC_AP1R3_EL1), + .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, + { SYS_DESC(SYS_ICC_BPR1_EL1), + .set_user = set_gic_bpr1, .get_user = get_gic_bpr1, }, + { SYS_DESC(SYS_ICC_CTLR_EL1), + .set_user = set_gic_ctlr, .get_user = get_gic_ctlr, }, + { SYS_DESC(SYS_ICC_SRE_EL1), + .set_user = set_gic_sre, .get_user = get_gic_sre, }, + { SYS_DESC(SYS_ICC_IGRPEN0_EL1), + .set_user = set_gic_grpen0, .get_user = get_gic_grpen0, }, + { SYS_DESC(SYS_ICC_IGRPEN1_EL1), + .set_user = set_gic_grpen1, .get_user = get_gic_grpen1, }, }; -int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, - u64 *reg) +static u64 attr_to_id(u64 attr) { - struct sys_reg_params params; - u64 sysreg = (id & KVM_DEV_ARM_VGIC_SYSREG_MASK) | KVM_REG_SIZE_U64; + return ARM64_SYS_REG(FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP0_MASK, attr), + FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP1_MASK, attr), + FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_CRN_MASK, attr), + FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_CRM_MASK, attr), + FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP2_MASK, attr)); +} - params.regval = *reg; - params.is_write = is_write; - - if (find_reg_by_id(sysreg, ¶ms, gic_v3_icc_reg_descs, - ARRAY_SIZE(gic_v3_icc_reg_descs))) +int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + if (get_reg_by_id(attr_to_id(attr->attr), gic_v3_icc_reg_descs, + ARRAY_SIZE(gic_v3_icc_reg_descs))) return 0; return -ENXIO; } -int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, u64 id, - u64 *reg) +int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr, + bool is_write) { - struct sys_reg_params params; - const struct sys_reg_desc *r; - u64 sysreg = (id & KVM_DEV_ARM_VGIC_SYSREG_MASK) | KVM_REG_SIZE_U64; + struct kvm_one_reg reg = { + .id = attr_to_id(attr->attr), + .addr = attr->addr, + }; if (is_write) - params.regval = *reg; - params.is_write = is_write; - - r = find_reg_by_id(sysreg, ¶ms, gic_v3_icc_reg_descs, - ARRAY_SIZE(gic_v3_icc_reg_descs)); - if (!r) - return -ENXIO; - - if (!r->access(vcpu, ¶ms, r)) - return -EINVAL; - - if (!is_write) - *reg = params.regval; - - return 0; + return kvm_sys_reg_set_user(vcpu, ®, gic_v3_icc_reg_descs, + ARRAY_SIZE(gic_v3_icc_reg_descs)); + else + return kvm_sys_reg_get_user(vcpu, ®, gic_v3_icc_reg_descs, + ARRAY_SIZE(gic_v3_icc_reg_descs)); } diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index f38c40a76251..78cde687383c 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -82,7 +82,7 @@ static bool end_of_vgic(struct vgic_state_iter *iter) static void *vgic_debug_start(struct seq_file *s, loff_t *pos) { - struct kvm *kvm = (struct kvm *)s->private; + struct kvm *kvm = s->private; struct vgic_state_iter *iter; mutex_lock(&kvm->lock); @@ -110,7 +110,7 @@ out: static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos) { - struct kvm *kvm = (struct kvm *)s->private; + struct kvm *kvm = s->private; struct vgic_state_iter *iter = kvm->arch.vgic.iter; ++*pos; @@ -122,7 +122,7 @@ static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos) static void vgic_debug_stop(struct seq_file *s, void *v) { - struct kvm *kvm = (struct kvm *)s->private; + struct kvm *kvm = s->private; struct vgic_state_iter *iter; /* @@ -229,8 +229,8 @@ static void print_irq_state(struct seq_file *s, struct vgic_irq *irq, static int vgic_debug_show(struct seq_file *s, void *v) { - struct kvm *kvm = (struct kvm *)s->private; - struct vgic_state_iter *iter = (struct vgic_state_iter *)v; + struct kvm *kvm = s->private; + struct vgic_state_iter *iter = v; struct vgic_irq *irq; struct kvm_vcpu *vcpu = NULL; unsigned long flags; diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 340c51d87677..f84e04f334c6 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -70,8 +70,9 @@ void kvm_vgic_early_init(struct kvm *kvm) */ int kvm_vgic_create(struct kvm *kvm, u32 type) { - int i, ret; struct kvm_vcpu *vcpu; + unsigned long i; + int ret; if (irqchip_in_kernel(kvm)) return -EEXIST; @@ -91,7 +92,7 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) return ret; kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu->arch.has_run_once) + if (vcpu_has_run_once(vcpu)) goto out_unlock; } ret = 0; @@ -134,7 +135,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0); int i; - dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL); + dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT); if (!dist->spis) return -ENOMEM; @@ -255,7 +256,8 @@ int vgic_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu; - int ret = 0, i, idx; + int ret = 0, i; + unsigned long idx; if (vgic_initialized(kvm)) return 0; @@ -308,7 +310,7 @@ int vgic_init(struct kvm *kvm) goto out; } - kvm_for_each_vcpu(i, vcpu, kvm) + kvm_for_each_vcpu(idx, vcpu, kvm) kvm_vgic_vcpu_enable(vcpu); ret = kvm_vgic_setup_default_irq_routing(kvm); @@ -317,7 +319,12 @@ int vgic_init(struct kvm *kvm) vgic_debug_init(kvm); - dist->implementation_rev = 2; + /* + * If userspace didn't set the GIC implementation revision, + * default to the latest and greatest. You know want it. + */ + if (!dist->implementation_rev) + dist->implementation_rev = KVM_VGIC_IMP_REV_LATEST; dist->initialized = true; out: @@ -370,7 +377,7 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) static void __kvm_vgic_destroy(struct kvm *kvm) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; vgic_debug_destroy(kvm); diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c index 79f8899b234c..475059bacedf 100644 --- a/arch/arm64/kvm/vgic/vgic-irqfd.c +++ b/arch/arm64/kvm/vgic/vgic-irqfd.c @@ -139,7 +139,7 @@ int kvm_vgic_setup_default_irq_routing(struct kvm *kvm) u32 nr = dist->nr_spis; int i, ret; - entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL); + entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL_ACCOUNT); if (!entries) return -ENOMEM; diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 1d534283378a..733b53055f97 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -48,7 +48,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, if (irq) return irq; - irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL); + irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT); if (!irq) return ERR_PTR(-ENOMEM); @@ -332,7 +332,7 @@ int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr) * we must be careful not to overrun the array. */ irq_count = READ_ONCE(dist->lpi_list_count); - intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL); + intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL_ACCOUNT); if (!intids) return -ENOMEM; @@ -406,7 +406,7 @@ static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its, struct its_ite *ite; for_each_lpi_its(device, ite, its) { - if (!ite->collection || coll != ite->collection) + if (ite->collection != coll) continue; update_affinity_ite(kvm, ite); @@ -683,7 +683,7 @@ int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, if (!vcpu) return E_ITS_INT_UNMAPPED_INTERRUPT; - if (!vcpu->arch.vgic_cpu.lpis_enabled) + if (!vgic_lpis_enabled(vcpu)) return -EBUSY; vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq); @@ -894,6 +894,18 @@ static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its, return update_affinity(ite->irq, vcpu); } +static bool __is_visible_gfn_locked(struct vgic_its *its, gpa_t gpa) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + int idx; + bool ret; + + idx = srcu_read_lock(&its->dev->kvm->srcu); + ret = kvm_is_visible_gfn(its->dev->kvm, gfn); + srcu_read_unlock(&its->dev->kvm->srcu, idx); + return ret; +} + /* * Check whether an ID can be stored into the corresponding guest table. * For a direct table this is pretty easy, but gets a bit nasty for @@ -908,9 +920,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, u64 indirect_ptr, type = GITS_BASER_TYPE(baser); phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); int esz = GITS_BASER_ENTRY_SIZE(baser); - int index, idx; - gfn_t gfn; - bool ret; + int index; switch (type) { case GITS_BASER_TYPE_DEVICE: @@ -933,12 +943,11 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, return false; addr = base + id * esz; - gfn = addr >> PAGE_SHIFT; if (eaddr) *eaddr = addr; - goto out; + return __is_visible_gfn_locked(its, addr); } /* calculate and check the index into the 1st level */ @@ -964,28 +973,43 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, /* Find the address of the actual entry */ index = id % (SZ_64K / esz); indirect_ptr += index * esz; - gfn = indirect_ptr >> PAGE_SHIFT; if (eaddr) *eaddr = indirect_ptr; -out: - idx = srcu_read_lock(&its->dev->kvm->srcu); - ret = kvm_is_visible_gfn(its->dev->kvm, gfn); - srcu_read_unlock(&its->dev->kvm->srcu, idx); - return ret; + return __is_visible_gfn_locked(its, indirect_ptr); } +/* + * Check whether an event ID can be stored in the corresponding Interrupt + * Translation Table, which starts at device->itt_addr. + */ +static bool vgic_its_check_event_id(struct vgic_its *its, struct its_device *device, + u32 event_id) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + int ite_esz = abi->ite_esz; + gpa_t gpa; + + /* max table size is: BIT_ULL(device->num_eventid_bits) * ite_esz */ + if (event_id >= BIT_ULL(device->num_eventid_bits)) + return false; + + gpa = device->itt_addr + event_id * ite_esz; + return __is_visible_gfn_locked(its, gpa); +} + +/* + * Add a new collection into the ITS collection table. + * Returns 0 on success, and a negative error value for generic errors. + */ static int vgic_its_alloc_collection(struct vgic_its *its, struct its_collection **colp, u32 coll_id) { struct its_collection *collection; - if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL)) - return E_ITS_MAPC_COLLECTION_OOR; - - collection = kzalloc(sizeof(*collection), GFP_KERNEL); + collection = kzalloc(sizeof(*collection), GFP_KERNEL_ACCOUNT); if (!collection) return -ENOMEM; @@ -1029,7 +1053,7 @@ static struct its_ite *vgic_its_alloc_ite(struct its_device *device, { struct its_ite *ite; - ite = kzalloc(sizeof(*ite), GFP_KERNEL); + ite = kzalloc(sizeof(*ite), GFP_KERNEL_ACCOUNT); if (!ite) return ERR_PTR(-ENOMEM); @@ -1061,7 +1085,7 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, if (!device) return E_ITS_MAPTI_UNMAPPED_DEVICE; - if (event_id >= BIT_ULL(device->num_eventid_bits)) + if (!vgic_its_check_event_id(its, device, event_id)) return E_ITS_MAPTI_ID_OOR; if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI) @@ -1078,7 +1102,12 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, collection = find_collection(its, coll_id); if (!collection) { - int ret = vgic_its_alloc_collection(its, &collection, coll_id); + int ret; + + if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL)) + return E_ITS_MAPC_COLLECTION_OOR; + + ret = vgic_its_alloc_collection(its, &collection, coll_id); if (ret) return ret; new_coll = collection; @@ -1150,7 +1179,7 @@ static struct its_device *vgic_its_alloc_device(struct vgic_its *its, { struct its_device *device; - device = kzalloc(sizeof(*device), GFP_KERNEL); + device = kzalloc(sizeof(*device), GFP_KERNEL_ACCOUNT); if (!device) return ERR_PTR(-ENOMEM); @@ -1233,6 +1262,10 @@ static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its, if (!collection) { int ret; + if (!vgic_its_check_id(its, its->baser_coll_table, + coll_id, NULL)) + return E_ITS_MAPC_COLLECTION_OOR; + ret = vgic_its_alloc_collection(its, &collection, coll_id); if (ret) @@ -1272,6 +1305,11 @@ static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its, return 0; } +int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq) +{ + return update_lpi_config(kvm, irq, NULL, true); +} + /* * The INV command syncs the configuration bits from the memory table. * Must be called with the its_lock mutex held. @@ -1288,7 +1326,41 @@ static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its, if (!ite) return E_ITS_INV_UNMAPPED_INTERRUPT; - return update_lpi_config(kvm, ite->irq, NULL, true); + return vgic_its_inv_lpi(kvm, ite->irq); +} + +/** + * vgic_its_invall - invalidate all LPIs targetting a given vcpu + * @vcpu: the vcpu for which the RD is targetted by an invalidation + * + * Contrary to the INVALL command, this targets a RD instead of a + * collection, and we don't need to hold the its_lock, since no ITS is + * involved here. + */ +int vgic_its_invall(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + int irq_count, i = 0; + u32 *intids; + + irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids); + if (irq_count < 0) + return irq_count; + + for (i = 0; i < irq_count; i++) { + struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intids[i]); + if (!irq) + continue; + update_lpi_config(kvm, irq, vcpu, false); + vgic_put_irq(kvm, irq); + } + + kfree(intids); + + if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm) + its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe); + + return 0; } /* @@ -1305,32 +1377,13 @@ static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its, u32 coll_id = its_cmd_get_collection(its_cmd); struct its_collection *collection; struct kvm_vcpu *vcpu; - struct vgic_irq *irq; - u32 *intids; - int irq_count, i; collection = find_collection(its, coll_id); if (!its_is_collection_mapped(collection)) return E_ITS_INVALL_UNMAPPED_COLLECTION; vcpu = kvm_get_vcpu(kvm, collection->target_addr); - - irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids); - if (irq_count < 0) - return irq_count; - - for (i = 0; i < irq_count; i++) { - irq = vgic_get_irq(kvm, NULL, intids[i]); - if (!irq) - continue; - update_lpi_config(kvm, irq, vcpu, false); - vgic_put_irq(kvm, irq); - } - - kfree(intids); - - if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm) - its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe); + vgic_its_invall(vcpu); return 0; } @@ -1847,7 +1900,7 @@ void vgic_lpi_translation_cache_init(struct kvm *kvm) struct vgic_translation_cache_entry *cte; /* An allocation failure is not fatal */ - cte = kzalloc(sizeof(*cte), GFP_KERNEL); + cte = kzalloc(sizeof(*cte), GFP_KERNEL_ACCOUNT); if (WARN_ON(!cte)) break; @@ -1888,7 +1941,7 @@ static int vgic_its_create(struct kvm_device *dev, u32 type) if (type != KVM_DEV_TYPE_ARM_VGIC_ITS) return -ENODEV; - its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL); + its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL_ACCOUNT); if (!its) return -ENOMEM; @@ -2146,7 +2199,7 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id, void *ptr, void *opaque) { - struct its_device *dev = (struct its_device *)opaque; + struct its_device *dev = opaque; struct its_collection *collection; struct kvm *kvm = its->dev->kvm; struct kvm_vcpu *vcpu = NULL; @@ -2178,6 +2231,9 @@ static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id, if (!collection) return -EINVAL; + if (!vgic_its_check_event_id(its, dev, event_id)) + return -EINVAL; + ite = vgic_its_alloc_ite(dev, collection, event_id); if (IS_ERR(ite)) return PTR_ERR(ite); @@ -2186,8 +2242,10 @@ static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id, vcpu = kvm_get_vcpu(kvm, collection->target_addr); irq = vgic_add_lpi(kvm, lpi_id, vcpu); - if (IS_ERR(irq)) + if (IS_ERR(irq)) { + its_free_ite(kvm, ite); return PTR_ERR(irq); + } ite->irq = irq; return offset; @@ -2299,6 +2357,7 @@ static int vgic_its_restore_dte(struct vgic_its *its, u32 id, void *ptr, void *opaque) { struct its_device *dev; + u64 baser = its->baser_device_table; gpa_t itt_addr; u8 num_eventid_bits; u64 entry = *(u64 *)ptr; @@ -2319,6 +2378,9 @@ static int vgic_its_restore_dte(struct vgic_its *its, u32 id, /* dte entry is valid */ offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT; + if (!vgic_its_check_id(its, baser, id, NULL)) + return -EINVAL; + dev = vgic_its_alloc_device(its, id, itt_addr, num_eventid_bits); if (IS_ERR(dev)) return PTR_ERR(dev); @@ -2448,6 +2510,9 @@ static int vgic_its_restore_device_tables(struct vgic_its *its) if (ret > 0) ret = 0; + if (ret < 0) + vgic_its_free_device_list(its->dev->kvm, its); + return ret; } @@ -2464,6 +2529,11 @@ static int vgic_its_save_cte(struct vgic_its *its, return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz); } +/* + * Restore a collection entry into the ITS collection table. + * Return +1 on success, 0 if the entry was invalid (which should be + * interpreted as end-of-table), and a negative error value for generic errors. + */ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) { struct its_collection *collection; @@ -2490,6 +2560,10 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) collection = find_collection(its, coll_id); if (collection) return -EEXIST; + + if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL)) + return -EINVAL; + ret = vgic_its_alloc_collection(its, &collection, coll_id); if (ret) return ret; @@ -2569,6 +2643,9 @@ static int vgic_its_restore_collection_table(struct vgic_its *its) if (ret > 0) return 0; + if (ret < 0) + vgic_its_free_collection_list(its->dev->kvm, its); + return ret; } @@ -2600,7 +2677,10 @@ static int vgic_its_restore_tables_v0(struct vgic_its *its) if (ret) return ret; - return vgic_its_restore_device_tables(its); + ret = vgic_its_restore_device_tables(its); + if (ret) + vgic_its_free_collection_list(its->dev->kvm, its); + return ret; } static int vgic_its_commit_v0(struct vgic_its *its) @@ -2713,8 +2793,8 @@ static int vgic_its_set_attr(struct kvm_device *dev, if (copy_from_user(&addr, uaddr, sizeof(addr))) return -EFAULT; - ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base, - addr, SZ_64K); + ret = vgic_check_iorange(dev->kvm, its->vgic_its_base, + addr, SZ_64K, KVM_VGIC_V3_ITS_SIZE); if (ret) return ret; diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index 7740995de982..edeac2380591 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -14,17 +14,21 @@ /* common helpers */ -int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t alignment) +int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, + phys_addr_t addr, phys_addr_t alignment, + phys_addr_t size) { - if (addr & ~kvm_phys_mask(kvm)) - return -E2BIG; + if (!IS_VGIC_ADDR_UNDEF(ioaddr)) + return -EEXIST; - if (!IS_ALIGNED(addr, alignment)) + if (!IS_ALIGNED(addr, alignment) || !IS_ALIGNED(size, alignment)) return -EINVAL; - if (!IS_VGIC_ADDR_UNDEF(*ioaddr)) - return -EEXIST; + if (addr + size < addr) + return -EINVAL; + + if (addr & ~kvm_phys_mask(kvm) || addr + size > kvm_phys_size(kvm)) + return -E2BIG; return 0; } @@ -37,11 +41,42 @@ static int vgic_check_type(struct kvm *kvm, int type_needed) return 0; } +int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) +{ + struct vgic_dist *vgic = &kvm->arch.vgic; + int r; + + mutex_lock(&kvm->lock); + switch (FIELD_GET(KVM_ARM_DEVICE_TYPE_MASK, dev_addr->id)) { + case KVM_VGIC_V2_ADDR_TYPE_DIST: + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + if (!r) + r = vgic_check_iorange(kvm, vgic->vgic_dist_base, dev_addr->addr, + SZ_4K, KVM_VGIC_V2_DIST_SIZE); + if (!r) + vgic->vgic_dist_base = dev_addr->addr; + break; + case KVM_VGIC_V2_ADDR_TYPE_CPU: + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + if (!r) + r = vgic_check_iorange(kvm, vgic->vgic_cpu_base, dev_addr->addr, + SZ_4K, KVM_VGIC_V2_CPU_SIZE); + if (!r) + vgic->vgic_cpu_base = dev_addr->addr; + break; + default: + r = -ENODEV; + } + + mutex_unlock(&kvm->lock); + + return r; +} + /** * kvm_vgic_addr - set or get vgic VM base addresses * @kvm: pointer to the vm struct - * @type: the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX - * @addr: pointer to address value + * @attr: pointer to the attribute being retrieved/updated * @write: if true set the address in the VM address space, if false read the * address * @@ -53,29 +88,39 @@ static int vgic_check_type(struct kvm *kvm, int type_needed) * overlapping regions in case of a virtual GICv3 here, since we don't know * the number of VCPUs yet, so we defer this check to map_resources(). */ -int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) +static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool write) { - int r = 0; + u64 __user *uaddr = (u64 __user *)attr->addr; struct vgic_dist *vgic = &kvm->arch.vgic; - phys_addr_t *addr_ptr, alignment; + phys_addr_t *addr_ptr, alignment, size; u64 undef_value = VGIC_ADDR_UNDEF; + u64 addr; + int r; + + /* Reading a redistributor region addr implies getting the index */ + if (write || attr->attr == KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION) + if (get_user(addr, uaddr)) + return -EFAULT; mutex_lock(&kvm->lock); - switch (type) { + switch (attr->attr) { case KVM_VGIC_V2_ADDR_TYPE_DIST: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); addr_ptr = &vgic->vgic_dist_base; alignment = SZ_4K; + size = KVM_VGIC_V2_DIST_SIZE; break; case KVM_VGIC_V2_ADDR_TYPE_CPU: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); addr_ptr = &vgic->vgic_cpu_base; alignment = SZ_4K; + size = KVM_VGIC_V2_CPU_SIZE; break; case KVM_VGIC_V3_ADDR_TYPE_DIST: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); addr_ptr = &vgic->vgic_dist_base; alignment = SZ_64K; + size = KVM_VGIC_V3_DIST_SIZE; break; case KVM_VGIC_V3_ADDR_TYPE_REDIST: { struct vgic_redist_region *rdreg; @@ -84,7 +129,7 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) if (r) break; if (write) { - r = vgic_v3_set_redist_base(kvm, 0, *addr, 0); + r = vgic_v3_set_redist_base(kvm, 0, addr, 0); goto out; } rdreg = list_first_entry_or_null(&vgic->rd_regions, @@ -104,14 +149,12 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) if (r) break; - index = *addr & KVM_VGIC_V3_RDIST_INDEX_MASK; + index = addr & KVM_VGIC_V3_RDIST_INDEX_MASK; if (write) { - gpa_t base = *addr & KVM_VGIC_V3_RDIST_BASE_MASK; - u32 count = (*addr & KVM_VGIC_V3_RDIST_COUNT_MASK) - >> KVM_VGIC_V3_RDIST_COUNT_SHIFT; - u8 flags = (*addr & KVM_VGIC_V3_RDIST_FLAGS_MASK) - >> KVM_VGIC_V3_RDIST_FLAGS_SHIFT; + gpa_t base = addr & KVM_VGIC_V3_RDIST_BASE_MASK; + u32 count = FIELD_GET(KVM_VGIC_V3_RDIST_COUNT_MASK, addr); + u8 flags = FIELD_GET(KVM_VGIC_V3_RDIST_FLAGS_MASK, addr); if (!count || flags) r = -EINVAL; @@ -127,9 +170,9 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) goto out; } - *addr = index; - *addr |= rdreg->base; - *addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT; + addr = index; + addr |= rdreg->base; + addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT; goto out; } default: @@ -140,15 +183,19 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) goto out; if (write) { - r = vgic_check_ioaddr(kvm, addr_ptr, *addr, alignment); + r = vgic_check_iorange(kvm, *addr_ptr, addr, alignment, size); if (!r) - *addr_ptr = *addr; + *addr_ptr = addr; } else { - *addr = *addr_ptr; + addr = *addr_ptr; } out: mutex_unlock(&kvm->lock); + + if (!r && !write) + r = put_user(addr, uaddr); + return r; } @@ -158,17 +205,9 @@ static int vgic_set_common_attr(struct kvm_device *dev, int r; switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 addr; - unsigned long type = (unsigned long)attr->attr; - - if (copy_from_user(&addr, uaddr, sizeof(addr))) - return -EFAULT; - - r = kvm_vgic_addr(dev->kvm, type, &addr, true); + case KVM_DEV_ARM_VGIC_GRP_ADDR: + r = kvm_vgic_addr(dev->kvm, attr, true); return (r == -ENODEV) ? -ENXIO : r; - } case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { u32 __user *uaddr = (u32 __user *)(long)attr->addr; u32 val; @@ -207,6 +246,24 @@ static int vgic_set_common_attr(struct kvm_device *dev, r = vgic_init(dev->kvm); mutex_unlock(&dev->kvm->lock); return r; + case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: + /* + * OK, this one isn't common at all, but we + * want to handle all control group attributes + * in a single place. + */ + if (vgic_check_type(dev->kvm, KVM_DEV_TYPE_ARM_VGIC_V3)) + return -ENXIO; + mutex_lock(&dev->kvm->lock); + + if (!lock_all_vcpus(dev->kvm)) { + mutex_unlock(&dev->kvm->lock); + return -EBUSY; + } + r = vgic_v3_save_pending_tables(dev->kvm); + unlock_all_vcpus(dev->kvm); + mutex_unlock(&dev->kvm->lock); + return r; } break; } @@ -221,22 +278,9 @@ static int vgic_get_common_attr(struct kvm_device *dev, int r = -ENXIO; switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 addr; - unsigned long type = (unsigned long)attr->attr; - - if (copy_from_user(&addr, uaddr, sizeof(addr))) - return -EFAULT; - - r = kvm_vgic_addr(dev->kvm, type, &addr, false); - if (r) - return (r == -ENODEV) ? -ENXIO : r; - - if (copy_to_user(uaddr, &addr, sizeof(addr))) - return -EFAULT; - break; - } + case KVM_DEV_ARM_VGIC_GRP_ADDR: + r = kvm_vgic_addr(dev->kvm, attr, false); + return (r == -ENODEV) ? -ENXIO : r; case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { u32 __user *uaddr = (u32 __user *)(long)attr->addr; @@ -318,7 +362,7 @@ void unlock_all_vcpus(struct kvm *kvm) bool lock_all_vcpus(struct kvm *kvm) { struct kvm_vcpu *tmp_vcpu; - int c; + unsigned long c; /* * Any time a vcpu is run, vcpu_load is called which tries to grab the @@ -341,17 +385,18 @@ bool lock_all_vcpus(struct kvm *kvm) * * @dev: kvm device handle * @attr: kvm device attribute - * @reg: address the value is read or written * @is_write: true if userspace is writing a register */ static int vgic_v2_attr_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, - u32 *reg, bool is_write) + bool is_write) { + u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; struct vgic_reg_attr reg_attr; gpa_t addr; struct kvm_vcpu *vcpu; int ret; + u32 val; ret = vgic_v2_parse_attr(dev, attr, ®_attr); if (ret) @@ -360,6 +405,10 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev, vcpu = reg_attr.vcpu; addr = reg_attr.addr; + if (is_write) + if (get_user(val, uaddr)) + return -EFAULT; + mutex_lock(&dev->kvm->lock); ret = vgic_init(dev->kvm); @@ -373,10 +422,10 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev, switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg); + ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, &val); break; case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, reg); + ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, &val); break; default: ret = -EINVAL; @@ -386,57 +435,35 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev, unlock_all_vcpus(dev->kvm); out: mutex_unlock(&dev->kvm->lock); + + if (!ret && !is_write) + ret = put_user(val, uaddr); + return ret; } static int vgic_v2_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { - int ret; - - ret = vgic_set_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 reg; - - if (get_user(reg, uaddr)) - return -EFAULT; - - return vgic_v2_attr_regs_access(dev, attr, ®, true); + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + return vgic_v2_attr_regs_access(dev, attr, true); + default: + return vgic_set_common_attr(dev, attr); } - } - - return -ENXIO; } static int vgic_v2_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { - int ret; - - ret = vgic_get_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 reg = 0; - - ret = vgic_v2_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - return put_user(reg, uaddr); + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + return vgic_v2_attr_regs_access(dev, attr, false); + default: + return vgic_get_common_attr(dev, attr); } - } - - return -ENXIO; } static int vgic_v2_has_attr(struct kvm_device *dev, @@ -505,18 +532,18 @@ int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, * * @dev: kvm device handle * @attr: kvm device attribute - * @reg: address the value is read or written * @is_write: true if userspace is writing a register */ static int vgic_v3_attr_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, - u64 *reg, bool is_write) + bool is_write) { struct vgic_reg_attr reg_attr; gpa_t addr; struct kvm_vcpu *vcpu; + bool uaccess; + u32 val; int ret; - u32 tmp32; ret = vgic_v3_parse_attr(dev, attr, ®_attr); if (ret) @@ -525,6 +552,21 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, vcpu = reg_attr.vcpu; addr = reg_attr.addr; + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + /* Sysregs uaccess is performed by the sysreg handling code */ + uaccess = false; + break; + default: + uaccess = true; + } + + if (uaccess && is_write) { + u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; + if (get_user(val, uaddr)) + return -EFAULT; + } + mutex_lock(&dev->kvm->lock); if (unlikely(!vgic_initialized(dev->kvm))) { @@ -539,29 +581,14 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - if (is_write) - tmp32 = *reg; - - ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &tmp32); - if (!is_write) - *reg = tmp32; + ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &val); break; case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: - if (is_write) - tmp32 = *reg; - - ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &tmp32); - if (!is_write) - *reg = tmp32; + ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &val); break; - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 regid; - - regid = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK); - ret = vgic_v3_cpu_sysregs_uaccess(vcpu, is_write, - regid, reg); + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + ret = vgic_v3_cpu_sysregs_uaccess(vcpu, attr, is_write); break; - } case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { unsigned int info, intid; @@ -571,7 +598,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, intid = attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK; ret = vgic_v3_line_level_info_uaccess(vcpu, is_write, - intid, reg); + intid, &val); } else { ret = -EINVAL; } @@ -585,117 +612,41 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, unlock_all_vcpus(dev->kvm); out: mutex_unlock(&dev->kvm->lock); + + if (!ret && uaccess && !is_write) { + u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; + ret = put_user(val, uaddr); + } + return ret; } static int vgic_v3_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { - int ret; - - ret = vgic_set_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 tmp32; - u64 reg; - - if (get_user(tmp32, uaddr)) - return -EFAULT; - - reg = tmp32; - return vgic_v3_attr_regs_access(dev, attr, ®, true); + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: + return vgic_v3_attr_regs_access(dev, attr, true); + default: + return vgic_set_common_attr(dev, attr); } - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 reg; - - if (get_user(reg, uaddr)) - return -EFAULT; - - return vgic_v3_attr_regs_access(dev, attr, ®, true); - } - case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u64 reg; - u32 tmp32; - - if (get_user(tmp32, uaddr)) - return -EFAULT; - - reg = tmp32; - return vgic_v3_attr_regs_access(dev, attr, ®, true); - } - case KVM_DEV_ARM_VGIC_GRP_CTRL: { - int ret; - - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: - mutex_lock(&dev->kvm->lock); - - if (!lock_all_vcpus(dev->kvm)) { - mutex_unlock(&dev->kvm->lock); - return -EBUSY; - } - ret = vgic_v3_save_pending_tables(dev->kvm); - unlock_all_vcpus(dev->kvm); - mutex_unlock(&dev->kvm->lock); - return ret; - } - break; - } - } - return -ENXIO; } static int vgic_v3_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { - int ret; - - ret = vgic_get_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u64 reg; - u32 tmp32; - - ret = vgic_v3_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - tmp32 = reg; - return put_user(tmp32, uaddr); + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: + return vgic_v3_attr_regs_access(dev, attr, false); + default: + return vgic_get_common_attr(dev, attr); } - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 reg; - - ret = vgic_v3_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - return put_user(reg, uaddr); - } - case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u64 reg; - u32 tmp32; - - ret = vgic_v3_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - tmp32 = reg; - return put_user(tmp32, uaddr); - } - } - return -ENXIO; } static int vgic_v3_has_attr(struct kvm_device *dev, diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c index 508aee9f8853..e070cda86e12 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c @@ -73,9 +73,13 @@ static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + u32 reg; + switch (addr & 0x0c) { case GIC_DIST_IIDR: - if (val != vgic_mmio_read_v2_misc(vcpu, addr, len)) + reg = vgic_mmio_read_v2_misc(vcpu, addr, len); + if ((reg ^ val) & ~GICD_IIDR_REVISION_MASK) return -EINVAL; /* @@ -87,8 +91,16 @@ static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu, * migration from old kernels to new kernels with legacy * userspace. */ - vcpu->kvm->arch.vgic.v2_groups_user_writable = true; - return 0; + reg = FIELD_GET(GICD_IIDR_REVISION_MASK, reg); + switch (reg) { + case KVM_VGIC_IMP_REV_2: + case KVM_VGIC_IMP_REV_3: + vcpu->kvm->arch.vgic.v2_groups_user_writable = true; + dist->implementation_rev = reg; + return 0; + default: + return -EINVAL; + } } vgic_mmio_write_v2_misc(vcpu, addr, len, val); @@ -113,9 +125,8 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu, int intid = val & 0xf; int targets = (val >> 16) & 0xff; int mode = (val >> 24) & 0x03; - int c; struct kvm_vcpu *vcpu; - unsigned long flags; + unsigned long flags, c; switch (mode) { case 0x0: /* as specified by targets */ diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index a09cdc0b953c..91201f743033 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -155,13 +155,27 @@ static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu, unsigned long val) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + u32 reg; switch (addr & 0x0c) { case GICD_TYPER2: - case GICD_IIDR: if (val != vgic_mmio_read_v3_misc(vcpu, addr, len)) return -EINVAL; return 0; + case GICD_IIDR: + reg = vgic_mmio_read_v3_misc(vcpu, addr, len); + if ((reg ^ val) & ~GICD_IIDR_REVISION_MASK) + return -EINVAL; + + reg = FIELD_GET(GICD_IIDR_REVISION_MASK, reg); + switch (reg) { + case KVM_VGIC_IMP_REV_2: + case KVM_VGIC_IMP_REV_3: + dist->implementation_rev = reg; + return 0; + default: + return -EINVAL; + } case GICD_CTLR: /* Not a GICv4.1? No HW SGIs */ if (!kvm_vgic_global_state.has_gicv4_1) @@ -221,34 +235,58 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, vgic_put_irq(vcpu->kvm, irq); } +bool vgic_lpis_enabled(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + return atomic_read(&vgic_cpu->ctlr) == GICR_CTLR_ENABLE_LPIS; +} + static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + unsigned long val; - return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0; + val = atomic_read(&vgic_cpu->ctlr); + if (vgic_get_implementation_rev(vcpu) >= KVM_VGIC_IMP_REV_3) + val |= GICR_CTLR_IR | GICR_CTLR_CES; + + return val; } - static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - bool was_enabled = vgic_cpu->lpis_enabled; + u32 ctlr; if (!vgic_has_its(vcpu->kvm)) return; - vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS; + if (!(val & GICR_CTLR_ENABLE_LPIS)) { + /* + * Don't disable if RWP is set, as there already an + * ongoing disable. Funky guest... + */ + ctlr = atomic_cmpxchg_acquire(&vgic_cpu->ctlr, + GICR_CTLR_ENABLE_LPIS, + GICR_CTLR_RWP); + if (ctlr != GICR_CTLR_ENABLE_LPIS) + return; - if (was_enabled && !vgic_cpu->lpis_enabled) { vgic_flush_pending_lpis(vcpu); vgic_its_invalidate_cache(vcpu->kvm); - } + atomic_set_release(&vgic_cpu->ctlr, 0); + } else { + ctlr = atomic_cmpxchg_acquire(&vgic_cpu->ctlr, 0, + GICR_CTLR_ENABLE_LPIS); + if (ctlr != 0) + return; - if (!was_enabled && vgic_cpu->lpis_enabled) vgic_enable_lpis(vcpu); + } } static bool vgic_mmio_vcpu_rdist_is_last(struct kvm_vcpu *vcpu) @@ -315,42 +353,6 @@ static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu, return 0; } -static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - u32 value = 0; - int i; - - /* - * pending state of interrupt is latched in pending_latch variable. - * Userspace will save and restore pending state and line_level - * separately. - * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst - * for handling of ISPENDR and ICPENDR. - */ - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - bool state = irq->pending_latch; - - if (irq->hw && vgic_irq_is_sgi(irq->intid)) { - int err; - - err = irq_get_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, - &state); - WARN_ON(err); - } - - if (state) - value |= (1U << i); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) @@ -478,11 +480,10 @@ static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu, unsigned long val) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; u64 old_propbaser, propbaser; /* Storing a value with LPIs already enabled is undefined */ - if (vgic_cpu->lpis_enabled) + if (vgic_lpis_enabled(vcpu)) return; do { @@ -513,7 +514,7 @@ static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, u64 old_pendbaser, pendbaser; /* Storing a value with LPIs already enabled is undefined */ - if (vgic_cpu->lpis_enabled) + if (vgic_lpis_enabled(vcpu)) return; do { @@ -525,6 +526,63 @@ static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, pendbaser) != old_pendbaser); } +static unsigned long vgic_mmio_read_sync(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return !!atomic_read(&vcpu->arch.vgic_cpu.syncr_busy); +} + +static void vgic_set_rdist_busy(struct kvm_vcpu *vcpu, bool busy) +{ + if (busy) { + atomic_inc(&vcpu->arch.vgic_cpu.syncr_busy); + smp_mb__after_atomic(); + } else { + smp_mb__before_atomic(); + atomic_dec(&vcpu->arch.vgic_cpu.syncr_busy); + } +} + +static void vgic_mmio_write_invlpi(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_irq *irq; + + /* + * If the guest wrote only to the upper 32bit part of the + * register, drop the write on the floor, as it is only for + * vPEs (which we don't support for obvious reasons). + * + * Also discard the access if LPIs are not enabled. + */ + if ((addr & 4) || !vgic_lpis_enabled(vcpu)) + return; + + vgic_set_rdist_busy(vcpu, true); + + irq = vgic_get_irq(vcpu->kvm, NULL, lower_32_bits(val)); + if (irq) { + vgic_its_inv_lpi(vcpu->kvm, irq); + vgic_put_irq(vcpu->kvm, irq); + } + + vgic_set_rdist_busy(vcpu, false); +} + +static void vgic_mmio_write_invall(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + /* See vgic_mmio_write_invlpi() for the early return rationale */ + if ((addr & 4) || !vgic_lpis_enabled(vcpu)) + return; + + vgic_set_rdist_busy(vcpu, true); + vgic_its_invall(vcpu); + vgic_set_rdist_busy(vcpu, false); +} + /* * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the * redistributors, while SPIs are covered by registers in the distributor @@ -572,7 +630,7 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = { VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR, vgic_mmio_read_pending, vgic_mmio_write_spending, - vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1, + vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR, vgic_mmio_read_pending, vgic_mmio_write_cpending, @@ -630,6 +688,15 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = { REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER, vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_INVLPIR, + vgic_mmio_read_raz, vgic_mmio_write_invlpi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_INVALLR, + vgic_mmio_read_raz, vgic_mmio_write_invall, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_SYNCR, + vgic_mmio_read_sync, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_IDREGS, vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, VGIC_ACCESS_32bit), @@ -647,7 +714,7 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = { VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0, vgic_mmio_read_pending, vgic_mmio_write_spending, - vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4, + vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0, vgic_mmio_read_pending, vgic_mmio_write_cpending, @@ -754,7 +821,8 @@ static void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu) static int vgic_register_all_redist_iodevs(struct kvm *kvm) { struct kvm_vcpu *vcpu; - int c, ret = 0; + unsigned long c; + int ret = 0; kvm_for_each_vcpu(c, vcpu, kvm) { ret = vgic_register_redist_iodev(vcpu); @@ -763,10 +831,12 @@ static int vgic_register_all_redist_iodevs(struct kvm *kvm) } if (ret) { - /* The current c failed, so we start with the previous one. */ + /* The current c failed, so iterate over the previous ones. */ + int i; + mutex_lock(&kvm->slots_lock); - for (c--; c >= 0; c--) { - vcpu = kvm_get_vcpu(kvm, c); + for (i = 0; i < c; i++) { + vcpu = kvm_get_vcpu(kvm, i); vgic_unregister_redist_iodev(vcpu); } mutex_unlock(&kvm->slots_lock); @@ -796,7 +866,9 @@ static int vgic_v3_alloc_redist_region(struct kvm *kvm, uint32_t index, struct vgic_dist *d = &kvm->arch.vgic; struct vgic_redist_region *rdreg; struct list_head *rd_regions = &d->rd_regions; - size_t size = count * KVM_VGIC_V3_REDIST_SIZE; + int nr_vcpus = atomic_read(&kvm->online_vcpus); + size_t size = count ? count * KVM_VGIC_V3_REDIST_SIZE + : nr_vcpus * KVM_VGIC_V3_REDIST_SIZE; int ret; /* cross the end of memory ? */ @@ -834,13 +906,13 @@ static int vgic_v3_alloc_redist_region(struct kvm *kvm, uint32_t index, if (vgic_v3_rdist_overlap(kvm, base, size)) return -EINVAL; - rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL); + rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL_ACCOUNT); if (!rdreg) return -ENOMEM; rdreg->base = VGIC_ADDR_UNDEF; - ret = vgic_check_ioaddr(kvm, &rdreg->base, base, SZ_64K); + ret = vgic_check_iorange(kvm, rdreg->base, base, SZ_64K, size); if (ret) goto free; @@ -914,12 +986,8 @@ int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) iodev.base_addr = 0; break; } - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 reg, id; - - id = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK); - return vgic_v3_has_cpu_sysregs_attr(vcpu, 0, id, ®); - } + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + return vgic_v3_has_cpu_sysregs_attr(vcpu, attr); default: return -ENXIO; } @@ -993,10 +1061,10 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1) struct kvm_vcpu *c_vcpu; u16 target_cpus; u64 mpidr; - int sgi, c; + int sgi; int vcpu_id = vcpu->vcpu_id; bool broadcast; - unsigned long flags; + unsigned long c, flags; sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT; broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT); @@ -1086,7 +1154,7 @@ int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, } int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, - u32 intid, u64 *val) + u32 intid, u32 *val) { if (intid % 32) return -EINVAL; diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c index 55630ca2c325..b32d434c1d4a 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.c +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -240,6 +240,15 @@ static unsigned long __read_pending(struct kvm_vcpu *vcpu, unsigned long flags; bool val; + /* + * When used from userspace with a GICv3 model: + * + * Pending state of interrupt is latched in pending_latch + * variable. Userspace will save and restore pending state + * and line_level separately. + * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst + * for handling of ISPENDR and ICPENDR. + */ raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->hw && vgic_irq_is_sgi(irq->intid)) { int err; @@ -252,7 +261,17 @@ static unsigned long __read_pending(struct kvm_vcpu *vcpu, } else if (!is_user && vgic_irq_is_mapped_level(irq)) { val = vgic_get_phys_line_level(irq); } else { - val = irq_is_pending(irq); + switch (vcpu->kvm->arch.vgic.vgic_model) { + case KVM_DEV_TYPE_ARM_VGIC_V3: + if (is_user) { + val = irq->pending_latch; + break; + } + fallthrough; + default: + val = irq_is_pending(irq); + break; + } } value |= ((u32)val << i); @@ -756,10 +775,10 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, } } -u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) +u32 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) { int i; - u64 val = 0; + u32 val = 0; int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; for (i = 0; i < 32; i++) { @@ -779,7 +798,7 @@ u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) } void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, - const u64 val) + const u32 val) { int i; int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; @@ -1065,7 +1084,7 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, return 0; } -struct kvm_io_device_ops kvm_io_gic_ops = { +const struct kvm_io_device_ops kvm_io_gic_ops = { .read = dispatch_mmio_read, .write = dispatch_mmio_write, }; diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h index dcea44015985..5b490a4dfa5e 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.h +++ b/arch/arm64/kvm/vgic/vgic-mmio.h @@ -34,7 +34,7 @@ struct vgic_register_region { }; }; -extern struct kvm_io_device_ops kvm_io_gic_ops; +extern const struct kvm_io_device_ops kvm_io_gic_ops; #define VGIC_ACCESS_8bit 1 #define VGIC_ACCESS_32bit 2 @@ -207,10 +207,10 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, bool is_write, int offset, u32 *val); -u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid); +u32 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid); void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, - const u64 val); + const u32 val); unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 95a18cec14a3..4e8bb90bd96f 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -293,12 +293,12 @@ int vgic_v2_map_resources(struct kvm *kvm) if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) { - kvm_err("Need to set vgic cpu and dist addresses first\n"); + kvm_debug("Need to set vgic cpu and dist addresses first\n"); return -ENXIO; } if (!vgic_v2_check_base(dist->vgic_dist_base, dist->vgic_cpu_base)) { - kvm_err("VGIC CPU and dist frames overlap\n"); + kvm_debug("VGIC CPU and dist frames overlap\n"); return -EINVAL; } @@ -345,6 +345,11 @@ int vgic_v2_probe(const struct gic_kvm_info *info) int ret; u32 vtr; + if (is_protected_kvm_enabled()) { + kvm_err("GICv2 not supported in protected mode\n"); + return -ENXIO; + } + if (!info->vctrl.start) { kvm_err("GICH not present in the firmware table\n"); return -ENXIO; @@ -465,17 +470,10 @@ void vgic_v2_load(struct kvm_vcpu *vcpu) kvm_vgic_global_state.vctrl_base + GICH_APR); } -void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu) +void vgic_v2_put(struct kvm_vcpu *vcpu, bool blocking) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); -} - -void vgic_v2_put(struct kvm_vcpu *vcpu) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - - vgic_v2_vmcr_sync(vcpu); cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR); } diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 8eb70451323b..8469155e08ff 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -15,6 +15,7 @@ static bool group0_trap; static bool group1_trap; static bool common_trap; +static bool dir_trap; static bool gicv4_enable; void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) @@ -296,6 +297,8 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) vgic_v3->vgic_hcr |= ICH_HCR_TALL1; if (common_trap) vgic_v3->vgic_hcr |= ICH_HCR_TC; + if (dir_trap) + vgic_v3->vgic_hcr |= ICH_HCR_TDIR; } int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) @@ -480,8 +483,10 @@ bool vgic_v3_check_base(struct kvm *kvm) return false; list_for_each_entry(rdreg, &d->rd_regions, list) { - if (rdreg->base + vgic_v3_rd_region_size(kvm, rdreg) < - rdreg->base) + size_t sz = vgic_v3_rd_region_size(kvm, rdreg); + + if (vgic_check_iorange(kvm, VGIC_ADDR_UNDEF, + rdreg->base, SZ_64K, sz)) return false; } @@ -534,24 +539,24 @@ int vgic_v3_map_resources(struct kvm *kvm) struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu; int ret = 0; - int c; + unsigned long c; kvm_for_each_vcpu(c, vcpu, kvm) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; if (IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) { - kvm_debug("vcpu %d redistributor base not set\n", c); + kvm_debug("vcpu %ld redistributor base not set\n", c); return -ENXIO; } } if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base)) { - kvm_err("Need to set vgic distributor addresses first\n"); + kvm_debug("Need to set vgic distributor addresses first\n"); return -ENXIO; } if (!vgic_v3_check_base(kvm)) { - kvm_err("VGIC redist and dist frames overlap\n"); + kvm_debug("VGIC redist and dist frames overlap\n"); return -EINVAL; } @@ -601,6 +606,22 @@ static int __init early_gicv4_enable(char *buf) } early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); +static const struct midr_range broken_seis[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), + {}, +}; + +static bool vgic_v3_broken_seis(void) +{ + return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) && + is_midr_in_range_list(read_cpuid_id(), broken_seis)); +} + /** * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller * @info: pointer to the GIC description @@ -643,7 +664,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info) } else if (!PAGE_ALIGNED(info->vcpu.start)) { pr_warn("GICV physical address 0x%llx not page aligned\n", (unsigned long long)info->vcpu.start); - } else { + } else if (kvm_get_mode() != KVM_MODE_PROTECTED) { kvm_vgic_global_state.vcpu_base = info->vcpu.start; kvm_vgic_global_state.can_emulate_gicv2 = true; ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); @@ -668,11 +689,24 @@ int vgic_v3_probe(const struct gic_kvm_info *info) group1_trap = true; } - if (group0_trap || group1_trap || common_trap) { - kvm_info("GICv3 sysreg trapping enabled ([%s%s%s], reduced performance)\n", + if (vgic_v3_broken_seis()) { + kvm_info("GICv3 with broken locally generated SEI\n"); + + kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_SEIS_MASK; + group0_trap = true; + group1_trap = true; + if (ich_vtr_el2 & ICH_VTR_TDS_MASK) + dir_trap = true; + else + common_trap = true; + } + + if (group0_trap || group1_trap || common_trap | dir_trap) { + kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n", group0_trap ? "G0" : "", group1_trap ? "G1" : "", - common_trap ? "C" : ""); + common_trap ? "C" : "", + dir_trap ? "D" : ""); static_branch_enable(&vgic_v3_cpuif_trap); } @@ -687,15 +721,8 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - /* - * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen - * is dependent on ICC_SRE_EL1.SRE, and we have to perform the - * VMCR_EL2 save/restore in the world switch. - */ - if (likely(cpu_if->vgic_sre)) - kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr); - - kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if); + if (likely(!is_protected_kvm_enabled())) + kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if); if (has_vhe()) __vgic_v3_activate_traps(cpu_if); @@ -703,23 +730,14 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) WARN_ON(vgic_v4_load(vcpu)); } -void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu) +void vgic_v3_put(struct kvm_vcpu *vcpu, bool blocking) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - if (likely(cpu_if->vgic_sre)) - cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr); -} + WARN_ON(vgic_v4_put(vcpu, blocking)); -void vgic_v3_put(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - - WARN_ON(vgic_v4_put(vcpu, false)); - - vgic_v3_vmcr_sync(vcpu); - - kvm_call_hyp(__vgic_v3_save_aprs, cpu_if); + if (likely(!is_protected_kvm_enabled())) + kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if); if (has_vhe()) __vgic_v3_deactivate_traps(cpu_if); diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index f507e3fcffce..a413718be92b 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -189,7 +189,7 @@ void vgic_v4_configure_vsgis(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_arm_halt_guest(kvm); @@ -240,7 +240,8 @@ int vgic_v4_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu; - int i, nr_vcpus, ret; + int nr_vcpus, ret; + unsigned long i; if (!kvm_vgic_global_state.has_gicv4) return 0; /* Nothing to see here... move along. */ @@ -251,7 +252,7 @@ int vgic_v4_init(struct kvm *kvm) nr_vcpus = atomic_read(&kvm->online_vcpus); dist->its_vm.vpes = kcalloc(nr_vcpus, sizeof(*dist->its_vm.vpes), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!dist->its_vm.vpes) return -ENOMEM; diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 5dad4996cfb2..6189ad969675 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -37,7 +37,7 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { * If you need to take multiple locks, always take the upper lock first, * then the lower ones, e.g. first take the its_lock, then the irq_lock. * If you are already holding a lock and need to take a higher one, you - * have to drop the lower ranking lock first and re-aquire it after having + * have to drop the lower ranking lock first and re-acquire it after having * taken the upper one. * * When taking more than one ap_list_lock at the same time, always take the @@ -931,26 +931,15 @@ void kvm_vgic_load(struct kvm_vcpu *vcpu) vgic_v3_load(vcpu); } -void kvm_vgic_put(struct kvm_vcpu *vcpu) +void kvm_vgic_put(struct kvm_vcpu *vcpu, bool blocking) { if (unlikely(!vgic_initialized(vcpu->kvm))) return; if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_put(vcpu); + vgic_v2_put(vcpu, blocking); else - vgic_v3_put(vcpu); -} - -void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu) -{ - if (unlikely(!irqchip_in_kernel(vcpu->kvm))) - return; - - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_vmcr_sync(vcpu); - else - vgic_v3_vmcr_sync(vcpu); + vgic_v3_put(vcpu, blocking); } int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) @@ -990,7 +979,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) void vgic_kick_vcpus(struct kvm *kvm) { struct kvm_vcpu *vcpu; - int c; + unsigned long c; /* * We've injected an interrupt, time to find out who deserves diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 36021c31a706..d9f54ac08042 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -98,6 +98,11 @@ #define DEBUG_SPINLOCK_BUG_ON(p) #endif +static inline u32 vgic_get_implementation_rev(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.vgic.implementation_rev; +} + /* Requires the irq_lock to be held by the caller. */ static inline bool irq_is_pending(struct vgic_irq *irq) { @@ -172,8 +177,9 @@ void vgic_kick_vcpus(struct kvm *kvm); void vgic_irq_handle_resampling(struct vgic_irq *irq, bool lr_deactivated, bool lr_pending); -int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t alignment); +int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, + phys_addr_t addr, phys_addr_t alignment, + phys_addr_t size); void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); @@ -195,8 +201,7 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, void vgic_v2_init_lrs(void); void vgic_v2_load(struct kvm_vcpu *vcpu); -void vgic_v2_put(struct kvm_vcpu *vcpu); -void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu); +void vgic_v2_put(struct kvm_vcpu *vcpu, bool blocking); void vgic_v2_save_state(struct kvm_vcpu *vcpu); void vgic_v2_restore_state(struct kvm_vcpu *vcpu); @@ -226,8 +231,7 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu); bool vgic_v3_check_base(struct kvm *kvm); void vgic_v3_load(struct kvm_vcpu *vcpu); -void vgic_v3_put(struct kvm_vcpu *vcpu); -void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu); +void vgic_v3_put(struct kvm_vcpu *vcpu, bool blocking); bool vgic_has_its(struct kvm *kvm); int kvm_vgic_register_its_device(void); @@ -239,12 +243,11 @@ int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); -int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, - u64 id, u64 *val); -int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, - u64 *reg); +int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr, bool is_write); +int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, - u32 intid, u64 *val); + u32 intid, u32 *val); int kvm_register_vgic_device(unsigned long type); void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); @@ -307,6 +310,7 @@ static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size) (base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE); } +bool vgic_lpis_enabled(struct kvm_vcpu *vcpu); int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr); int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, u32 devid, u32 eventid, struct vgic_irq **irq); @@ -316,6 +320,10 @@ void vgic_lpi_translation_cache_init(struct kvm *kvm); void vgic_lpi_translation_cache_destroy(struct kvm *kvm); void vgic_its_invalidate_cache(struct kvm *kvm); +/* GICv4.1 MMIO interface */ +int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq); +int vgic_its_invall(struct kvm_vcpu *vcpu); + bool vgic_supports_direct_msis(struct kvm *kvm); int vgic_v4_init(struct kvm *kvm); void vgic_v4_teardown(struct kvm *kvm); diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c new file mode 100644 index 000000000000..d78ae63d7c15 --- /dev/null +++ b/arch/arm64/kvm/vmid.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * VMID allocator. + * + * Based on Arm64 ASID allocator algorithm. + * Please refer arch/arm64/mm/context.c for detailed + * comments on algorithm. + * + * Copyright (C) 2002-2003 Deep Blue Solutions Ltd, all rights reserved. + * Copyright (C) 2012 ARM Ltd. + */ + +#include +#include + +#include +#include + +unsigned int kvm_arm_vmid_bits; +static DEFINE_RAW_SPINLOCK(cpu_vmid_lock); + +static atomic64_t vmid_generation; +static unsigned long *vmid_map; + +static DEFINE_PER_CPU(atomic64_t, active_vmids); +static DEFINE_PER_CPU(u64, reserved_vmids); + +#define VMID_MASK (~GENMASK(kvm_arm_vmid_bits - 1, 0)) +#define VMID_FIRST_VERSION (1UL << kvm_arm_vmid_bits) + +#define NUM_USER_VMIDS VMID_FIRST_VERSION +#define vmid2idx(vmid) ((vmid) & ~VMID_MASK) +#define idx2vmid(idx) vmid2idx(idx) + +/* + * As vmid #0 is always reserved, we will never allocate one + * as below and can be treated as invalid. This is used to + * set the active_vmids on vCPU schedule out. + */ +#define VMID_ACTIVE_INVALID VMID_FIRST_VERSION + +#define vmid_gen_match(vmid) \ + (!(((vmid) ^ atomic64_read(&vmid_generation)) >> kvm_arm_vmid_bits)) + +static void flush_context(void) +{ + int cpu; + u64 vmid; + + bitmap_clear(vmid_map, 0, NUM_USER_VMIDS); + + for_each_possible_cpu(cpu) { + vmid = atomic64_xchg_relaxed(&per_cpu(active_vmids, cpu), 0); + + /* Preserve reserved VMID */ + if (vmid == 0) + vmid = per_cpu(reserved_vmids, cpu); + __set_bit(vmid2idx(vmid), vmid_map); + per_cpu(reserved_vmids, cpu) = vmid; + } + + /* + * Unlike ASID allocator, we expect less frequent rollover in + * case of VMIDs. Hence, instead of marking the CPU as + * flush_pending and issuing a local context invalidation on + * the next context-switch, we broadcast TLB flush + I-cache + * invalidation over the inner shareable domain on rollover. + */ + kvm_call_hyp(__kvm_flush_vm_context); +} + +static bool check_update_reserved_vmid(u64 vmid, u64 newvmid) +{ + int cpu; + bool hit = false; + + /* + * Iterate over the set of reserved VMIDs looking for a match + * and update to use newvmid (i.e. the same VMID in the current + * generation). + */ + for_each_possible_cpu(cpu) { + if (per_cpu(reserved_vmids, cpu) == vmid) { + hit = true; + per_cpu(reserved_vmids, cpu) = newvmid; + } + } + + return hit; +} + +static u64 new_vmid(struct kvm_vmid *kvm_vmid) +{ + static u32 cur_idx = 1; + u64 vmid = atomic64_read(&kvm_vmid->id); + u64 generation = atomic64_read(&vmid_generation); + + if (vmid != 0) { + u64 newvmid = generation | (vmid & ~VMID_MASK); + + if (check_update_reserved_vmid(vmid, newvmid)) { + atomic64_set(&kvm_vmid->id, newvmid); + return newvmid; + } + + if (!__test_and_set_bit(vmid2idx(vmid), vmid_map)) { + atomic64_set(&kvm_vmid->id, newvmid); + return newvmid; + } + } + + vmid = find_next_zero_bit(vmid_map, NUM_USER_VMIDS, cur_idx); + if (vmid != NUM_USER_VMIDS) + goto set_vmid; + + /* We're out of VMIDs, so increment the global generation count */ + generation = atomic64_add_return_relaxed(VMID_FIRST_VERSION, + &vmid_generation); + flush_context(); + + /* We have more VMIDs than CPUs, so this will always succeed */ + vmid = find_next_zero_bit(vmid_map, NUM_USER_VMIDS, 1); + +set_vmid: + __set_bit(vmid, vmid_map); + cur_idx = vmid; + vmid = idx2vmid(vmid) | generation; + atomic64_set(&kvm_vmid->id, vmid); + return vmid; +} + +/* Called from vCPU sched out with preemption disabled */ +void kvm_arm_vmid_clear_active(void) +{ + atomic64_set(this_cpu_ptr(&active_vmids), VMID_ACTIVE_INVALID); +} + +void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) +{ + unsigned long flags; + u64 vmid, old_active_vmid; + + vmid = atomic64_read(&kvm_vmid->id); + + /* + * Please refer comments in check_and_switch_context() in + * arch/arm64/mm/context.c. + * + * Unlike ASID allocator, we set the active_vmids to + * VMID_ACTIVE_INVALID on vCPU schedule out to avoid + * reserving the VMID space needlessly on rollover. + * Hence explicitly check here for a "!= 0" to + * handle the sync with a concurrent rollover. + */ + old_active_vmid = atomic64_read(this_cpu_ptr(&active_vmids)); + if (old_active_vmid != 0 && vmid_gen_match(vmid) && + 0 != atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_vmids), + old_active_vmid, vmid)) + return; + + raw_spin_lock_irqsave(&cpu_vmid_lock, flags); + + /* Check that our VMID belongs to the current generation. */ + vmid = atomic64_read(&kvm_vmid->id); + if (!vmid_gen_match(vmid)) + vmid = new_vmid(kvm_vmid); + + atomic64_set(this_cpu_ptr(&active_vmids), vmid); + raw_spin_unlock_irqrestore(&cpu_vmid_lock, flags); +} + +/* + * Initialize the VMID allocator + */ +int kvm_arm_vmid_alloc_init(void) +{ + kvm_arm_vmid_bits = kvm_get_vmid_bits(); + + /* + * Expect allocation after rollover to fail if we don't have + * at least one more VMID than CPUs. VMID #0 is always reserved. + */ + WARN_ON(NUM_USER_VMIDS - 1 <= num_possible_cpus()); + atomic64_set(&vmid_generation, VMID_FIRST_VERSION); + vmid_map = kcalloc(BITS_TO_LONGS(NUM_USER_VMIDS), + sizeof(*vmid_map), GFP_KERNEL); + if (!vmid_map) + return -ENOMEM; + + return 0; +} + +void kvm_arm_vmid_alloc_free(void) +{ + kfree(vmid_map); +} diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c index 1688af0a4c97..5b7890139bc2 100644 --- a/arch/arm64/lib/delay.c +++ b/arch/arm64/lib/delay.c @@ -27,7 +27,17 @@ void __delay(unsigned long cycles) { cycles_t start = get_cycles(); - if (arch_timer_evtstrm_available()) { + if (cpus_have_const_cap(ARM64_HAS_WFXT)) { + u64 end = start + cycles; + + /* + * Start with WFIT. If an interrupt makes us resume + * early, use a WFET loop to complete the delay. + */ + wfit(end); + while ((get_cycles() - start) < cycles) + wfet(end); + } else if (arch_timer_evtstrm_available()) { const cycles_t timer_evt_period = USECS_TO_CYCLES(ARCH_TIMER_EVT_STREAM_PERIOD_US); diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S index f531dcb95174..d96125a0ed1d 100644 --- a/arch/arm64/lib/mte.S +++ b/arch/arm64/lib/mte.S @@ -18,7 +18,7 @@ */ .macro multitag_transfer_size, reg, tmp mrs_s \reg, SYS_GMID_EL1 - ubfx \reg, \reg, #SYS_GMID_EL1_BS_SHIFT, #SYS_GMID_EL1_BS_SIZE + ubfx \reg, \reg, #GMID_EL1_BS_SHIFT, #GMID_EL1_BS_SIZE mov \tmp, #4 lsl \reg, \tmp, \reg .endm @@ -93,7 +93,7 @@ SYM_FUNC_START(mte_copy_tags_from_user) mov x3, x1 cbz x2, 2f 1: - user_ldst 2f, ldtrb, w4, x1, 0 +USER(2f, ldtrb w4, [x1]) lsl x4, x4, #MTE_TAG_SHIFT stg x4, [x0], #MTE_GRANULE_SIZE add x1, x1, #1 @@ -120,7 +120,7 @@ SYM_FUNC_START(mte_copy_tags_to_user) 1: ldg x4, [x1] ubfx x4, x4, #MTE_TAG_SHIFT, #MTE_TAG_SIZE - user_ldst 2f, sttrb, w4, x0, 0 +USER(2f, sttrb w4, [x0]) add x0, x0, #1 add x1, x1, #MTE_GRANULE_SIZE subs x2, x2, #1 diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index f188c9092696..d45f88351f9a 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -1,8 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := dma-mapping.o extable.o fault.o init.o \ cache.o copypage.o flush.o \ - ioremap.o mmap.o pgd.o mmu.o \ + ioremap.o mem_encrypt.o mmap.o pgd.o mmu.o \ context.o proc.o pageattr.o +obj-$(CONFIG_MEMORY_RELINQUISH) += mem_relinquish.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index cd72576ae2b7..d16155985cf8 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -43,17 +43,17 @@ static u32 get_cpu_asid_bits(void) { u32 asid; int fld = cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64MMFR0_EL1), - ID_AA64MMFR0_ASID_SHIFT); + ID_AA64MMFR0_EL1_ASIDBITS_SHIFT); switch (fld) { default: pr_warn("CPU%d: Unknown ASID size (%d); assuming 8-bit\n", smp_processor_id(), fld); fallthrough; - case 0: + case ID_AA64MMFR0_EL1_ASIDBITS_8: asid = 8; break; - case 2: + case ID_AA64MMFR0_EL1_ASIDBITS_16: asid = 16; } diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index 0dea80bf6de4..24913271e898 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -23,15 +23,6 @@ void copy_highpage(struct page *to, struct page *from) if (system_supports_mte() && test_bit(PG_mte_tagged, &from->flags)) { set_bit(PG_mte_tagged, &to->flags); - page_kasan_tag_reset(to); - /* - * We need smp_wmb() in between setting the flags and clearing the - * tags because if another thread reads page->flags and builds a - * tagged address out of it, there is an actual dependency to the - * memory access, but on the current thread we do not guarantee that - * the new page->flags are visible before the tags were updated. - */ - smp_wmb(); mte_copy_page_tags(kto, kfrom); } } diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 6719f9efea09..443475dc4ddc 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -49,8 +50,10 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, ARCH_DMA_MINALIGN, cls); dev->dma_coherent = coherent; - if (iommu) + if (iommu) { iommu_setup_dma_ops(dev, dma_base, dma_base + size - 1); + trace_android_rvh_iommu_setup_dma_ops(dev, dma_base, dma_base + size - 1); + } #ifdef CONFIG_XEN if (xen_swiotlb_detect()) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 97a93ee756a2..0f78302f1392 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,9 @@ #include #include #include +#include + +#include struct fault_info { int (*fn)(unsigned long far, unsigned long esr, @@ -257,6 +261,15 @@ static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr return false; } +static bool is_pkvm_stage2_abort(unsigned int esr) +{ + /* + * S1PTW should only ever be set in ESR_EL1 if the pkvm hypervisor + * injected a stage-2 abort -- see host_inject_abort(). + */ + return is_pkvm_initialized() && (esr & ESR_ELx_S1PTW); +} + static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) @@ -268,6 +281,9 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, (esr & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT) return false; + if (is_pkvm_stage2_abort(esr)) + return false; + local_irq_save(flags); asm volatile("at s1e1r, %0" :: "r" (addr)); isb(); @@ -297,6 +313,7 @@ static void die_kernel_fault(const char *msg, unsigned long addr, pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg, addr); + trace_android_rvh_die_kernel_fault(msg, addr, esr, regs); mem_abort_decode(esr); show_pte(addr); @@ -386,6 +403,8 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr, msg = "read from unreadable memory"; } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; + } else if (is_pkvm_stage2_abort(esr)) { + msg = "access to hypervisor-protected memory"; } else { if (is_translation_fault(esr) && kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) @@ -528,6 +547,10 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, unsigned long vm_flags; unsigned int mm_flags = FAULT_FLAG_DEFAULT; unsigned long addr = untagged_addr(far); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + struct vm_area_struct *vma; + unsigned long seq; +#endif if (kprobe_page_fault(regs, esr)) return 0; @@ -576,8 +599,68 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, addr, esr, regs); } + if (is_pkvm_stage2_abort(esr)) { + if (!user_mode(regs)) + goto no_context; + arm64_force_sig_fault(SIGSEGV, SEGV_ACCERR, far, "stage-2 fault"); + return 0; + } + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + /* + * No need to try speculative faults for kernel or + * single threaded user space. + */ + if (!(mm_flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1) + goto no_spf; + + count_vm_event(SPF_ATTEMPT); + seq = mmap_seq_read_start(mm); + if (seq & 1) { + count_vm_spf_event(SPF_ABORT_ODD); + goto spf_abort; + } + vma = get_vma(mm, addr); + if (!vma) { + count_vm_spf_event(SPF_ABORT_UNMAPPED); + goto spf_abort; + } + if (!vma_can_speculate(vma, mm_flags)) { + put_vma(vma); + count_vm_spf_event(SPF_ABORT_NO_SPECULATE); + goto spf_abort; + } + + if (!mmap_seq_read_check(mm, seq, SPF_ABORT_VMA_COPY)) { + put_vma(vma); + goto spf_abort; + } + if (!(vma->vm_flags & vm_flags)) { + put_vma(vma); + count_vm_spf_event(SPF_ABORT_ACCESS_ERROR); + goto spf_abort; + } + fault = do_handle_mm_fault(vma, addr & PAGE_MASK, + mm_flags | FAULT_FLAG_SPECULATIVE, seq, regs); + put_vma(vma); + + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + goto no_context; + return 0; + } + if (!(fault & VM_FAULT_RETRY)) + goto done; + +spf_abort: + count_vm_event(SPF_ABORT); +no_spf: + +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ + /* * As per x86, we may deadlock here. However, since the kernel only * validly references user space from well defined areas of the code, @@ -618,6 +701,9 @@ retry: } } mmap_read_unlock(mm); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +done: +#endif /* * Handle the "normal" (no error) case first. @@ -698,7 +784,11 @@ static int do_alignment_fault(unsigned long far, unsigned long esr, static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs) { - return 1; /* "fault" */ + unsigned long addr = untagged_addr(far); + int ret = 1; + + trace_android_vh_handle_tlb_conf(addr, esr, &ret); + return ret; } static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs) @@ -726,6 +816,7 @@ static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs) */ siaddr = untagged_addr(far); } + trace_android_rvh_do_sea(siaddr, esr, regs); arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr); return 0; @@ -821,6 +912,7 @@ void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs) if (!user_mode(regs)) { pr_alert("Unhandled fault at 0x%016lx\n", addr); + trace_android_rvh_do_mem_abort(addr, esr, regs); mem_abort_decode(esr); show_pte(addr); } @@ -836,6 +928,8 @@ NOKPROBE_SYMBOL(do_mem_abort); void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs) { + trace_android_rvh_do_sp_pc_abort(addr, esr, regs); + arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN, addr, esr); } @@ -919,7 +1013,7 @@ NOKPROBE_SYMBOL(do_debug_exception); struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, unsigned long vaddr) { - gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO; + gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO | __GFP_CMA; /* * If the page is mapped with PROT_MTE, initialise the tags at the @@ -935,6 +1029,5 @@ struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, void tag_clear_highpage(struct page *page) { mte_zero_clear_page_tags(page_address(page)); - page_kasan_tag_reset(page); set_bit(PG_mte_tagged, &page->flags); } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 3b269c756798..b96e61706a61 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -90,6 +90,12 @@ phys_addr_t __ro_after_init arm64_dma_phys_limit; phys_addr_t __ro_after_init arm64_dma_phys_limit = PHYS_MASK + 1; #endif +/* + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via + * CONFIG_ZONE_DMA32. + */ +static bool disable_dma32 __ro_after_init; + #ifdef CONFIG_KEXEC_CORE /* * reserve_crashkernel() - reserves memory for crash kernel @@ -175,7 +181,7 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); #endif #ifdef CONFIG_ZONE_DMA32 - max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit); + max_zone_pfns[ZONE_DMA32] = disable_dma32 ? 0 : PFN_DOWN(dma32_phys_limit); if (!arm64_dma_phys_limit) arm64_dma_phys_limit = dma32_phys_limit; #endif @@ -184,6 +190,18 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) free_area_init(max_zone_pfns); } +static int __init early_disable_dma32(char *buf) +{ + if (!buf) + return -EINVAL; + + if (!strcmp(buf, "on")) + disable_dma32 = true; + + return 0; +} +early_param("disable_dma32", early_disable_dma32); + int pfn_is_map_memory(unsigned long pfn) { phys_addr_t addr = PFN_PHYS(pfn); @@ -310,7 +328,7 @@ void __init arm64_memblock_init(void) extern u16 memstart_offset_seed; u64 mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); int parange = cpuid_feature_extract_unsigned_field( - mmfr0, ID_AA64MMFR0_PARANGE_SHIFT); + mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); s64 range = linear_region_size - BIT(id_aa64mmfr0_parange_to_phys_shift(parange)); diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index b21f91cd830d..7073f86699f2 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -9,13 +9,220 @@ * Copyright (C) 2012 ARM Ltd. */ +#define pr_fmt(fmt) "ioremap: " fmt + #include #include #include +#include #include +#include #include #include +#include + +#ifndef ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO +#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO 5 + +#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_INFO_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO) +#endif /* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO */ + +#ifndef ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL +#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL 6 + +#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_ENROLL_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL) +#endif /* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL */ + +#ifndef ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP +#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP 7 + +#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP) +#endif /* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP */ + +#ifndef ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP +#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP 8 + +#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_UNMAP_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP) +#endif /* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP */ + +struct ioremap_guard_ref { + refcount_t count; +}; + +static DEFINE_STATIC_KEY_FALSE(ioremap_guard_key); +static DEFINE_XARRAY(ioremap_guard_array); +static DEFINE_MUTEX(ioremap_guard_lock); + +static size_t guard_granule; + +static bool ioremap_guard; +static int __init ioremap_guard_setup(char *str) +{ + ioremap_guard = true; + + return 0; +} +early_param("ioremap_guard", ioremap_guard_setup); + +void kvm_init_ioremap_services(void) +{ + struct arm_smccc_res res; + size_t granule; + + if (!ioremap_guard) + return; + + /* We need all the functions to be implemented */ + if (!kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO) || + !kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL) || + !kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP) || + !kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP)) + return; + + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_INFO_FUNC_ID, + 0, 0, 0, &res); + granule = res.a0; + if (granule > PAGE_SIZE || !granule || (granule & (granule - 1))) { + pr_warn("KVM MMIO guard initialization failed: " + "guard granule (%lu), page size (%lu)\n", + granule, PAGE_SIZE); + return; + } + + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_ENROLL_FUNC_ID, + &res); + if (res.a0 == SMCCC_RET_SUCCESS) { + guard_granule = granule; + static_branch_enable(&ioremap_guard_key); + pr_info("Using KVM MMIO guard for ioremap\n"); + } else { + pr_warn("KVM MMIO guard registration failed (%ld)\n", res.a0); + } +} + +void ioremap_phys_range_hook(phys_addr_t phys_addr, size_t size, pgprot_t prot) +{ + int guard_shift; + + if (!static_branch_unlikely(&ioremap_guard_key)) + return; + + guard_shift = __builtin_ctzl(guard_granule); + + mutex_lock(&ioremap_guard_lock); + + while (size) { + u64 guard_fn = phys_addr >> guard_shift; + struct ioremap_guard_ref *ref; + struct arm_smccc_res res; + + if (pfn_valid(__phys_to_pfn(phys_addr))) + goto next; + + ref = xa_load(&ioremap_guard_array, guard_fn); + if (ref) { + refcount_inc(&ref->count); + goto next; + } + + /* + * It is acceptable for the allocation to fail, specially + * if trying to ioremap something very early on, like with + * earlycon, which happens long before kmem_cache_init. + * This page will be permanently accessible, similar to a + * saturated refcount. + */ + if (slab_is_available()) + ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (ref) { + refcount_set(&ref->count, 1); + if (xa_err(xa_store(&ioremap_guard_array, guard_fn, ref, + GFP_KERNEL))) { + kfree(ref); + ref = NULL; + } + } + + arm_smccc_1_1_hvc(ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID, + phys_addr, prot, &res); + if (res.a0 != SMCCC_RET_SUCCESS) { + pr_warn_ratelimited("Failed to register %llx\n", + phys_addr); + xa_erase(&ioremap_guard_array, guard_fn); + kfree(ref); + goto out; + } + + next: + size -= guard_granule; + phys_addr += guard_granule; + } +out: + mutex_unlock(&ioremap_guard_lock); +} + +void iounmap_phys_range_hook(phys_addr_t phys_addr, size_t size) +{ + int guard_shift; + + if (!static_branch_unlikely(&ioremap_guard_key)) + return; + + VM_BUG_ON(phys_addr & ~PAGE_MASK || size & ~PAGE_MASK); + guard_shift = __builtin_ctzl(guard_granule); + + mutex_lock(&ioremap_guard_lock); + + while (size) { + u64 guard_fn = phys_addr >> guard_shift; + struct ioremap_guard_ref *ref; + struct arm_smccc_res res; + + ref = xa_load(&ioremap_guard_array, guard_fn); + if (!ref) { + pr_warn_ratelimited("%llx not tracked, left mapped\n", + phys_addr); + goto next; + } + + if (!refcount_dec_and_test(&ref->count)) + goto next; + + xa_erase(&ioremap_guard_array, guard_fn); + kfree(ref); + + arm_smccc_1_1_hvc(ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_UNMAP_FUNC_ID, + phys_addr, &res); + if (res.a0 != SMCCC_RET_SUCCESS) { + pr_warn_ratelimited("Failed to unregister %llx\n", + phys_addr); + goto out; + } + + next: + size -= guard_granule; + phys_addr += guard_granule; + } +out: + mutex_unlock(&ioremap_guard_lock); +} static void __iomem *__ioremap_caller(phys_addr_t phys_addr, size_t size, pgprot_t prot, void *caller) diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 61b52a92b8b6..c12cd700598f 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -36,7 +36,7 @@ static phys_addr_t __init kasan_alloc_zeroed_page(int node) { void *p = memblock_alloc_try_nid(PAGE_SIZE, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - MEMBLOCK_ALLOC_KASAN, node); + MEMBLOCK_ALLOC_NOLEAKTRACE, node); if (!p) panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%llx\n", __func__, PAGE_SIZE, PAGE_SIZE, node, @@ -49,7 +49,8 @@ static phys_addr_t __init kasan_alloc_raw_page(int node) { void *p = memblock_alloc_try_nid_raw(PAGE_SIZE, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - MEMBLOCK_ALLOC_KASAN, node); + MEMBLOCK_ALLOC_NOLEAKTRACE, + node); if (!p) panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%llx\n", __func__, PAGE_SIZE, PAGE_SIZE, node, @@ -287,13 +288,29 @@ static void __init kasan_init_depth(void) init_task.kasan_depth = 0; } +#ifdef CONFIG_KASAN_VMALLOC +void __init kasan_populate_early_vm_area_shadow(void *start, unsigned long size) +{ + unsigned long shadow_start, shadow_end; + + if (!is_vmalloc_or_module_addr(start)) + return; + + shadow_start = (unsigned long)kasan_mem_to_shadow(start); + shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE); + shadow_end = (unsigned long)kasan_mem_to_shadow(start + size); + shadow_end = ALIGN(shadow_end, PAGE_SIZE); + kasan_map_populate(shadow_start, shadow_end, NUMA_NO_NODE); +} +#endif + void __init kasan_init(void) { kasan_init_shadow(); kasan_init_depth(); #if defined(CONFIG_KASAN_GENERIC) /* CONFIG_KASAN_SW_TAGS also requires kasan_init_sw_tags(). */ - pr_info("KernelAddressSanitizer initialized\n"); + pr_info("KernelAddressSanitizer initialized (generic)\n"); #endif } diff --git a/arch/arm64/mm/mem_encrypt.c b/arch/arm64/mm/mem_encrypt.c new file mode 100644 index 000000000000..849b0e628e02 --- /dev/null +++ b/arch/arm64/mm/mem_encrypt.c @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Implementation of the memory encryption/decryption API. + * + * Amusingly, no crypto is actually performed. Rather, we call into the + * hypervisor component of KVM to expose pages selectively to the host + * for virtio "DMA" operations. In other words, "encrypted" pages are + * not accessible to the host, whereas "decrypted" pages are. + * + * Author: Will Deacon + */ +#include +#include +#include +#include +#include +#include + +#include + +#ifndef ARM_SMCCC_KVM_FUNC_HYP_MEMINFO +#define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO 2 + +#define ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_HYP_MEMINFO) +#endif /* ARM_SMCCC_KVM_FUNC_HYP_MEMINFO */ + +#ifndef ARM_SMCCC_KVM_FUNC_MEM_SHARE +#define ARM_SMCCC_KVM_FUNC_MEM_SHARE 3 + +#define ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MEM_SHARE) +#endif /* ARM_SMCCC_KVM_FUNC_MEM_SHARE */ + +#ifndef ARM_SMCCC_KVM_FUNC_MEM_UNSHARE +#define ARM_SMCCC_KVM_FUNC_MEM_UNSHARE 4 + +#define ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MEM_UNSHARE) +#endif /* ARM_SMCCC_KVM_FUNC_MEM_UNSHARE */ + +static unsigned long memshare_granule_sz; + +bool mem_encrypt_active(void) +{ + return memshare_granule_sz; +} +EXPORT_SYMBOL(mem_encrypt_active); + +void kvm_init_memshare_services(void) +{ + int i; + struct arm_smccc_res res; + const u32 funcs[] = { + ARM_SMCCC_KVM_FUNC_HYP_MEMINFO, + ARM_SMCCC_KVM_FUNC_MEM_SHARE, + ARM_SMCCC_KVM_FUNC_MEM_UNSHARE, + }; + + for (i = 0; i < ARRAY_SIZE(funcs); ++i) { + if (!kvm_arm_hyp_service_available(funcs[i])) + return; + } + + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID, + 0, 0, 0, &res); + if (res.a0 > PAGE_SIZE) /* Includes error codes */ + return; + + memshare_granule_sz = res.a0; +} + +static int arm_smccc_share_unshare_page(u32 func_id, phys_addr_t phys) +{ + phys_addr_t end = phys + PAGE_SIZE; + + while (phys < end) { + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(func_id, phys, 0, 0, &res); + if (res.a0 != SMCCC_RET_SUCCESS) + return -EPERM; + + phys += memshare_granule_sz; + } + + return 0; +} + +static int set_memory_xcrypted(u32 func_id, unsigned long start, int numpages) +{ + void *addr = (void *)start, *end = addr + numpages * PAGE_SIZE; + + while (addr < end) { + int err; + + err = arm_smccc_share_unshare_page(func_id, virt_to_phys(addr)); + if (err) + return err; + + addr += PAGE_SIZE; + } + + return 0; +} + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + if (!memshare_granule_sz || WARN_ON(!PAGE_ALIGNED(addr))) + return 0; + + return set_memory_xcrypted(ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID, + addr, numpages); +} + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + if (!memshare_granule_sz || WARN_ON(!PAGE_ALIGNED(addr))) + return 0; + + return set_memory_xcrypted(ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID, + addr, numpages); +} diff --git a/arch/arm64/mm/mem_relinquish.c b/arch/arm64/mm/mem_relinquish.c new file mode 100644 index 000000000000..7948098288e3 --- /dev/null +++ b/arch/arm64/mm/mem_relinquish.c @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 Google LLC + * Author: Keir Fraser + */ + +#include +#include +#include +#include +#include + +#include + +static unsigned long memshare_granule_sz; + +void kvm_init_memrelinquish_services(void) +{ + int i; + struct arm_smccc_res res; + const u32 funcs[] = { + ARM_SMCCC_KVM_FUNC_HYP_MEMINFO, + ARM_SMCCC_KVM_FUNC_MEM_RELINQUISH, + }; + + for (i = 0; i < ARRAY_SIZE(funcs); ++i) { + if (!kvm_arm_hyp_service_available(funcs[i])) + return; + } + + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID, + 0, 0, 0, &res); + if (res.a0 > PAGE_SIZE) /* Includes error codes */ + return; + + memshare_granule_sz = res.a0; +} + +bool kvm_has_memrelinquish_services(void) +{ + return !!memshare_granule_sz; +} +EXPORT_SYMBOL_GPL(kvm_has_memrelinquish_services); + +void page_relinquish(struct page *page) +{ + phys_addr_t phys, end; + u32 func_id = ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID; + + if (!memshare_granule_sz) + return; + + phys = page_to_phys(page); + end = phys + PAGE_SIZE; + + while (phys < end) { + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(func_id, phys, 0, 0, &res); + BUG_ON(res.a0 != SMCCC_RET_SUCCESS); + + phys += memshare_granule_sz; + } +} +EXPORT_SYMBOL_GPL(page_relinquish); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 6680689242df..d916966e724f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -97,7 +97,8 @@ static phys_addr_t __init early_pgtable_alloc(int shift) phys_addr_t phys; void *ptr; - phys = memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); + phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0, + MEMBLOCK_ALLOC_NOLEAKTRACE); if (!phys) panic("Failed to allocate page table page\n"); @@ -700,7 +701,7 @@ static bool arm64_early_this_cpu_has_bti(void) pfr1 = __read_sysreg_by_encoding(SYS_ID_AA64PFR1_EL1); return cpuid_feature_extract_unsigned_field(pfr1, - ID_AA64PFR1_BT_SHIFT); + ID_AA64PFR1_EL1_BT_SHIFT); } /* @@ -1331,6 +1332,21 @@ void __set_fixmap(enum fixed_addresses idx, } } +pte_t *__get_fixmap_pte(enum fixed_addresses idx) +{ + unsigned long addr = __fix_to_virt(idx); + pte_t *ptep; + + BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses); + + ptep = fixmap_pte(addr); + + if (!pte_valid(*ptep)) + return NULL; + + return ptep; +} + void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot) { const u64 dt_virt_base = __fix_to_virt(FIX_FDT); diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c index fd6cabc6d033..b3105c609758 100644 --- a/arch/arm64/mm/mteswap.c +++ b/arch/arm64/mm/mteswap.c @@ -53,15 +53,6 @@ bool mte_restore_tags(swp_entry_t entry, struct page *page) if (!tags) return false; - page_kasan_tag_reset(page); - /* - * We need smp_wmb() in between setting the flags and clearing the - * tags because if another thread reads page->flags and builds a - * tagged address out of it, there is an actual dependency to the - * memory access, but on the current thread we do not guarantee that - * the new page->flags are visible before the tags were updated. - */ - smp_wmb(); /* * Test PG_mte_tagged again in case it was racing with another * set_pte_at(). diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index a3bacd79507a..64e985eaa52d 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -85,7 +85,7 @@ static int change_memory_common(unsigned long addr, int numpages, */ area = find_vm_area((void *)addr); if (!area || - end > (unsigned long)area->addr + area->size || + end > (unsigned long)kasan_reset_tag(area->addr) + area->size || !(area->flags & VM_ALLOC)) return -EINVAL; diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 1a9684b11474..b6291b877713 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -66,7 +66,8 @@ MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) | \ MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) | \ MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) | \ - MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL_TAGGED)) + MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL_TAGGED) | \ + MAIR_ATTRIDX(MAIR_ATTR_NORMAL_iNC_oWB, MT_NORMAL_iNC_oWB)) #ifdef CONFIG_CPU_PM /** diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 4895b4d7e150..8efe6e7e1057 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1155,7 +1155,8 @@ u64 bpf_jit_alloc_exec_limit(void) void *bpf_jit_alloc_exec(unsigned long size) { - return vmalloc(size); + /* Memory is intended to be executable, reset the pointer tag. */ + return kasan_reset_tag(vmalloc(size)); } void bpf_jit_free_exec(void *addr) diff --git a/arch/arm64/tools/.gitignore b/arch/arm64/tools/.gitignore new file mode 100644 index 000000000000..1ddedddfffbe --- /dev/null +++ b/arch/arm64/tools/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +gen-hyprel diff --git a/arch/arm64/tools/Makefile b/arch/arm64/tools/Makefile index 932b4fe5c768..3ae620d01ae3 100644 --- a/arch/arm64/tools/Makefile +++ b/arch/arm64/tools/Makefile @@ -20,3 +20,7 @@ quiet_cmd_gen_cpucaps = GEN $@ $(kapi)/cpucaps.h: $(src)/gen-cpucaps.awk $(src)/cpucaps FORCE $(call if_changed,gen_cpucaps) + +HOST_EXTRACFLAGS += -I$(objtree)/include +hostprogs += gen-hyprel +gen-hyprel: $(obj)/gen-hyprel diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 6b1e70aee8cf..a076d28b9126 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -7,7 +7,8 @@ BTI HAS_32BIT_EL0_DO_NOT_USE HAS_32BIT_EL1 HAS_ADDRESS_AUTH -HAS_ADDRESS_AUTH_ARCH +HAS_ADDRESS_AUTH_ARCH_QARMA3 +HAS_ADDRESS_AUTH_ARCH_QARMA5 HAS_ADDRESS_AUTH_IMP_DEF HAS_AMU_EXTN HAS_ARMv8_4_TTL @@ -18,9 +19,11 @@ HAS_CRC32 HAS_DCPODP HAS_DCPOP HAS_E0PD +HAS_ECV HAS_EPAN HAS_GENERIC_AUTH -HAS_GENERIC_AUTH_ARCH +HAS_GENERIC_AUTH_ARCH_QARMA3 +HAS_GENERIC_AUTH_ARCH_QARMA5 HAS_GENERIC_AUTH_IMP_DEF HAS_IRQ_PRIO_MASKING HAS_LDAPR @@ -35,10 +38,14 @@ HAS_STAGE2_FWB HAS_SYSREG_GIC_CPUIF HAS_TLB_RANGE HAS_VIRT_HOST_EXTN +HAS_WFXT HW_DBM KVM_PROTECTED_MODE MISMATCHED_CACHE_TYPE MTE +MTE_ASYMM +SME +SME_FA64 SPECTRE_V2 SPECTRE_V3A SPECTRE_V4 @@ -56,6 +63,10 @@ WORKAROUND_1508412 WORKAROUND_1542419 WORKAROUND_1742098 WORKAROUND_2457168 +WORKAROUND_2658417 +WORKAROUND_TRBE_OVERWRITE_FILL_MODE +WORKAROUND_TSB_FLUSH_FAILURE +WORKAROUND_TRBE_WRITE_OUT_OF_RANGE WORKAROUND_CAVIUM_23154 WORKAROUND_CAVIUM_27456 WORKAROUND_CAVIUM_30115 diff --git a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c b/arch/arm64/tools/gen-hyprel.c similarity index 100% rename from arch/arm64/kvm/hyp/nvhe/gen-hyprel.c rename to arch/arm64/tools/gen-hyprel.c diff --git a/arch/csky/kernel/perf_callchain.c b/arch/csky/kernel/perf_callchain.c index 75e1f9df5f60..1612f4354087 100644 --- a/arch/csky/kernel/perf_callchain.c +++ b/arch/csky/kernel/perf_callchain.c @@ -86,13 +86,8 @@ static unsigned long user_backtrace(struct perf_callchain_entry_ctx *entry, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); unsigned long fp = 0; - /* C-SKY does not support virtualization. */ - if (guest_cbs && guest_cbs->is_in_guest()) - return; - fp = regs->regs[4]; perf_callchain_store(entry, regs->pc); @@ -111,15 +106,8 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stackframe fr; - /* C-SKY does not support virtualization. */ - if (guest_cbs && guest_cbs->is_in_guest()) { - pr_warn("C-SKY does not support perf in guest mode!"); - return; - } - fr.fp = regs->regs[4]; fr.lr = regs->lr; walk_stackframe(&fr, entry); diff --git a/arch/h8300/boot/compressed/Makefile b/arch/h8300/boot/compressed/Makefile index 5942793f77a0..6ab2fa5ba105 100644 --- a/arch/h8300/boot/compressed/Makefile +++ b/arch/h8300/boot/compressed/Makefile @@ -30,9 +30,11 @@ $(obj)/vmlinux.bin: vmlinux FORCE suffix-$(CONFIG_KERNEL_GZIP) := gzip suffix-$(CONFIG_KERNEL_LZO) := lzo +compress-$(CONFIG_KERNEL_GZIP) := gzip +compress-$(CONFIG_KERNEL_LZO) := lzo_with_size $(obj)/vmlinux.bin.$(suffix-y): $(obj)/vmlinux.bin FORCE - $(call if_changed,$(suffix-y)) + $(call if_changed,$(compress-y)) LDFLAGS_piggy.o := -r --format binary --oformat elf32-h8300-linux -T OBJCOPYFLAGS := -O binary diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile index 705b9e7f8035..8cec6d70bc15 100644 --- a/arch/mips/boot/compressed/Makefile +++ b/arch/mips/boot/compressed/Makefile @@ -80,12 +80,12 @@ $(obj)/vmlinux.bin: $(KBUILD_IMAGE) FORCE $(call if_changed,objcopy) tool_$(CONFIG_KERNEL_GZIP) = gzip -tool_$(CONFIG_KERNEL_BZIP2) = bzip2 -tool_$(CONFIG_KERNEL_LZ4) = lz4 -tool_$(CONFIG_KERNEL_LZMA) = lzma -tool_$(CONFIG_KERNEL_LZO) = lzo -tool_$(CONFIG_KERNEL_XZ) = xzkern -tool_$(CONFIG_KERNEL_ZSTD) = zstd22 +tool_$(CONFIG_KERNEL_BZIP2) = bzip2_with_size +tool_$(CONFIG_KERNEL_LZ4) = lz4_with_size +tool_$(CONFIG_KERNEL_LZMA) = lzma_with_size +tool_$(CONFIG_KERNEL_LZO) = lzo_with_size +tool_$(CONFIG_KERNEL_XZ) = xzkern_with_size +tool_$(CONFIG_KERNEL_ZSTD) = zstd22_with_size targets += vmlinux.bin.z diff --git a/arch/mips/kvm/loongson_ipi.c b/arch/mips/kvm/loongson_ipi.c index 3681fc8fba38..5d53f32d837c 100644 --- a/arch/mips/kvm/loongson_ipi.c +++ b/arch/mips/kvm/loongson_ipi.c @@ -120,7 +120,7 @@ static int loongson_vipi_write(struct loongson_kvm_ipi *ipi, s->status |= data; irq.cpu = id; irq.irq = 6; - kvm_vcpu_ioctl_interrupt(kvm->vcpus[id], &irq); + kvm_vcpu_ioctl_interrupt(kvm_get_vcpu(kvm, id), &irq); break; case CORE0_CLEAR_OFF: @@ -128,7 +128,7 @@ static int loongson_vipi_write(struct loongson_kvm_ipi *ipi, if (!s->status) { irq.cpu = id; irq.irq = -6; - kvm_vcpu_ioctl_interrupt(kvm->vcpus[id], &irq); + kvm_vcpu_ioctl_interrupt(kvm_get_vcpu(kvm, id), &irq); } break; diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 75c6f264c626..43d0fff75bf8 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -171,25 +171,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return 0; } -void kvm_mips_free_vcpus(struct kvm *kvm) -{ - unsigned int i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) { - kvm_vcpu_destroy(vcpu); - } - - mutex_lock(&kvm->lock); - - for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) - kvm->vcpus[i] = NULL; - - atomic_set(&kvm->online_vcpus, 0); - - mutex_unlock(&kvm->lock); -} - static void kvm_mips_free_gpa_pt(struct kvm *kvm) { /* It should always be safe to remove after flushing the whole range */ @@ -199,7 +180,7 @@ static void kvm_mips_free_gpa_pt(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { - kvm_mips_free_vcpus(kvm); + kvm_destroy_vcpus(kvm); kvm_mips_free_gpa_pt(kvm); } @@ -498,7 +479,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, if (irq->cpu == -1) dvcpu = vcpu; else - dvcpu = vcpu->kvm->vcpus[irq->cpu]; + dvcpu = kvm_get_vcpu(vcpu->kvm, irq->cpu); if (intr == 2 || intr == 3 || intr == 4 || intr == 6) { kvm_mips_callbacks->queue_io_int(dvcpu, irq); diff --git a/arch/nds32/kernel/perf_event_cpu.c b/arch/nds32/kernel/perf_event_cpu.c index f38791960781..a78a879e7ef1 100644 --- a/arch/nds32/kernel/perf_event_cpu.c +++ b/arch/nds32/kernel/perf_event_cpu.c @@ -1363,7 +1363,6 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); unsigned long fp = 0; unsigned long gp = 0; unsigned long lp = 0; @@ -1372,11 +1371,6 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, leaf_fp = 0; - if (guest_cbs && guest_cbs->is_in_guest()) { - /* We don't support guest os callchain now */ - return; - } - perf_callchain_store(entry, regs->ipc); fp = regs->fp; gp = regs->gp; @@ -1480,13 +1474,8 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stackframe fr; - if (guest_cbs && guest_cbs->is_in_guest()) { - /* We don't support guest os callchain now */ - return; - } fr.fp = regs->fp; fr.lp = regs->lp; fr.sp = regs->sp; @@ -1495,32 +1484,17 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, unsigned long perf_instruction_pointer(struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - /* However, NDS32 does not support virtualization */ - if (guest_cbs && guest_cbs->is_in_guest()) - return guest_cbs->get_guest_ip(); - return instruction_pointer(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); int misc = 0; - /* However, NDS32 does not support virtualization */ - if (guest_cbs && guest_cbs->is_in_guest()) { - if (guest_cbs->is_user_mode()) - misc |= PERF_RECORD_MISC_GUEST_USER; - else - misc |= PERF_RECORD_MISC_GUEST_KERNEL; - } else { - if (user_mode(regs)) - misc |= PERF_RECORD_MISC_USER; - else - misc |= PERF_RECORD_MISC_KERNEL; - } + if (user_mode(regs)) + misc |= PERF_RECORD_MISC_USER; + else + misc |= PERF_RECORD_MISC_KERNEL; return misc; } diff --git a/arch/parisc/boot/compressed/Makefile b/arch/parisc/boot/compressed/Makefile index 9fe54878167d..d2283f9fa891 100644 --- a/arch/parisc/boot/compressed/Makefile +++ b/arch/parisc/boot/compressed/Makefile @@ -73,15 +73,15 @@ suffix-$(CONFIG_KERNEL_XZ) := xz $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE - $(call if_changed,bzip2) + $(call if_changed,bzip2_with_size) $(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lz4) + $(call if_changed,lz4_with_size) $(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzma) + $(call if_changed,lzma_with_size) $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzo) + $(call if_changed,lzo_with_size) $(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE - $(call if_changed,xzkern) + $(call if_changed,xzkern_with_size) LDFLAGS_piggy.o := -r --format binary --oformat $(LD_BFD) -T $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) FORCE diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index eef0096db5f8..24e7f1fae38b 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -27,7 +27,7 @@ void flush_kernel_icache_range_asm(unsigned long, unsigned long); void flush_user_dcache_range_asm(unsigned long, unsigned long); void flush_kernel_dcache_range_asm(unsigned long, unsigned long); void purge_kernel_dcache_range_asm(unsigned long, unsigned long); -void flush_kernel_dcache_page_asm(void *); +void flush_kernel_dcache_page_asm(const void *addr); void flush_kernel_icache_page(void *); /* Cache flush operations */ @@ -36,7 +36,7 @@ void flush_cache_all_local(void); void flush_cache_all(void); void flush_cache_mm(struct mm_struct *mm); -void flush_kernel_dcache_page_addr(void *addr); +void flush_kernel_dcache_page_addr(const void *addr); #define flush_kernel_dcache_range(start,size) \ flush_kernel_dcache_range_asm((start), (start)+(size)); @@ -97,7 +97,7 @@ flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vma } #define ARCH_HAS_FLUSH_ON_KUNMAP -static inline void kunmap_flush_on_unmap(void *addr) +static inline void kunmap_flush_on_unmap(const void *addr) { flush_kernel_dcache_page_addr(addr); } diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 394e6e14e5c4..f82ff6681a55 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -448,7 +448,7 @@ extern void purge_kernel_dcache_page_asm(unsigned long); extern void clear_user_page_asm(void *, unsigned long); extern void copy_user_page_asm(void *, void *, unsigned long); -void flush_kernel_dcache_page_addr(void *addr) +void flush_kernel_dcache_page_addr(const void *addr) { unsigned long flags; diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 27222b75d2a4..2dea57ee25cb 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -151,6 +151,7 @@ config PPC select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x + select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT if PPC_BOOK3S_64 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_USE_MEMTEST diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index ed91d5b9ffc6..049b1fc4f43f 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -49,7 +49,7 @@ obj-y := cputable.o syscalls.o \ udbg.o misc.o io.o misc_$(BITS).o \ of_platform.o prom_parse.o firmware.o \ hw_breakpoint_constraints.o interrupt.o \ - kdebugfs.o + kdebugfs.o stacktrace.o obj-y += ptrace/ obj-$(CONFIG_PPC64) += setup_64.o \ paca.o nvram_64.o note.o @@ -118,7 +118,6 @@ obj-$(CONFIG_OPTPROBES) += optprobes.o optprobes_head.o obj-$(CONFIG_KPROBES_ON_FTRACE) += kprobes-ftrace.o obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o -obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o obj-$(CONFIG_ARCH_HAS_DMA_SET_MASK) += dma-mask.o diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c index 3fbd570f9c1e..0215f32932a9 100644 --- a/arch/powerpc/kvm/book3s_32_mmu.c +++ b/arch/powerpc/kvm/book3s_32_mmu.c @@ -337,7 +337,7 @@ static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum, static void kvmppc_mmu_book3s_32_tlbie(struct kvm_vcpu *vcpu, ulong ea, bool large) { - int i; + unsigned long i; struct kvm_vcpu *v; /* flush this VA on all cpus */ diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index feee40cb2ba1..61290282fd9e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -530,7 +530,7 @@ static void kvmppc_mmu_book3s_64_tlbie(struct kvm_vcpu *vcpu, ulong va, bool large) { u64 mask = 0xFFFFFFFFFULL; - long i; + unsigned long i; struct kvm_vcpu *v; dprintk("KVM MMU: tlbie(0x%lx)\n", va); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index eba77096c443..207547d05461 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1992,7 +1992,7 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr, */ if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->arch.vcore != vc) @@ -4785,8 +4785,8 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - int i, r; - unsigned long n; + int r; + unsigned long n, i; unsigned long *buf, *p; struct kvm_vcpu *vcpu; @@ -5868,7 +5868,7 @@ static int kvmhv_svm_off(struct kvm *kvm) int mmu_was_ready; int srcu_idx; int ret = 0; - int i; + unsigned long i; if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)) return ret; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 6bc9425acb32..bb0612c49b92 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -428,7 +428,7 @@ static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu) /************* MMU Notifiers *************/ static bool do_kvm_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - long i; + unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index ebd5d920de8c..9cc466006e8b 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -942,8 +942,8 @@ static int xics_debug_show(struct seq_file *m, void *private) struct kvmppc_xics *xics = m->private; struct kvm *kvm = xics->kvm; struct kvm_vcpu *vcpu; - int icsid, i; - unsigned long flags; + int icsid; + unsigned long flags, i; unsigned long t_rm_kick_vcpu, t_rm_check_resend; unsigned long t_rm_notify_eoi; unsigned long t_reject, t_check_resend; @@ -1340,7 +1340,7 @@ static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) static void kvmppc_xics_release(struct kvm_device *dev) { struct kvmppc_xics *xics = dev->private; - int i; + unsigned long i; struct kvm *kvm = xics->kvm; struct kvm_vcpu *vcpu; diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index 6231f76bdd66..8e4c79e2fcd8 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -116,7 +116,7 @@ static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm, u32 nr) { struct kvm_vcpu *vcpu = NULL; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->arch.icp && nr == vcpu->arch.icp->server_num) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index a18db9e16ea4..e868185668fe 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -368,7 +368,8 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio) { struct kvmppc_xive *xive = kvm->arch.xive; struct kvm_vcpu *vcpu; - int i, rc; + unsigned long i; + int rc; lockdep_assert_held(&xive->lock); @@ -439,7 +440,8 @@ static int xive_try_pick_queue(struct kvm_vcpu *vcpu, u8 prio) int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio) { struct kvm_vcpu *vcpu; - int i, rc; + unsigned long i; + int rc; /* Locate target server */ vcpu = kvmppc_xive_find_server(kvm, *server); @@ -1519,7 +1521,8 @@ static void xive_pre_save_queue(struct kvmppc_xive *xive, struct xive_q *q) static void xive_pre_save_scan(struct kvmppc_xive *xive) { struct kvm_vcpu *vcpu = NULL; - int i, j; + unsigned long i; + int j; /* * See comment in xive_get_source() about how this @@ -1700,7 +1703,7 @@ static bool xive_check_delayed_irq(struct kvmppc_xive *xive, u32 irq) { struct kvm *kvm = xive->kvm; struct kvm_vcpu *vcpu = NULL; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; @@ -2037,7 +2040,7 @@ static void kvmppc_xive_release(struct kvm_device *dev) struct kvmppc_xive *xive = dev->private; struct kvm *kvm = xive->kvm; struct kvm_vcpu *vcpu; - int i; + unsigned long i; pr_devel("Releasing xive device\n"); @@ -2291,7 +2294,7 @@ static int xive_debug_show(struct seq_file *m, void *private) u64 t_vm_h_cppr = 0; u64 t_vm_h_eoi = 0; u64 t_vm_h_ipi = 0; - unsigned int i; + unsigned long i; if (!kvm) return 0; diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index e6a9651c6f1e..09d0657596c3 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -199,7 +199,7 @@ struct kvmppc_xive_vcpu { static inline struct kvm_vcpu *kvmppc_xive_find_server(struct kvm *kvm, u32 nr) { struct kvm_vcpu *vcpu = NULL; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->arch.xive_vcpu && nr == vcpu->arch.xive_vcpu->server_num) @@ -240,7 +240,7 @@ static inline u32 kvmppc_xive_vp(struct kvmppc_xive *xive, u32 server) static inline bool kvmppc_xive_vp_in_use(struct kvm *kvm, u32 vp_id) { struct kvm_vcpu *vcpu = NULL; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->arch.xive_vcpu && vp_id == vcpu->arch.xive_vcpu->vp_id) diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 99db9ac49901..561a5bfe0468 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -807,7 +807,7 @@ static int kvmppc_xive_reset(struct kvmppc_xive *xive) { struct kvm *kvm = xive->kvm; struct kvm_vcpu *vcpu; - unsigned int i; + unsigned long i; pr_devel("%s\n", __func__); @@ -916,7 +916,7 @@ static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive) { struct kvm *kvm = xive->kvm; struct kvm_vcpu *vcpu; - unsigned int i; + unsigned long i; pr_devel("%s\n", __func__); @@ -1017,7 +1017,7 @@ static void kvmppc_xive_native_release(struct kvm_device *dev) struct kvmppc_xive *xive = dev->private; struct kvm *kvm = xive->kvm; struct kvm_vcpu *vcpu; - int i; + unsigned long i; pr_devel("Releasing xive native device\n"); @@ -1214,7 +1214,7 @@ static int xive_native_debug_show(struct seq_file *m, void *private) struct kvmppc_xive *xive = m->private; struct kvm *kvm = xive->kvm; struct kvm_vcpu *vcpu; - unsigned int i; + unsigned long i; if (!kvm) return 0; diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index 64eb833e9f02..051102d50c31 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -65,7 +65,7 @@ static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, int rb) ulong param = vcpu->arch.regs.gpr[rb]; int prio = dbell2prio(rb); int pir = param & PPC_DBELL_PIR_MASK; - int i; + unsigned long i; struct kvm_vcpu *cvcpu; if (prio < 0) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index ee305455bd8d..1d697223db10 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -463,9 +463,6 @@ err_out: void kvm_arch_destroy_vm(struct kvm *kvm) { - unsigned int i; - struct kvm_vcpu *vcpu; - #ifdef CONFIG_KVM_XICS /* * We call kick_all_cpus_sync() to ensure that all @@ -476,14 +473,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kick_all_cpus_sync(); #endif - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vcpu_destroy(vcpu); + kvm_destroy_vcpus(kvm); mutex_lock(&kvm->lock); - for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) - kvm->vcpus[i] = NULL; - - atomic_set(&kvm->online_vcpus, 0); kvmppc_core_destroy_vm(kvm); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 4a15172dfef2..888c12f405c2 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -394,6 +394,9 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, int is_write = page_fault_is_write(error_code); vm_fault_t fault, major = 0; bool kprobe_fault = kprobe_page_fault(regs, 11); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + unsigned long seq; +#endif if (unlikely(debugger_fault_handler(regs) || kprobe_fault)) return 0; @@ -450,6 +453,64 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, if (is_exec) flags |= FAULT_FLAG_INSTRUCTION; +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + /* + * No need to try speculative faults for kernel or + * single threaded user space. + */ + if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1) + goto no_spf; + + count_vm_event(SPF_ATTEMPT); + seq = mmap_seq_read_start(mm); + if (seq & 1) { + count_vm_spf_event(SPF_ABORT_ODD); + goto spf_abort; + } + vma = get_vma(mm, address); + if (!vma) { + count_vm_spf_event(SPF_ABORT_UNMAPPED); + goto spf_abort; + } + if (!vma_can_speculate(vma, flags)) { + put_vma(vma); + count_vm_spf_event(SPF_ABORT_NO_SPECULATE); + goto spf_abort; + } + + if (!mmap_seq_read_check(mm, seq, SPF_ABORT_VMA_COPY)) { + put_vma(vma); + goto spf_abort; + } +#ifdef CONFIG_PPC_MEM_KEYS + if (unlikely(access_pkey_error(is_write, is_exec, + (error_code & DSISR_KEYFAULT), vma))) { + put_vma(vma); + count_vm_spf_event(SPF_ABORT_ACCESS_ERROR); + goto spf_abort; + } +#endif /* CONFIG_PPC_MEM_KEYS */ + if (unlikely(access_error(is_write, is_exec, vma))) { + put_vma(vma); + count_vm_spf_event(SPF_ABORT_ACCESS_ERROR); + goto spf_abort; + } + fault = do_handle_mm_fault(vma, address, + flags | FAULT_FLAG_SPECULATIVE, seq, regs); + put_vma(vma); + major |= fault & VM_FAULT_MAJOR; + + if (fault_signal_pending(fault, regs)) + return user_mode(regs) ? 0 : SIGBUS; + if (!(fault & VM_FAULT_RETRY)) + goto done; + +spf_abort: + count_vm_event(SPF_ABORT); +no_spf: + +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ + /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an @@ -523,6 +584,9 @@ retry: } mmap_read_unlock(current->mm); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +done: +#endif if (unlikely(fault & VM_FAULT_ERROR)) return mm_fault_error(regs, address, fault); diff --git a/arch/riscv/kernel/perf_callchain.c b/arch/riscv/kernel/perf_callchain.c index 357f985041cb..3348a61de7d9 100644 --- a/arch/riscv/kernel/perf_callchain.c +++ b/arch/riscv/kernel/perf_callchain.c @@ -56,13 +56,8 @@ static unsigned long user_backtrace(struct perf_callchain_entry_ctx *entry, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); unsigned long fp = 0; - /* RISC-V does not support perf in guest mode. */ - if (guest_cbs && guest_cbs->is_in_guest()) - return; - fp = regs->s0; perf_callchain_store(entry, regs->epc); @@ -79,13 +74,5 @@ static bool fill_callchain(void *entry, unsigned long pc) void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - /* RISC-V does not support perf in guest mode. */ - if (guest_cbs && guest_cbs->is_in_guest()) { - pr_warn("RISC-V does not support perf in guest mode!"); - return; - } - walk_stackframe(NULL, regs, fill_callchain, entry); } diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 811e837a8c4e..b4f741934c3b 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -142,12 +142,8 @@ unsigned long get_wchan(struct task_struct *task) return pc; } -#ifdef CONFIG_STACKTRACE - noinline void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task, struct pt_regs *regs) { walk_stackframe(task, regs, consume_entry, cookie); } - -#endif /* CONFIG_STACKTRACE */ diff --git a/arch/s390/boot/compressed/Makefile b/arch/s390/boot/compressed/Makefile index 3b860061e84d..8ea880b7c3ec 100644 --- a/arch/s390/boot/compressed/Makefile +++ b/arch/s390/boot/compressed/Makefile @@ -71,17 +71,17 @@ suffix-$(CONFIG_KERNEL_ZSTD) := .zst $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE - $(call if_changed,bzip2) + $(call if_changed,bzip2_with_size) $(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lz4) + $(call if_changed,lz4_with_size) $(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzma) + $(call if_changed,lzma_with_size) $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzo) + $(call if_changed,lzo_with_size) $(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE - $(call if_changed,xzkern) + $(call if_changed,xzkern_with_size) $(obj)/vmlinux.bin.zst: $(vmlinux.bin.all-y) FORCE - $(call if_changed,zstd22) + $(call if_changed,zstd22_with_size) OBJCOPYFLAGS_piggy.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.bin.compressed $(obj)/piggy.o: $(obj)/vmlinux.bin$(suffix-y) FORCE diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 80f500ffb55c..be8007f367aa 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -40,7 +40,7 @@ obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o -obj-y += smp.o text_amode31.o +obj-y += smp.o text_amode31.o stacktrace.o extra-y += head64.o vmlinux.lds @@ -55,7 +55,6 @@ compat-obj-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_COMPAT) += compat_linux.o compat_signal.o obj-$(CONFIG_COMPAT) += $(compat-obj-y) obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KPROBES) += kprobes_insn_page.o obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index b032e556eeb7..a7aefc278909 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -45,7 +45,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index ca7d09f09809..969ff5c047e6 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2670,7 +2670,7 @@ static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr) static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { int r = 0; - unsigned int i; + unsigned long i; struct kvm_vcpu *vcpu; switch (attr->group) { diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index c61533e1448a..2bfa86e6f19a 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -295,7 +295,7 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val, { struct kvm *kvm; struct kvm_vcpu *vcpu; - int i; + unsigned long i; unsigned long long *delta = v; list_for_each_entry(kvm, &vm_list, vm_list) { @@ -680,7 +680,7 @@ out: static void icpt_operexc_on_all_vcpus(struct kvm *kvm) { - unsigned int i; + unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { @@ -934,7 +934,7 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_s390_vcpu_block_all(kvm); @@ -1019,7 +1019,7 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req) { - int cx; + unsigned long cx; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(cx, vcpu, kvm) @@ -2219,7 +2219,7 @@ static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp) struct kvm_vcpu *vcpu; u16 rc, rrc; int ret = 0; - int i; + unsigned long i; /* * We ignore failures and try to destroy as many CPUs as possible. @@ -2243,7 +2243,8 @@ static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp) static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc) { - int i, r = 0; + unsigned long i; + int r = 0; u16 dummy; struct kvm_vcpu *vcpu; @@ -2834,27 +2835,11 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) free_page((unsigned long)(vcpu->arch.sie_block)); } -static void kvm_free_vcpus(struct kvm *kvm) -{ - unsigned int i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vcpu_destroy(vcpu); - - mutex_lock(&kvm->lock); - for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) - kvm->vcpus[i] = NULL; - - atomic_set(&kvm->online_vcpus, 0); - mutex_unlock(&kvm->lock); -} - void kvm_arch_destroy_vm(struct kvm *kvm) { u16 rc, rrc; - kvm_free_vcpus(kvm); + kvm_destroy_vcpus(kvm); sca_dispose(kvm); kvm_s390_gisa_destroy(kvm); /* @@ -2958,7 +2943,7 @@ static int sca_switch_to_extended(struct kvm *kvm) struct bsca_block *old_sca = kvm->arch.sca; struct esca_block *new_sca; struct kvm_vcpu *vcpu; - unsigned int vcpu_idx; + unsigned long vcpu_idx; u32 scaol, scaoh; if (kvm->arch.use_esca) @@ -3440,7 +3425,7 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, struct kvm *kvm = gmap->private; struct kvm_vcpu *vcpu; unsigned long prefix; - int i; + unsigned long i; if (gmap_is_shadow(gmap)) return; @@ -3932,7 +3917,7 @@ static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_t { struct kvm_vcpu *vcpu; union tod_clock clk; - int i; + unsigned long i; preempt_disable(); @@ -4570,7 +4555,7 @@ static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu) static void __disable_ibs_on_all_vcpus(struct kvm *kvm) { - unsigned int i; + unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { @@ -4608,7 +4593,7 @@ int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) } for (i = 0; i < online_vcpus; i++) { - if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) + if (!is_vcpu_stopped(kvm_get_vcpu(vcpu->kvm, i))) started_vcpus++; } @@ -4675,9 +4660,11 @@ int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) __disable_ibs_on_vcpu(vcpu); for (i = 0; i < online_vcpus; i++) { - if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) { + struct kvm_vcpu *tmp = kvm_get_vcpu(vcpu->kvm, i); + + if (!is_vcpu_stopped(tmp)) { started_vcpus++; - started_vcpu = vcpu->kvm->vcpus[i]; + started_vcpu = tmp; } } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index a2fde6d69057..5d646d215458 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -347,7 +347,7 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); static inline void kvm_s390_vcpu_block_all(struct kvm *kvm) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; WARN_ON(!mutex_is_locked(&kvm->lock)); @@ -357,7 +357,7 @@ static inline void kvm_s390_vcpu_block_all(struct kvm *kvm) static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) diff --git a/arch/sh/boot/compressed/Makefile b/arch/sh/boot/compressed/Makefile index 589d2d8a573d..0525d8abdf45 100644 --- a/arch/sh/boot/compressed/Makefile +++ b/arch/sh/boot/compressed/Makefile @@ -58,13 +58,13 @@ vmlinux.bin.all-y := $(obj)/vmlinux.bin $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE - $(call if_changed,bzip2) + $(call if_changed,bzip2_with_size) $(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzma) + $(call if_changed,lzma_with_size) $(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE - $(call if_changed,xzkern) + $(call if_changed,xzkern_with_size) $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzo) + $(call if_changed,lzo_with_size) OBJCOPYFLAGS += -R .empty_zero_page diff --git a/arch/um/Kconfig b/arch/um/Kconfig index c18b45f75d41..16627fcf1ccd 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -11,6 +11,8 @@ config UML select ARCH_HAS_STRNLEN_USER select ARCH_NO_PREEMPT select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_KASAN if X86_64 + select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN select HAVE_ARCH_SECCOMP_FILTER select HAVE_ASM_MODVERSIONS select HAVE_UID16 @@ -220,6 +222,19 @@ config UML_TIME_TRAVEL_SUPPORT It is safe to say Y, but you probably don't need this. +config KASAN_SHADOW_OFFSET + hex + depends on KASAN + default 0x100000000000 + help + This is the offset at which the ~16TB of shadow memory is + mapped and used by KASAN for memory debugging. This can be any + address that has at least KASAN_SHADOW_SIZE (total address space divided + by 8) amount of space so that the KASAN shadow memory does not conflict + with anything. The default is 0x100000000000, which works even if mem is + set to a large value. On low-memory systems, try 0x7fff8000, as it fits + into the immediate of most instructions, improving performance. + endmenu source "arch/um/drivers/Kconfig" diff --git a/arch/um/include/asm/common.lds.S b/arch/um/include/asm/common.lds.S index eca6c452a41b..fd481ac371de 100644 --- a/arch/um/include/asm/common.lds.S +++ b/arch/um/include/asm/common.lds.S @@ -83,6 +83,8 @@ } .init_array : { __init_array_start = .; + *(.kasan_init) + *(.init_array.*) *(.init_array) __init_array_end = .; } diff --git a/arch/um/include/asm/kasan.h b/arch/um/include/asm/kasan.h new file mode 100644 index 000000000000..0d6547f4ec85 --- /dev/null +++ b/arch/um/include/asm/kasan.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_UM_KASAN_H +#define __ASM_UM_KASAN_H + +#include +#include + +#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) + +/* used in kasan_mem_to_shadow to divide by 8 */ +#define KASAN_SHADOW_SCALE_SHIFT 3 + +#ifdef CONFIG_X86_64 +#define KASAN_HOST_USER_SPACE_END_ADDR 0x00007fffffffffffUL +/* KASAN_SHADOW_SIZE is the size of total address space divided by 8 */ +#define KASAN_SHADOW_SIZE ((KASAN_HOST_USER_SPACE_END_ADDR + 1) >> \ + KASAN_SHADOW_SCALE_SHIFT) +#else +#error "KASAN_SHADOW_SIZE is not defined for this sub-architecture" +#endif /* CONFIG_X86_64 */ + +#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET) +#define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) + +#ifdef CONFIG_KASAN +void kasan_init(void); +void kasan_map_memory(void *start, unsigned long len); +extern int kasan_um_is_ready; + +#ifdef CONFIG_STATIC_LINK +#define kasan_arch_is_ready() (kasan_um_is_ready) +#endif +#else +static inline void kasan_init(void) { } +#endif /* CONFIG_KASAN */ + +#endif /* __ASM_UM_KASAN_H */ diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index 2f2a8ce92f1e..2b7fc5b54164 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -109,7 +109,11 @@ SECTIONS be empty, which isn't pretty. */ . = ALIGN(32 / 8); .preinit_array : { *(.preinit_array) } - .init_array : { *(.init_array) } + .init_array : { + *(.kasan_init) + *(.init_array.*) + *(.init_array) + } .fini_array : { *(.fini_array) } .data : { INIT_TASK_DATA(KERNEL_STACK_SIZE) diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 8e636ce02949..6d1537356c50 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -18,6 +18,25 @@ #include #include #include +#include + +#ifdef CONFIG_KASAN +int kasan_um_is_ready; +void kasan_init(void) +{ + /* + * kasan_map_memory will map all of the required address space and + * the host machine will allocate physical memory as necessary. + */ + kasan_map_memory((void *)KASAN_SHADOW_START, KASAN_SHADOW_SIZE); + init_task.kasan_depth = 0; + kasan_um_is_ready = true; +} + +static void (*kasan_init_ptr)(void) +__section(".kasan_init") __used += kasan_init; +#endif /* allocated in paging_init, zeroed in mem_init, and unchanged thereafter */ unsigned long *empty_zero_page = NULL; diff --git a/arch/um/kernel/stacktrace.c b/arch/um/kernel/stacktrace.c index 86df52168bd9..fd3b61b3d4d2 100644 --- a/arch/um/kernel/stacktrace.c +++ b/arch/um/kernel/stacktrace.c @@ -27,7 +27,7 @@ void dump_trace(struct task_struct *tsk, frame = (struct stack_frame *)bp; while (((long) sp & (THREAD_SIZE-1)) != 0) { - addr = *sp; + addr = READ_ONCE_NOCHECK(*sp); if (__kernel_text_address(addr)) { reliable = 0; if ((unsigned long) sp == bp + sizeof(long)) { diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c index 3c1b77474d2d..8530b2e08604 100644 --- a/arch/um/os-Linux/mem.c +++ b/arch/um/os-Linux/mem.c @@ -17,6 +17,28 @@ #include #include +/* + * kasan_map_memory - maps memory from @start with a size of @len. + * The allocated memory is filled with zeroes upon success. + * @start: the start address of the memory to be mapped + * @len: the length of the memory to be mapped + * + * This function is used to map shadow memory for KASAN in uml + */ +void kasan_map_memory(void *start, size_t len) +{ + if (mmap(start, + len, + PROT_READ|PROT_WRITE, + MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE|MAP_NORESERVE, + -1, + 0) == MAP_FAILED) { + os_info("Couldn't allocate shadow memory: %s\n.", + strerror(errno)); + exit(1); + } +} + /* Set by make_tempfile() during early boot. */ static char *tempdir = NULL; diff --git a/arch/um/os-Linux/user_syms.c b/arch/um/os-Linux/user_syms.c index 715594fe5719..cb667c9225ab 100644 --- a/arch/um/os-Linux/user_syms.c +++ b/arch/um/os-Linux/user_syms.c @@ -27,10 +27,10 @@ EXPORT_SYMBOL(strstr); #ifndef __x86_64__ extern void *memcpy(void *, const void *, size_t); EXPORT_SYMBOL(memcpy); -#endif - EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(memset); +#endif + EXPORT_SYMBOL(printf); /* Here, instead, I can provide a fake prototype. Yes, someone cares: genksyms. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0f2234cd8453..e933c20ba342 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,6 +34,7 @@ config X86_64 select SWIOTLB select ARCH_HAS_ELFCORE_COMPAT select ZONE_DMA32 + select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT config FORCE_DYNAMIC_FTRACE def_bool y @@ -84,6 +85,7 @@ config X86 select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_PTE_DEVMAP if X86_64 select ARCH_HAS_PTE_SPECIAL + select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2 select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_COPY_MC if X86_64 select ARCH_HAS_SET_MEMORY diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 9c09bbd390ce..1ac31384f84b 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -244,8 +244,11 @@ BOOT_TARGETS = bzdisk fdimage fdimage144 fdimage288 hdimage isoimage PHONY += bzImage $(BOOT_TARGETS) +# Don't compile Image in mixed build with "all" target +ifndef KBUILD_MIXED_TREE # Default kernel to build all: bzImage +endif # KBUILD_IMAGE specify target image being built KBUILD_IMAGE := $(boot)/bzImage diff --git a/arch/x86/OWNERS b/arch/x86/OWNERS new file mode 100644 index 000000000000..f59fa995b361 --- /dev/null +++ b/arch/x86/OWNERS @@ -0,0 +1,3 @@ +per-file crypto/**=file:/crypto/OWNERS +per-file mm/**=file:/mm/OWNERS +per-file net/**=file:/net/OWNERS diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 34c9dbb6a47d..686a9d75a0e4 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -110,66 +110,78 @@ typedef unsigned int addr_t; static inline u8 rdfs8(addr_t addr) { + u8 *ptr = (u8 *)absolute_pointer(addr); u8 v; - asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr)); + asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*ptr)); return v; } static inline u16 rdfs16(addr_t addr) { + u16 *ptr = (u16 *)absolute_pointer(addr); u16 v; - asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr)); + asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*ptr)); return v; } static inline u32 rdfs32(addr_t addr) { + u32 *ptr = (u32 *)absolute_pointer(addr); u32 v; - asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr)); + asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*ptr)); return v; } static inline void wrfs8(u8 v, addr_t addr) { - asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "qi" (v)); + u8 *ptr = (u8 *)absolute_pointer(addr); + asm volatile("movb %1,%%fs:%0" : "+m" (*ptr) : "qi" (v)); } static inline void wrfs16(u16 v, addr_t addr) { - asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "ri" (v)); + u16 *ptr = (u16 *)absolute_pointer(addr); + asm volatile("movw %1,%%fs:%0" : "+m" (*ptr) : "ri" (v)); } static inline void wrfs32(u32 v, addr_t addr) { - asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "ri" (v)); + u32 *ptr = (u32 *)absolute_pointer(addr); + asm volatile("movl %1,%%fs:%0" : "+m" (*ptr) : "ri" (v)); } static inline u8 rdgs8(addr_t addr) { + u8 *ptr = (u8 *)absolute_pointer(addr); u8 v; - asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr)); + asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*ptr)); return v; } static inline u16 rdgs16(addr_t addr) { + u16 *ptr = (u16 *)absolute_pointer(addr); u16 v; - asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr)); + asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*ptr)); return v; } static inline u32 rdgs32(addr_t addr) { + u32 *ptr = (u32 *)absolute_pointer(addr); u32 v; - asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr)); + asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*ptr)); return v; } static inline void wrgs8(u8 v, addr_t addr) { - asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "qi" (v)); + u8 *ptr = (u8 *)absolute_pointer(addr); + asm volatile("movb %1,%%gs:%0" : "+m" (*ptr) : "qi" (v)); } static inline void wrgs16(u16 v, addr_t addr) { - asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "ri" (v)); + u16 *ptr = (u16 *)absolute_pointer(addr); + asm volatile("movw %1,%%gs:%0" : "+m" (*ptr) : "ri" (v)); } static inline void wrgs32(u32 v, addr_t addr) { - asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "ri" (v)); + u32 *ptr = (u32 *)absolute_pointer(addr); + asm volatile("movl %1,%%gs:%0" : "+m" (*ptr) : "ri" (v)); } /* Note: these only return true/false, not a signed return value! */ diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 15c5ae62a0e9..c66ea96936de 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -130,17 +130,17 @@ vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE - $(call if_changed,bzip2) + $(call if_changed,bzip2_with_size) $(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzma) + $(call if_changed,lzma_with_size) $(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE - $(call if_changed,xzkern) + $(call if_changed,xzkern_with_size) $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzo) + $(call if_changed,lzo_with_size) $(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lz4) + $(call if_changed,lz4_with_size) $(obj)/vmlinux.bin.zst: $(vmlinux.bin.all-y) FORCE - $(call if_changed,zstd22) + $(call if_changed,zstd22_with_size) suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index e3add857c2c9..c421af5a3cdc 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -33,7 +33,7 @@ static void copy_boot_params(void) u16 cl_offset; }; const struct old_cmdline * const oldcmd = - (const struct old_cmdline *)OLD_CL_ADDRESS; + absolute_pointer(OLD_CL_ADDRESS); BUILD_BUG_ON(sizeof(boot_params) != 4096); memcpy(&boot_params.hdr, &hdr, sizeof(hdr)); diff --git a/arch/x86/configs/gki_defconfig b/arch/x86/configs/gki_defconfig new file mode 100644 index 000000000000..b2972345690f --- /dev/null +++ b/arch/x86/configs/gki_defconfig @@ -0,0 +1,668 @@ +CONFIG_UAPI_HEADER_TEST=y +CONFIG_KERNEL_LZ4=y +# CONFIG_USELIB is not set +CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_PREEMPT=y +CONFIG_IRQ_TIME_ACCOUNTING=y +CONFIG_TASKSTATS=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_PSI=y +CONFIG_RCU_EXPERT=y +CONFIG_RCU_FAST_NO_HZ=y +CONFIG_RCU_BOOST=y +CONFIG_RCU_NOCB_CPU=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_IKHEADERS=y +CONFIG_UCLAMP_TASK=y +CONFIG_UCLAMP_BUCKETS_COUNT=20 +CONFIG_CGROUPS=y +CONFIG_MEMCG=y +CONFIG_BLK_CGROUP=y +CONFIG_CGROUP_SCHED=y +CONFIG_UCLAMP_TASK_GROUP=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CPUSETS=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_BPF=y +CONFIG_NAMESPACES=y +# CONFIG_TIME_NS is not set +# CONFIG_PID_NS is not set +# CONFIG_RD_BZIP2 is not set +# CONFIG_RD_LZMA is not set +# CONFIG_RD_XZ is not set +# CONFIG_RD_LZO is not set +CONFIG_BOOT_CONFIG=y +# CONFIG_UID16 is not set +# CONFIG_SYSFS_SYSCALL is not set +# CONFIG_FHANDLE is not set +# CONFIG_PCSPKR_PLATFORM is not set +CONFIG_KALLSYMS_ALL=y +CONFIG_USERFAULTFD=y +# CONFIG_RSEQ is not set +CONFIG_EMBEDDED=y +# CONFIG_COMPAT_BRK is not set +# CONFIG_SLAB_MERGE_DEFAULT is not set +CONFIG_SLAB_FREELIST_RANDOM=y +CONFIG_SLAB_FREELIST_HARDENED=y +CONFIG_SHUFFLE_PAGE_ALLOCATOR=y +CONFIG_PROFILING=y +CONFIG_SMP=y +CONFIG_X86_X2APIC=y +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PARAVIRT=y +CONFIG_PARAVIRT_TIME_ACCOUNTING=y +CONFIG_NR_CPUS=32 +# CONFIG_X86_MCE is not set +# CONFIG_MICROCODE is not set +# CONFIG_X86_5LEVEL is not set +# CONFIG_MTRR_SANITIZER is not set +CONFIG_EFI=y +CONFIG_CMDLINE_BOOL=y +CONFIG_CMDLINE="console=ttynull stack_depot_disable=on cgroup_disable=pressure bootconfig" +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=0 +# CONFIG_PM_WAKELOCKS_GC is not set +# CONFIG_ACPI_AC is not set +# CONFIG_ACPI_BATTERY is not set +# CONFIG_ACPI_FAN is not set +# CONFIG_ACPI_THERMAL is not set +# CONFIG_X86_PM_TIMER is not set +CONFIG_CPU_FREQ_STAT=y +CONFIG_CPU_FREQ_TIMES=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=y +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y +CONFIG_IA32_EMULATION=y +CONFIG_KVM=y +CONFIG_KVM_INTEL=y +CONFIG_KVM_AMD=y +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +CONFIG_LTO_CLANG_FULL=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODVERSIONS=y +CONFIG_MODULE_SCMVERSION=y +CONFIG_MODULE_SIG=y +CONFIG_MODULE_SIG_PROTECT=y +CONFIG_BLK_DEV_ZONED=y +CONFIG_BLK_CGROUP_IOCOST=y +CONFIG_BLK_INLINE_ENCRYPTION=y +CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y +CONFIG_IOSCHED_BFQ=y +CONFIG_BFQ_GROUP_IOSCHED=y +CONFIG_GKI_HACKS_TO_FIX=y +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +CONFIG_BINFMT_MISC=y +CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_DEFAULT_MMAP_MIN_ADDR=32768 +CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y +CONFIG_CLEANCACHE=y +CONFIG_CMA=y +CONFIG_CMA_DEBUGFS=y +CONFIG_CMA_SYSFS=y +CONFIG_CMA_AREAS=16 +CONFIG_ZSMALLOC=m +# CONFIG_ZONE_DMA is not set +CONFIG_ANON_VMA_NAME=y +CONFIG_LRU_GEN=y +CONFIG_DAMON=y +CONFIG_DAMON_PADDR=y +CONFIG_DAMON_RECLAIM=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_XFRM_USER=y +CONFIG_XFRM_INTERFACE=y +CONFIG_XFRM_MIGRATE=y +CONFIG_XFRM_STATISTICS=y +CONFIG_NET_KEY=y +CONFIG_XDP_SOCKETS=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_NET_IPIP=y +CONFIG_NET_IPGRE_DEMUX=y +CONFIG_NET_IPGRE=y +CONFIG_NET_IPVTI=y +CONFIG_INET_ESP=y +CONFIG_INET_UDP_DIAG=y +CONFIG_INET_DIAG_DESTROY=y +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_ESP=y +CONFIG_INET6_IPCOMP=y +CONFIG_IPV6_MIP6=y +CONFIG_IPV6_VTI=y +CONFIG_IPV6_GRE=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_MROUTE=y +CONFIG_NETFILTER=y +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_AMANDA=y +CONFIG_NF_CONNTRACK_FTP=y +CONFIG_NF_CONNTRACK_H323=y +CONFIG_NF_CONNTRACK_IRC=y +CONFIG_NF_CONNTRACK_NETBIOS_NS=y +CONFIG_NF_CONNTRACK_PPTP=y +CONFIG_NF_CONNTRACK_SANE=y +CONFIG_NF_CONNTRACK_TFTP=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y +CONFIG_NETFILTER_XT_TARGET_CONNMARK=y +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y +CONFIG_NETFILTER_XT_TARGET_DSCP=y +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_NFLOG=y +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y +CONFIG_NETFILTER_XT_TARGET_NOTRACK=y +CONFIG_NETFILTER_XT_TARGET_TEE=y +CONFIG_NETFILTER_XT_TARGET_TPROXY=y +CONFIG_NETFILTER_XT_TARGET_TRACE=y +CONFIG_NETFILTER_XT_TARGET_SECMARK=y +CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_MATCH_BPF=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y +CONFIG_NETFILTER_XT_MATCH_CONNMARK=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_DSCP=y +CONFIG_NETFILTER_XT_MATCH_ESP=y +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y +CONFIG_NETFILTER_XT_MATCH_HELPER=y +CONFIG_NETFILTER_XT_MATCH_IPRANGE=y +CONFIG_NETFILTER_XT_MATCH_L2TP=y +CONFIG_NETFILTER_XT_MATCH_LENGTH=y +CONFIG_NETFILTER_XT_MATCH_LIMIT=y +CONFIG_NETFILTER_XT_MATCH_MAC=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=y +CONFIG_NETFILTER_XT_MATCH_OWNER=y +CONFIG_NETFILTER_XT_MATCH_POLICY=y +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y +CONFIG_NETFILTER_XT_MATCH_QUOTA=y +CONFIG_NETFILTER_XT_MATCH_QUOTA2=y +CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG=y +CONFIG_NETFILTER_XT_MATCH_SOCKET=y +CONFIG_NETFILTER_XT_MATCH_STATE=y +CONFIG_NETFILTER_XT_MATCH_STATISTIC=y +CONFIG_NETFILTER_XT_MATCH_STRING=y +CONFIG_NETFILTER_XT_MATCH_TIME=y +CONFIG_NETFILTER_XT_MATCH_U32=y +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_MATCH_ECN=y +CONFIG_IP_NF_MATCH_TTL=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_IP_NF_NAT=y +CONFIG_IP_NF_TARGET_MASQUERADE=y +CONFIG_IP_NF_TARGET_NETMAP=y +CONFIG_IP_NF_TARGET_REDIRECT=y +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_RAW=y +CONFIG_IP_NF_SECURITY=y +CONFIG_IP_NF_ARPTABLES=y +CONFIG_IP_NF_ARPFILTER=y +CONFIG_IP_NF_ARP_MANGLE=y +CONFIG_IP6_NF_IPTABLES=y +CONFIG_IP6_NF_MATCH_RPFILTER=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_TARGET_REJECT=y +CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_RAW=y +CONFIG_TIPC=m +CONFIG_L2TP=m +CONFIG_BRIDGE=y +CONFIG_VLAN_8021Q=m +CONFIG_6LOWPAN=m +CONFIG_IEEE802154=m +CONFIG_IEEE802154_6LOWPAN=m +CONFIG_MAC802154=m +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_HTB=y +CONFIG_NET_SCH_PRIO=y +CONFIG_NET_SCH_MULTIQ=y +CONFIG_NET_SCH_SFQ=y +CONFIG_NET_SCH_TBF=y +CONFIG_NET_SCH_NETEM=y +CONFIG_NET_SCH_CODEL=y +CONFIG_NET_SCH_FQ_CODEL=y +CONFIG_NET_SCH_FQ=y +CONFIG_NET_SCH_INGRESS=y +CONFIG_NET_CLS_BASIC=y +CONFIG_NET_CLS_TCINDEX=y +CONFIG_NET_CLS_FW=y +CONFIG_NET_CLS_U32=y +CONFIG_CLS_U32_MARK=y +CONFIG_NET_CLS_FLOW=y +CONFIG_NET_CLS_BPF=y +CONFIG_NET_CLS_MATCHALL=y +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_CMP=y +CONFIG_NET_EMATCH_NBYTE=y +CONFIG_NET_EMATCH_U32=y +CONFIG_NET_EMATCH_META=y +CONFIG_NET_EMATCH_TEXT=y +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_POLICE=y +CONFIG_NET_ACT_GACT=y +CONFIG_NET_ACT_MIRRED=y +CONFIG_NET_ACT_SKBEDIT=y +CONFIG_NET_ACT_BPF=y +CONFIG_VSOCKETS=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_CAN=m +CONFIG_CAN_VCAN=m +CONFIG_CAN_SLCAN=m +CONFIG_BT=m +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +CONFIG_BT_HIDP=m +CONFIG_BT_HCIBTSDIO=m +CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_LL=y +CONFIG_BT_HCIUART_BCM=y +CONFIG_BT_HCIUART_QCA=y +CONFIG_CFG80211=m +CONFIG_NL80211_TESTMODE=y +# CONFIG_CFG80211_DEFAULT_PS is not set +# CONFIG_CFG80211_CRDA_SUPPORT is not set +CONFIG_MAC80211=m +CONFIG_RFKILL=m +CONFIG_NFC=m +CONFIG_PCI=y +CONFIG_PCIEPORTBUS=y +CONFIG_PCIEAER=y +CONFIG_PCI_MSI=y +CONFIG_PCI_IOV=y +CONFIG_PCIE_DW_PLAT_EP=y +CONFIG_PCI_ENDPOINT=y +CONFIG_FW_LOADER_USER_HELPER=y +# CONFIG_FW_CACHE is not set +CONFIG_GNSS=y +CONFIG_OF=y +CONFIG_ZRAM=m +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_LOOP_MIN_COUNT=16 +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_SRAM=y +CONFIG_UID_SYS_STATS=y +CONFIG_SCSI=y +# CONFIG_SCSI_PROC_FS is not set +CONFIG_BLK_DEV_SD=y +CONFIG_SCSI_LOGGING=y +CONFIG_MD=y +CONFIG_BLK_DEV_DM=y +CONFIG_DM_CRYPT=y +CONFIG_DM_DEFAULT_KEY=y +CONFIG_DM_SNAPSHOT=y +CONFIG_DM_INIT=y +CONFIG_DM_UEVENT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y +CONFIG_DM_BOW=y +CONFIG_NETDEVICES=y +CONFIG_DUMMY=y +CONFIG_WIREGUARD=y +CONFIG_IFB=y +CONFIG_MACSEC=y +CONFIG_TUN=y +CONFIG_VETH=y +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_MPPE=m +CONFIG_PPTP=m +CONFIG_PPPOL2TP=m +CONFIG_USB_RTL8150=y +CONFIG_USB_RTL8152=y +CONFIG_USB_USBNET=y +CONFIG_USB_NET_CDC_EEM=y +# CONFIG_USB_NET_NET1080 is not set +# CONFIG_USB_NET_CDC_SUBSET is not set +# CONFIG_USB_NET_ZAURUS is not set +CONFIG_USB_NET_AQC111=y +# CONFIG_WLAN_VENDOR_ADMTEK is not set +# CONFIG_WLAN_VENDOR_ATH is not set +# CONFIG_WLAN_VENDOR_ATMEL is not set +# CONFIG_WLAN_VENDOR_BROADCOM is not set +# CONFIG_WLAN_VENDOR_CISCO is not set +# CONFIG_WLAN_VENDOR_INTEL is not set +# CONFIG_WLAN_VENDOR_INTERSIL is not set +# CONFIG_WLAN_VENDOR_MARVELL is not set +# CONFIG_WLAN_VENDOR_MEDIATEK is not set +# CONFIG_WLAN_VENDOR_RALINK is not set +# CONFIG_WLAN_VENDOR_REALTEK is not set +# CONFIG_WLAN_VENDOR_RSI is not set +# CONFIG_WLAN_VENDOR_ST is not set +# CONFIG_WLAN_VENDOR_TI is not set +# CONFIG_WLAN_VENDOR_ZYDAS is not set +# CONFIG_WLAN_VENDOR_QUANTENNA is not set +CONFIG_INPUT_EVDEV=y +CONFIG_KEYBOARD_GPIO=y +# CONFIG_MOUSE_PS2 is not set +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_XPAD=y +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_INPUT_MISC=y +CONFIG_INPUT_UINPUT=y +# CONFIG_VT is not set +# CONFIG_LEGACY_PTYS is not set +CONFIG_SERIAL_8250=y +# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_RUNTIME_UARTS=0 +CONFIG_SERIAL_OF_PLATFORM=y +CONFIG_SERIAL_SAMSUNG=y +CONFIG_SERIAL_SAMSUNG_CONSOLE=y +CONFIG_NULL_TTY=y +CONFIG_SERIAL_DEV_BUS=y +CONFIG_HW_RANDOM=y +# CONFIG_HW_RANDOM_VIA is not set +# CONFIG_DEVMEM is not set +# CONFIG_DEVPORT is not set +CONFIG_HPET=y +# CONFIG_I2C_COMPAT is not set +# CONFIG_I2C_HELPER_AUTO is not set +CONFIG_I3C=y +CONFIG_SPI=y +CONFIG_SPI_MEM=y +CONFIG_GPIOLIB=y +CONFIG_GPIO_GENERIC_PLATFORM=y +# CONFIG_HWMON is not set +CONFIG_THERMAL_NETLINK=y +CONFIG_THERMAL_STATISTICS=y +CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 +CONFIG_THERMAL_WRITABLE_TRIPS=y +CONFIG_THERMAL_GOV_USER_SPACE=y +CONFIG_CPU_THERMAL=y +CONFIG_DEVFREQ_THERMAL=y +CONFIG_THERMAL_EMULATION=y +# CONFIG_X86_PKG_TEMP_THERMAL is not set +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_CORE=y +CONFIG_MFD_SYSCON=y +CONFIG_REGULATOR=y +CONFIG_REGULATOR_FIXED_VOLTAGE=y +CONFIG_RC_CORE=y +# CONFIG_RC_MAP is not set +CONFIG_LIRC=y +CONFIG_BPF_LIRC_MODE2=y +CONFIG_RC_DECODERS=y +CONFIG_RC_DEVICES=y +CONFIG_MEDIA_CEC_RC=y +# CONFIG_MEDIA_ANALOG_TV_SUPPORT is not set +# CONFIG_MEDIA_DIGITAL_TV_SUPPORT is not set +# CONFIG_MEDIA_RADIO_SUPPORT is not set +# CONFIG_MEDIA_SDR_SUPPORT is not set +# CONFIG_MEDIA_TEST_SUPPORT is not set +CONFIG_VIDEO_V4L2_SUBDEV_API=y +CONFIG_MEDIA_USB_SUPPORT=y +CONFIG_USB_VIDEO_CLASS=y +CONFIG_USB_GSPCA=y +CONFIG_V4L_PLATFORM_DRIVERS=y +CONFIG_V4L_MEM2MEM_DRIVERS=y +CONFIG_DRM=y +CONFIG_BACKLIGHT_CLASS_DEVICE=y +CONFIG_SOUND=y +CONFIG_SND=y +CONFIG_SND_HRTIMER=y +# CONFIG_SND_SUPPORT_OLD_API is not set +# CONFIG_SND_VERBOSE_PROCFS is not set +# CONFIG_SND_DRIVERS is not set +CONFIG_SND_USB_AUDIO=y +CONFIG_SND_SOC=y +CONFIG_HID_BATTERY_STRENGTH=y +CONFIG_HIDRAW=y +CONFIG_UHID=y +CONFIG_HID_APPLE=y +CONFIG_HID_PRODIKEYS=y +CONFIG_HID_ELECOM=y +CONFIG_HID_UCLOGIC=y +CONFIG_HID_LOGITECH=y +CONFIG_HID_LOGITECH_DJ=y +CONFIG_HID_MAGICMOUSE=y +CONFIG_HID_MICROSOFT=y +CONFIG_HID_MULTITOUCH=y +CONFIG_HID_NINTENDO=y +CONFIG_HID_PICOLCD=y +CONFIG_HID_PLANTRONICS=y +CONFIG_HID_PLAYSTATION=y +CONFIG_PLAYSTATION_FF=y +CONFIG_HID_ROCCAT=y +CONFIG_HID_SONY=y +CONFIG_SONY_FF=y +CONFIG_HID_STEAM=y +CONFIG_HID_WACOM=y +CONFIG_HID_WIIMOTE=y +CONFIG_USB_HIDDEV=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y +CONFIG_USB_XHCI_HCD=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_EHCI_ROOT_HUB_TT=y +CONFIG_USB_EHCI_HCD_PLATFORM=y +CONFIG_USB_ACM=m +CONFIG_USB_STORAGE=y +CONFIG_USB_UAS=y +CONFIG_USB_DWC3=y +CONFIG_USB_SERIAL=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_GADGET=y +CONFIG_USB_CONFIGFS=y +CONFIG_USB_CONFIGFS_UEVENT=y +CONFIG_USB_CONFIGFS_SERIAL=y +CONFIG_USB_CONFIGFS_ACM=y +CONFIG_USB_CONFIGFS_NCM=y +CONFIG_USB_CONFIGFS_ECM=y +CONFIG_USB_CONFIGFS_EEM=y +CONFIG_USB_CONFIGFS_MASS_STORAGE=y +CONFIG_USB_CONFIGFS_F_FS=y +CONFIG_USB_CONFIGFS_F_ACC=y +CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y +CONFIG_USB_CONFIGFS_F_MIDI=y +CONFIG_USB_CONFIGFS_F_HID=y +CONFIG_USB_CONFIGFS_F_UVC=y +CONFIG_TYPEC=y +CONFIG_TYPEC_TCPM=y +CONFIG_TYPEC_TCPCI=y +CONFIG_TYPEC_UCSI=y +CONFIG_MMC=y +# CONFIG_PWRSEQ_EMMC is not set +# CONFIG_PWRSEQ_SIMPLE is not set +CONFIG_MMC_CRYPTO=y +CONFIG_MMC_SDHCI=y +CONFIG_MMC_SDHCI_PLTFM=y +CONFIG_SCSI_UFSHCD=y +CONFIG_SCSI_UFS_BSG=y +CONFIG_SCSI_UFS_CRYPTO=y +CONFIG_SCSI_UFS_HPB=y +CONFIG_SCSI_UFSHCD_PCI=y +CONFIG_SCSI_UFSHCD_PLATFORM=y +CONFIG_SCSI_UFS_DWC_TC_PLATFORM=y +CONFIG_LEDS_CLASS_FLASH=y +CONFIG_LEDS_TRIGGER_TIMER=y +CONFIG_LEDS_TRIGGER_TRANSIENT=y +CONFIG_EDAC=y +CONFIG_RTC_CLASS=y +# CONFIG_RTC_HCTOSYS is not set +CONFIG_DMABUF_HEAPS=y +CONFIG_DMABUF_SYSFS_STATS=y +CONFIG_DMABUF_HEAPS_DEFERRED_FREE=y +CONFIG_DMABUF_HEAPS_PAGE_POOL=y +CONFIG_UIO=y +# CONFIG_VIRTIO_MEM is not set +CONFIG_VHOST_VSOCK=y +CONFIG_STAGING=y +CONFIG_ASHMEM=y +CONFIG_DEBUG_KINFO=y +CONFIG_REMOTEPROC=y +CONFIG_REMOTEPROC_CDEV=y +CONFIG_RPMSG_CHAR=y +CONFIG_PM_DEVFREQ_EVENT=y +CONFIG_IIO=y +CONFIG_IIO_BUFFER=y +CONFIG_IIO_TRIGGER=y +CONFIG_POWERCAP=y +CONFIG_ANDROID=y +CONFIG_ANDROID_BINDER_IPC=y +CONFIG_ANDROID_BINDERFS=y +CONFIG_ANDROID_VENDOR_HOOKS=y +CONFIG_LIBNVDIMM=y +# CONFIG_ND_BLK is not set +CONFIG_INTERCONNECT=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_F2FS_FS=y +CONFIG_F2FS_FS_SECURITY=y +CONFIG_F2FS_FS_COMPRESSION=y +CONFIG_F2FS_UNFAIR_RWSEM=y +CONFIG_FS_ENCRYPTION=y +CONFIG_FS_ENCRYPTION_INLINE_CRYPT=y +CONFIG_FS_VERITY=y +CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y +# CONFIG_DNOTIFY is not set +CONFIG_QUOTA=y +CONFIG_QFMT_V2=y +CONFIG_FUSE_FS=y +CONFIG_VIRTIO_FS=y +CONFIG_FUSE_BPF=y +CONFIG_OVERLAY_FS=y +CONFIG_INCREMENTAL_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_EXFAT_FS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +# CONFIG_EFIVAR_FS is not set +CONFIG_PSTORE=y +CONFIG_PSTORE_CONSOLE=y +CONFIG_PSTORE_PMSG=y +CONFIG_PSTORE_RAM=y +CONFIG_EROFS_FS=y +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_CODEPAGE_737=y +CONFIG_NLS_CODEPAGE_775=y +CONFIG_NLS_CODEPAGE_850=y +CONFIG_NLS_CODEPAGE_852=y +CONFIG_NLS_CODEPAGE_855=y +CONFIG_NLS_CODEPAGE_857=y +CONFIG_NLS_CODEPAGE_860=y +CONFIG_NLS_CODEPAGE_861=y +CONFIG_NLS_CODEPAGE_862=y +CONFIG_NLS_CODEPAGE_863=y +CONFIG_NLS_CODEPAGE_864=y +CONFIG_NLS_CODEPAGE_865=y +CONFIG_NLS_CODEPAGE_866=y +CONFIG_NLS_CODEPAGE_869=y +CONFIG_NLS_CODEPAGE_936=y +CONFIG_NLS_CODEPAGE_950=y +CONFIG_NLS_CODEPAGE_932=y +CONFIG_NLS_CODEPAGE_949=y +CONFIG_NLS_CODEPAGE_874=y +CONFIG_NLS_ISO8859_8=y +CONFIG_NLS_CODEPAGE_1250=y +CONFIG_NLS_CODEPAGE_1251=y +CONFIG_NLS_ASCII=y +CONFIG_NLS_ISO8859_1=y +CONFIG_NLS_ISO8859_2=y +CONFIG_NLS_ISO8859_3=y +CONFIG_NLS_ISO8859_4=y +CONFIG_NLS_ISO8859_5=y +CONFIG_NLS_ISO8859_6=y +CONFIG_NLS_ISO8859_7=y +CONFIG_NLS_ISO8859_9=y +CONFIG_NLS_ISO8859_13=y +CONFIG_NLS_ISO8859_14=y +CONFIG_NLS_ISO8859_15=y +CONFIG_NLS_KOI8_R=y +CONFIG_NLS_KOI8_U=y +CONFIG_NLS_MAC_ROMAN=y +CONFIG_NLS_MAC_CELTIC=y +CONFIG_NLS_MAC_CENTEURO=y +CONFIG_NLS_MAC_CROATIAN=y +CONFIG_NLS_MAC_CYRILLIC=y +CONFIG_NLS_MAC_GAELIC=y +CONFIG_NLS_MAC_GREEK=y +CONFIG_NLS_MAC_ICELAND=y +CONFIG_NLS_MAC_INUIT=y +CONFIG_NLS_MAC_ROMANIAN=y +CONFIG_NLS_MAC_TURKISH=y +CONFIG_NLS_UTF8=y +CONFIG_UNICODE=y +CONFIG_SECURITY=y +CONFIG_SECURITYFS=y +CONFIG_SECURITY_NETWORK=y +CONFIG_HARDENED_USERCOPY=y +# CONFIG_HARDENED_USERCOPY_FALLBACK is not set +CONFIG_STATIC_USERMODEHELPER=y +CONFIG_STATIC_USERMODEHELPER_PATH="" +CONFIG_SECURITY_SELINUX=y +CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y +CONFIG_CRYPTO_ECDH=y +CONFIG_CRYPTO_CCM=y +CONFIG_CRYPTO_CHACHA20POLY1305=y +CONFIG_CRYPTO_ADIANTUM=y +CONFIG_CRYPTO_HCTR2=y +CONFIG_CRYPTO_CMAC=y +CONFIG_CRYPTO_XCBC=y +CONFIG_CRYPTO_BLAKE2B=y +CONFIG_CRYPTO_POLYVAL_CLMUL_NI=y +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_SHA256_SSSE3=y +CONFIG_CRYPTO_SHA512_SSSE3=y +CONFIG_CRYPTO_AES_NI_INTEL=y +CONFIG_CRYPTO_DES=y +CONFIG_CRYPTO_LZO=y +CONFIG_CRYPTO_LZ4=y +CONFIG_CRYPTO_ZSTD=y +CONFIG_CRYPTO_ANSI_CPRNG=y +CONFIG_CRC_CCITT=y +CONFIG_CRC8=y +CONFIG_XZ_DEC=y +CONFIG_DMA_CMA=y +CONFIG_STACK_HASH_ORDER=12 +CONFIG_PRINTK_TIME=y +CONFIG_DYNAMIC_DEBUG_CORE=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF4=y +CONFIG_DEBUG_INFO_BTF=y +CONFIG_MODULE_ALLOW_BTF_MISMATCH=y +CONFIG_HEADERS_INSTALL=y +# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set +CONFIG_MAGIC_SYSRQ=y +CONFIG_UBSAN=y +CONFIG_UBSAN_TRAP=y +CONFIG_UBSAN_LOCAL_BOUNDS=y +# CONFIG_UBSAN_SHIFT is not set +# CONFIG_UBSAN_BOOL is not set +# CONFIG_UBSAN_ENUM is not set +CONFIG_PAGE_OWNER=y +CONFIG_DEBUG_STACK_USAGE=y +CONFIG_DEBUG_MEMORY_INIT=y +CONFIG_KFENCE=y +CONFIG_KFENCE_SAMPLE_INTERVAL=500 +CONFIG_KFENCE_NUM_OBJECTS=63 +CONFIG_KFENCE_STATIC_KEYS=y +CONFIG_PANIC_ON_OOPS=y +CONFIG_PANIC_TIMEOUT=-1 +CONFIG_SOFTLOCKUP_DETECTOR=y +CONFIG_WQ_WATCHDOG=y +CONFIG_SCHEDSTATS=y +CONFIG_BUG_ON_DATA_CORRUPTION=y +CONFIG_HIST_TRIGGERS=y +CONFIG_UNWINDER_FRAME_POINTER=y diff --git a/arch/x86/configs/x86_64_gce.fragment b/arch/x86/configs/x86_64_gce.fragment new file mode 100644 index 000000000000..5a1f1b08b57e --- /dev/null +++ b/arch/x86/configs/x86_64_gce.fragment @@ -0,0 +1,2 @@ +# To boot Linux distributions like Debian +CONFIG_DEVTMPFS=y diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index fce05e9df56d..5e9ddbdb9aac 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -67,6 +67,9 @@ libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o +obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o +polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o + obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o crc32c-intel-y := crc32c-intel_glue.o crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S index c799838242a6..4b36b453f3ca 100644 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -66,6 +66,11 @@ #define VMOVDQ vmovdqu +/* + * Note: the "x" prefix in these aliases means "this is an xmm register". The + * alias prefixes have no relation to XCTR where the "X" prefix means "XOR + * counter". + */ #define xdata0 %xmm0 #define xdata1 %xmm1 #define xdata2 %xmm2 @@ -74,8 +79,10 @@ #define xdata5 %xmm5 #define xdata6 %xmm6 #define xdata7 %xmm7 -#define xcounter %xmm8 -#define xbyteswap %xmm9 +#define xcounter %xmm8 // CTR mode only +#define xiv %xmm8 // XCTR mode only +#define xbyteswap %xmm9 // CTR mode only +#define xtmp %xmm9 // XCTR mode only #define xkey0 %xmm10 #define xkey4 %xmm11 #define xkey8 %xmm12 @@ -88,7 +95,7 @@ #define p_keys %rdx #define p_out %rcx #define num_bytes %r8 - +#define counter %r9 // XCTR mode only #define tmp %r10 #define DDQ_DATA 0 #define XDATA 1 @@ -145,7 +152,7 @@ ddq_add_8: * do_aes num_in_par load_keys key_len * This increments p_in, but not p_out */ -.macro do_aes b, k, key_len +.macro do_aes b, k, key_len, xctr .set by, \b .set load_keys, \k .set klen, \key_len @@ -154,29 +161,48 @@ ddq_add_8: vmovdqa 0*16(p_keys), xkey0 .endif - vpshufb xbyteswap, xcounter, xdata0 - - .set i, 1 - .rept (by - 1) - club XDATA, i - vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata - vptest ddq_low_msk(%rip), var_xdata - jnz 1f - vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata - vpaddq ddq_high_add_1(%rip), xcounter, xcounter - 1: - vpshufb xbyteswap, var_xdata, var_xdata - .set i, (i +1) - .endr + .if \xctr + movq counter, xtmp + .set i, 0 + .rept (by) + club XDATA, i + vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata + .set i, (i +1) + .endr + .set i, 0 + .rept (by) + club XDATA, i + vpxor xiv, var_xdata, var_xdata + .set i, (i +1) + .endr + .else + vpshufb xbyteswap, xcounter, xdata0 + .set i, 1 + .rept (by - 1) + club XDATA, i + vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata + vptest ddq_low_msk(%rip), var_xdata + jnz 1f + vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata + vpaddq ddq_high_add_1(%rip), xcounter, xcounter + 1: + vpshufb xbyteswap, var_xdata, var_xdata + .set i, (i +1) + .endr + .endif vmovdqa 1*16(p_keys), xkeyA vpxor xkey0, xdata0, xdata0 - vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter - vptest ddq_low_msk(%rip), xcounter - jnz 1f - vpaddq ddq_high_add_1(%rip), xcounter, xcounter - 1: + .if \xctr + add $by, counter + .else + vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter + vptest ddq_low_msk(%rip), xcounter + jnz 1f + vpaddq ddq_high_add_1(%rip), xcounter, xcounter + 1: + .endif .set i, 1 .rept (by - 1) @@ -414,94 +440,99 @@ ddq_add_8: .endr .endm -.macro do_aes_load val, key_len - do_aes \val, 1, \key_len +.macro do_aes_load val, key_len, xctr + do_aes \val, 1, \key_len, \xctr .endm -.macro do_aes_noload val, key_len - do_aes \val, 0, \key_len +.macro do_aes_noload val, key_len, xctr + do_aes \val, 0, \key_len, \xctr .endm /* main body of aes ctr load */ -.macro do_aes_ctrmain key_len +.macro do_aes_ctrmain key_len, xctr cmp $16, num_bytes - jb .Ldo_return2\key_len + jb .Ldo_return2\xctr\key_len - vmovdqa byteswap_const(%rip), xbyteswap - vmovdqu (p_iv), xcounter - vpshufb xbyteswap, xcounter, xcounter + .if \xctr + shr $4, counter + vmovdqu (p_iv), xiv + .else + vmovdqa byteswap_const(%rip), xbyteswap + vmovdqu (p_iv), xcounter + vpshufb xbyteswap, xcounter, xcounter + .endif mov num_bytes, tmp and $(7*16), tmp - jz .Lmult_of_8_blks\key_len + jz .Lmult_of_8_blks\xctr\key_len /* 1 <= tmp <= 7 */ cmp $(4*16), tmp - jg .Lgt4\key_len - je .Leq4\key_len + jg .Lgt4\xctr\key_len + je .Leq4\xctr\key_len -.Llt4\key_len: +.Llt4\xctr\key_len: cmp $(2*16), tmp - jg .Leq3\key_len - je .Leq2\key_len + jg .Leq3\xctr\key_len + je .Leq2\xctr\key_len -.Leq1\key_len: - do_aes_load 1, \key_len +.Leq1\xctr\key_len: + do_aes_load 1, \key_len, \xctr add $(1*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq2\key_len: - do_aes_load 2, \key_len +.Leq2\xctr\key_len: + do_aes_load 2, \key_len, \xctr add $(2*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq3\key_len: - do_aes_load 3, \key_len +.Leq3\xctr\key_len: + do_aes_load 3, \key_len, \xctr add $(3*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq4\key_len: - do_aes_load 4, \key_len +.Leq4\xctr\key_len: + do_aes_load 4, \key_len, \xctr add $(4*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Lgt4\key_len: +.Lgt4\xctr\key_len: cmp $(6*16), tmp - jg .Leq7\key_len - je .Leq6\key_len + jg .Leq7\xctr\key_len + je .Leq6\xctr\key_len -.Leq5\key_len: - do_aes_load 5, \key_len +.Leq5\xctr\key_len: + do_aes_load 5, \key_len, \xctr add $(5*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq6\key_len: - do_aes_load 6, \key_len +.Leq6\xctr\key_len: + do_aes_load 6, \key_len, \xctr add $(6*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq7\key_len: - do_aes_load 7, \key_len +.Leq7\xctr\key_len: + do_aes_load 7, \key_len, \xctr add $(7*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Lmult_of_8_blks\key_len: +.Lmult_of_8_blks\xctr\key_len: .if (\key_len != KEY_128) vmovdqa 0*16(p_keys), xkey0 vmovdqa 4*16(p_keys), xkey4 @@ -514,17 +545,19 @@ ddq_add_8: vmovdqa 9*16(p_keys), xkey12 .endif .align 16 -.Lmain_loop2\key_len: +.Lmain_loop2\xctr\key_len: /* num_bytes is a multiple of 8 and >0 */ - do_aes_noload 8, \key_len + do_aes_noload 8, \key_len, \xctr add $(8*16), p_out sub $(8*16), num_bytes - jne .Lmain_loop2\key_len + jne .Lmain_loop2\xctr\key_len -.Ldo_return2\key_len: - /* return updated IV */ - vpshufb xbyteswap, xcounter, xcounter - vmovdqu xcounter, (p_iv) +.Ldo_return2\xctr\key_len: + .if !\xctr + /* return updated IV */ + vpshufb xbyteswap, xcounter, xcounter + vmovdqu xcounter, (p_iv) + .endif RET .endm @@ -537,7 +570,7 @@ ddq_add_8: */ SYM_FUNC_START(aes_ctr_enc_128_avx_by8) /* call the aes main loop */ - do_aes_ctrmain KEY_128 + do_aes_ctrmain KEY_128 0 SYM_FUNC_END(aes_ctr_enc_128_avx_by8) @@ -550,7 +583,7 @@ SYM_FUNC_END(aes_ctr_enc_128_avx_by8) */ SYM_FUNC_START(aes_ctr_enc_192_avx_by8) /* call the aes main loop */ - do_aes_ctrmain KEY_192 + do_aes_ctrmain KEY_192 0 SYM_FUNC_END(aes_ctr_enc_192_avx_by8) @@ -563,6 +596,45 @@ SYM_FUNC_END(aes_ctr_enc_192_avx_by8) */ SYM_FUNC_START(aes_ctr_enc_256_avx_by8) /* call the aes main loop */ - do_aes_ctrmain KEY_256 + do_aes_ctrmain KEY_256 0 SYM_FUNC_END(aes_ctr_enc_256_avx_by8) + +/* + * routine to do AES128 XCTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys, + * u8* out, unsigned int num_bytes, unsigned int byte_ctr) + */ +SYM_FUNC_START(aes_xctr_enc_128_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_128 1 + +SYM_FUNC_END(aes_xctr_enc_128_avx_by8) + +/* + * routine to do AES192 XCTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys, + * u8* out, unsigned int num_bytes, unsigned int byte_ctr) + */ +SYM_FUNC_START(aes_xctr_enc_192_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_192 1 + +SYM_FUNC_END(aes_xctr_enc_192_avx_by8) + +/* + * routine to do AES256 XCTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys, + * u8* out, unsigned int num_bytes, unsigned int byte_ctr) + */ +SYM_FUNC_START(aes_xctr_enc_256_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_256 1 + +SYM_FUNC_END(aes_xctr_enc_256_avx_by8) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 41901ba9d3a2..a5b0cb3efeba 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -135,6 +135,20 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); + + +asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, + const void *keys, u8 *out, unsigned int num_bytes, + unsigned int byte_ctr); + +asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, + const void *keys, u8 *out, unsigned int num_bytes, + unsigned int byte_ctr); + +asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, + const void *keys, u8 *out, unsigned int num_bytes, + unsigned int byte_ctr); + /* * asmlinkage void aesni_gcm_init_avx_gen2() * gcm_data *my_ctx_data, context data @@ -527,6 +541,59 @@ static int ctr_crypt(struct skcipher_request *req) return err; } +static void aesni_xctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv, + unsigned int byte_ctr) +{ + if (ctx->key_length == AES_KEYSIZE_128) + aes_xctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len, + byte_ctr); + else if (ctx->key_length == AES_KEYSIZE_192) + aes_xctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len, + byte_ctr); + else + aes_xctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len, + byte_ctr); +} + +static int xctr_crypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + u8 keystream[AES_BLOCK_SIZE]; + struct skcipher_walk walk; + unsigned int nbytes; + unsigned int byte_ctr = 0; + int err; + __le32 block[AES_BLOCK_SIZE / sizeof(__le32)]; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + kernel_fpu_begin(); + if (nbytes & AES_BLOCK_MASK) + aesni_xctr_enc_avx_tfm(ctx, walk.dst.virt.addr, + walk.src.virt.addr, nbytes & AES_BLOCK_MASK, + walk.iv, byte_ctr); + nbytes &= ~AES_BLOCK_MASK; + byte_ctr += walk.nbytes - nbytes; + + if (walk.nbytes == walk.total && nbytes > 0) { + memcpy(block, walk.iv, AES_BLOCK_SIZE); + block[0] ^= cpu_to_le32(1 + byte_ctr / AES_BLOCK_SIZE); + aesni_enc(ctx, keystream, (u8 *)block); + crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - + nbytes, walk.src.virt.addr + walk.nbytes + - nbytes, keystream, nbytes); + byte_ctr += nbytes; + nbytes = 0; + } + kernel_fpu_end(); + err = skcipher_walk_done(&walk, nbytes); + } + return err; +} + static int rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) { @@ -1050,6 +1117,33 @@ static struct skcipher_alg aesni_skciphers[] = { static struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)]; +#ifdef CONFIG_X86_64 +/* + * XCTR does not have a non-AVX implementation, so it must be enabled + * conditionally. + */ +static struct skcipher_alg aesni_xctr = { + .base = { + .cra_name = "__xctr(aes)", + .cra_driver_name = "__xctr-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, + .setkey = aesni_skcipher_setkey, + .encrypt = xctr_crypt, + .decrypt = xctr_crypt, +}; + +static struct simd_skcipher_alg *aesni_simd_xctr; +#endif /* CONFIG_X86_64 */ + #ifdef CONFIG_X86_64 static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, unsigned int key_len) @@ -1163,7 +1257,7 @@ static int __init aesni_init(void) static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm); pr_info("AES CTR mode by8 optimization enabled\n"); } -#endif +#endif /* CONFIG_X86_64 */ err = crypto_register_alg(&aesni_cipher_alg); if (err) @@ -1180,8 +1274,22 @@ static int __init aesni_init(void) if (err) goto unregister_skciphers; +#ifdef CONFIG_X86_64 + if (boot_cpu_has(X86_FEATURE_AVX)) + err = simd_register_skciphers_compat(&aesni_xctr, 1, + &aesni_simd_xctr); + if (err) + goto unregister_aeads; +#endif /* CONFIG_X86_64 */ + return 0; +#ifdef CONFIG_X86_64 +unregister_aeads: + simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), + aesni_simd_aeads); +#endif /* CONFIG_X86_64 */ + unregister_skciphers: simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); @@ -1197,6 +1305,10 @@ static void __exit aesni_exit(void) simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); crypto_unregister_alg(&aesni_cipher_alg); +#ifdef CONFIG_X86_64 + if (boot_cpu_has(X86_FEATURE_AVX)) + simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); +#endif /* CONFIG_X86_64 */ } late_initcall(aesni_init); diff --git a/arch/x86/crypto/polyval-clmulni_asm.S b/arch/x86/crypto/polyval-clmulni_asm.S new file mode 100644 index 000000000000..a6ebe4e7dd2b --- /dev/null +++ b/arch/x86/crypto/polyval-clmulni_asm.S @@ -0,0 +1,321 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2021 Google LLC + */ +/* + * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI + * instructions. It works on 8 blocks at a time, by precomputing the first 8 + * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation + * allows us to split finite field multiplication into two steps. + * + * In the first step, we consider h^i, m_i as normal polynomials of degree less + * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication + * is simply polynomial multiplication. + * + * In the second step, we compute the reduction of p(x) modulo the finite field + * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1. + * + * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where + * multiplication is finite field multiplication. The advantage is that the + * two-step process only requires 1 finite field reduction for every 8 + * polynomial multiplications. Further parallelism is gained by interleaving the + * multiplications and polynomial reductions. + */ + +#include +#include + +#define STRIDE_BLOCKS 8 + +#define GSTAR %xmm7 +#define PL %xmm8 +#define PH %xmm9 +#define TMP_XMM %xmm11 +#define LO %xmm12 +#define HI %xmm13 +#define MI %xmm14 +#define SUM %xmm15 + +#define KEY_POWERS %rdi +#define MSG %rsi +#define BLOCKS_LEFT %rdx +#define ACCUMULATOR %rcx +#define TMP %rax + +.section .rodata.cst16.gstar, "aM", @progbits, 16 +.align 16 + +.Lgstar: + .quad 0xc200000000000000, 0xc200000000000000 + +.text + +/* + * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length + * count pointed to by MSG and KEY_POWERS. + */ +.macro schoolbook1 count + .set i, 0 + .rept (\count) + schoolbook1_iteration i 0 + .set i, (i +1) + .endr +.endm + +/* + * Computes the product of two 128-bit polynomials at the memory locations + * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of + * the 256-bit product into LO, MI, HI. + * + * Given: + * X = [X_1 : X_0] + * Y = [Y_1 : Y_0] + * + * We compute: + * LO += X_0 * Y_0 + * MI += X_0 * Y_1 + X_1 * Y_0 + * HI += X_1 * Y_1 + * + * Later, the 256-bit result can be extracted as: + * [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] + * This step is done when computing the polynomial reduction for efficiency + * reasons. + * + * If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an + * extra multiplication of SUM and h^8. + */ +.macro schoolbook1_iteration i xor_sum + movups (16*\i)(MSG), %xmm0 + .if (\i == 0 && \xor_sum == 1) + pxor SUM, %xmm0 + .endif + vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2 + vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1 + vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3 + vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4 + vpxor %xmm2, MI, MI + vpxor %xmm1, LO, LO + vpxor %xmm4, HI, HI + vpxor %xmm3, MI, MI +.endm + +/* + * Performs the same computation as schoolbook1_iteration, except we expect the + * arguments to already be loaded into xmm0 and xmm1 and we set the result + * registers LO, MI, and HI directly rather than XOR'ing into them. + */ +.macro schoolbook1_noload + vpclmulqdq $0x01, %xmm0, %xmm1, MI + vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2 + vpclmulqdq $0x00, %xmm0, %xmm1, LO + vpclmulqdq $0x11, %xmm0, %xmm1, HI + vpxor %xmm2, MI, MI +.endm + +/* + * Computes the 256-bit polynomial represented by LO, HI, MI. Stores + * the result in PL, PH. + * [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] + */ +.macro schoolbook2 + vpslldq $8, MI, PL + vpsrldq $8, MI, PH + pxor LO, PL + pxor HI, PH +.endm + +/* + * Computes the 128-bit reduction of PH : PL. Stores the result in dest. + * + * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) = + * x^128 + x^127 + x^126 + x^121 + 1. + * + * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the + * product of two 128-bit polynomials in Montgomery form. We need to reduce it + * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor + * of x^128, this product has two extra factors of x^128. To get it back into + * Montgomery form, we need to remove one of these factors by dividing by x^128. + * + * To accomplish both of these goals, we add multiples of g(x) that cancel out + * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low + * bits are zero, the polynomial division by x^128 can be done by right shifting. + * + * Since the only nonzero term in the low 64 bits of g(x) is the constant term, + * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can + * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 + + * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to + * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T + * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191. + * + * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits + * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1 + * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) * + * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 : + * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0). + * + * So our final computation is: + * T = T_1 : T_0 = g*(x) * P_0 + * V = V_1 : V_0 = g*(x) * (P_1 + T_0) + * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0 + * + * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0 + * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 : + * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V. + */ +.macro montgomery_reduction dest + vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x) + pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1 + pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1 + pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1 + pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)] + vpxor TMP_XMM, PH, \dest +.endm + +/* + * Compute schoolbook multiplication for 8 blocks + * m_0h^8 + ... + m_7h^1 + * + * If reduce is set, also computes the montgomery reduction of the + * previous full_stride call and XORs with the first message block. + * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. + * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0. + */ +.macro full_stride reduce + pxor LO, LO + pxor HI, HI + pxor MI, MI + + schoolbook1_iteration 7 0 + .if \reduce + vpclmulqdq $0x00, PL, GSTAR, TMP_XMM + .endif + + schoolbook1_iteration 6 0 + .if \reduce + pshufd $0b01001110, TMP_XMM, TMP_XMM + .endif + + schoolbook1_iteration 5 0 + .if \reduce + pxor PL, TMP_XMM + .endif + + schoolbook1_iteration 4 0 + .if \reduce + pxor TMP_XMM, PH + .endif + + schoolbook1_iteration 3 0 + .if \reduce + pclmulqdq $0x11, GSTAR, TMP_XMM + .endif + + schoolbook1_iteration 2 0 + .if \reduce + vpxor TMP_XMM, PH, SUM + .endif + + schoolbook1_iteration 1 0 + + schoolbook1_iteration 0 1 + + addq $(8*16), MSG + schoolbook2 +.endm + +/* + * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS + */ +.macro partial_stride + mov BLOCKS_LEFT, TMP + shlq $4, TMP + addq $(16*STRIDE_BLOCKS), KEY_POWERS + subq TMP, KEY_POWERS + + movups (MSG), %xmm0 + pxor SUM, %xmm0 + movaps (KEY_POWERS), %xmm1 + schoolbook1_noload + dec BLOCKS_LEFT + addq $16, MSG + addq $16, KEY_POWERS + + test $4, BLOCKS_LEFT + jz .Lpartial4BlocksDone + schoolbook1 4 + addq $(4*16), MSG + addq $(4*16), KEY_POWERS +.Lpartial4BlocksDone: + test $2, BLOCKS_LEFT + jz .Lpartial2BlocksDone + schoolbook1 2 + addq $(2*16), MSG + addq $(2*16), KEY_POWERS +.Lpartial2BlocksDone: + test $1, BLOCKS_LEFT + jz .LpartialDone + schoolbook1 1 +.LpartialDone: + schoolbook2 + montgomery_reduction SUM +.endm + +/* + * Perform montgomery multiplication in GF(2^128) and store result in op1. + * + * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1 + * If op1, op2 are in montgomery form, this computes the montgomery + * form of op1*op2. + * + * void clmul_polyval_mul(u8 *op1, const u8 *op2); + */ +SYM_FUNC_START(clmul_polyval_mul) + FRAME_BEGIN + vmovdqa .Lgstar(%rip), GSTAR + movups (%rdi), %xmm0 + movups (%rsi), %xmm1 + schoolbook1_noload + schoolbook2 + montgomery_reduction SUM + movups SUM, (%rdi) + FRAME_END + RET +SYM_FUNC_END(clmul_polyval_mul) + +/* + * Perform polynomial evaluation as specified by POLYVAL. This computes: + * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1} + * where n=nblocks, h is the hash key, and m_i are the message blocks. + * + * rdi - pointer to precomputed key powers h^8 ... h^1 + * rsi - pointer to message blocks + * rdx - number of blocks to hash + * rcx - pointer to the accumulator + * + * void clmul_polyval_update(const struct polyval_tfm_ctx *keys, + * const u8 *in, size_t nblocks, u8 *accumulator); + */ +SYM_FUNC_START(clmul_polyval_update) + FRAME_BEGIN + vmovdqa .Lgstar(%rip), GSTAR + movups (ACCUMULATOR), SUM + subq $STRIDE_BLOCKS, BLOCKS_LEFT + js .LstrideLoopExit + full_stride 0 + subq $STRIDE_BLOCKS, BLOCKS_LEFT + js .LstrideLoopExitReduce +.LstrideLoop: + full_stride 1 + subq $STRIDE_BLOCKS, BLOCKS_LEFT + jns .LstrideLoop +.LstrideLoopExitReduce: + montgomery_reduction SUM +.LstrideLoopExit: + add $STRIDE_BLOCKS, BLOCKS_LEFT + jz .LskipPartial + partial_stride +.LskipPartial: + movups SUM, (ACCUMULATOR) + FRAME_END + RET +SYM_FUNC_END(clmul_polyval_update) diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c new file mode 100644 index 000000000000..8fa58b0f3cb3 --- /dev/null +++ b/arch/x86/crypto/polyval-clmulni_glue.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Glue code for POLYVAL using PCMULQDQ-NI + * + * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying + * Copyright 2021 Google LLC + */ + +/* + * Glue code based on ghash-clmulni-intel_glue.c. + * + * This implementation of POLYVAL uses montgomery multiplication + * accelerated by PCLMULQDQ-NI to implement the finite field + * operations. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define POLYVAL_ALIGN 16 +#define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN) +#define POLYVAL_ALIGN_EXTRA ((POLYVAL_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1)) +#define POLYVAL_CTX_SIZE (sizeof(struct polyval_tfm_ctx) + POLYVAL_ALIGN_EXTRA) +#define NUM_KEY_POWERS 8 + +struct polyval_tfm_ctx { + /* + * These powers must be in the order h^8, ..., h^1. + */ + u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE] POLYVAL_ALIGN_ATTR; +}; + +struct polyval_desc_ctx { + u8 buffer[POLYVAL_BLOCK_SIZE]; + u32 bytes; +}; + +asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys, + const u8 *in, size_t nblocks, u8 *accumulator); +asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2); + +static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm) +{ + return PTR_ALIGN(crypto_shash_ctx(tfm), POLYVAL_ALIGN); +} + +static void internal_polyval_update(const struct polyval_tfm_ctx *keys, + const u8 *in, size_t nblocks, u8 *accumulator) +{ + if (likely(crypto_simd_usable())) { + kernel_fpu_begin(); + clmul_polyval_update(keys, in, nblocks, accumulator); + kernel_fpu_end(); + } else { + polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in, + nblocks, accumulator); + } +} + +static void internal_polyval_mul(u8 *op1, const u8 *op2) +{ + if (likely(crypto_simd_usable())) { + kernel_fpu_begin(); + clmul_polyval_mul(op1, op2); + kernel_fpu_end(); + } else { + polyval_mul_non4k(op1, op2); + } +} + +static int polyval_x86_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(tfm); + int i; + + if (keylen != POLYVAL_BLOCK_SIZE) + return -EINVAL; + + memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE); + + for (i = NUM_KEY_POWERS-2; i >= 0; i--) { + memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE); + internal_polyval_mul(tctx->key_powers[i], + tctx->key_powers[i+1]); + } + + return 0; +} + +static int polyval_x86_init(struct shash_desc *desc) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + + memset(dctx, 0, sizeof(*dctx)); + + return 0; +} + +static int polyval_x86_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); + u8 *pos; + unsigned int nblocks; + unsigned int n; + + if (dctx->bytes) { + n = min(srclen, dctx->bytes); + pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes; + + dctx->bytes -= n; + srclen -= n; + + while (n--) + *pos++ ^= *src++; + + if (!dctx->bytes) + internal_polyval_mul(dctx->buffer, + tctx->key_powers[NUM_KEY_POWERS-1]); + } + + while (srclen >= POLYVAL_BLOCK_SIZE) { + /* Allow rescheduling every 4K bytes. */ + nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE; + internal_polyval_update(tctx, src, nblocks, dctx->buffer); + srclen -= nblocks * POLYVAL_BLOCK_SIZE; + src += nblocks * POLYVAL_BLOCK_SIZE; + } + + if (srclen) { + dctx->bytes = POLYVAL_BLOCK_SIZE - srclen; + pos = dctx->buffer; + while (srclen--) + *pos++ ^= *src++; + } + + return 0; +} + +static int polyval_x86_final(struct shash_desc *desc, u8 *dst) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); + + if (dctx->bytes) { + internal_polyval_mul(dctx->buffer, + tctx->key_powers[NUM_KEY_POWERS-1]); + } + + memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE); + + return 0; +} + +static struct shash_alg polyval_alg = { + .digestsize = POLYVAL_DIGEST_SIZE, + .init = polyval_x86_init, + .update = polyval_x86_update, + .final = polyval_x86_final, + .setkey = polyval_x86_setkey, + .descsize = sizeof(struct polyval_desc_ctx), + .base = { + .cra_name = "polyval", + .cra_driver_name = "polyval-clmulni", + .cra_priority = 200, + .cra_blocksize = POLYVAL_BLOCK_SIZE, + .cra_ctxsize = POLYVAL_CTX_SIZE, + .cra_module = THIS_MODULE, + }, +}; + +__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = { + X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id); + +static int __init polyval_clmulni_mod_init(void) +{ + if (!x86_match_cpu(pcmul_cpu_id)) + return -ENODEV; + + if (!boot_cpu_has(X86_FEATURE_AVX)) + return -ENODEV; + + return crypto_register_shash(&polyval_alg); +} + +static void __exit polyval_clmulni_mod_exit(void) +{ + crypto_unregister_shash(&polyval_alg); +} + +module_init(polyval_clmulni_mod_init); +module_exit(polyval_clmulni_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI"); +MODULE_ALIAS_CRYPTO("polyval"); +MODULE_ALIAS_CRYPTO("polyval-clmulni"); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 81d5e0a1f48c..f2bf5a192ee0 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2762,11 +2762,10 @@ static bool perf_hw_regs(struct pt_regs *regs) void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct unwind_state state; unsigned long addr; - if (guest_cbs && guest_cbs->is_in_guest()) { + if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } @@ -2866,11 +2865,10 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stack_frame frame; const struct stack_frame __user *fp; - if (guest_cbs && guest_cbs->is_in_guest()) { + if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } @@ -2947,21 +2945,19 @@ static unsigned long code_segment_base(struct pt_regs *regs) unsigned long perf_instruction_pointer(struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - if (guest_cbs && guest_cbs->is_in_guest()) - return guest_cbs->get_guest_ip(); + if (perf_guest_state()) + return perf_guest_get_ip(); return regs->ip + code_segment_base(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + unsigned int guest_state = perf_guest_state(); int misc = 0; - if (guest_cbs && guest_cbs->is_in_guest()) { - if (guest_cbs->is_user_mode()) + if (guest_state) { + if (guest_state & PERF_GUEST_USER) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index f36fda2672e0..9bbb3bc88593 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2788,7 +2788,6 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) { struct perf_sample_data data; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct perf_guest_info_callbacks *guest_cbs; int bit; int handled = 0; u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); @@ -2855,12 +2854,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) { handled++; - - guest_cbs = perf_get_guest_cbs(); - if (unlikely(guest_cbs && guest_cbs->is_in_guest() && - guest_cbs->handle_intel_pt_intr)) - guest_cbs->handle_intel_pt_intr(); - else + if (!perf_guest_handle_intel_pt_intr()) intel_pt_interrupt(); } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9e800d4d323c..388c10cb1203 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -780,6 +780,7 @@ struct kvm_vcpu_arch { unsigned nmi_pending; /* NMI queued after currently running handler */ bool nmi_injected; /* Trying to inject an NMI this entry */ bool smi_pending; /* SMI queued after currently running handler */ + u8 handling_intr_from_guest; struct kvm_mtrr mtrr_state; u64 pat; @@ -1516,7 +1517,7 @@ struct kvm_x86_init_ops { int (*disabled_by_bios)(void); int (*check_processor_compatibility)(void); int (*hardware_setup)(void); - bool (*intel_pt_intr_in_guest)(void); + unsigned int (*handle_intel_pt_intr)(void); struct kvm_x86_ops *runtime_ops; }; @@ -1564,6 +1565,9 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) return -ENOTSUPP; } +#define kvm_arch_pmi_in_guest(vcpu) \ + ((vcpu) && (vcpu)->arch.handling_intr_from_guest) + void __init kvm_mmu_x86_module_init(void); int kvm_mmu_vendor_module_init(void); void kvm_mmu_vendor_module_exit(void); @@ -1897,8 +1901,6 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); -int kvm_is_in_guest(void); - void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 448cd01eb3ec..01a1763123ff 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) static inline int pmd_bad(pmd_t pmd) { - return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; + return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != + (_KERNPG_TABLE & ~_PAGE_ACCESSED); } static inline unsigned long pages_to_mb(unsigned long npg) @@ -1397,10 +1398,10 @@ static inline bool arch_has_pfn_modify_check(void) return boot_cpu_has_bug(X86_BUG_L1TF); } -#define arch_faults_on_old_pte arch_faults_on_old_pte -static inline bool arch_faults_on_old_pte(void) +#define arch_has_hw_pte_young arch_has_hw_pte_young +static inline bool arch_has_hw_pte_young(void) { - return false; + return true; } #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 2ff3e600f426..6aef9ee28a39 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -84,7 +84,7 @@ obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o obj-$(CONFIG_INTEL_TXT) += tboot.o obj-$(CONFIG_ISA_DMA_API) += i8237.o -obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-y += stacktrace.o obj-y += cpu/ obj-y += acpi/ obj-y += reboot.o diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 06b53ea940bf..4c7420d77d40 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -78,7 +78,7 @@ void *module_alloc(unsigned long size) MODULES_END, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ac69894eab88..699bf786fbce 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -36,6 +36,7 @@ config KVM select KVM_MMIO select SCHED_INFO select PERF_EVENTS + select GUEST_PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_NO_POLL diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 762b43f0d919..acaebb17ba3f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -164,7 +164,7 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) { struct kvm_vcpu *vcpu = NULL; - int i; + unsigned long i; if (vpidx >= KVM_MAX_VCPUS) return NULL; @@ -1722,7 +1722,8 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask( { struct kvm_hv *hv = to_kvm_hv(kvm); struct kvm_vcpu *vcpu; - int i, bank, sbank = 0; + int bank, sbank = 0; + unsigned long i; memset(vp_bitmap, 0, KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap)); @@ -1871,7 +1872,7 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, .vector = vector }; struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 5a69cce4d72d..0b65a764ed3a 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -242,7 +242,7 @@ static void pit_do_work(struct kthread_work *work) struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); struct kvm *kvm = pit->kvm; struct kvm_vcpu *vcpu; - int i; + unsigned long i; struct kvm_kpit_state *ps = &pit->pit_state; if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0)) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 0b80263d46d8..814064d06016 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -50,7 +50,7 @@ static void pic_unlock(struct kvm_pic *s) { bool wakeup = s->wakeup_needed; struct kvm_vcpu *vcpu; - int i; + unsigned long i; s->wakeup_needed = false; @@ -270,7 +270,8 @@ int kvm_pic_read_irq(struct kvm *kvm) static void kvm_pic_reset(struct kvm_kpic_state *s) { - int irq, i; + int irq; + unsigned long i; struct kvm_vcpu *vcpu; u8 edge_irr = s->irr & ~s->elcr; bool found = false; diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 4e0f52660842..97413fec8fa3 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -149,7 +149,7 @@ void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; if (RTC_GSI >= IOAPIC_NUM_PINS) return; @@ -184,7 +184,7 @@ static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic) static void ioapic_lazy_update_eoi(struct kvm_ioapic *ioapic, int irq) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq]; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index d5b72a08e566..39ad02d6dc63 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -45,9 +45,9 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, struct dest_map *dest_map) { - int i, r = -1; + int r = -1; struct kvm_vcpu *vcpu, *lowest = NULL; - unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; unsigned int dest_vcpus = 0; if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) @@ -320,7 +320,8 @@ int kvm_set_routing_entry(struct kvm *kvm, bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu) { - int i, r = 0; + int r = 0; + unsigned long i; struct kvm_vcpu *vcpu; if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c index c7db2df50a7a..b469f45e3fe4 100644 --- a/arch/x86/kvm/kvm_onhyperv.c +++ b/arch/x86/kvm/kvm_onhyperv.c @@ -33,7 +33,8 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm, { struct kvm_arch *kvm_arch = &kvm->arch; struct kvm_vcpu *vcpu; - int ret = 0, i, nr_unique_valid_roots; + int ret = 0, nr_unique_valid_roots; + unsigned long i; hpa_t root; spin_lock(&kvm_arch->hv_root_tdp_lock); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8ea4658f48ef..362fe9e47666 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -186,7 +186,7 @@ void kvm_recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; - int i; + unsigned long i; u32 max_id = 255; /* enough space for any xAPIC ID */ /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */ @@ -1171,8 +1171,8 @@ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_lapic *src = NULL; struct kvm_apic_map *map; struct kvm_vcpu *vcpu; - unsigned long bitmap; - int i, vcpu_idx; + unsigned long bitmap, i; + int vcpu_idx; bool ret; rcu_read_lock(); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4724289c8a7f..e6f118f0feaa 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1699,6 +1699,18 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) percpu_counter_add(&kvm_total_used_mmu_pages, nr); } +static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + kvm_mod_used_mmu_pages(kvm, +1); + kvm_account_pgtable_pages((void *)sp->spt, +1); +} + +static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + kvm_mod_used_mmu_pages(kvm, -1); + kvm_account_pgtable_pages((void *)sp->spt, -1); +} + static void kvm_mmu_free_page(struct kvm_mmu_page *sp) { MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); @@ -1754,7 +1766,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct */ sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - kvm_mod_used_mmu_pages(vcpu->kvm, +1); + kvm_account_mmu_page(vcpu->kvm, sp); return sp; } @@ -2385,7 +2397,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, list_add(&sp->link, invalid_list); else list_move(&sp->link, invalid_list); - kvm_mod_used_mmu_pages(kvm, -1); + kvm_unaccount_mmu_page(kvm, sp); } else { /* * Remove the active root from the active page list, the root diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 6c2bb60ccd88..87a7de5c8cb6 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -267,6 +267,16 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, } } +static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + kvm_account_pgtable_pages((void *)sp->spt, +1); +} + +static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + kvm_account_pgtable_pages((void *)sp->spt, -1); +} + /** * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU * @@ -283,6 +293,7 @@ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, if (account_nx) account_huge_nx_page(kvm, sp); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + tdp_account_mmu_page(kvm, sp); } /** @@ -297,6 +308,7 @@ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, bool shared) { + tdp_unaccount_mmu_page(kvm, sp); if (shared) spin_lock(&kvm->arch.tdp_mmu_pages_lock); else diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 62333f9756a3..b904c21b71d1 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -87,7 +87,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, * woken up. So we should wake it, but this is impossible from * NMI context. Do it from irq work instead. */ - if (!kvm_is_in_guest()) + if (!kvm_handling_nmi_from_guest(pmc->vcpu)) irq_work_queue(&pmc_to_pmu(pmc)->irq_work); else kvm_make_request(KVM_REQ_PMI, pmc->vcpu); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 3d3f8dfb8045..d891223a8ff4 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -293,7 +293,7 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, u32 icrl, u32 icrh) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { bool m = kvm_apic_match_dest(vcpu, source, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index eeedcb3d40e8..814b28843cd4 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -629,7 +629,8 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_vcpu *vcpu; - int i, ret; + unsigned long i; + int ret; if (!sev_es_guest(kvm)) return -ENOTTY; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c1a758038892..5dab8f853ef1 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3873,7 +3873,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) } if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) - kvm_before_interrupt(vcpu); + kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); kvm_load_host_xsave_state(vcpu); stgi(); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0718658268fe..8988f707fdca 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6445,7 +6445,9 @@ void vmx_do_interrupt_nmi_irqoff(unsigned long entry); static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, unsigned long entry) { - kvm_before_interrupt(vcpu); + bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist; + + kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ); vmx_do_interrupt_nmi_irqoff(entry); kvm_after_interrupt(vcpu); } @@ -7819,6 +7821,20 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, }; +static unsigned int vmx_handle_intel_pt_intr(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + /* '0' on failure so that the !PT case can use a RET0 static call. */ + if (!vcpu || !kvm_handling_nmi_from_guest(vcpu)) + return 0; + + kvm_make_request(KVM_REQ_PMI, vcpu); + __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, + (unsigned long *)&vcpu->arch.pmu.global_status); + return 1; +} + static __init void vmx_setup_user_return_msrs(void) { @@ -7845,6 +7861,8 @@ static __init void vmx_setup_user_return_msrs(void) kvm_add_user_return_msr(vmx_uret_msrs_list[i]); } +static struct kvm_x86_init_ops vmx_init_ops __initdata; + static __init int hardware_setup(void) { unsigned long host_bndcfgs; @@ -8008,6 +8026,10 @@ static __init int hardware_setup(void) return -EINVAL; if (!enable_ept || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; + if (pt_mode == PT_MODE_HOST_GUEST) + vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; + else + vmx_init_ops.handle_intel_pt_intr = NULL; setup_default_sgx_lepubkeyhash(); @@ -8036,7 +8058,7 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = { .disabled_by_bios = vmx_disabled_by_bios, .check_processor_compatibility = vmx_check_processor_compat, .hardware_setup = hardware_setup, - .intel_pt_intr_in_guest = vmx_pt_mode_is_host_guest, + .handle_intel_pt_intr = NULL, .runtime_ops = &vmx_x86_ops, }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fcfa3fedf84f..152f1d4ce07c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2814,10 +2814,9 @@ void kvm_make_mclock_inprogress_request(struct kvm *kvm) static void kvm_gen_update_masterclock(struct kvm *kvm) { #ifdef CONFIG_X86_64 - int i; struct kvm_vcpu *vcpu; struct kvm_arch *ka = &kvm->arch; - unsigned long flags; + unsigned long flags, i; kvm_hv_invalidate_tsc_page(kvm); @@ -3043,7 +3042,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) static void kvmclock_update_fn(struct work_struct *work) { - int i; + unsigned long i; struct delayed_work *dwork = to_delayed_work(work); struct kvm_arch *ka = container_of(dwork, struct kvm_arch, kvmclock_update_work); @@ -5676,7 +5675,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) * VM-Exit. */ struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) kvm_vcpu_kick(vcpu); @@ -5998,7 +5997,8 @@ long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, static int kvm_arch_suspend_notifier(struct kvm *kvm) { struct kvm_vcpu *vcpu; - int i, ret = 0; + unsigned long i; + int ret = 0; mutex_lock(&kvm->lock); kvm_for_each_vcpu(i, vcpu, kvm) { @@ -8371,8 +8371,7 @@ static void kvm_hyperv_tsc_notifier(void) { struct kvm *kvm; struct kvm_vcpu *vcpu; - int cpu; - unsigned long flags; + unsigned long cpu, flags; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) @@ -8406,7 +8405,8 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu) { struct kvm *kvm; struct kvm_vcpu *vcpu; - int i, send_ipi = 0; + int send_ipi = 0; + unsigned long i; /* * We allow guests to temporarily run on slowing clocks, @@ -8531,57 +8531,12 @@ static void kvm_timer_init(void) kvmclock_cpu_online, kvmclock_cpu_down_prep); } -DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); -EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu); - -int kvm_is_in_guest(void) -{ - return __this_cpu_read(current_vcpu) != NULL; -} - -static int kvm_is_user_mode(void) -{ - int user_mode = 3; - - if (__this_cpu_read(current_vcpu)) - user_mode = static_call(kvm_x86_get_cpl)(__this_cpu_read(current_vcpu)); - - return user_mode != 0; -} - -static unsigned long kvm_get_guest_ip(void) -{ - unsigned long ip = 0; - - if (__this_cpu_read(current_vcpu)) - ip = kvm_rip_read(__this_cpu_read(current_vcpu)); - - return ip; -} - -static void kvm_handle_intel_pt_intr(void) -{ - struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); - - kvm_make_request(KVM_REQ_PMI, vcpu); - __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, - (unsigned long *)&vcpu->arch.pmu.global_status); -} - -static struct perf_guest_info_callbacks kvm_guest_cbs = { - .is_in_guest = kvm_is_in_guest, - .is_user_mode = kvm_is_user_mode, - .get_guest_ip = kvm_get_guest_ip, - .handle_intel_pt_intr = NULL, -}; - #ifdef CONFIG_X86_64 static void pvclock_gtod_update_fn(struct work_struct *work) { struct kvm *kvm; - struct kvm_vcpu *vcpu; - int i; + unsigned long i; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) @@ -9965,7 +9920,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * interrupts on processors that implement an interrupt shadow, the * stat.exits increment will do nicely. */ - kvm_before_interrupt(vcpu); + kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); local_irq_enable(); ++vcpu->stat.exits; local_irq_disable(); @@ -11270,7 +11225,7 @@ int kvm_arch_hardware_enable(void) { struct kvm *kvm; struct kvm_vcpu *vcpu; - int i; + unsigned long i; int ret; u64 local_tsc; u64 max_tsc = 0; @@ -11380,9 +11335,7 @@ int kvm_arch_hardware_setup(void *opaque) memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); kvm_ops_static_call_update(); - if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest()) - kvm_guest_cbs.handle_intel_pt_intr = kvm_handle_intel_pt_intr; - perf_register_guest_info_callbacks(&kvm_guest_cbs); + kvm_register_perf_callbacks(ops->handle_intel_pt_intr); if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) supported_xss = 0; @@ -11411,8 +11364,7 @@ int kvm_arch_hardware_setup(void *opaque) void kvm_arch_hardware_unsetup(void) { - perf_unregister_guest_info_callbacks(&kvm_guest_cbs); - kvm_guest_cbs.handle_intel_pt_intr = NULL; + kvm_unregister_perf_callbacks(); static_call(kvm_x86_hardware_unsetup)(); } @@ -11527,7 +11479,7 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) static void kvm_free_vcpus(struct kvm *kvm) { - unsigned int i; + unsigned long i; struct kvm_vcpu *vcpu; /* @@ -11537,15 +11489,8 @@ static void kvm_free_vcpus(struct kvm *kvm) kvm_clear_async_pf_completion_queue(vcpu); kvm_unload_vcpu_mmu(vcpu); } - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vcpu_destroy(vcpu); - mutex_lock(&kvm->lock); - for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) - kvm->vcpus[i] = NULL; - - atomic_set(&kvm->online_vcpus, 0); - mutex_unlock(&kvm->lock); + kvm_destroy_vcpus(kvm); } void kvm_arch_sync_events(struct kvm *kvm) @@ -11825,7 +11770,7 @@ out_free: void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; /* * memslots->generation has been incremented. @@ -12045,6 +11990,11 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) return vcpu->arch.preempted_in_kernel; } +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) +{ + return kvm_rip_read(vcpu); +} + int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index cd0c93ec72fa..ebe02dfe0bdb 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -394,18 +394,27 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm) return kvm->arch.cstate_in_guest; } -DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu); +enum kvm_intr_type { + /* Values are arbitrary, but must be non-zero. */ + KVM_HANDLING_IRQ = 1, + KVM_HANDLING_NMI, +}; -static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu) +static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu, + enum kvm_intr_type intr) { - __this_cpu_write(current_vcpu, vcpu); + WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr); } static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) { - __this_cpu_write(current_vcpu, NULL); + WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0); } +static inline bool kvm_handling_nmi_from_guest(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.handling_intr_from_guest == KVM_HANDLING_NMI; +} static inline bool kvm_pat_valid(u64 data) { diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index c6506c6a7092..e02b20aafb6c 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -60,6 +60,7 @@ ifeq ($(CONFIG_X86_32),y) lib-y += checksum_32.o lib-y += strstr_32.o lib-y += string_32.o + lib-y += memmove_32.o ifneq ($(CONFIG_X86_CMPXCHG64),y) lib-y += cmpxchg8b_emu.o atomic64_386_32.o endif diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index e565d1c9019e..c69541dbbcc2 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -20,190 +20,3 @@ __visible void *memset(void *s, int c, size_t count) return __memset(s, c, count); } EXPORT_SYMBOL(memset); - -__visible void *memmove(void *dest, const void *src, size_t n) -{ - int d0,d1,d2,d3,d4,d5; - char *ret = dest; - - __asm__ __volatile__( - /* Handle more 16 bytes in loop */ - "cmp $0x10, %0\n\t" - "jb 1f\n\t" - - /* Decide forward/backward copy mode */ - "cmp %2, %1\n\t" - "jb 2f\n\t" - - /* - * movs instruction have many startup latency - * so we handle small size by general register. - */ - "cmp $680, %0\n\t" - "jb 3f\n\t" - /* - * movs instruction is only good for aligned case. - */ - "mov %1, %3\n\t" - "xor %2, %3\n\t" - "and $0xff, %3\n\t" - "jz 4f\n\t" - "3:\n\t" - "sub $0x10, %0\n\t" - - /* - * We gobble 16 bytes forward in each loop. - */ - "3:\n\t" - "sub $0x10, %0\n\t" - "mov 0*4(%1), %3\n\t" - "mov 1*4(%1), %4\n\t" - "mov %3, 0*4(%2)\n\t" - "mov %4, 1*4(%2)\n\t" - "mov 2*4(%1), %3\n\t" - "mov 3*4(%1), %4\n\t" - "mov %3, 2*4(%2)\n\t" - "mov %4, 3*4(%2)\n\t" - "lea 0x10(%1), %1\n\t" - "lea 0x10(%2), %2\n\t" - "jae 3b\n\t" - "add $0x10, %0\n\t" - "jmp 1f\n\t" - - /* - * Handle data forward by movs. - */ - ".p2align 4\n\t" - "4:\n\t" - "mov -4(%1, %0), %3\n\t" - "lea -4(%2, %0), %4\n\t" - "shr $2, %0\n\t" - "rep movsl\n\t" - "mov %3, (%4)\n\t" - "jmp 11f\n\t" - /* - * Handle data backward by movs. - */ - ".p2align 4\n\t" - "6:\n\t" - "mov (%1), %3\n\t" - "mov %2, %4\n\t" - "lea -4(%1, %0), %1\n\t" - "lea -4(%2, %0), %2\n\t" - "shr $2, %0\n\t" - "std\n\t" - "rep movsl\n\t" - "mov %3,(%4)\n\t" - "cld\n\t" - "jmp 11f\n\t" - - /* - * Start to prepare for backward copy. - */ - ".p2align 4\n\t" - "2:\n\t" - "cmp $680, %0\n\t" - "jb 5f\n\t" - "mov %1, %3\n\t" - "xor %2, %3\n\t" - "and $0xff, %3\n\t" - "jz 6b\n\t" - - /* - * Calculate copy position to tail. - */ - "5:\n\t" - "add %0, %1\n\t" - "add %0, %2\n\t" - "sub $0x10, %0\n\t" - - /* - * We gobble 16 bytes backward in each loop. - */ - "7:\n\t" - "sub $0x10, %0\n\t" - - "mov -1*4(%1), %3\n\t" - "mov -2*4(%1), %4\n\t" - "mov %3, -1*4(%2)\n\t" - "mov %4, -2*4(%2)\n\t" - "mov -3*4(%1), %3\n\t" - "mov -4*4(%1), %4\n\t" - "mov %3, -3*4(%2)\n\t" - "mov %4, -4*4(%2)\n\t" - "lea -0x10(%1), %1\n\t" - "lea -0x10(%2), %2\n\t" - "jae 7b\n\t" - /* - * Calculate copy position to head. - */ - "add $0x10, %0\n\t" - "sub %0, %1\n\t" - "sub %0, %2\n\t" - - /* - * Move data from 8 bytes to 15 bytes. - */ - ".p2align 4\n\t" - "1:\n\t" - "cmp $8, %0\n\t" - "jb 8f\n\t" - "mov 0*4(%1), %3\n\t" - "mov 1*4(%1), %4\n\t" - "mov -2*4(%1, %0), %5\n\t" - "mov -1*4(%1, %0), %1\n\t" - - "mov %3, 0*4(%2)\n\t" - "mov %4, 1*4(%2)\n\t" - "mov %5, -2*4(%2, %0)\n\t" - "mov %1, -1*4(%2, %0)\n\t" - "jmp 11f\n\t" - - /* - * Move data from 4 bytes to 7 bytes. - */ - ".p2align 4\n\t" - "8:\n\t" - "cmp $4, %0\n\t" - "jb 9f\n\t" - "mov 0*4(%1), %3\n\t" - "mov -1*4(%1, %0), %4\n\t" - "mov %3, 0*4(%2)\n\t" - "mov %4, -1*4(%2, %0)\n\t" - "jmp 11f\n\t" - - /* - * Move data from 2 bytes to 3 bytes. - */ - ".p2align 4\n\t" - "9:\n\t" - "cmp $2, %0\n\t" - "jb 10f\n\t" - "movw 0*2(%1), %%dx\n\t" - "movw -1*2(%1, %0), %%bx\n\t" - "movw %%dx, 0*2(%2)\n\t" - "movw %%bx, -1*2(%2, %0)\n\t" - "jmp 11f\n\t" - - /* - * Move data for 1 byte. - */ - ".p2align 4\n\t" - "10:\n\t" - "cmp $1, %0\n\t" - "jb 11f\n\t" - "movb (%1), %%cl\n\t" - "movb %%cl, (%2)\n\t" - ".p2align 4\n\t" - "11:" - : "=&c" (d0), "=&S" (d1), "=&D" (d2), - "=r" (d3),"=r" (d4), "=r"(d5) - :"0" (n), - "1" (src), - "2" (dest) - :"memory"); - - return ret; - -} -EXPORT_SYMBOL(memmove); diff --git a/arch/x86/lib/memmove_32.S b/arch/x86/lib/memmove_32.S new file mode 100644 index 000000000000..0588b2c0fc95 --- /dev/null +++ b/arch/x86/lib/memmove_32.S @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include + +SYM_FUNC_START(memmove) +/* + * void *memmove(void *dest_in, const void *src_in, size_t n) + * -mregparm=3 passes these in registers: + * dest_in: %eax + * src_in: %edx + * n: %ecx + * See also: arch/x86/entry/calling.h for description of the calling convention. + * + * n can remain in %ecx, but for `rep movsl`, we'll need dest in %edi and src + * in %esi. + */ +.set dest_in, %eax +.set dest, %edi +.set src_in, %edx +.set src, %esi +.set n, %ecx +.set tmp0, %edx +.set tmp0w, %dx +.set tmp1, %ebx +.set tmp1w, %bx +.set tmp2, %eax +.set tmp3b, %cl + +/* + * Save all callee-saved registers, because this function is going to clobber + * all of them: + */ + pushl %ebp + movl %esp, %ebp // set standard frame pointer + + pushl %ebx + pushl %edi + pushl %esi + pushl %eax // save 'dest_in' parameter [eax] as the return value + + movl src_in, src + movl dest_in, dest + + /* Handle more 16 bytes in loop */ + cmpl $0x10, n + jb .Lmove_16B + + /* Decide forward/backward copy mode */ + cmpl dest, src + jb .Lbackwards_header + + /* + * movs instruction have many startup latency + * so we handle small size by general register. + */ + cmpl $680, n + jb .Ltoo_small_forwards + /* movs instruction is only good for aligned case. */ + movl src, tmp0 + xorl dest, tmp0 + andl $0xff, tmp0 + jz .Lforward_movs +.Ltoo_small_forwards: + subl $0x10, n + + /* We gobble 16 bytes forward in each loop. */ +.Lmove_16B_forwards_loop: + subl $0x10, n + movl 0*4(src), tmp0 + movl 1*4(src), tmp1 + movl tmp0, 0*4(dest) + movl tmp1, 1*4(dest) + movl 2*4(src), tmp0 + movl 3*4(src), tmp1 + movl tmp0, 2*4(dest) + movl tmp1, 3*4(dest) + leal 0x10(src), src + leal 0x10(dest), dest + jae .Lmove_16B_forwards_loop + addl $0x10, n + jmp .Lmove_16B + + /* Handle data forward by movs. */ +.p2align 4 +.Lforward_movs: + movl -4(src, n), tmp0 + leal -4(dest, n), tmp1 + shrl $2, n + rep movsl + movl tmp0, (tmp1) + jmp .Ldone + + /* Handle data backward by movs. */ +.p2align 4 +.Lbackwards_movs: + movl (src), tmp0 + movl dest, tmp1 + leal -4(src, n), src + leal -4(dest, n), dest + shrl $2, n + std + rep movsl + movl tmp0,(tmp1) + cld + jmp .Ldone + + /* Start to prepare for backward copy. */ +.p2align 4 +.Lbackwards_header: + cmpl $680, n + jb .Ltoo_small_backwards + movl src, tmp0 + xorl dest, tmp0 + andl $0xff, tmp0 + jz .Lbackwards_movs + + /* Calculate copy position to tail. */ +.Ltoo_small_backwards: + addl n, src + addl n, dest + subl $0x10, n + + /* We gobble 16 bytes backward in each loop. */ +.Lmove_16B_backwards_loop: + subl $0x10, n + + movl -1*4(src), tmp0 + movl -2*4(src), tmp1 + movl tmp0, -1*4(dest) + movl tmp1, -2*4(dest) + movl -3*4(src), tmp0 + movl -4*4(src), tmp1 + movl tmp0, -3*4(dest) + movl tmp1, -4*4(dest) + leal -0x10(src), src + leal -0x10(dest), dest + jae .Lmove_16B_backwards_loop + /* Calculate copy position to head. */ + addl $0x10, n + subl n, src + subl n, dest + + /* Move data from 8 bytes to 15 bytes. */ +.p2align 4 +.Lmove_16B: + cmpl $8, n + jb .Lmove_8B + movl 0*4(src), tmp0 + movl 1*4(src), tmp1 + movl -2*4(src, n), tmp2 + movl -1*4(src, n), src + + movl tmp0, 0*4(dest) + movl tmp1, 1*4(dest) + movl tmp2, -2*4(dest, n) + movl src, -1*4(dest, n) + jmp .Ldone + + /* Move data from 4 bytes to 7 bytes. */ +.p2align 4 +.Lmove_8B: + cmpl $4, n + jb .Lmove_4B + movl 0*4(src), tmp0 + movl -1*4(src, n), tmp1 + movl tmp0, 0*4(dest) + movl tmp1, -1*4(dest, n) + jmp .Ldone + + /* Move data from 2 bytes to 3 bytes. */ +.p2align 4 +.Lmove_4B: + cmpl $2, n + jb .Lmove_1B + movw 0*2(src), tmp0w + movw -1*2(src, n), tmp1w + movw tmp0w, 0*2(dest) + movw tmp1w, -1*2(dest, n) + jmp .Ldone + + /* Move data for 1 byte. */ +.p2align 4 +.Lmove_1B: + cmpl $1, n + jb .Ldone + movb (src), tmp3b + movb tmp3b, (dest) +.p2align 4 +.Ldone: + popl dest_in // restore 'dest_in' [eax] as the return value + /* Restore all callee-saved registers: */ + popl %esi + popl %edi + popl %ebx + popl %ebp + + RET +SYM_FUNC_END(memmove) +EXPORT_SYMBOL(memmove) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4bfed53e210e..83e07cbaa95a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1226,6 +1226,9 @@ void do_user_addr_fault(struct pt_regs *regs, struct mm_struct *mm; vm_fault_t fault; unsigned int flags = FAULT_FLAG_DEFAULT; +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + unsigned long seq; +#endif tsk = current; mm = tsk->mm; @@ -1323,6 +1326,63 @@ void do_user_addr_fault(struct pt_regs *regs, } #endif +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + /* + * No need to try speculative faults for kernel or + * single threaded user space. + */ + if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1) + goto no_spf; + + count_vm_event(SPF_ATTEMPT); + seq = mmap_seq_read_start(mm); + if (seq & 1) { + count_vm_spf_event(SPF_ABORT_ODD); + goto spf_abort; + } + vma = get_vma(mm, address); + if (!vma) { + count_vm_spf_event(SPF_ABORT_UNMAPPED); + goto spf_abort; + } + + if (!vma_can_speculate(vma, flags)) { + put_vma(vma); + count_vm_spf_event(SPF_ABORT_NO_SPECULATE); + goto spf_abort; + } + + if (!mmap_seq_read_check(mm, seq, SPF_ABORT_VMA_COPY)) { + put_vma(vma); + goto spf_abort; + } + if (unlikely(access_error(error_code, vma))) { + put_vma(vma); + count_vm_spf_event(SPF_ABORT_ACCESS_ERROR); + goto spf_abort; + } + fault = do_handle_mm_fault(vma, address, + flags | FAULT_FLAG_SPECULATIVE, seq, regs); + put_vma(vma); + + if (!(fault & VM_FAULT_RETRY)) + goto done; + + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + kernelmode_fixup_or_oops(regs, error_code, address, + SIGBUS, BUS_ADRERR, + ARCH_DEFAULT_PKEY); + return; + } + +spf_abort: + count_vm_event(SPF_ABORT); +no_spf: + +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ + /* * Kernel-mode access to the user address space should only occur * on well-defined single instructions listed in the exception @@ -1420,6 +1480,9 @@ good_area: } mmap_read_unlock(mm); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +done: +#endif if (likely(!(fault & VM_FAULT_ERROR))) return; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 0e3667e529ab..851c7d96f4ae 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -112,6 +112,12 @@ static unsigned long min_pfn_mapped; static bool __initdata can_use_brk_pgt = true; +/* + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via + * CONFIG_ZONE_DMA32. + */ +static bool disable_dma32 __ro_after_init; + /* * Pages returned are already directly mapped. * @@ -1024,7 +1030,7 @@ void __init zone_sizes_init(void) max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn); #endif #ifdef CONFIG_ZONE_DMA32 - max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn); + max_zone_pfns[ZONE_DMA32] = disable_dma32 ? 0 : min(MAX_DMA32_PFN, max_low_pfn); #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM @@ -1034,6 +1040,18 @@ void __init zone_sizes_init(void) free_area_init(max_zone_pfns); } +static int __init early_disable_dma32(char *buf) +{ + if (!buf) + return -EINVAL; + + if (!strcmp(buf, "on")) + disable_dma32 = true; + + return 0; +} +early_param("disable_dma32", early_disable_dma32); + __visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = { .loaded_mm = &init_mm, .next_asid = 1, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3481b35cb4ec..a224193d84bf 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, return ret; } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { @@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, return ret; } +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp) { diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index a8591ec8ae68..470a01f269e7 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -28,7 +28,8 @@ else obj-y += syscalls_64.o vdso/ -subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o +subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o \ + ../lib/memmove_64.o ../lib/memset_64.o subarch-$(CONFIG_PREEMPTION) += ../entry/thunk_64.o endif diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile index 5ca366e15c76..6fbe97c52c99 100644 --- a/arch/x86/um/vdso/Makefile +++ b/arch/x86/um/vdso/Makefile @@ -3,6 +3,9 @@ # Building vDSO images for x86. # +# do not instrument on vdso because KASAN is not compatible with user mode +KASAN_SANITIZE := n + # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. KCOV_INSTRUMENT := n diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 6bcd3d8ca6ac..85246dd9faa1 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -23,6 +23,7 @@ config XEN_PV select PARAVIRT_XXL select XEN_HAVE_PVMMU select XEN_HAVE_VPMU + select GUEST_PERF_EVENTS help Support running as a Xen PV guest. diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index d7249f4c90f1..21ecbe754cb2 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -413,34 +413,29 @@ int pmu_apic_update(uint32_t val) } /* perf callbacks */ -static int xen_is_in_guest(void) +static unsigned int xen_guest_state(void) { const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + unsigned int state = 0; if (!xenpmu_data) { pr_warn_once("%s: pmudata not initialized\n", __func__); - return 0; + return state; } if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF)) - return 0; + return state; - return 1; -} + state |= PERF_GUEST_ACTIVE; -static int xen_is_user_mode(void) -{ - const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); - - if (!xenpmu_data) { - pr_warn_once("%s: pmudata not initialized\n", __func__); - return 0; + if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV) { + if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER) + state |= PERF_GUEST_USER; + } else if (xenpmu_data->pmu.r.regs.cpl & 3) { + state |= PERF_GUEST_USER; } - if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV) - return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER); - else - return !!(xenpmu_data->pmu.r.regs.cpl & 3); + return state; } static unsigned long xen_get_guest_ip(void) @@ -456,9 +451,8 @@ static unsigned long xen_get_guest_ip(void) } static struct perf_guest_info_callbacks xen_guest_cbs = { - .is_in_guest = xen_is_in_guest, - .is_user_mode = xen_is_user_mode, - .get_guest_ip = xen_get_guest_ip, + .state = xen_guest_state, + .get_ip = xen_get_guest_ip, }; /* Convert registers from Xen's format to Linux' */ diff --git a/block/Makefile b/block/Makefile index 74df168729ec..0b917c8e5e63 100644 --- a/block/Makefile +++ b/block/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o -obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o +obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \ + blk-crypto-sysfs.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o diff --git a/block/OWNERS b/block/OWNERS new file mode 100644 index 000000000000..2641e066350b --- /dev/null +++ b/block/OWNERS @@ -0,0 +1,2 @@ +bvanassche@google.com +jaegeuk@google.com diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 85120d7b5cf0..bc7aa87a32f2 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -132,6 +132,7 @@ #include "blk-mq-tag.h" #include "blk-mq-sched.h" #include "bfq-iosched.h" +#include "blk-stat.h" #include "blk-wbt.h" #define BFQ_BFQQ_FNS(name) \ @@ -7050,6 +7051,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); wbt_disable_default(q); + blk_stat_enable_accounting(q); + return 0; out_free: diff --git a/block/bio.c b/block/bio.c index ba9120d4fe49..5bdef489711f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -273,6 +273,9 @@ void bio_init(struct bio *bio, struct bio_vec *table, #endif #ifdef CONFIG_BLK_INLINE_ENCRYPTION bio->bi_crypt_context = NULL; +#if IS_ENABLED(CONFIG_DM_DEFAULT_KEY) + bio->bi_skip_dm_default_key = false; +#endif #endif #ifdef CONFIG_BLK_DEV_INTEGRITY bio->bi_integrity = NULL; diff --git a/block/blk-core.c b/block/blk-core.c index ed6271dcc1b1..ee1d9b6a80d8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -49,7 +49,9 @@ #include "blk-mq.h" #include "blk-mq-sched.h" #include "blk-pm.h" +#ifndef __GENKSYMS__ #include "blk-rq-qos.h" +#endif struct dentry *blk_debugfs_root; diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index c322176a1e09..06b6f778504b 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -12,9 +12,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -72,12 +72,12 @@ static mempool_t *bio_fallback_crypt_ctx_pool; static DEFINE_MUTEX(tfms_init_lock); static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX]; -static struct blk_crypto_keyslot { +static struct blk_crypto_fallback_keyslot { enum blk_crypto_mode_num crypto_mode; struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX]; } *blk_crypto_keyslots; -static struct blk_keyslot_manager blk_crypto_ksm; +static struct blk_crypto_profile blk_crypto_fallback_profile; static struct workqueue_struct *blk_crypto_wq; static mempool_t *blk_crypto_bounce_page_pool; static struct bio_set crypto_bio_split; @@ -86,11 +86,11 @@ static struct bio_set crypto_bio_split; * This is the key we set when evicting a keyslot. This *should* be the all 0's * key, but AES-XTS rejects that key, so we use some random bytes instead. */ -static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE]; +static u8 blank_key[BLK_CRYPTO_MAX_STANDARD_KEY_SIZE]; -static void blk_crypto_evict_keyslot(unsigned int slot) +static void blk_crypto_fallback_evict_keyslot(unsigned int slot) { - struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; + struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot]; enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode; int err; @@ -103,45 +103,41 @@ static void blk_crypto_evict_keyslot(unsigned int slot) slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID; } -static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - unsigned int slot) +static int +blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + unsigned int slot) { - struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; + struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot]; const enum blk_crypto_mode_num crypto_mode = key->crypto_cfg.crypto_mode; int err; if (crypto_mode != slotp->crypto_mode && slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID) - blk_crypto_evict_keyslot(slot); + blk_crypto_fallback_evict_keyslot(slot); slotp->crypto_mode = crypto_mode; err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw, key->size); if (err) { - blk_crypto_evict_keyslot(slot); + blk_crypto_fallback_evict_keyslot(slot); return err; } return 0; } -static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - unsigned int slot) +static int blk_crypto_fallback_keyslot_evict(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + unsigned int slot) { - blk_crypto_evict_keyslot(slot); + blk_crypto_fallback_evict_keyslot(slot); return 0; } -/* - * The crypto API fallback KSM ops - only used for a bio when it specifies a - * blk_crypto_key that was not supported by the device's inline encryption - * hardware. - */ -static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = { - .keyslot_program = blk_crypto_keyslot_program, - .keyslot_evict = blk_crypto_keyslot_evict, +static const struct blk_crypto_ll_ops blk_crypto_fallback_ll_ops = { + .keyslot_program = blk_crypto_fallback_keyslot_program, + .keyslot_evict = blk_crypto_fallback_keyslot_evict, }; static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) @@ -159,7 +155,7 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) bio_endio(src_bio); } -static struct bio *blk_crypto_clone_bio(struct bio *bio_src) +static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) { struct bvec_iter iter; struct bio_vec bv; @@ -183,16 +179,19 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src) bio_clone_blkg_association(bio, bio_src); blkcg_bio_issue_init(bio); + bio_clone_skip_dm_default_key(bio, bio_src); + return bio; } -static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot, - struct skcipher_request **ciph_req_ret, - struct crypto_wait *wait) +static bool +blk_crypto_fallback_alloc_cipher_req(struct blk_crypto_keyslot *slot, + struct skcipher_request **ciph_req_ret, + struct crypto_wait *wait) { struct skcipher_request *ciph_req; - const struct blk_crypto_keyslot *slotp; - int keyslot_idx = blk_ksm_get_slot_idx(slot); + const struct blk_crypto_fallback_keyslot *slotp; + int keyslot_idx = blk_crypto_keyslot_index(slot); slotp = &blk_crypto_keyslots[keyslot_idx]; ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode], @@ -209,7 +208,7 @@ static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot, return true; } -static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr) +static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr) { struct bio *bio = *bio_ptr; unsigned int i = 0; @@ -264,7 +263,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) { struct bio *src_bio, *enc_bio; struct bio_crypt_ctx *bc; - struct blk_ksm_keyslot *slot; + struct blk_crypto_keyslot *slot; int data_unit_size; struct skcipher_request *ciph_req = NULL; DECLARE_CRYPTO_WAIT(wait); @@ -276,7 +275,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) blk_status_t blk_st; /* Split the bio if it's too big for single page bvec */ - if (!blk_crypto_split_bio_if_needed(bio_ptr)) + if (!blk_crypto_fallback_split_bio_if_needed(bio_ptr)) return false; src_bio = *bio_ptr; @@ -284,24 +283,25 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; /* Allocate bounce bio for encryption */ - enc_bio = blk_crypto_clone_bio(src_bio); + enc_bio = blk_crypto_fallback_clone_bio(src_bio); if (!enc_bio) { src_bio->bi_status = BLK_STS_RESOURCE; return false; } /* - * Use the crypto API fallback keyslot manager to get a crypto_skcipher - * for the algorithm and key specified for this bio. + * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for + * this bio's algorithm and key. */ - blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); + blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile, + bc->bc_key, &slot); if (blk_st != BLK_STS_OK) { src_bio->bi_status = blk_st; goto out_put_enc_bio; } /* and then allocate an skcipher_request for it */ - if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { + if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) { src_bio->bi_status = BLK_STS_RESOURCE; goto out_release_keyslot; } @@ -362,7 +362,7 @@ out_free_bounce_pages: out_free_ciph_req: skcipher_request_free(ciph_req); out_release_keyslot: - blk_ksm_put_slot(slot); + blk_crypto_put_keyslot(slot); out_put_enc_bio: if (enc_bio) bio_put(enc_bio); @@ -380,7 +380,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) container_of(work, struct bio_fallback_crypt_ctx, work); struct bio *bio = f_ctx->bio; struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx; - struct blk_ksm_keyslot *slot; + struct blk_crypto_keyslot *slot; struct skcipher_request *ciph_req = NULL; DECLARE_CRYPTO_WAIT(wait); u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; @@ -393,17 +393,18 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) blk_status_t blk_st; /* - * Use the crypto API fallback keyslot manager to get a crypto_skcipher - * for the algorithm and key specified for this bio. + * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for + * this bio's algorithm and key. */ - blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); + blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile, + bc->bc_key, &slot); if (blk_st != BLK_STS_OK) { bio->bi_status = blk_st; goto out_no_keyslot; } /* and then allocate an skcipher_request for it */ - if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { + if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) { bio->bi_status = BLK_STS_RESOURCE; goto out; } @@ -434,7 +435,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) out: skcipher_request_free(ciph_req); - blk_ksm_put_slot(slot); + blk_crypto_put_keyslot(slot); out_no_keyslot: mempool_free(f_ctx, bio_fallback_crypt_ctx_pool); bio_endio(bio); @@ -473,9 +474,9 @@ static void blk_crypto_fallback_decrypt_endio(struct bio *bio) * @bio_ptr: pointer to the bio to prepare * * If bio is doing a WRITE operation, this splits the bio into two parts if it's - * too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio - * for the first part, encrypts it, and update bio_ptr to point to the bounce - * bio. + * too big (see blk_crypto_fallback_split_bio_if_needed()). It then allocates a + * bounce bio for the first part, encrypts it, and updates bio_ptr to point to + * the bounce bio. * * For a READ operation, we mark the bio for decryption by using bi_private and * bi_end_io. @@ -499,8 +500,8 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) return false; } - if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm, - &bc->bc_key->crypto_cfg)) { + if (!__blk_crypto_cfg_supported(&blk_crypto_fallback_profile, + &bc->bc_key->crypto_cfg)) { bio->bi_status = BLK_STS_NOTSUPP; return false; } @@ -526,7 +527,7 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) { - return blk_ksm_evict_key(&blk_crypto_ksm, key); + return __blk_crypto_evict_key(&blk_crypto_fallback_profile, key); } static bool blk_crypto_fallback_inited; @@ -534,34 +535,36 @@ static int blk_crypto_fallback_init(void) { int i; int err; + struct blk_crypto_profile *profile = &blk_crypto_fallback_profile; if (blk_crypto_fallback_inited) return 0; - prandom_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE); + prandom_bytes(blank_key, sizeof(blank_key)); err = bioset_init(&crypto_bio_split, 64, 0, 0); if (err) goto out; - err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots); + err = blk_crypto_profile_init(profile, blk_crypto_num_keyslots); if (err) goto fail_free_bioset; err = -ENOMEM; - blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops; - blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; + profile->ll_ops = blk_crypto_fallback_ll_ops; + profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; + profile->key_types_supported = BLK_CRYPTO_KEY_TYPE_STANDARD; /* All blk-crypto modes have a crypto API fallback. */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) - blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF; - blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0; + profile->modes_supported[i] = 0xFFFFFFFF; + profile->modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0; blk_crypto_wq = alloc_workqueue("blk_crypto_wq", WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM, num_online_cpus()); if (!blk_crypto_wq) - goto fail_free_ksm; + goto fail_destroy_profile; blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots, sizeof(blk_crypto_keyslots[0]), @@ -595,8 +598,8 @@ fail_free_keyslots: kfree(blk_crypto_keyslots); fail_free_wq: destroy_workqueue(blk_crypto_wq); -fail_free_ksm: - blk_ksm_destroy(&blk_crypto_ksm); +fail_destroy_profile: + blk_crypto_profile_destroy(profile); fail_free_bioset: bioset_exit(&crypto_bio_split); out: @@ -610,7 +613,7 @@ out: int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) { const char *cipher_str = blk_crypto_modes[mode_num].cipher_str; - struct blk_crypto_keyslot *slotp; + struct blk_crypto_fallback_keyslot *slotp; unsigned int i; int err = 0; diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index 0d36aae538d7..30e8393484a3 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -11,8 +11,10 @@ /* Represents a crypto mode supported by blk-crypto */ struct blk_crypto_mode { + const char *name; /* name of this mode, shown in sysfs */ const char *cipher_str; /* crypto API name (for fallback case) */ unsigned int keysize; /* key size in bytes */ + unsigned int security_strength; /* security strength in bytes */ unsigned int ivsize; /* iv size in bytes */ }; @@ -20,6 +22,10 @@ extern const struct blk_crypto_mode blk_crypto_modes[]; #ifdef CONFIG_BLK_INLINE_ENCRYPTION +int blk_crypto_sysfs_register(struct request_queue *q); + +void blk_crypto_sysfs_unregister(struct request_queue *q); + void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], unsigned int inc); @@ -60,8 +66,27 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq) return rq->crypt_ctx; } +blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + struct blk_crypto_keyslot **slot_ptr); + +void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot); + +int __blk_crypto_evict_key(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key); + +bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, + const struct blk_crypto_config *cfg); + #else /* CONFIG_BLK_INLINE_ENCRYPTION */ +static inline int blk_crypto_sysfs_register(struct request_queue *q) +{ + return 0; +} + +static inline void blk_crypto_sysfs_unregister(struct request_queue *q) { } + static inline bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio) { diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c new file mode 100644 index 000000000000..d778a996827f --- /dev/null +++ b/block/blk-crypto-profile.c @@ -0,0 +1,607 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 Google LLC + */ + +/** + * DOC: blk-crypto profiles + * + * 'struct blk_crypto_profile' contains all generic inline encryption-related + * state for a particular inline encryption device. blk_crypto_profile serves + * as the way that drivers for inline encryption hardware expose their crypto + * capabilities and certain functions (e.g., functions to program and evict + * keys) to upper layers. Device drivers that want to support inline encryption + * construct a crypto profile, then associate it with the disk's request_queue. + * + * If the device has keyslots, then its blk_crypto_profile also handles managing + * these keyslots in a device-independent way, using the driver-provided + * functions to program and evict keys as needed. This includes keeping track + * of which key and how many I/O requests are using each keyslot, getting + * keyslots for I/O requests, and handling key eviction requests. + * + * For more information, see Documentation/block/inline-encryption.rst. + */ + +#define pr_fmt(fmt) "blk-crypto: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "blk-crypto-internal.h" + +struct blk_crypto_keyslot { + atomic_t slot_refs; + struct list_head idle_slot_node; + struct hlist_node hash_node; + const struct blk_crypto_key *key; + struct blk_crypto_profile *profile; +}; + +static inline void blk_crypto_hw_enter(struct blk_crypto_profile *profile) +{ + /* + * Calling into the driver requires profile->lock held and the device + * resumed. But we must resume the device first, since that can acquire + * and release profile->lock via blk_crypto_reprogram_all_keys(). + */ + if (profile->dev) + pm_runtime_get_sync(profile->dev); + down_write(&profile->lock); +} + +static inline void blk_crypto_hw_exit(struct blk_crypto_profile *profile) +{ + up_write(&profile->lock); + if (profile->dev) + pm_runtime_put_sync(profile->dev); +} + +/** + * blk_crypto_profile_init() - Initialize a blk_crypto_profile + * @profile: the blk_crypto_profile to initialize + * @num_slots: the number of keyslots + * + * Storage drivers must call this when starting to set up a blk_crypto_profile, + * before filling in additional fields. + * + * Return: 0 on success, or else a negative error code. + */ +int blk_crypto_profile_init(struct blk_crypto_profile *profile, + unsigned int num_slots) +{ + unsigned int slot; + unsigned int i; + unsigned int slot_hashtable_size; + + memset(profile, 0, sizeof(*profile)); + init_rwsem(&profile->lock); + + if (num_slots == 0) + return 0; + + /* Initialize keyslot management data. */ + + profile->slots = kvcalloc(num_slots, sizeof(profile->slots[0]), + GFP_KERNEL); + if (!profile->slots) + return -ENOMEM; + + profile->num_slots = num_slots; + + init_waitqueue_head(&profile->idle_slots_wait_queue); + INIT_LIST_HEAD(&profile->idle_slots); + + for (slot = 0; slot < num_slots; slot++) { + profile->slots[slot].profile = profile; + list_add_tail(&profile->slots[slot].idle_slot_node, + &profile->idle_slots); + } + + spin_lock_init(&profile->idle_slots_lock); + + slot_hashtable_size = roundup_pow_of_two(num_slots); + /* + * hash_ptr() assumes bits != 0, so ensure the hash table has at least 2 + * buckets. This only makes a difference when there is only 1 keyslot. + */ + if (slot_hashtable_size < 2) + slot_hashtable_size = 2; + + profile->log_slot_ht_size = ilog2(slot_hashtable_size); + profile->slot_hashtable = + kvmalloc_array(slot_hashtable_size, + sizeof(profile->slot_hashtable[0]), GFP_KERNEL); + if (!profile->slot_hashtable) + goto err_destroy; + for (i = 0; i < slot_hashtable_size; i++) + INIT_HLIST_HEAD(&profile->slot_hashtable[i]); + + return 0; + +err_destroy: + blk_crypto_profile_destroy(profile); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(blk_crypto_profile_init); + +static void blk_crypto_profile_destroy_callback(void *profile) +{ + blk_crypto_profile_destroy(profile); +} + +/** + * devm_blk_crypto_profile_init() - Resource-managed blk_crypto_profile_init() + * @dev: the device which owns the blk_crypto_profile + * @profile: the blk_crypto_profile to initialize + * @num_slots: the number of keyslots + * + * Like blk_crypto_profile_init(), but causes blk_crypto_profile_destroy() to be + * called automatically on driver detach. + * + * Return: 0 on success, or else a negative error code. + */ +int devm_blk_crypto_profile_init(struct device *dev, + struct blk_crypto_profile *profile, + unsigned int num_slots) +{ + int err = blk_crypto_profile_init(profile, num_slots); + + if (err) + return err; + + return devm_add_action_or_reset(dev, + blk_crypto_profile_destroy_callback, + profile); +} +EXPORT_SYMBOL_GPL(devm_blk_crypto_profile_init); + +static inline struct hlist_head * +blk_crypto_hash_bucket_for_key(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key) +{ + return &profile->slot_hashtable[ + hash_ptr(key, profile->log_slot_ht_size)]; +} + +static void +blk_crypto_remove_slot_from_lru_list(struct blk_crypto_keyslot *slot) +{ + struct blk_crypto_profile *profile = slot->profile; + unsigned long flags; + + spin_lock_irqsave(&profile->idle_slots_lock, flags); + list_del(&slot->idle_slot_node); + spin_unlock_irqrestore(&profile->idle_slots_lock, flags); +} + +static struct blk_crypto_keyslot * +blk_crypto_find_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key) +{ + const struct hlist_head *head = + blk_crypto_hash_bucket_for_key(profile, key); + struct blk_crypto_keyslot *slotp; + + hlist_for_each_entry(slotp, head, hash_node) { + if (slotp->key == key) + return slotp; + } + return NULL; +} + +static struct blk_crypto_keyslot * +blk_crypto_find_and_grab_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key) +{ + struct blk_crypto_keyslot *slot; + + slot = blk_crypto_find_keyslot(profile, key); + if (!slot) + return NULL; + if (atomic_inc_return(&slot->slot_refs) == 1) { + /* Took first reference to this slot; remove it from LRU list */ + blk_crypto_remove_slot_from_lru_list(slot); + } + return slot; +} + +/** + * blk_crypto_keyslot_index() - Get the index of a keyslot + * @slot: a keyslot that blk_crypto_get_keyslot() returned + * + * Return: the 0-based index of the keyslot within the device's keyslots. + */ +unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot) +{ + return slot - slot->profile->slots; +} +EXPORT_SYMBOL_GPL(blk_crypto_keyslot_index); + +/** + * blk_crypto_get_keyslot() - Get a keyslot for a key, if needed. + * @profile: the crypto profile of the device the key will be used on + * @key: the key that will be used + * @slot_ptr: If a keyslot is allocated, an opaque pointer to the keyslot struct + * will be stored here; otherwise NULL will be stored here. + * + * If the device has keyslots, this gets a keyslot that's been programmed with + * the specified key. If the key is already in a slot, this reuses it; + * otherwise this waits for a slot to become idle and programs the key into it. + * + * This must be paired with a call to blk_crypto_put_keyslot(). + * + * Context: Process context. Takes and releases profile->lock. + * Return: BLK_STS_OK on success, meaning that either a keyslot was allocated or + * one wasn't needed; or a blk_status_t error on failure. + */ +blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + struct blk_crypto_keyslot **slot_ptr) +{ + struct blk_crypto_keyslot *slot; + int slot_idx; + int err; + + *slot_ptr = NULL; + + /* + * If the device has no concept of "keyslots", then there is no need to + * get one. + */ + if (profile->num_slots == 0) + return BLK_STS_OK; + + down_read(&profile->lock); + slot = blk_crypto_find_and_grab_keyslot(profile, key); + up_read(&profile->lock); + if (slot) + goto success; + + for (;;) { + blk_crypto_hw_enter(profile); + slot = blk_crypto_find_and_grab_keyslot(profile, key); + if (slot) { + blk_crypto_hw_exit(profile); + goto success; + } + + /* + * If we're here, that means there wasn't a slot that was + * already programmed with the key. So try to program it. + */ + if (!list_empty(&profile->idle_slots)) + break; + + blk_crypto_hw_exit(profile); + wait_event(profile->idle_slots_wait_queue, + !list_empty(&profile->idle_slots)); + } + + slot = list_first_entry(&profile->idle_slots, struct blk_crypto_keyslot, + idle_slot_node); + slot_idx = blk_crypto_keyslot_index(slot); + + err = profile->ll_ops.keyslot_program(profile, key, slot_idx); + if (err) { + wake_up(&profile->idle_slots_wait_queue); + blk_crypto_hw_exit(profile); + return errno_to_blk_status(err); + } + + /* Move this slot to the hash list for the new key. */ + if (slot->key) + hlist_del(&slot->hash_node); + slot->key = key; + hlist_add_head(&slot->hash_node, + blk_crypto_hash_bucket_for_key(profile, key)); + + atomic_set(&slot->slot_refs, 1); + + blk_crypto_remove_slot_from_lru_list(slot); + + blk_crypto_hw_exit(profile); +success: + *slot_ptr = slot; + return BLK_STS_OK; +} + +/** + * blk_crypto_put_keyslot() - Release a reference to a keyslot + * @slot: The keyslot to release the reference of (may be NULL). + * + * Context: Any context. + */ +void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot) +{ + struct blk_crypto_profile *profile; + unsigned long flags; + + if (!slot) + return; + + profile = slot->profile; + + if (atomic_dec_and_lock_irqsave(&slot->slot_refs, + &profile->idle_slots_lock, flags)) { + list_add_tail(&slot->idle_slot_node, &profile->idle_slots); + spin_unlock_irqrestore(&profile->idle_slots_lock, flags); + wake_up(&profile->idle_slots_wait_queue); + } +} + +/** + * __blk_crypto_cfg_supported() - Check whether the given crypto profile + * supports the given crypto configuration. + * @profile: the crypto profile to check + * @cfg: the crypto configuration to check for + * + * Return: %true if @profile supports the given @cfg. + */ +bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, + const struct blk_crypto_config *cfg) +{ + if (!profile) + return false; + if (!(profile->modes_supported[cfg->crypto_mode] & cfg->data_unit_size)) + return false; + if (profile->max_dun_bytes_supported < cfg->dun_bytes) + return false; + if (!(profile->key_types_supported & cfg->key_type)) + return false; + return true; +} + +/** + * __blk_crypto_evict_key() - Evict a key from a device. + * @profile: the crypto profile of the device + * @key: the key to evict. It must not still be used in any I/O. + * + * If the device has keyslots, this finds the keyslot (if any) that contains the + * specified key and calls the driver's keyslot_evict function to evict it. + * + * Otherwise, this just calls the driver's keyslot_evict function if it is + * implemented, passing just the key (without any particular keyslot). This + * allows layered devices to evict the key from their underlying devices. + * + * Context: Process context. Takes and releases profile->lock. + * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY + * if the keyslot is still in use, or another -errno value on other + * error. + */ +int __blk_crypto_evict_key(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key) +{ + struct blk_crypto_keyslot *slot; + int err = 0; + + if (profile->num_slots == 0) { + if (profile->ll_ops.keyslot_evict) { + blk_crypto_hw_enter(profile); + err = profile->ll_ops.keyslot_evict(profile, key, -1); + blk_crypto_hw_exit(profile); + return err; + } + return 0; + } + + blk_crypto_hw_enter(profile); + slot = blk_crypto_find_keyslot(profile, key); + if (!slot) + goto out_unlock; + + if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) { + err = -EBUSY; + goto out_unlock; + } + err = profile->ll_ops.keyslot_evict(profile, key, + blk_crypto_keyslot_index(slot)); + if (err) + goto out_unlock; + + hlist_del(&slot->hash_node); + slot->key = NULL; + err = 0; +out_unlock: + blk_crypto_hw_exit(profile); + return err; +} + +/** + * blk_crypto_reprogram_all_keys() - Re-program all keyslots. + * @profile: The crypto profile + * + * Re-program all keyslots that are supposed to have a key programmed. This is + * intended only for use by drivers for hardware that loses its keys on reset. + * + * Context: Process context. Takes and releases profile->lock. + */ +void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile) +{ + unsigned int slot; + + if (profile->num_slots == 0) + return; + + /* This is for device initialization, so don't resume the device */ + down_write(&profile->lock); + for (slot = 0; slot < profile->num_slots; slot++) { + const struct blk_crypto_key *key = profile->slots[slot].key; + int err; + + if (!key) + continue; + + err = profile->ll_ops.keyslot_program(profile, key, slot); + WARN_ON(err); + } + up_write(&profile->lock); +} +EXPORT_SYMBOL_GPL(blk_crypto_reprogram_all_keys); + +void blk_crypto_profile_destroy(struct blk_crypto_profile *profile) +{ + if (!profile) + return; + kvfree(profile->slot_hashtable); + kvfree_sensitive(profile->slots, + sizeof(profile->slots[0]) * profile->num_slots); + memzero_explicit(profile, sizeof(*profile)); +} +EXPORT_SYMBOL_GPL(blk_crypto_profile_destroy); + +bool blk_crypto_register(struct blk_crypto_profile *profile, + struct request_queue *q) +{ + if (blk_integrity_queue_supports_integrity(q)) { + pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); + return false; + } + q->crypto_profile = profile; + return true; +} +EXPORT_SYMBOL_GPL(blk_crypto_register); + +/** + * blk_crypto_derive_sw_secret() - Derive software secret from wrapped key + * @bdev: a block device that supports hardware-wrapped keys + * @eph_key: the hardware-wrapped key in ephemerally-wrapped form + * @eph_key_size: size of @eph_key in bytes + * @sw_secret: (output) the software secret + * + * Given a hardware-wrapped key in ephemerally-wrapped form (the same form that + * it is used for I/O), ask the hardware to derive the secret which software can + * use for cryptographic tasks other than inline encryption. This secret is + * guaranteed to be cryptographically isolated from the inline encryption key, + * i.e. derived with a different KDF context. + * + * Return: 0 on success, -EOPNOTSUPP if the block device doesn't support + * hardware-wrapped keys, -EBADMSG if the key isn't a valid + * hardware-wrapped key, or another -errno code. + */ +int blk_crypto_derive_sw_secret(struct block_device *bdev, + const u8 *eph_key, size_t eph_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]) +{ + struct blk_crypto_profile *profile = + bdev_get_queue(bdev)->crypto_profile; + int err; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.derive_sw_secret) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + err = profile->ll_ops.derive_sw_secret(profile, eph_key, eph_key_size, + sw_secret); + blk_crypto_hw_exit(profile); + return err; +} +EXPORT_SYMBOL_GPL(blk_crypto_derive_sw_secret); + +/** + * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities + * by child device + * @parent: the crypto profile for the parent device + * @child: the crypto profile for the child device, or NULL + * + * This clears all crypto capabilities in @parent that aren't set in @child. If + * @child is NULL, then this clears all parent capabilities. + * + * Only use this when setting up the crypto profile for a layered device, before + * it's been exposed yet. + */ +void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent, + const struct blk_crypto_profile *child) +{ + if (child) { + unsigned int i; + + parent->max_dun_bytes_supported = + min(parent->max_dun_bytes_supported, + child->max_dun_bytes_supported); + for (i = 0; i < ARRAY_SIZE(child->modes_supported); i++) + parent->modes_supported[i] &= child->modes_supported[i]; + parent->key_types_supported &= child->key_types_supported; + } else { + parent->max_dun_bytes_supported = 0; + memset(parent->modes_supported, 0, + sizeof(parent->modes_supported)); + parent->key_types_supported = 0; + } +} +EXPORT_SYMBOL_GPL(blk_crypto_intersect_capabilities); + +/** + * blk_crypto_has_capabilities() - Check whether @target supports at least all + * the crypto capabilities that @reference does. + * @target: the target profile + * @reference: the reference profile + * + * Return: %true if @target supports all the crypto capabilities of @reference. + */ +bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target, + const struct blk_crypto_profile *reference) +{ + int i; + + if (!reference) + return true; + + if (!target) + return false; + + for (i = 0; i < ARRAY_SIZE(target->modes_supported); i++) { + if (reference->modes_supported[i] & ~target->modes_supported[i]) + return false; + } + + if (reference->max_dun_bytes_supported > + target->max_dun_bytes_supported) + return false; + + if (reference->key_types_supported & ~target->key_types_supported) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(blk_crypto_has_capabilities); + +/** + * blk_crypto_update_capabilities() - Update the capabilities of a crypto + * profile to match those of another crypto + * profile. + * @dst: The crypto profile whose capabilities to update. + * @src: The crypto profile whose capabilities this function will update @dst's + * capabilities to. + * + * Blk-crypto requires that crypto capabilities that were + * advertised when a bio was created continue to be supported by the + * device until that bio is ended. This is turn means that a device cannot + * shrink its advertised crypto capabilities without any explicit + * synchronization with upper layers. So if there's no such explicit + * synchronization, @src must support all the crypto capabilities that + * @dst does (i.e. we need blk_crypto_has_capabilities(@src, @dst)). + * + * Note also that as long as the crypto capabilities are being expanded, the + * order of updates becoming visible is not important because it's alright + * for blk-crypto to see stale values - they only cause blk-crypto to + * believe that a crypto capability isn't supported when it actually is (which + * might result in blk-crypto-fallback being used if available, or the bio being + * failed). + */ +void blk_crypto_update_capabilities(struct blk_crypto_profile *dst, + const struct blk_crypto_profile *src) +{ + memcpy(dst->modes_supported, src->modes_supported, + sizeof(dst->modes_supported)); + + dst->max_dun_bytes_supported = src->max_dun_bytes_supported; + dst->key_types_supported = src->key_types_supported; +} +EXPORT_SYMBOL_GPL(blk_crypto_update_capabilities); diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c new file mode 100644 index 000000000000..fd93bd2f33b7 --- /dev/null +++ b/block/blk-crypto-sysfs.c @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2021 Google LLC + * + * sysfs support for blk-crypto. This file contains the code which exports the + * crypto capabilities of devices via /sys/block/$disk/queue/crypto/. + */ + +#include + +#include "blk-crypto-internal.h" + +struct blk_crypto_kobj { + struct kobject kobj; + struct blk_crypto_profile *profile; +}; + +struct blk_crypto_attr { + struct attribute attr; + ssize_t (*show)(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page); +}; + +static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj) +{ + return container_of(kobj, struct blk_crypto_kobj, kobj)->profile; +} + +static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr) +{ + return container_of(attr, struct blk_crypto_attr, attr); +} + +static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + return sysfs_emit(page, "%u\n", 8 * profile->max_dun_bytes_supported); +} + +static ssize_t num_keyslots_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + return sysfs_emit(page, "%u\n", profile->num_slots); +} + +#define BLK_CRYPTO_RO_ATTR(_name) \ + static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name) + +BLK_CRYPTO_RO_ATTR(max_dun_bits); +BLK_CRYPTO_RO_ATTR(num_keyslots); + +static struct attribute *blk_crypto_attrs[] = { + &max_dun_bits_attr.attr, + &num_keyslots_attr.attr, + NULL, +}; + +static const struct attribute_group blk_crypto_attr_group = { + .attrs = blk_crypto_attrs, +}; + +/* + * The encryption mode attributes. To avoid hard-coding the list of encryption + * modes, these are initialized at boot time by blk_crypto_sysfs_init(). + */ +static struct blk_crypto_attr __blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX]; +static struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1]; + +static umode_t blk_crypto_mode_is_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); + struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + int mode_num = a - __blk_crypto_mode_attrs; + + if (profile->modes_supported[mode_num]) + return 0444; + return 0; +} + +static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + int mode_num = attr - __blk_crypto_mode_attrs; + + return sysfs_emit(page, "0x%x\n", profile->modes_supported[mode_num]); +} + +static const struct attribute_group blk_crypto_modes_attr_group = { + .name = "modes", + .attrs = blk_crypto_mode_attrs, + .is_visible = blk_crypto_mode_is_visible, +}; + +static const struct attribute_group *blk_crypto_attr_groups[] = { + &blk_crypto_attr_group, + &blk_crypto_modes_attr_group, + NULL, +}; + +static ssize_t blk_crypto_attr_show(struct kobject *kobj, + struct attribute *attr, char *page) +{ + struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); + struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + + return a->show(profile, a, page); +} + +static const struct sysfs_ops blk_crypto_attr_ops = { + .show = blk_crypto_attr_show, +}; + +static void blk_crypto_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct blk_crypto_kobj, kobj)); +} + +static struct kobj_type blk_crypto_ktype = { + .default_groups = blk_crypto_attr_groups, + .sysfs_ops = &blk_crypto_attr_ops, + .release = blk_crypto_release, +}; + +/* + * If the request_queue has a blk_crypto_profile, create the "crypto" + * subdirectory in sysfs (/sys/block/$disk/queue/crypto/). + */ +int blk_crypto_sysfs_register(struct request_queue *q) +{ + struct blk_crypto_kobj *obj; + int err; + + if (!q->crypto_profile) + return 0; + + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + obj->profile = q->crypto_profile; + + err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, &q->kobj, + "crypto"); + if (err) { + kobject_put(&obj->kobj); + return err; + } + q->crypto_kobject = &obj->kobj; + return 0; +} + +void blk_crypto_sysfs_unregister(struct request_queue *q) +{ + kobject_put(q->crypto_kobject); +} + +static int __init blk_crypto_sysfs_init(void) +{ + int i; + + BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0); + for (i = 1; i < BLK_ENCRYPTION_MODE_MAX; i++) { + struct blk_crypto_attr *attr = &__blk_crypto_mode_attrs[i]; + + attr->attr.name = blk_crypto_modes[i].name; + attr->attr.mode = 0444; + attr->show = blk_crypto_mode_show; + blk_crypto_mode_attrs[i - 1] = &attr->attr; + } + return 0; +} +subsys_initcall(blk_crypto_sysfs_init); diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 103c2e2d50d6..c0bc79f923f1 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include @@ -19,20 +19,33 @@ const struct blk_crypto_mode blk_crypto_modes[] = { [BLK_ENCRYPTION_MODE_AES_256_XTS] = { + .name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, + .security_strength = 32, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = { + .name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, + .security_strength = 16, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_ADIANTUM] = { + .name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, + .security_strength = 32, .ivsize = 32, }, + [BLK_ENCRYPTION_MODE_SM4_XTS] = { + .name = "SM4-XTS", + .cipher_str = "xts(sm4)", + .keysize = 32, + .security_strength = 16, + .ivsize = 16, + }, }; /* @@ -66,9 +79,15 @@ static int __init bio_crypt_ctx_init(void) /* This is assumed in various places. */ BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0); - /* Sanity check that no algorithm exceeds the defined limits. */ + /* + * Validate the crypto mode properties. This ideally would be done with + * static assertions, but boot-time checks are the next best thing. + */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) { - BUG_ON(blk_crypto_modes[i].keysize > BLK_CRYPTO_MAX_KEY_SIZE); + BUG_ON(blk_crypto_modes[i].keysize > + BLK_CRYPTO_MAX_STANDARD_KEY_SIZE); + BUG_ON(blk_crypto_modes[i].security_strength > + blk_crypto_modes[i].keysize); BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE); } @@ -96,6 +115,7 @@ void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key, bio->bi_crypt_context = bc; } +EXPORT_SYMBOL_GPL(bio_crypt_set_ctx); void __bio_crypt_free_ctx(struct bio *bio) { @@ -218,8 +238,9 @@ static bool bio_crypt_check_alignment(struct bio *bio) blk_status_t __blk_crypto_init_request(struct request *rq) { - return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key, - &rq->crypt_keyslot); + return blk_crypto_get_keyslot(rq->q->crypto_profile, + rq->crypt_ctx->bc_key, + &rq->crypt_keyslot); } /** @@ -233,7 +254,7 @@ blk_status_t __blk_crypto_init_request(struct request *rq) */ void __blk_crypto_free_request(struct request *rq) { - blk_ksm_put_slot(rq->crypt_keyslot); + blk_crypto_put_keyslot(rq->crypt_keyslot); mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool); blk_crypto_rq_set_defaults(rq); } @@ -280,10 +301,9 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) * Success if device supports the encryption context, or if we succeeded * in falling back to the crypto API. */ - if (blk_ksm_crypto_cfg_supported(bio->bi_bdev->bd_disk->queue->ksm, - &bc_key->crypto_cfg)) + if (blk_crypto_config_supported_natively(bio->bi_bdev, + &bc_key->crypto_cfg)) return true; - if (blk_crypto_fallback_bio_prep(bio_ptr)) return true; fail: @@ -306,8 +326,9 @@ int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, /** * blk_crypto_init_key() - Prepare a key for use with blk-crypto * @blk_key: Pointer to the blk_crypto_key to initialize. - * @raw_key: Pointer to the raw key. Must be the correct length for the chosen - * @crypto_mode; see blk_crypto_modes[]. + * @raw_key: the raw bytes of the key + * @raw_key_size: size of the raw key in bytes + * @key_type: type of the key -- either standard or hardware-wrapped * @crypto_mode: identifier for the encryption algorithm to use * @dun_bytes: number of bytes that will be used to specify the DUN when this * key is used @@ -316,7 +337,9 @@ int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, * Return: 0 on success, -errno on failure. The caller is responsible for * zeroizing both blk_key and raw_key when done with them. */ -int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, +int blk_crypto_init_key(struct blk_crypto_key *blk_key, + const u8 *raw_key, size_t raw_key_size, + enum blk_crypto_key_type key_type, enum blk_crypto_mode_num crypto_mode, unsigned int dun_bytes, unsigned int data_unit_size) @@ -329,8 +352,19 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, return -EINVAL; mode = &blk_crypto_modes[crypto_mode]; - if (mode->keysize == 0) + switch (key_type) { + case BLK_CRYPTO_KEY_TYPE_STANDARD: + if (raw_key_size != mode->keysize) + return -EINVAL; + break; + case BLK_CRYPTO_KEY_TYPE_HW_WRAPPED: + if (raw_key_size < mode->security_strength || + raw_key_size > BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE) + return -EINVAL; + break; + default: return -EINVAL; + } if (dun_bytes == 0 || dun_bytes > mode->ivsize) return -EINVAL; @@ -341,29 +375,40 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, blk_key->crypto_cfg.crypto_mode = crypto_mode; blk_key->crypto_cfg.dun_bytes = dun_bytes; blk_key->crypto_cfg.data_unit_size = data_unit_size; + blk_key->crypto_cfg.key_type = key_type; blk_key->data_unit_size_bits = ilog2(data_unit_size); - blk_key->size = mode->keysize; - memcpy(blk_key->raw, raw_key, mode->keysize); + blk_key->size = raw_key_size; + memcpy(blk_key->raw, raw_key, raw_key_size); return 0; } +EXPORT_SYMBOL_GPL(blk_crypto_init_key); + +bool blk_crypto_config_supported_natively(struct block_device *bdev, + const struct blk_crypto_config *cfg) +{ + return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile, + cfg); +} /* * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the - * request queue it's submitted to supports inline crypto, or the + * block_device it's submitted to supports inline crypto, or the * blk-crypto-fallback is enabled and supports the cfg). */ -bool blk_crypto_config_supported(struct request_queue *q, +bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg) { - return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || - blk_ksm_crypto_cfg_supported(q->ksm, cfg); + if (IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) && + cfg->key_type == BLK_CRYPTO_KEY_TYPE_STANDARD) + return true; + return blk_crypto_config_supported_natively(bdev, cfg); } /** * blk_crypto_start_using_key() - Start using a blk_crypto_key on a device + * @bdev: block device to operate on * @key: A key to use on the device - * @q: the request queue for the device * * Upper layers must call this function to ensure that either the hardware * supports the key's crypto settings, or the crypto API fallback has transforms @@ -375,18 +420,23 @@ bool blk_crypto_config_supported(struct request_queue *q, * blk-crypto-fallback is either disabled or the needed algorithm * is disabled in the crypto API; or another -errno code. */ -int blk_crypto_start_using_key(const struct blk_crypto_key *key, - struct request_queue *q) +int blk_crypto_start_using_key(struct block_device *bdev, + const struct blk_crypto_key *key) { - if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) + if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) return 0; + if (key->crypto_cfg.key_type != BLK_CRYPTO_KEY_TYPE_STANDARD) { + pr_warn_once("tried to use wrapped key, but hardware doesn't support it\n"); + return -EOPNOTSUPP; + } return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); } +EXPORT_SYMBOL_GPL(blk_crypto_start_using_key); /** * blk_crypto_evict_key() - Evict a key from any inline encryption hardware * it may have been programmed into - * @q: The request queue who's associated inline encryption hardware this key + * @bdev: The block_device who's associated inline encryption hardware this key * might have been programmed into * @key: The key to evict * @@ -394,18 +444,19 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key, * evicted from any hardware that it might have been programmed into. The key * must not be in use by any in-flight IO when this function is called. * - * Return: 0 on success or if key is not present in the q's ksm, -err on error. + * Return: 0 on success or if the key wasn't in any keyslot; -errno on error. */ -int blk_crypto_evict_key(struct request_queue *q, +int blk_crypto_evict_key(struct block_device *bdev, const struct blk_crypto_key *key) { - if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) - return blk_ksm_evict_key(q->ksm, key); + struct request_queue *q = bdev_get_queue(bdev); + + if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) + return __blk_crypto_evict_key(q->crypto_profile, key); /* - * If the request queue's associated inline encryption hardware didn't - * have support for the key, then the key might have been programmed - * into the fallback keyslot manager, so try to evict from there. + * If the block_device didn't support the key, then blk-crypto-fallback + * may have been used, so try to evict the key from blk-crypto-fallback. */ return blk_crypto_fallback_evict_key(key); } diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 16d5d5338392..f0e956d772f1 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -409,9 +409,9 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); #ifdef CONFIG_BLK_INLINE_ENCRYPTION - if (disk->queue->ksm) { + if (disk->queue->crypto_profile) { pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); - blk_ksm_unregister(disk->queue); + disk->queue->crypto_profile = NULL; } #endif } diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 8ed55af08427..c0c95ceb454c 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -26,6 +26,8 @@ struct blk_mq_tags { * request pool */ spinlock_t lock; + + ANDROID_OEM_DATA(1); }; extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, diff --git a/block/blk-mq.c b/block/blk-mq.c index 9f53b4caf977..810955f33016 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -41,6 +41,8 @@ #include "blk-mq-sched.h" #include "blk-rq-qos.h" +#include + static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static void blk_mq_poll_stats_start(struct request_queue *q); @@ -349,6 +351,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, } data->hctx->queued++; + trace_android_vh_blk_rq_ctx_init(rq, tags, data, alloc_time_ns); return rq; } @@ -2463,6 +2466,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, */ rq_size = round_up(sizeof(struct request) + set->cmd_size, cache_line_size()); + trace_android_vh_blk_alloc_rqs(&rq_size, set, tags); left = rq_size * depth; for (i = 0; i < depth; ) { diff --git a/block/blk-mq.h b/block/blk-mq.h index 7cdca23b6263..f302e9cb40e4 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -35,6 +35,8 @@ struct blk_mq_ctx { struct request_queue *queue; struct blk_mq_ctxs *ctxs; struct kobject kobj; + + ANDROID_OEM_DATA_ARRAY(1, 2); } ____cacheline_aligned_in_smp; void blk_mq_exit_queue(struct request_queue *q); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 00021f012370..56f0f3ef2942 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -889,6 +889,10 @@ int blk_register_queue(struct gendisk *disk) } } + ret = blk_crypto_sysfs_register(q); + if (ret) + goto put_dev; + blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(q); blk_throtl_register_queue(q); @@ -917,6 +921,16 @@ unlock: percpu_ref_switch_to_percpu(&q->q_usage_counter); } + return ret; + +put_dev: + elv_unregister_queue(q); + mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); + kobject_del(&q->kobj); + blk_trace_remove_sysfs(dev); + kobject_put(&dev->kobj); + return ret; } @@ -954,11 +968,11 @@ void blk_unregister_queue(struct gendisk *disk) */ if (queue_is_mq(q)) blk_mq_unregister_dev(disk_to_dev(disk), q); + blk_crypto_sysfs_unregister(q); blk_trace_remove_sysfs(disk_to_dev(disk)); mutex_lock(&q->sysfs_lock); - if (q->elevator) - elv_unregister_queue(q); + elv_unregister_queue(q); mutex_unlock(&q->sysfs_lock); /* Now that we've deleted all child objects, we can delete the queue. */ diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 774ecc598bee..aba9f53853ee 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -57,20 +57,7 @@ EXPORT_SYMBOL_GPL(blk_zone_cond_str); */ bool blk_req_needs_zone_write_lock(struct request *rq) { - if (!rq->q->seq_zones_wlock) - return false; - - if (blk_rq_is_passthrough(rq)) - return false; - - switch (req_op(rq)) { - case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE_SAME: - case REQ_OP_WRITE: - return blk_rq_zone_is_seq(rq); - default: - return false; - } + return rq->q->seq_zones_wlock && blk_rq_is_seq_zone_write(rq); } EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); @@ -532,7 +519,8 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, break; case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: - if (!args->seq_zones_wlock) { + if (!blk_queue_pipeline_zoned_writes(q) && + !args->seq_zones_wlock) { args->seq_zones_wlock = blk_alloc_zone_bitmap(q->node, args->nr_zones); if (!args->seq_zones_wlock) diff --git a/block/elevator.c b/block/elevator.c index 1b5e57f6115f..ca84e2b53575 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -514,9 +514,11 @@ int elv_register_queue(struct request_queue *q, bool uevent) void elv_unregister_queue(struct request_queue *q) { + struct elevator_queue *e = q->elevator; + lockdep_assert_held(&q->sysfs_lock); - if (q) { + if (e && e->registered) { struct elevator_queue *e = q->elevator; kobject_uevent(&e->kobj, KOBJ_REMOVE); @@ -589,9 +591,7 @@ int elevator_switch_mq(struct request_queue *q, lockdep_assert_held(&q->sysfs_lock); if (q->elevator) { - if (q->elevator->registered) - elv_unregister_queue(q); - + elv_unregister_queue(q); ioc_clear_queue(q); elevator_exit(q, q->elevator); } diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c deleted file mode 100644 index 2c4a55bea6ca..000000000000 --- a/block/keyslot-manager.c +++ /dev/null @@ -1,578 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2019 Google LLC - */ - -/** - * DOC: The Keyslot Manager - * - * Many devices with inline encryption support have a limited number of "slots" - * into which encryption contexts may be programmed, and requests can be tagged - * with a slot number to specify the key to use for en/decryption. - * - * As the number of slots is limited, and programming keys is expensive on - * many inline encryption hardware, we don't want to program the same key into - * multiple slots - if multiple requests are using the same key, we want to - * program just one slot with that key and use that slot for all requests. - * - * The keyslot manager manages these keyslots appropriately, and also acts as - * an abstraction between the inline encryption hardware and the upper layers. - * - * Lower layer devices will set up a keyslot manager in their request queue - * and tell it how to perform device specific operations like programming/ - * evicting keys from keyslots. - * - * Upper layers will call blk_ksm_get_slot_for_key() to program a - * key into some slot in the inline encryption hardware. - */ - -#define pr_fmt(fmt) "blk-crypto: " fmt - -#include -#include -#include -#include -#include -#include -#include - -struct blk_ksm_keyslot { - atomic_t slot_refs; - struct list_head idle_slot_node; - struct hlist_node hash_node; - const struct blk_crypto_key *key; - struct blk_keyslot_manager *ksm; -}; - -static inline void blk_ksm_hw_enter(struct blk_keyslot_manager *ksm) -{ - /* - * Calling into the driver requires ksm->lock held and the device - * resumed. But we must resume the device first, since that can acquire - * and release ksm->lock via blk_ksm_reprogram_all_keys(). - */ - if (ksm->dev) - pm_runtime_get_sync(ksm->dev); - down_write(&ksm->lock); -} - -static inline void blk_ksm_hw_exit(struct blk_keyslot_manager *ksm) -{ - up_write(&ksm->lock); - if (ksm->dev) - pm_runtime_put_sync(ksm->dev); -} - -static inline bool blk_ksm_is_passthrough(struct blk_keyslot_manager *ksm) -{ - return ksm->num_slots == 0; -} - -/** - * blk_ksm_init() - Initialize a keyslot manager - * @ksm: The keyslot_manager to initialize. - * @num_slots: The number of key slots to manage. - * - * Allocate memory for keyslots and initialize a keyslot manager. Called by - * e.g. storage drivers to set up a keyslot manager in their request_queue. - * - * Return: 0 on success, or else a negative error code. - */ -int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots) -{ - unsigned int slot; - unsigned int i; - unsigned int slot_hashtable_size; - - memset(ksm, 0, sizeof(*ksm)); - - if (num_slots == 0) - return -EINVAL; - - ksm->slots = kvcalloc(num_slots, sizeof(ksm->slots[0]), GFP_KERNEL); - if (!ksm->slots) - return -ENOMEM; - - ksm->num_slots = num_slots; - - init_rwsem(&ksm->lock); - - init_waitqueue_head(&ksm->idle_slots_wait_queue); - INIT_LIST_HEAD(&ksm->idle_slots); - - for (slot = 0; slot < num_slots; slot++) { - ksm->slots[slot].ksm = ksm; - list_add_tail(&ksm->slots[slot].idle_slot_node, - &ksm->idle_slots); - } - - spin_lock_init(&ksm->idle_slots_lock); - - slot_hashtable_size = roundup_pow_of_two(num_slots); - /* - * hash_ptr() assumes bits != 0, so ensure the hash table has at least 2 - * buckets. This only makes a difference when there is only 1 keyslot. - */ - if (slot_hashtable_size < 2) - slot_hashtable_size = 2; - - ksm->log_slot_ht_size = ilog2(slot_hashtable_size); - ksm->slot_hashtable = kvmalloc_array(slot_hashtable_size, - sizeof(ksm->slot_hashtable[0]), - GFP_KERNEL); - if (!ksm->slot_hashtable) - goto err_destroy_ksm; - for (i = 0; i < slot_hashtable_size; i++) - INIT_HLIST_HEAD(&ksm->slot_hashtable[i]); - - return 0; - -err_destroy_ksm: - blk_ksm_destroy(ksm); - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(blk_ksm_init); - -static void blk_ksm_destroy_callback(void *ksm) -{ - blk_ksm_destroy(ksm); -} - -/** - * devm_blk_ksm_init() - Resource-managed blk_ksm_init() - * @dev: The device which owns the blk_keyslot_manager. - * @ksm: The blk_keyslot_manager to initialize. - * @num_slots: The number of key slots to manage. - * - * Like blk_ksm_init(), but causes blk_ksm_destroy() to be called automatically - * on driver detach. - * - * Return: 0 on success, or else a negative error code. - */ -int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm, - unsigned int num_slots) -{ - int err = blk_ksm_init(ksm, num_slots); - - if (err) - return err; - - return devm_add_action_or_reset(dev, blk_ksm_destroy_callback, ksm); -} -EXPORT_SYMBOL_GPL(devm_blk_ksm_init); - -static inline struct hlist_head * -blk_ksm_hash_bucket_for_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key) -{ - return &ksm->slot_hashtable[hash_ptr(key, ksm->log_slot_ht_size)]; -} - -static void blk_ksm_remove_slot_from_lru_list(struct blk_ksm_keyslot *slot) -{ - struct blk_keyslot_manager *ksm = slot->ksm; - unsigned long flags; - - spin_lock_irqsave(&ksm->idle_slots_lock, flags); - list_del(&slot->idle_slot_node); - spin_unlock_irqrestore(&ksm->idle_slots_lock, flags); -} - -static struct blk_ksm_keyslot *blk_ksm_find_keyslot( - struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key) -{ - const struct hlist_head *head = blk_ksm_hash_bucket_for_key(ksm, key); - struct blk_ksm_keyslot *slotp; - - hlist_for_each_entry(slotp, head, hash_node) { - if (slotp->key == key) - return slotp; - } - return NULL; -} - -static struct blk_ksm_keyslot *blk_ksm_find_and_grab_keyslot( - struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key) -{ - struct blk_ksm_keyslot *slot; - - slot = blk_ksm_find_keyslot(ksm, key); - if (!slot) - return NULL; - if (atomic_inc_return(&slot->slot_refs) == 1) { - /* Took first reference to this slot; remove it from LRU list */ - blk_ksm_remove_slot_from_lru_list(slot); - } - return slot; -} - -unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot) -{ - return slot - slot->ksm->slots; -} -EXPORT_SYMBOL_GPL(blk_ksm_get_slot_idx); - -/** - * blk_ksm_get_slot_for_key() - Program a key into a keyslot. - * @ksm: The keyslot manager to program the key into. - * @key: Pointer to the key object to program, including the raw key, crypto - * mode, and data unit size. - * @slot_ptr: A pointer to return the pointer of the allocated keyslot. - * - * Get a keyslot that's been programmed with the specified key. If one already - * exists, return it with incremented refcount. Otherwise, wait for a keyslot - * to become idle and program it. - * - * Context: Process context. Takes and releases ksm->lock. - * Return: BLK_STS_OK on success (and keyslot is set to the pointer of the - * allocated keyslot), or some other blk_status_t otherwise (and - * keyslot is set to NULL). - */ -blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - struct blk_ksm_keyslot **slot_ptr) -{ - struct blk_ksm_keyslot *slot; - int slot_idx; - int err; - - *slot_ptr = NULL; - - if (blk_ksm_is_passthrough(ksm)) - return BLK_STS_OK; - - down_read(&ksm->lock); - slot = blk_ksm_find_and_grab_keyslot(ksm, key); - up_read(&ksm->lock); - if (slot) - goto success; - - for (;;) { - blk_ksm_hw_enter(ksm); - slot = blk_ksm_find_and_grab_keyslot(ksm, key); - if (slot) { - blk_ksm_hw_exit(ksm); - goto success; - } - - /* - * If we're here, that means there wasn't a slot that was - * already programmed with the key. So try to program it. - */ - if (!list_empty(&ksm->idle_slots)) - break; - - blk_ksm_hw_exit(ksm); - wait_event(ksm->idle_slots_wait_queue, - !list_empty(&ksm->idle_slots)); - } - - slot = list_first_entry(&ksm->idle_slots, struct blk_ksm_keyslot, - idle_slot_node); - slot_idx = blk_ksm_get_slot_idx(slot); - - err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot_idx); - if (err) { - wake_up(&ksm->idle_slots_wait_queue); - blk_ksm_hw_exit(ksm); - return errno_to_blk_status(err); - } - - /* Move this slot to the hash list for the new key. */ - if (slot->key) - hlist_del(&slot->hash_node); - slot->key = key; - hlist_add_head(&slot->hash_node, blk_ksm_hash_bucket_for_key(ksm, key)); - - atomic_set(&slot->slot_refs, 1); - - blk_ksm_remove_slot_from_lru_list(slot); - - blk_ksm_hw_exit(ksm); -success: - *slot_ptr = slot; - return BLK_STS_OK; -} - -/** - * blk_ksm_put_slot() - Release a reference to a slot - * @slot: The keyslot to release the reference of. - * - * Context: Any context. - */ -void blk_ksm_put_slot(struct blk_ksm_keyslot *slot) -{ - struct blk_keyslot_manager *ksm; - unsigned long flags; - - if (!slot) - return; - - ksm = slot->ksm; - - if (atomic_dec_and_lock_irqsave(&slot->slot_refs, - &ksm->idle_slots_lock, flags)) { - list_add_tail(&slot->idle_slot_node, &ksm->idle_slots); - spin_unlock_irqrestore(&ksm->idle_slots_lock, flags); - wake_up(&ksm->idle_slots_wait_queue); - } -} - -/** - * blk_ksm_crypto_cfg_supported() - Find out if a crypto configuration is - * supported by a ksm. - * @ksm: The keyslot manager to check - * @cfg: The crypto configuration to check for. - * - * Checks for crypto_mode/data unit size/dun bytes support. - * - * Return: Whether or not this ksm supports the specified crypto config. - */ -bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm, - const struct blk_crypto_config *cfg) -{ - if (!ksm) - return false; - if (!(ksm->crypto_modes_supported[cfg->crypto_mode] & - cfg->data_unit_size)) - return false; - if (ksm->max_dun_bytes_supported < cfg->dun_bytes) - return false; - return true; -} - -/** - * blk_ksm_evict_key() - Evict a key from the lower layer device. - * @ksm: The keyslot manager to evict from - * @key: The key to evict - * - * Find the keyslot that the specified key was programmed into, and evict that - * slot from the lower layer device. The slot must not be in use by any - * in-flight IO when this function is called. - * - * Context: Process context. Takes and releases ksm->lock. - * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY - * if the keyslot is still in use, or another -errno value on other - * error. - */ -int blk_ksm_evict_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key) -{ - struct blk_ksm_keyslot *slot; - int err = 0; - - if (blk_ksm_is_passthrough(ksm)) { - if (ksm->ksm_ll_ops.keyslot_evict) { - blk_ksm_hw_enter(ksm); - err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, -1); - blk_ksm_hw_exit(ksm); - return err; - } - return 0; - } - - blk_ksm_hw_enter(ksm); - slot = blk_ksm_find_keyslot(ksm, key); - if (!slot) - goto out_unlock; - - if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) { - err = -EBUSY; - goto out_unlock; - } - err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, - blk_ksm_get_slot_idx(slot)); - if (err) - goto out_unlock; - - hlist_del(&slot->hash_node); - slot->key = NULL; - err = 0; -out_unlock: - blk_ksm_hw_exit(ksm); - return err; -} - -/** - * blk_ksm_reprogram_all_keys() - Re-program all keyslots. - * @ksm: The keyslot manager - * - * Re-program all keyslots that are supposed to have a key programmed. This is - * intended only for use by drivers for hardware that loses its keys on reset. - * - * Context: Process context. Takes and releases ksm->lock. - */ -void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm) -{ - unsigned int slot; - - if (blk_ksm_is_passthrough(ksm)) - return; - - /* This is for device initialization, so don't resume the device */ - down_write(&ksm->lock); - for (slot = 0; slot < ksm->num_slots; slot++) { - const struct blk_crypto_key *key = ksm->slots[slot].key; - int err; - - if (!key) - continue; - - err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot); - WARN_ON(err); - } - up_write(&ksm->lock); -} -EXPORT_SYMBOL_GPL(blk_ksm_reprogram_all_keys); - -void blk_ksm_destroy(struct blk_keyslot_manager *ksm) -{ - if (!ksm) - return; - kvfree(ksm->slot_hashtable); - kvfree_sensitive(ksm->slots, sizeof(ksm->slots[0]) * ksm->num_slots); - memzero_explicit(ksm, sizeof(*ksm)); -} -EXPORT_SYMBOL_GPL(blk_ksm_destroy); - -bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q) -{ - if (blk_integrity_queue_supports_integrity(q)) { - pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); - return false; - } - q->ksm = ksm; - return true; -} -EXPORT_SYMBOL_GPL(blk_ksm_register); - -void blk_ksm_unregister(struct request_queue *q) -{ - q->ksm = NULL; -} - -/** - * blk_ksm_intersect_modes() - restrict supported modes by child device - * @parent: The keyslot manager for parent device - * @child: The keyslot manager for child device, or NULL - * - * Clear any crypto mode support bits in @parent that aren't set in @child. - * If @child is NULL, then all parent bits are cleared. - * - * Only use this when setting up the keyslot manager for a layered device, - * before it's been exposed yet. - */ -void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent, - const struct blk_keyslot_manager *child) -{ - if (child) { - unsigned int i; - - parent->max_dun_bytes_supported = - min(parent->max_dun_bytes_supported, - child->max_dun_bytes_supported); - for (i = 0; i < ARRAY_SIZE(child->crypto_modes_supported); - i++) { - parent->crypto_modes_supported[i] &= - child->crypto_modes_supported[i]; - } - } else { - parent->max_dun_bytes_supported = 0; - memset(parent->crypto_modes_supported, 0, - sizeof(parent->crypto_modes_supported)); - } -} -EXPORT_SYMBOL_GPL(blk_ksm_intersect_modes); - -/** - * blk_ksm_is_superset() - Check if a KSM supports a superset of crypto modes - * and DUN bytes that another KSM supports. Here, - * "superset" refers to the mathematical meaning of the - * word - i.e. if two KSMs have the *same* capabilities, - * they *are* considered supersets of each other. - * @ksm_superset: The KSM that we want to verify is a superset - * @ksm_subset: The KSM that we want to verify is a subset - * - * Return: True if @ksm_superset supports a superset of the crypto modes and DUN - * bytes that @ksm_subset supports. - */ -bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset, - struct blk_keyslot_manager *ksm_subset) -{ - int i; - - if (!ksm_subset) - return true; - - if (!ksm_superset) - return false; - - for (i = 0; i < ARRAY_SIZE(ksm_superset->crypto_modes_supported); i++) { - if (ksm_subset->crypto_modes_supported[i] & - (~ksm_superset->crypto_modes_supported[i])) { - return false; - } - } - - if (ksm_subset->max_dun_bytes_supported > - ksm_superset->max_dun_bytes_supported) { - return false; - } - - return true; -} -EXPORT_SYMBOL_GPL(blk_ksm_is_superset); - -/** - * blk_ksm_update_capabilities() - Update the restrictions of a KSM to those of - * another KSM - * @target_ksm: The KSM whose restrictions to update. - * @reference_ksm: The KSM to whose restrictions this function will update - * @target_ksm's restrictions to. - * - * Blk-crypto requires that crypto capabilities that were - * advertised when a bio was created continue to be supported by the - * device until that bio is ended. This is turn means that a device cannot - * shrink its advertised crypto capabilities without any explicit - * synchronization with upper layers. So if there's no such explicit - * synchronization, @reference_ksm must support all the crypto capabilities that - * @target_ksm does - * (i.e. we need blk_ksm_is_superset(@reference_ksm, @target_ksm) == true). - * - * Note also that as long as the crypto capabilities are being expanded, the - * order of updates becoming visible is not important because it's alright - * for blk-crypto to see stale values - they only cause blk-crypto to - * believe that a crypto capability isn't supported when it actually is (which - * might result in blk-crypto-fallback being used if available, or the bio being - * failed). - */ -void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm, - struct blk_keyslot_manager *reference_ksm) -{ - memcpy(target_ksm->crypto_modes_supported, - reference_ksm->crypto_modes_supported, - sizeof(target_ksm->crypto_modes_supported)); - - target_ksm->max_dun_bytes_supported = - reference_ksm->max_dun_bytes_supported; -} -EXPORT_SYMBOL_GPL(blk_ksm_update_capabilities); - -/** - * blk_ksm_init_passthrough() - Init a passthrough keyslot manager - * @ksm: The keyslot manager to init - * - * Initialize a passthrough keyslot manager. - * Called by e.g. storage drivers to set up a keyslot manager in their - * request_queue, when the storage driver wants to manage its keys by itself. - * This is useful for inline encryption hardware that doesn't have the concept - * of keyslots, and for layered devices. - */ -void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm) -{ - memset(ksm, 0, sizeof(*ksm)); - init_rwsem(&ksm->lock); -} -EXPORT_SYMBOL_GPL(blk_ksm_init_passthrough); diff --git a/block/mq-deadline.c b/block/mq-deadline.c index aaef5088a3ba..df2a173bdd86 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -350,7 +350,8 @@ deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, return NULL; rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); - if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) + if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q) || + blk_queue_pipeline_zoned_writes(rq->q)) return rq; /* @@ -389,7 +390,8 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, if (!rq) return NULL; - if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) + if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q) || + blk_queue_pipeline_zoned_writes(rq->q)) return rq; /* @@ -497,8 +499,9 @@ dispatch_find_request: } /* - * For a zoned block device, if we only have writes queued and none of - * them can be dispatched, rq will be NULL. + * For a zoned block device that requires write serialization, if we + * only have writes queued and none of them can be dispatched, rq will + * be NULL. */ if (!rq) return NULL; @@ -835,7 +838,8 @@ static void dd_finish_request(struct request *rq) if (rq->elv.priv[0]) dd_count(dd, completed, prio); - if (blk_queue_is_zoned(q)) { + if (blk_queue_is_zoned(rq->q) && + !blk_queue_pipeline_zoned_writes(q)) { unsigned long flags; spin_lock_irqsave(&dd->zone_lock, flags); diff --git a/build.config.aarch64 b/build.config.aarch64 new file mode 100644 index 000000000000..9fcbceb10db2 --- /dev/null +++ b/build.config.aarch64 @@ -0,0 +1,16 @@ +ARCH=arm64 +MAKE_GOALS=" +Image +modules +" + +FILES=" +arch/arm64/boot/Image +vmlinux +System.map +vmlinux.symvers +modules.builtin +modules.builtin.modinfo +" + +NDK_TRIPLE=aarch64-linux-android31 diff --git a/build.config.allmodconfig b/build.config.allmodconfig new file mode 100644 index 000000000000..0e85b3f70385 --- /dev/null +++ b/build.config.allmodconfig @@ -0,0 +1,14 @@ +DEFCONFIG=allmodconfig + +POST_DEFCONFIG_CMDS="update_config" +function update_config() { + ${KERNEL_DIR}/scripts/config --file ${OUT_DIR}/.config \ + -d TEST_KMOD \ + -e UNWINDER_FRAME_POINTER \ + -d INFINIBAND_QIB \ + -d SAMPLES \ + -d BPFILTER \ + + (cd ${OUT_DIR} && \ + make O=${OUT_DIR} $archsubarch CROSS_COMPILE=${CROSS_COMPILE} ${TOOL_ARGS} ${MAKE_ARGS} olddefconfig) +} diff --git a/build.config.allmodconfig.aarch64 b/build.config.allmodconfig.aarch64 new file mode 100644 index 000000000000..2fbe380e030a --- /dev/null +++ b/build.config.allmodconfig.aarch64 @@ -0,0 +1,4 @@ +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.common +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.aarch64 +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.allmodconfig + diff --git a/build.config.allmodconfig.arm b/build.config.allmodconfig.arm new file mode 100644 index 000000000000..e92744a9b518 --- /dev/null +++ b/build.config.allmodconfig.arm @@ -0,0 +1,4 @@ +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.common +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.arm +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.allmodconfig + diff --git a/build.config.allmodconfig.x86_64 b/build.config.allmodconfig.x86_64 new file mode 100644 index 000000000000..f06b30c8426f --- /dev/null +++ b/build.config.allmodconfig.x86_64 @@ -0,0 +1,4 @@ +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.common +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.x86_64 +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.allmodconfig + diff --git a/build.config.amlogic b/build.config.amlogic new file mode 100644 index 000000000000..e04bd524936e --- /dev/null +++ b/build.config.amlogic @@ -0,0 +1,34 @@ +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.gki.aarch64 + +DEFCONFIG=amlogic_gki_defconfig +FRAGMENT_CONFIG=${KERNEL_DIR}/arch/arm64/configs/amlogic_gki.fragment + +PRE_DEFCONFIG_CMDS="KCONFIG_CONFIG=${ROOT_DIR}/${KERNEL_DIR}/arch/arm64/configs/${DEFCONFIG} ${ROOT_DIR}/${KERNEL_DIR}/scripts/kconfig/merge_config.sh -m -r ${ROOT_DIR}/${KERNEL_DIR}/arch/arm64/configs/gki_defconfig ${ROOT_DIR}/${FRAGMENT_CONFIG}" +POST_DEFCONFIG_CMDS="rm ${ROOT_DIR}/${KERNEL_DIR}/arch/arm64/configs/${DEFCONFIG}" + +# needed for DT overlay support +DTC_FLAGS="-@" + +MAKE_GOALS="${MAKE_GOALS} +dtbs +" + +FILES="${FILES} +arch/arm64/boot/Image.lz4 +arch/arm64/boot/dts/amlogic/meson-g12a-sei510*.dtb +arch/arm64/boot/dts/amlogic/meson-sm1-sei610*.dtb +arch/arm64/boot/dts/amlogic/meson-sm1-khadas-vim3l*.dtb +arch/arm64/boot/dts/amlogic/meson-g12b-a311d-khadas-vim3*.dtb +" + +# +# NOTE: Using Image.lz4 in MAKE_GOALS does not work because +# kernel build passes legacy option (-l) to lz4 command +# and u-boot fails to decompress. Instead, add custom +# command to lz4 compress same options as kernel, but +# without the -l. +# +EXTRA_CMDS="lz4_compress" +function lz4_compress() { + lz4 -f -12 --favor-decSpeed ${OUT_DIR}/arch/arm64/boot/Image ${OUT_DIR}/arch/arm64/boot/Image.lz4 +} diff --git a/build.config.arm b/build.config.arm new file mode 100644 index 000000000000..c7f114c66937 --- /dev/null +++ b/build.config.arm @@ -0,0 +1,13 @@ +ARCH=arm +MAKE_GOALS=" +zImage +modules +" + +FILES=" +arch/arm/boot/zImage +vmlinux +System.map +" + +NDK_TRIPLE=arm-linux-androideabi31 diff --git a/build.config.common b/build.config.common new file mode 100644 index 000000000000..1fc02b16bd8e --- /dev/null +++ b/build.config.common @@ -0,0 +1,17 @@ +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.constants + +KMI_GENERATION=5 + +LLVM=1 +DEPMOD=depmod +CLANG_PREBUILT_BIN=prebuilts/clang/host/linux-x86/clang-${CLANG_VERSION}/bin +BUILDTOOLS_PREBUILT_BIN=build/kernel/build-tools/path/linux-x86 +DTC=${ROOT_DIR}/${BUILDTOOLS_PREBUILT_BIN}/dtc + +KCFLAGS="${KCFLAGS} -D__ANDROID_COMMON_KERNEL__" +EXTRA_CMDS='' +STOP_SHIP_TRACEPRINTK=1 +IN_KERNEL_MODULES=1 +DO_NOT_STRIP_MODULES=1 + +HERMETIC_TOOLCHAIN=${HERMETIC_TOOLCHAIN:-1} diff --git a/build.config.constants b/build.config.constants new file mode 100644 index 000000000000..c763f040d290 --- /dev/null +++ b/build.config.constants @@ -0,0 +1,2 @@ +BRANCH=android14-5.15 +CLANG_VERSION=r475365b \ No newline at end of file diff --git a/build.config.db845c b/build.config.db845c new file mode 100644 index 000000000000..3da01aca6cd5 --- /dev/null +++ b/build.config.db845c @@ -0,0 +1,22 @@ +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.common +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.aarch64 + +BUILD_INITRAMFS=1 +DEFCONFIG=db845c_gki_defconfig +FRAGMENT_CONFIG=${KERNEL_DIR}/arch/arm64/configs/db845c_gki.fragment +PRE_DEFCONFIG_CMDS="KCONFIG_CONFIG=${ROOT_DIR}/${KERNEL_DIR}/arch/arm64/configs/${DEFCONFIG} ${ROOT_DIR}/${KERNEL_DIR}/scripts/kconfig/merge_config.sh -m -r ${ROOT_DIR}/${KERNEL_DIR}/arch/arm64/configs/gki_defconfig ${ROOT_DIR}/${FRAGMENT_CONFIG}" +POST_DEFCONFIG_CMDS="rm ${ROOT_DIR}/${KERNEL_DIR}/arch/arm64/configs/${DEFCONFIG}" + +MAKE_GOALS=" +modules +qcom/sdm845-db845c.dtb +qcom/qrb5165-rb5.dtb +" + +FILES=" +arch/arm64/boot/dts/qcom/sdm845-db845c.dtb +arch/arm64/boot/dts/qcom/qrb5165-rb5.dtb +" + +KMI_SYMBOL_LIST=android/abi_gki_aarch64_db845c +GKI_MODULES_LIST=android/gki_aarch64_modules diff --git a/build.config.gki b/build.config.gki new file mode 100644 index 000000000000..4b931d9eb333 --- /dev/null +++ b/build.config.gki @@ -0,0 +1,2 @@ +DEFCONFIG=gki_defconfig +POST_DEFCONFIG_CMDS="check_defconfig" diff --git a/build.config.gki-debug.aarch64 b/build.config.gki-debug.aarch64 new file mode 100644 index 000000000000..c1fe2f03a279 --- /dev/null +++ b/build.config.gki-debug.aarch64 @@ -0,0 +1,3 @@ +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.gki.aarch64 +TRIM_NONLISTED_KMI="" +KMI_SYMBOL_LIST_STRICT_MODE="" diff --git a/build.config.gki-debug.x86_64 b/build.config.gki-debug.x86_64 new file mode 100644 index 000000000000..d89b7ad4e804 --- /dev/null +++ b/build.config.gki-debug.x86_64 @@ -0,0 +1,3 @@ +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.gki.x86_64 +TRIM_NONLISTED_KMI="" +KMI_SYMBOL_LIST_STRICT_MODE="" diff --git a/build.config.gki.aarch64 b/build.config.gki.aarch64 new file mode 100644 index 000000000000..b1fa073a529c --- /dev/null +++ b/build.config.gki.aarch64 @@ -0,0 +1,28 @@ +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.common +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.aarch64 +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.gki + +MAKE_GOALS="${MAKE_GOALS} +Image.lz4 +Image.gz +" + +FILES="${FILES} +arch/arm64/boot/Image.lz4 +arch/arm64/boot/Image.gz +" + +BUILD_SYSTEM_DLKM=1 +MODULES_LIST=${ROOT_DIR}/${KERNEL_DIR}/android/gki_system_dlkm_modules +MODULES_ORDER=android/gki_aarch64_modules + +BUILD_GKI_CERTIFICATION_TOOLS=1 + +BUILD_GKI_ARTIFACTS=1 +BUILD_GKI_BOOT_IMG_SIZE=67108864 +BUILD_GKI_BOOT_IMG_GZ_SIZE=47185920 +BUILD_GKI_BOOT_IMG_LZ4_SIZE=53477376 + +if [ -n "${GKI_BUILD_CONFIG_FRAGMENT}" ]; then +source ${GKI_BUILD_CONFIG_FRAGMENT} +fi diff --git a/build.config.gki.aarch64.16k b/build.config.gki.aarch64.16k new file mode 100644 index 000000000000..20be95d54647 --- /dev/null +++ b/build.config.gki.aarch64.16k @@ -0,0 +1,5 @@ +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.gki.aarch64 + +DEFCONFIG=16k_gki_defconfig +PRE_DEFCONFIG_CMDS="mkdir -p \${OUT_DIR}/arch/arm64/configs/ && cat ${ROOT_DIR}/${KERNEL_DIR}/arch/arm64/configs/gki_defconfig ${ROOT_DIR}/${KERNEL_DIR}/arch/arm64/configs/16k_gki.fragment > \${OUT_DIR}/arch/arm64/configs/${DEFCONFIG};" +POST_DEFCONFIG_CMDS="" diff --git a/build.config.gki.aarch64.fips140 b/build.config.gki.aarch64.fips140 new file mode 100644 index 000000000000..ec493efc20cf --- /dev/null +++ b/build.config.gki.aarch64.fips140 @@ -0,0 +1,27 @@ +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.common +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.aarch64 +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.gki + +FILES=" +crypto/fips140.ko +" + +MAKE_GOALS=" +modules +" + +if [ "${LTO}" = "none" ]; then + echo "The FIPS140 module needs LTO to be enabled." + exit 1 +fi + +MODULES_ORDER=android/gki_aarch64_fips140_modules +KERNEL_DIR=common + +DEFCONFIG=fips140_gki_defconfig +PRE_DEFCONFIG_CMDS="mkdir -p \${OUT_DIR}/arch/arm64/configs/ && KCONFIG_CONFIG=\${OUT_DIR}/arch/arm64/configs/${DEFCONFIG} ${ROOT_DIR}/${KERNEL_DIR}/scripts/kconfig/merge_config.sh -m -r ${ROOT_DIR}/${KERNEL_DIR}/arch/arm64/configs/gki_defconfig ${ROOT_DIR}/${KERNEL_DIR}/arch/arm64/configs/fips140_gki.fragment" +POST_DEFCONFIG_CMDS="" + +if [ -n "${GKI_BUILD_CONFIG_FRAGMENT}" ]; then +source ${GKI_BUILD_CONFIG_FRAGMENT} +fi diff --git a/build.config.gki.x86_64 b/build.config.gki.x86_64 new file mode 100644 index 000000000000..93f492cabec8 --- /dev/null +++ b/build.config.gki.x86_64 @@ -0,0 +1,15 @@ +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.common +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.x86_64 +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.gki + +BUILD_SYSTEM_DLKM=1 +MODULES_LIST=${ROOT_DIR}/${KERNEL_DIR}/android/gki_system_dlkm_modules + +BUILD_GKI_CERTIFICATION_TOOLS=1 + +BUILD_GKI_ARTIFACTS=1 +BUILD_GKI_BOOT_IMG_SIZE=67108864 + +if [ -n "${GKI_BUILD_CONFIG_FRAGMENT}" ]; then +source ${GKI_BUILD_CONFIG_FRAGMENT} +fi diff --git a/build.config.gki_kasan b/build.config.gki_kasan new file mode 100644 index 000000000000..1aca62c3a13b --- /dev/null +++ b/build.config.gki_kasan @@ -0,0 +1,22 @@ +DEFCONFIG=gki_defconfig +POST_DEFCONFIG_CMDS="check_defconfig && update_kasan_config" +KERNEL_DIR=common +LTO=none + +function update_kasan_config() { + ${KERNEL_DIR}/scripts/config --file ${OUT_DIR}/.config \ + -e CONFIG_KASAN \ + -e CONFIG_KASAN_INLINE \ + -e CONFIG_KCOV \ + -e CONFIG_PANIC_ON_WARN_DEFAULT_ENABLE \ + -d CONFIG_RANDOMIZE_BASE \ + -d CONFIG_KASAN_OUTLINE \ + --set-val CONFIG_FRAME_WARN 0 \ + -d CFI \ + -d CFI_PERMISSIVE \ + -d CFI_CLANG \ + -d SHADOW_CALL_STACK + (cd ${OUT_DIR} && \ + make ${TOOL_ARGS} O=${OUT_DIR} olddefconfig) +} + diff --git a/build.config.gki_kasan.aarch64 b/build.config.gki_kasan.aarch64 new file mode 100644 index 000000000000..9fd2560c45e8 --- /dev/null +++ b/build.config.gki_kasan.aarch64 @@ -0,0 +1,3 @@ +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.common +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.aarch64 +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.gki_kasan diff --git a/build.config.gki_kasan.x86_64 b/build.config.gki_kasan.x86_64 new file mode 100644 index 000000000000..eec645805f39 --- /dev/null +++ b/build.config.gki_kasan.x86_64 @@ -0,0 +1,4 @@ +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.common +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.x86_64 +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.gki_kasan + diff --git a/build.config.gki_kprobes b/build.config.gki_kprobes new file mode 100644 index 000000000000..f1c6cf785766 --- /dev/null +++ b/build.config.gki_kprobes @@ -0,0 +1,20 @@ +DEFCONFIG=gki_defconfig +POST_DEFCONFIG_CMDS="check_defconfig && update_kprobes_config" +function update_kprobes_config() { + ${KERNEL_DIR}/scripts/config --file ${OUT_DIR}/.config \ + -d LTO \ + -d LTO_CLANG_THIN \ + -d CFI \ + -d CFI_PERMISSIVE \ + -d CFI_CLANG \ + -e CONFIG_DYNAMIC_FTRACE \ + -e CONFIG_FUNCTION_TRACER \ + -e CONFIG_IRQSOFF_TRACER \ + -e CONFIG_FUNCTION_PROFILER \ + -e CONFIG_PREEMPT_TRACER \ + -e CONFIG_CHECKPOINT_RESTORE \ + -d CONFIG_RANDOMIZE_BASE + (cd ${OUT_DIR} && \ + make ${TOOL_ARGS} O=${OUT_DIR} olddefconfig) +} + diff --git a/build.config.gki_kprobes.aarch64 b/build.config.gki_kprobes.aarch64 new file mode 100644 index 000000000000..627c2176f971 --- /dev/null +++ b/build.config.gki_kprobes.aarch64 @@ -0,0 +1,4 @@ +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.common +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.aarch64 +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.gki_kprobes + diff --git a/build.config.gki_kprobes.x86_64 b/build.config.gki_kprobes.x86_64 new file mode 100644 index 000000000000..a1d3da740b36 --- /dev/null +++ b/build.config.gki_kprobes.x86_64 @@ -0,0 +1,4 @@ +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.common +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.x86_64 +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.gki_kprobes + diff --git a/build.config.khwasan b/build.config.khwasan new file mode 100644 index 000000000000..c8e0c5e24637 --- /dev/null +++ b/build.config.khwasan @@ -0,0 +1,17 @@ +append_cmd POST_DEFCONFIG_CMDS update_khwasan_config + +function update_khwasan_config() { + ${KERNEL_DIR}/scripts/config --file ${OUT_DIR}/.config \ + -e CONFIG_KASAN \ + -d CONFIG_KASAN_HW_TAGS \ + -e CONFIG_KASAN_SW_TAGS \ + -e CONFIG_KASAN_OUTLINE \ + -e CONFIG_KASAN_PANIC_ON_WARN \ + -e CONFIG_KCOV \ + -e CONFIG_PANIC_ON_WARN_DEFAULT_ENABLE \ + -d CONFIG_RANDOMIZE_BASE \ + --set-val CONFIG_FRAME_WARN 0 \ + -d SHADOW_CALL_STACK + (cd ${OUT_DIR} && \ + make O=${OUT_DIR} "${TOOL_ARGS[@]}" ${MAKE_ARGS} olddefconfig) +} diff --git a/build.config.rockpi4 b/build.config.rockpi4 new file mode 100644 index 000000000000..424e568f018c --- /dev/null +++ b/build.config.rockpi4 @@ -0,0 +1,19 @@ +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.common +. ${ROOT_DIR}/${KERNEL_DIR}/build.config.aarch64 +TRIM_NONLISTED_KMI="" +KMI_SYMBOL_LIST_STRICT_MODE="" + +BUILD_INITRAMFS=1 +LZ4_RAMDISK=1 +DEFCONFIG=rockpi4_gki_defconfig +FRAGMENT_CONFIG=${KERNEL_DIR}/arch/arm64/configs/rockpi4_gki.fragment +PRE_DEFCONFIG_CMDS="KCONFIG_CONFIG=${ROOT_DIR}/${KERNEL_DIR}/arch/arm64/configs/${DEFCONFIG} ${ROOT_DIR}/${KERNEL_DIR}/scripts/kconfig/merge_config.sh -m -r ${ROOT_DIR}/${KERNEL_DIR}/arch/arm64/configs/gki_defconfig ${ROOT_DIR}/${FRAGMENT_CONFIG}" +POST_DEFCONFIG_CMDS="rm ${ROOT_DIR}/${KERNEL_DIR}/arch/arm64/configs/${DEFCONFIG}" + +MAKE_GOALS="${MAKE_GOALS} +rockchip/rk3399-rock-pi-4b.dtb +" + +FILES="${FILES} +arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4b.dtb +" diff --git a/build.config.x86_64 b/build.config.x86_64 new file mode 100644 index 000000000000..b5ac82b36bac --- /dev/null +++ b/build.config.x86_64 @@ -0,0 +1,16 @@ +ARCH=x86_64 +MAKE_GOALS=" +bzImage +modules +" + +FILES=" +arch/x86/boot/bzImage +vmlinux +System.map +vmlinux.symvers +modules.builtin +modules.builtin.modinfo +" + +NDK_TRIPLE=x86_64-linux-android31 diff --git a/crypto/Kconfig b/crypto/Kconfig index db260ccfba51..f41ddc91b7b2 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -33,6 +33,31 @@ config CRYPTO_FIPS certification. You should say no unless you know what this is. +config CRYPTO_FIPS140_MOD + tristate "Enable FIPS 140 cryptographic module" + depends on ARM64 && ARM64_MODULE_PLTS + depends on m + help + This option enables building a loadable module fips140.ko, which + contains various crypto algorithms that are also built into vmlinux. + At load time, this module overrides the built-in implementations of + these algorithms with its implementations. It also runs self-tests on + these algorithms and verifies the integrity of its code and data. If + either of these steps fails, the kernel will panic. + + This module is intended to be loaded at early boot time in order to + meet FIPS 140 and NIAP FPT_TST_EXT.1 requirements. It shouldn't be + used if you don't need to meet these requirements. + +config CRYPTO_FIPS140_MOD_EVAL_TESTING + bool "Enable evaluation testing features in FIPS 140 module" + depends on CRYPTO_FIPS140_MOD + help + This option adds some features to the FIPS 140 module which are needed + for lab evaluation testing of the module, e.g. support for injecting + errors and support for a userspace interface to some of the module's + services. This option should not be enabled in production builds. + config CRYPTO_ALGAPI tristate select CRYPTO_ALGAPI2 @@ -453,6 +478,15 @@ config CRYPTO_PCBC PCBC: Propagating Cipher Block Chaining mode This block cipher algorithm is required for RxRPC. +config CRYPTO_XCTR + tristate + select CRYPTO_SKCIPHER + select CRYPTO_MANAGER + help + XCTR: XOR Counter mode. This blockcipher mode is a variant of CTR mode + using XORs and little-endian addition rather than big-endian arithmetic. + XCTR mode is used to implement HCTR2. + config CRYPTO_XTS tristate "XTS support" select CRYPTO_SKCIPHER @@ -516,6 +550,17 @@ config CRYPTO_ADIANTUM If unsure, say N. +config CRYPTO_HCTR2 + tristate "HCTR2 support" + select CRYPTO_XCTR + select CRYPTO_POLYVAL + select CRYPTO_MANAGER + help + HCTR2 is a length-preserving encryption mode for storage encryption that + is efficient on processors with instructions to accelerate AES and + carryless multiplication, e.g. x86 processors with AES-NI and CLMUL, and + ARM processors with the ARMv8 crypto extensions. + config CRYPTO_ESSIV tristate "ESSIV support for block encryption" select CRYPTO_AUTHENC @@ -734,6 +779,23 @@ config CRYPTO_GHASH GHASH is the hash function used in GCM (Galois/Counter Mode). It is not a general-purpose cryptographic hash function. +config CRYPTO_POLYVAL + tristate + select CRYPTO_GF128MUL + select CRYPTO_HASH + help + POLYVAL is the hash function used in HCTR2. It is not a general-purpose + cryptographic hash function. + +config CRYPTO_POLYVAL_CLMUL_NI + tristate "POLYVAL hash function (CLMUL-NI accelerated)" + depends on X86 && 64BIT + select CRYPTO_POLYVAL + help + This is the x86_64 CLMUL-NI accelerated implementation of POLYVAL. It is + used to efficiently implement HCTR2 on x86-64 processors that support + carry-less multiplication instructions. + config CRYPTO_POLY1305 tristate "Poly1305 authenticator algorithm" select CRYPTO_HASH @@ -1094,7 +1156,7 @@ config CRYPTO_AES_NI_INTEL In addition to AES cipher algorithm support, the acceleration for some popular block cipher mode is supported too, including ECB, CBC, LRW, XTS. The 64 bit version has additional - acceleration for CTR. + acceleration for CTR and XCTR. config CRYPTO_AES_SPARC64 tristate "AES cipher algorithms (SPARC64)" diff --git a/crypto/Makefile b/crypto/Makefile index 429591ffeb5d..8c09b56133ee 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -92,6 +92,8 @@ obj-$(CONFIG_CRYPTO_CTS) += cts.o obj-$(CONFIG_CRYPTO_LRW) += lrw.o obj-$(CONFIG_CRYPTO_XTS) += xts.o obj-$(CONFIG_CRYPTO_CTR) += ctr.o +obj-$(CONFIG_CRYPTO_XCTR) += xctr.o +obj-$(CONFIG_CRYPTO_HCTR2) += hctr2.o obj-$(CONFIG_CRYPTO_KEYWRAP) += keywrap.o obj-$(CONFIG_CRYPTO_ADIANTUM) += adiantum.o obj-$(CONFIG_CRYPTO_NHPOLY1305) += nhpoly1305.o @@ -165,6 +167,7 @@ UBSAN_SANITIZE_jitterentropy.o = n jitterentropy_rng-y := jitterentropy.o jitterentropy-kcapi.o obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o obj-$(CONFIG_CRYPTO_GHASH) += ghash-generic.o +obj-$(CONFIG_CRYPTO_POLYVAL) += polyval-generic.o obj-$(CONFIG_CRYPTO_USER_API) += af_alg.o obj-$(CONFIG_CRYPTO_USER_API_HASH) += algif_hash.o obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o @@ -197,3 +200,55 @@ obj-$(CONFIG_ASYMMETRIC_KEY_TYPE) += asymmetric_keys/ obj-$(CONFIG_CRYPTO_HASH_INFO) += hash_info.o crypto_simd-y := simd.o obj-$(CONFIG_CRYPTO_SIMD) += crypto_simd.o + +ifneq ($(CONFIG_CRYPTO_FIPS140_MOD),) + +FIPS140_CFLAGS := -DBUILD_FIPS140_KO -include $(srctree)/crypto/fips140-defs.h + +CFLAGS_jitterentropy-fips.o := -O0 +KASAN_SANITIZE_jitterentropy-fips.o = n +UBSAN_SANITIZE_jitterentropy-fips.o = n + +# Compile an extra copy of various crypto algorithms into the fips140 module. +# +# Note: the module will still work if some files are removed from here. +# However, it may affect FIPS certifiability. Don't remove files from here +# without considering impact on FIPS certifiability. + +crypto-fips-objs := drbg.o ecb.o cbc.o ctr.o cts.o gcm.o xts.o hmac.o cmac.o \ + gf128mul.o aes_generic.o lib-crypto-aes.o \ + jitterentropy.o jitterentropy-kcapi.o \ + sha1_generic.o sha256_generic.o sha512_generic.o \ + lib-memneq.o lib-sha1.o lib-crypto-sha256.o +crypto-fips-objs := $(foreach o,$(crypto-fips-objs),$(o:.o=-fips.o)) + +# get the arch to add its objects to $(crypto-fips-objs) +include $(srctree)/arch/$(ARCH)/crypto/Kbuild.fips140 + +$(obj)/%-fips.o: KBUILD_CFLAGS += $(FIPS140_CFLAGS) +$(obj)/%-fips.o: $(src)/%.c FORCE + $(call if_changed_rule,cc_o_c) +$(obj)/lib-%-fips.o: $(srctree)/lib/%.c FORCE + $(call if_changed_rule,cc_o_c) +$(obj)/lib-crypto-%-fips.o: $(srctree)/lib/crypto/%.c FORCE + $(call if_changed_rule,cc_o_c) + +fips140-objs := \ + fips140-alg-registration.o \ + fips140-module.o \ + fips140-refs.o \ + fips140-selftests.o \ + $(crypto-fips-objs) +fips140-$(CONFIG_CRYPTO_FIPS140_MOD_EVAL_TESTING) += \ + fips140-eval-testing.o +obj-m += fips140.o + +CFLAGS_fips140-alg-registration.o += $(FIPS140_CFLAGS) +CFLAGS_fips140-module.o += $(FIPS140_CFLAGS) +CFLAGS_fips140-selftests.o += $(FIPS140_CFLAGS) +CFLAGS_fips140-eval-testing.o += $(FIPS140_CFLAGS) + +hostprogs-always-y := fips140_gen_hmac +HOSTLDLIBS_fips140_gen_hmac := -lcrypto -lelf + +endif diff --git a/crypto/OWNERS b/crypto/OWNERS new file mode 100644 index 000000000000..4ed35a0f4668 --- /dev/null +++ b/crypto/OWNERS @@ -0,0 +1 @@ +ardb@google.com diff --git a/crypto/algapi.c b/crypto/algapi.c index f3d95af3e428..4277a5009156 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -216,7 +216,87 @@ void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list, } EXPORT_SYMBOL_GPL(crypto_remove_spawns); -static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg) +static void crypto_alg_finish_registration(struct crypto_alg *alg, + bool fulfill_requests, + struct list_head *algs_to_put) +{ + struct crypto_alg *q; + + list_for_each_entry(q, &crypto_alg_list, cra_list) { + if (q == alg) + continue; + + if (crypto_is_moribund(q)) + continue; + + if (crypto_is_larval(q)) { + struct crypto_larval *larval = (void *)q; + + /* + * Check to see if either our generic name or + * specific name can satisfy the name requested + * by the larval entry q. + */ + if (strcmp(alg->cra_name, q->cra_name) && + strcmp(alg->cra_driver_name, q->cra_name)) + continue; + + if (larval->adult) + continue; + if ((q->cra_flags ^ alg->cra_flags) & larval->mask) + continue; + + if (fulfill_requests && crypto_mod_get(alg)) + larval->adult = alg; + else + larval->adult = ERR_PTR(-EAGAIN); + + continue; + } + + if (strcmp(alg->cra_name, q->cra_name)) + continue; + + if (strcmp(alg->cra_driver_name, q->cra_driver_name) && + q->cra_priority > alg->cra_priority) + continue; + + crypto_remove_spawns(q, algs_to_put, alg); + } + + crypto_notify(CRYPTO_MSG_ALG_LOADED, alg); +} + +static struct crypto_larval *crypto_alloc_test_larval(struct crypto_alg *alg) +{ + struct crypto_larval *larval; + + if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER) || + IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) || + (alg->cra_flags & CRYPTO_ALG_INTERNAL)) + return NULL; /* No self-test needed */ + + larval = crypto_larval_alloc(alg->cra_name, + alg->cra_flags | CRYPTO_ALG_TESTED, 0); + if (IS_ERR(larval)) + return larval; + + larval->adult = crypto_mod_get(alg); + if (!larval->adult) { + kfree(larval); + return ERR_PTR(-ENOENT); + } + + refcount_set(&larval->alg.cra_refcnt, 1); + memcpy(larval->alg.cra_driver_name, alg->cra_driver_name, + CRYPTO_MAX_ALG_NAME); + larval->alg.cra_priority = alg->cra_priority; + + return larval; +} + +static struct crypto_larval * +__crypto_register_alg(struct crypto_alg *alg, struct list_head *algs_to_put) { struct crypto_alg *q; struct crypto_larval *larval; @@ -227,9 +307,6 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg) INIT_LIST_HEAD(&alg->cra_users); - /* No cheating! */ - alg->cra_flags &= ~CRYPTO_ALG_TESTED; - ret = -EEXIST; list_for_each_entry(q, &crypto_alg_list, cra_list) { @@ -250,31 +327,27 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg) goto err; } - larval = crypto_larval_alloc(alg->cra_name, - alg->cra_flags | CRYPTO_ALG_TESTED, 0); + larval = crypto_alloc_test_larval(alg); if (IS_ERR(larval)) goto out; - ret = -ENOENT; - larval->adult = crypto_mod_get(alg); - if (!larval->adult) - goto free_larval; - - refcount_set(&larval->alg.cra_refcnt, 1); - memcpy(larval->alg.cra_driver_name, alg->cra_driver_name, - CRYPTO_MAX_ALG_NAME); - larval->alg.cra_priority = alg->cra_priority; - list_add(&alg->cra_list, &crypto_alg_list); - list_add(&larval->alg.cra_list, &crypto_alg_list); crypto_stats_init(alg); + if (larval) { + /* No cheating! */ + alg->cra_flags &= ~CRYPTO_ALG_TESTED; + + list_add(&larval->alg.cra_list, &crypto_alg_list); + } else { + alg->cra_flags |= CRYPTO_ALG_TESTED; + crypto_alg_finish_registration(alg, true, algs_to_put); + } + out: return larval; -free_larval: - kfree(larval); err: larval = ERR_PTR(ret); goto out; @@ -310,7 +383,10 @@ found: alg->cra_flags |= CRYPTO_ALG_TESTED; - /* Only satisfy larval waiters if we are the best. */ + /* + * If a higher-priority implementation of the same algorithm is + * currently being tested, then don't fulfill request larvals. + */ best = true; list_for_each_entry(q, &crypto_alg_list, cra_list) { if (crypto_is_moribund(q) || !crypto_is_larval(q)) @@ -325,47 +401,7 @@ found: } } - list_for_each_entry(q, &crypto_alg_list, cra_list) { - if (q == alg) - continue; - - if (crypto_is_moribund(q)) - continue; - - if (crypto_is_larval(q)) { - struct crypto_larval *larval = (void *)q; - - /* - * Check to see if either our generic name or - * specific name can satisfy the name requested - * by the larval entry q. - */ - if (strcmp(alg->cra_name, q->cra_name) && - strcmp(alg->cra_driver_name, q->cra_name)) - continue; - - if (larval->adult) - continue; - if ((q->cra_flags ^ alg->cra_flags) & larval->mask) - continue; - - if (best && crypto_mod_get(alg)) - larval->adult = alg; - else - larval->adult = ERR_PTR(-EAGAIN); - - continue; - } - - if (strcmp(alg->cra_name, q->cra_name)) - continue; - - if (strcmp(alg->cra_driver_name, q->cra_driver_name) && - q->cra_priority > alg->cra_priority) - continue; - - crypto_remove_spawns(q, &list, alg); - } + crypto_alg_finish_registration(alg, best, &list); complete: complete_all(&test->completion); @@ -389,29 +425,11 @@ void crypto_remove_final(struct list_head *list) } EXPORT_SYMBOL_GPL(crypto_remove_final); -static void crypto_wait_for_test(struct crypto_larval *larval) -{ - int err; - - err = crypto_probing_notify(CRYPTO_MSG_ALG_REGISTER, larval->adult); - if (err != NOTIFY_STOP) { - if (WARN_ON(err != NOTIFY_DONE)) - goto out; - crypto_alg_tested(larval->alg.cra_driver_name, 0); - } - - err = wait_for_completion_killable(&larval->completion); - WARN_ON(err); - if (!err) - crypto_notify(CRYPTO_MSG_ALG_LOADED, larval); - -out: - crypto_larval_kill(&larval->alg); -} - int crypto_register_alg(struct crypto_alg *alg) { struct crypto_larval *larval; + LIST_HEAD(algs_to_put); + bool test_started = false; int err; alg->cra_flags &= ~CRYPTO_ALG_DEAD; @@ -420,13 +438,18 @@ int crypto_register_alg(struct crypto_alg *alg) return err; down_write(&crypto_alg_sem); - larval = __crypto_register_alg(alg); + larval = __crypto_register_alg(alg, &algs_to_put); + if (!IS_ERR_OR_NULL(larval)) { + test_started = crypto_boot_test_finished(); + larval->test_started = test_started; + } up_write(&crypto_alg_sem); if (IS_ERR(larval)) return PTR_ERR(larval); - - crypto_wait_for_test(larval); + if (test_started) + crypto_wait_for_test(larval); + crypto_remove_final(&algs_to_put); return 0; } EXPORT_SYMBOL_GPL(crypto_register_alg); @@ -602,6 +625,7 @@ int crypto_register_instance(struct crypto_template *tmpl, { struct crypto_larval *larval; struct crypto_spawn *spawn; + LIST_HEAD(algs_to_put); int err; err = crypto_check_alg(&inst->alg); @@ -629,9 +653,11 @@ int crypto_register_instance(struct crypto_template *tmpl, spawn = next; } - larval = __crypto_register_alg(&inst->alg); + larval = __crypto_register_alg(&inst->alg, &algs_to_put); if (IS_ERR(larval)) goto unlock; + else if (larval) + larval->test_started = true; hlist_add_head(&inst->list, &tmpl->instances); inst->tmpl = tmpl; @@ -639,15 +665,12 @@ int crypto_register_instance(struct crypto_template *tmpl, unlock: up_write(&crypto_alg_sem); - err = PTR_ERR(larval); if (IS_ERR(larval)) - goto err; - - crypto_wait_for_test(larval); - err = 0; - -err: - return err; + return PTR_ERR(larval); + if (larval) + crypto_wait_for_test(larval); + crypto_remove_final(&algs_to_put); + return 0; } EXPORT_SYMBOL_GPL(crypto_register_instance); @@ -1261,9 +1284,51 @@ void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, EXPORT_SYMBOL_GPL(crypto_stats_skcipher_decrypt); #endif +static void __init crypto_start_tests(void) +{ + if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) + return; + + for (;;) { + struct crypto_larval *larval = NULL; + struct crypto_alg *q; + + down_write(&crypto_alg_sem); + + list_for_each_entry(q, &crypto_alg_list, cra_list) { + struct crypto_larval *l; + + if (!crypto_is_larval(q)) + continue; + + l = (void *)q; + + if (!crypto_is_test_larval(l)) + continue; + + if (l->test_started) + continue; + + l->test_started = true; + larval = l; + break; + } + + up_write(&crypto_alg_sem); + + if (!larval) + break; + + crypto_wait_for_test(larval); + } + + set_crypto_boot_test_finished(); +} + static int __init crypto_algapi_init(void) { crypto_init_proc(); + crypto_start_tests(); return 0; } @@ -1272,7 +1337,11 @@ static void __exit crypto_algapi_exit(void) crypto_exit_proc(); } -module_init(crypto_algapi_init); +/* + * We run this at late_initcall so that all the built-in algorithms + * have had a chance to register themselves first. + */ +late_initcall(crypto_algapi_init); module_exit(crypto_algapi_exit); MODULE_LICENSE("GPL"); diff --git a/crypto/algboss.c b/crypto/algboss.c index 1814d2c5188a..8dbf0f9a36e9 100644 --- a/crypto/algboss.c +++ b/crypto/algboss.c @@ -175,18 +175,10 @@ static int cryptomgr_test(void *data) { struct crypto_test_param *param = data; u32 type = param->type; - int err = 0; - -#ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS - goto skiptest; -#endif - - if (type & CRYPTO_ALG_TESTED) - goto skiptest; + int err; err = alg_test(param->driver, param->alg, type, CRYPTO_ALG_TESTED); -skiptest: crypto_alg_tested(param->driver, err); kfree(param); @@ -197,7 +189,9 @@ static int cryptomgr_schedule_test(struct crypto_alg *alg) { struct task_struct *thread; struct crypto_test_param *param; - u32 type; + + if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) + return NOTIFY_DONE; if (!try_module_get(THIS_MODULE)) goto err; @@ -208,13 +202,7 @@ static int cryptomgr_schedule_test(struct crypto_alg *alg) memcpy(param->driver, alg->cra_driver_name, sizeof(param->driver)); memcpy(param->alg, alg->cra_name, sizeof(param->alg)); - type = alg->cra_flags; - - /* Do not test internal algorithms. */ - if (type & CRYPTO_ALG_INTERNAL) - type |= CRYPTO_ALG_TESTED; - - param->type = type; + param->type = alg->cra_flags; thread = kthread_run(cryptomgr_test, param, "cryptomgr_test"); if (IS_ERR(thread)) diff --git a/crypto/api.c b/crypto/api.c index 5ffcd3ab4a75..fa29728a7187 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -30,6 +31,11 @@ EXPORT_SYMBOL_GPL(crypto_alg_sem); BLOCKING_NOTIFIER_HEAD(crypto_chain); EXPORT_SYMBOL_GPL(crypto_chain); +#ifndef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS +DEFINE_STATIC_KEY_FALSE(__crypto_boot_test_finished); +EXPORT_SYMBOL_GPL(__crypto_boot_test_finished); +#endif + static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg); struct crypto_alg *crypto_mod_get(struct crypto_alg *alg) @@ -47,11 +53,6 @@ void crypto_mod_put(struct crypto_alg *alg) } EXPORT_SYMBOL_GPL(crypto_mod_put); -static inline int crypto_is_test_larval(struct crypto_larval *larval) -{ - return larval->alg.cra_driver_name[0]; -} - static struct crypto_alg *__crypto_alg_lookup(const char *name, u32 type, u32 mask) { @@ -163,11 +164,49 @@ void crypto_larval_kill(struct crypto_alg *alg) } EXPORT_SYMBOL_GPL(crypto_larval_kill); +void crypto_wait_for_test(struct crypto_larval *larval) +{ + int err; + + err = crypto_probing_notify(CRYPTO_MSG_ALG_REGISTER, larval->adult); + if (WARN_ON_ONCE(err != NOTIFY_STOP)) + goto out; + + err = wait_for_completion_killable(&larval->completion); + WARN_ON(err); +out: + crypto_larval_kill(&larval->alg); +} +EXPORT_SYMBOL_GPL(crypto_wait_for_test); + +static void crypto_start_test(struct crypto_larval *larval) +{ + if (!crypto_is_test_larval(larval)) + return; + + if (larval->test_started) + return; + + down_write(&crypto_alg_sem); + if (larval->test_started) { + up_write(&crypto_alg_sem); + return; + } + + larval->test_started = true; + up_write(&crypto_alg_sem); + + crypto_wait_for_test(larval); +} + static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg) { struct crypto_larval *larval = (void *)alg; long timeout; + if (!crypto_boot_test_finished()) + crypto_start_test(larval); + timeout = wait_for_completion_killable_timeout( &larval->completion, 60 * HZ); diff --git a/crypto/fips140-alg-registration.c b/crypto/fips140-alg-registration.c new file mode 100644 index 000000000000..03757f88890b --- /dev/null +++ b/crypto/fips140-alg-registration.c @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Block crypto operations until tests complete + * + * Copyright 2021 Google LLC + * + * This file defines the fips140_crypto_register_*() functions, to which all + * calls to crypto_register_*() in the module are redirected. These functions + * override the tfm initialization function of each algorithm to insert a wait + * for the module having completed its self-tests and integrity check. + * + * The exact field that we override depends on the algorithm type. For + * algorithm types that have a strongly-typed initialization function pointer + * (e.g. skcipher), we must override that, since cra_init isn't guaranteed to be + * called for those despite the field being present in the base struct. For the + * other algorithm types (e.g. "cipher") we must override cra_init. + * + * All of this applies to both normal algorithms and template instances. + * + * The purpose of all of this is to meet a FIPS requirement where the module + * must not produce any output from cryptographic algorithms until it completes + * its tests. Technically this is impossible, but this solution meets the + * intent of the requirement, assuming the user makes a supported sequence of + * API calls. Note that we can't simply run the tests before registering the + * algorithms, as the algorithms must be registered in order to run the tests. + * + * It would be much easier to handle this in the kernel's crypto API framework. + * Unfortunately, that was deemed insufficient because the module itself is + * required to do the enforcement. What is *actually* required is still very + * vague, but the approach implemented here should meet the requirement. + */ + +/* + * This file is the one place in fips140.ko that needs to call the kernel's real + * algorithm registration functions, so #undefine all the macros from + * fips140-defs.h so that the "fips140_" prefix doesn't automatically get added. + */ +#undef aead_register_instance +#undef ahash_register_instance +#undef crypto_register_aead +#undef crypto_register_aeads +#undef crypto_register_ahash +#undef crypto_register_ahashes +#undef crypto_register_alg +#undef crypto_register_algs +#undef crypto_register_rng +#undef crypto_register_rngs +#undef crypto_register_shash +#undef crypto_register_shashes +#undef crypto_register_skcipher +#undef crypto_register_skciphers +#undef shash_register_instance +#undef skcipher_register_instance + +#include +#include +#include +#include +#include +#include + +#include "fips140-module.h" + +/* Indicates whether the self-tests and integrity check have completed */ +DECLARE_COMPLETION(fips140_tests_done); + +/* The thread running the self-tests and integrity check */ +struct task_struct *fips140_init_thread; + +/* + * Map from crypto_alg to original initialization function (possibly NULL) + * + * Note: unregistering an algorithm will leak its map entry, as we don't bother + * to remove it. This should be fine since fips140.ko can't be unloaded. The + * proper solution would be to store the original function pointer in a new + * field in 'struct crypto_alg', but that would require kernel support. + */ +static DEFINE_XARRAY(fips140_init_func_map); + +static bool fips140_ready(void) +{ + return completion_done(&fips140_tests_done); +} + +/* + * Wait until crypto operations are allowed to proceed. Return true if the + * tests are done, or false if the caller is the thread running the tests so it + * is allowed to proceed anyway. + */ +static bool fips140_wait_until_ready(struct crypto_alg *alg) +{ + if (fips140_ready()) + return true; + /* + * The thread running the tests must not wait. Since tfms can only be + * allocated in task context, we can reliably determine whether the + * invocation is from that thread or not by checking 'current'. + */ + if (current == fips140_init_thread) + return false; + + pr_info("blocking user of %s until tests complete\n", + alg->cra_driver_name); + wait_for_completion(&fips140_tests_done); + pr_info("tests done, allowing %s to proceed\n", alg->cra_driver_name); + return true; +} + +static int fips140_store_init_function(struct crypto_alg *alg, void *func) +{ + void *ret; + + /* + * The XArray API requires 4-byte aligned values. Although function + * pointers in general aren't guaranteed to be 4-byte aligned, it should + * be the case for the platforms this module is used on. + */ + if (WARN_ON((unsigned long)func & 3)) + return -EINVAL; + + ret = xa_store(&fips140_init_func_map, (unsigned long)alg, func, + GFP_KERNEL); + return xa_err(ret); +} + +/* Get the algorithm's original initialization function (possibly NULL) */ +static void *fips140_load_init_function(struct crypto_alg *alg) +{ + return xa_load(&fips140_init_func_map, (unsigned long)alg); +} + +/* tfm initialization function overrides */ + +static int fips140_alg_init_tfm(struct crypto_tfm *tfm) +{ + struct crypto_alg *alg = tfm->__crt_alg; + int (*cra_init)(struct crypto_tfm *tfm) = + fips140_load_init_function(alg); + + if (fips140_wait_until_ready(alg)) + WRITE_ONCE(alg->cra_init, cra_init); + return cra_init ? cra_init(tfm) : 0; +} + +static int fips140_aead_init_tfm(struct crypto_aead *tfm) +{ + struct aead_alg *alg = crypto_aead_alg(tfm); + int (*init)(struct crypto_aead *tfm) = + fips140_load_init_function(&alg->base); + + if (fips140_wait_until_ready(&alg->base)) + WRITE_ONCE(alg->init, init); + return init ? init(tfm) : 0; +} + +static int fips140_ahash_init_tfm(struct crypto_ahash *tfm) +{ + struct hash_alg_common *halg = crypto_hash_alg_common(tfm); + struct ahash_alg *alg = container_of(halg, struct ahash_alg, halg); + int (*init_tfm)(struct crypto_ahash *tfm) = + fips140_load_init_function(&halg->base); + + if (fips140_wait_until_ready(&halg->base)) + WRITE_ONCE(alg->init_tfm, init_tfm); + return init_tfm ? init_tfm(tfm) : 0; +} + +static int fips140_shash_init_tfm(struct crypto_shash *tfm) +{ + struct shash_alg *alg = crypto_shash_alg(tfm); + int (*init_tfm)(struct crypto_shash *tfm) = + fips140_load_init_function(&alg->base); + + if (fips140_wait_until_ready(&alg->base)) + WRITE_ONCE(alg->init_tfm, init_tfm); + return init_tfm ? init_tfm(tfm) : 0; +} + +static int fips140_skcipher_init_tfm(struct crypto_skcipher *tfm) +{ + struct skcipher_alg *alg = crypto_skcipher_alg(tfm); + int (*init)(struct crypto_skcipher *tfm) = + fips140_load_init_function(&alg->base); + + if (fips140_wait_until_ready(&alg->base)) + WRITE_ONCE(alg->init, init); + return init ? init(tfm) : 0; +} + +/* Single algorithm registration */ + +#define prepare_alg(alg, base_alg, field, wrapper_func) \ +({ \ + int err = 0; \ + \ + if (!fips140_ready() && alg->field != wrapper_func) { \ + err = fips140_store_init_function(base_alg, alg->field);\ + if (err == 0) \ + alg->field = wrapper_func; \ + } \ + err; \ +}) + +static int fips140_prepare_alg(struct crypto_alg *alg) +{ + /* + * Override cra_init. This is only for algorithm types like cipher and + * rng that don't have a strongly-typed initialization function. + */ + return prepare_alg(alg, alg, cra_init, fips140_alg_init_tfm); +} + +static int fips140_prepare_aead_alg(struct aead_alg *alg) +{ + return prepare_alg(alg, &alg->base, init, fips140_aead_init_tfm); +} + +static int fips140_prepare_ahash_alg(struct ahash_alg *alg) +{ + return prepare_alg(alg, &alg->halg.base, init_tfm, + fips140_ahash_init_tfm); +} + +static int fips140_prepare_rng_alg(struct rng_alg *alg) +{ + /* + * rng doesn't have a strongly-typed initialization function, so we must + * treat rng algorithms as "generic" algorithms. + */ + return fips140_prepare_alg(&alg->base); +} + +static int fips140_prepare_shash_alg(struct shash_alg *alg) +{ + return prepare_alg(alg, &alg->base, init_tfm, fips140_shash_init_tfm); +} + +static int fips140_prepare_skcipher_alg(struct skcipher_alg *alg) +{ + return prepare_alg(alg, &alg->base, init, fips140_skcipher_init_tfm); +} + +int fips140_crypto_register_alg(struct crypto_alg *alg) +{ + return fips140_prepare_alg(alg) ?: crypto_register_alg(alg); +} + +int fips140_crypto_register_aead(struct aead_alg *alg) +{ + return fips140_prepare_aead_alg(alg) ?: crypto_register_aead(alg); +} + +int fips140_crypto_register_ahash(struct ahash_alg *alg) +{ + return fips140_prepare_ahash_alg(alg) ?: crypto_register_ahash(alg); +} + +int fips140_crypto_register_rng(struct rng_alg *alg) +{ + return fips140_prepare_rng_alg(alg) ?: crypto_register_rng(alg); +} + +int fips140_crypto_register_shash(struct shash_alg *alg) +{ + return fips140_prepare_shash_alg(alg) ?: crypto_register_shash(alg); +} + +int fips140_crypto_register_skcipher(struct skcipher_alg *alg) +{ + return fips140_prepare_skcipher_alg(alg) ?: + crypto_register_skcipher(alg); +} + +/* Instance registration */ + +int fips140_aead_register_instance(struct crypto_template *tmpl, + struct aead_instance *inst) +{ + return fips140_prepare_aead_alg(&inst->alg) ?: + aead_register_instance(tmpl, inst); +} + +int fips140_ahash_register_instance(struct crypto_template *tmpl, + struct ahash_instance *inst) +{ + return fips140_prepare_ahash_alg(&inst->alg) ?: + ahash_register_instance(tmpl, inst); +} + +int fips140_shash_register_instance(struct crypto_template *tmpl, + struct shash_instance *inst) +{ + return fips140_prepare_shash_alg(&inst->alg) ?: + shash_register_instance(tmpl, inst); +} + +int fips140_skcipher_register_instance(struct crypto_template *tmpl, + struct skcipher_instance *inst) +{ + return fips140_prepare_skcipher_alg(&inst->alg) ?: + skcipher_register_instance(tmpl, inst); +} + +/* Bulk algorithm registration */ + +int fips140_crypto_register_algs(struct crypto_alg *algs, int count) +{ + int i; + int err; + + for (i = 0; i < count; i++) { + err = fips140_prepare_alg(&algs[i]); + if (err) + return err; + } + + return crypto_register_algs(algs, count); +} + +int fips140_crypto_register_aeads(struct aead_alg *algs, int count) +{ + int i; + int err; + + for (i = 0; i < count; i++) { + err = fips140_prepare_aead_alg(&algs[i]); + if (err) + return err; + } + + return crypto_register_aeads(algs, count); +} + +int fips140_crypto_register_ahashes(struct ahash_alg *algs, int count) +{ + int i; + int err; + + for (i = 0; i < count; i++) { + err = fips140_prepare_ahash_alg(&algs[i]); + if (err) + return err; + } + + return crypto_register_ahashes(algs, count); +} + +int fips140_crypto_register_rngs(struct rng_alg *algs, int count) +{ + int i; + int err; + + for (i = 0; i < count; i++) { + err = fips140_prepare_rng_alg(&algs[i]); + if (err) + return err; + } + + return crypto_register_rngs(algs, count); +} + +int fips140_crypto_register_shashes(struct shash_alg *algs, int count) +{ + int i; + int err; + + for (i = 0; i < count; i++) { + err = fips140_prepare_shash_alg(&algs[i]); + if (err) + return err; + } + + return crypto_register_shashes(algs, count); +} + +int fips140_crypto_register_skciphers(struct skcipher_alg *algs, int count) +{ + int i; + int err; + + for (i = 0; i < count; i++) { + err = fips140_prepare_skcipher_alg(&algs[i]); + if (err) + return err; + } + + return crypto_register_skciphers(algs, count); +} diff --git a/crypto/fips140-defs.h b/crypto/fips140-defs.h new file mode 100644 index 000000000000..9005f9513308 --- /dev/null +++ b/crypto/fips140-defs.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2021 Google LLC + * + * This file is automatically included by all files built into fips140.ko, via + * the "-include" compiler flag. + */ + +/* + * fips140.ko is built from various unmodified or minimally modified kernel + * source files, many of which are normally meant to be buildable into different + * modules themselves. That results in conflicting instances of module_init() + * and related macros such as MODULE_LICENSE(). + * + * To solve that, we undefine MODULE to trick the kernel headers into thinking + * the code is being compiled as built-in. That causes module_init() and + * related macros to be expanded as they would be for built-in code; e.g., + * module_init() adds the function to the .initcalls section of the binary. + * + * The .c file that contains the real module_init() for fips140.ko is then + * responsible for redefining MODULE, and the real module_init() is responsible + * for executing all the initcalls that were collected into .initcalls. + */ +#undef MODULE + +/* + * Defining KBUILD_MODFILE is also required, since the kernel headers expect it + * to be defined when code that can be a module is compiled as built-in. + */ +#define KBUILD_MODFILE "crypto/fips140" + +/* + * Disable symbol exports by default. fips140.ko includes various files that + * use EXPORT_SYMBOL*(), but it's unwanted to export any symbols from fips140.ko + * except where explicitly needed for FIPS certification reasons. + */ +#define __DISABLE_EXPORTS + +/* + * Redirect all calls to algorithm registration functions to the wrapper + * functions defined within the module. + */ +#define aead_register_instance fips140_aead_register_instance +#define ahash_register_instance fips140_ahash_register_instance +#define crypto_register_aead fips140_crypto_register_aead +#define crypto_register_aeads fips140_crypto_register_aeads +#define crypto_register_ahash fips140_crypto_register_ahash +#define crypto_register_ahashes fips140_crypto_register_ahashes +#define crypto_register_alg fips140_crypto_register_alg +#define crypto_register_algs fips140_crypto_register_algs +#define crypto_register_rng fips140_crypto_register_rng +#define crypto_register_rngs fips140_crypto_register_rngs +#define crypto_register_shash fips140_crypto_register_shash +#define crypto_register_shashes fips140_crypto_register_shashes +#define crypto_register_skcipher fips140_crypto_register_skcipher +#define crypto_register_skciphers fips140_crypto_register_skciphers +#define shash_register_instance fips140_shash_register_instance +#define skcipher_register_instance fips140_skcipher_register_instance diff --git a/crypto/fips140-eval-testing-uapi.h b/crypto/fips140-eval-testing-uapi.h new file mode 100644 index 000000000000..04e6cf633594 --- /dev/null +++ b/crypto/fips140-eval-testing-uapi.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#ifndef _CRYPTO_FIPS140_EVAL_TESTING_H +#define _CRYPTO_FIPS140_EVAL_TESTING_H + +#include + +/* + * This header defines the ioctls that are available on the fips140 character + * device. These ioctls expose some of the module's services to userspace so + * that they can be tested by the FIPS certification lab; this is a required + * part of getting a FIPS 140 certification. These ioctls do not have any other + * purpose, and they do not need to be present in production builds. + */ + +/* + * Call the fips140_is_approved_service() function. The argument must be the + * service name as a NUL-terminated string. The return value will be 1 if + * fips140_is_approved_service() returned true, or 0 if it returned false. + */ +#define FIPS140_IOCTL_IS_APPROVED_SERVICE _IO('F', 0) + +/* + * Call the fips140_module_version() function. The argument must be a pointer + * to a buffer of size >= 256 chars. The NUL-terminated string returned by + * fips140_module_version() will be written to this buffer. + */ +#define FIPS140_IOCTL_MODULE_VERSION _IOR('F', 1, char[256]) + +#endif /* _CRYPTO_FIPS140_EVAL_TESTING_H */ diff --git a/crypto/fips140-eval-testing.c b/crypto/fips140-eval-testing.c new file mode 100644 index 000000000000..ea3cd653983a --- /dev/null +++ b/crypto/fips140-eval-testing.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2021 Google LLC + * + * This file can optionally be built into fips140.ko in order to support certain + * types of testing that the FIPS lab has to do to evaluate the module. It + * should not be included in production builds of the module. + */ + +/* + * We have to redefine inline to mean always_inline, so that _copy_to_user() + * gets inlined. This is needed for it to be placed into the correct section. + * See fips140_copy_to_user(). + * + * We also need to undefine BUILD_FIPS140_KO to allow the use of the code + * patching which copy_to_user() requires. + */ +#undef inline +#define inline inline __attribute__((__always_inline__)) __gnu_inline \ + __inline_maybe_unused notrace +#undef BUILD_FIPS140_KO + +#include +#include +#include +#include + +#include "fips140-module.h" +#include "fips140-eval-testing-uapi.h" + +/* + * This option allows deliberately failing the self-tests for a particular + * algorithm. + */ +static char *fips140_fail_selftest; +module_param_named(fail_selftest, fips140_fail_selftest, charp, 0); + +/* This option allows deliberately failing the integrity check. */ +static bool fips140_fail_integrity_check; +module_param_named(fail_integrity_check, fips140_fail_integrity_check, bool, 0); + +static dev_t fips140_devnum; +static struct cdev fips140_cdev; + +/* Inject a self-test failure (via corrupting the result) if requested. */ +void fips140_inject_selftest_failure(const char *impl, u8 *result) +{ + if (fips140_fail_selftest && strcmp(impl, fips140_fail_selftest) == 0) + result[0] ^= 0xff; +} + +/* Inject an integrity check failure (via corrupting the text) if requested. */ +void fips140_inject_integrity_failure(u8 *textcopy) +{ + if (fips140_fail_integrity_check) + textcopy[0] ^= 0xff; +} + +static long fips140_ioctl_is_approved_service(unsigned long arg) +{ + const char *service_name = strndup_user((const char __user *)arg, 256); + long ret; + + if (IS_ERR(service_name)) + return PTR_ERR(service_name); + + ret = fips140_is_approved_service(service_name); + + kfree(service_name); + return ret; +} + +/* + * Code in fips140.ko is covered by an integrity check by default, and this + * check breaks if copy_to_user() is called. This is because copy_to_user() is + * an inline function that relies on code patching. However, since this is + * "evaluation testing" code which isn't included in the production builds of + * fips140.ko, it's acceptable to just exclude it from the integrity check. + */ +static noinline unsigned long __section("text.._fips140_unchecked") +fips140_copy_to_user(void __user *to, const void *from, unsigned long n) +{ + return copy_to_user(to, from, n); +} + +static long fips140_ioctl_module_version(unsigned long arg) +{ + const char *version = fips140_module_version(); + size_t len = strlen(version) + 1; + + if (len > 256) + return -EOVERFLOW; + + if (fips140_copy_to_user((void __user *)arg, version, len)) + return -EFAULT; + + return 0; +} + +static long fips140_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case FIPS140_IOCTL_IS_APPROVED_SERVICE: + return fips140_ioctl_is_approved_service(arg); + case FIPS140_IOCTL_MODULE_VERSION: + return fips140_ioctl_module_version(arg); + default: + return -ENOTTY; + } +} + +static const struct file_operations fips140_fops = { + .unlocked_ioctl = fips140_ioctl, +}; + +bool fips140_eval_testing_init(void) +{ + if (alloc_chrdev_region(&fips140_devnum, 1, 1, "fips140") != 0) { + pr_err("failed to allocate device number\n"); + return false; + } + cdev_init(&fips140_cdev, &fips140_fops); + if (cdev_add(&fips140_cdev, fips140_devnum, 1) != 0) { + pr_err("failed to add fips140 character device\n"); + return false; + } + return true; +} diff --git a/crypto/fips140-generated-testvecs.h b/crypto/fips140-generated-testvecs.h new file mode 100644 index 000000000000..d4ccd77eb97f --- /dev/null +++ b/crypto/fips140-generated-testvecs.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright 2021 Google LLC */ + +/* + * This header was automatically generated by gen_fips140_testvecs.py. + * Don't edit it directly. + */ + +static const u8 fips_message[32] __initconst = + "This is a 32-byte test message."; + +static const u8 fips_aes_key[16] __initconst = "128-bit AES key"; + +static const u8 fips_aes_iv[16] __initconst = "ABCDEFGHIJKLMNOP"; + +static const u8 fips_aes_cbc_ciphertext[32] __initconst = + "\x4c\x3e\xeb\x38\x8d\x1f\x28\xfd\xa2\x3b\xa9\xda\x36\xf2\x99\xe2" + "\x84\x84\x66\x37\x0a\x53\x68\x2f\x17\x95\x8d\x7f\xca\x5a\x68\x4e"; + +static const u8 fips_aes_ecb_ciphertext[32] __initconst = + "\xc1\x9d\xe6\xb8\xb2\x90\xff\xfe\xf2\x77\x18\xb0\x55\xd3\xee\xa9" + "\xe2\x6f\x4a\x32\x67\xfd\xb7\xa5\x2f\x4b\x6e\x1a\x86\x2b\x6e\x3a"; + +static const u8 fips_aes_ctr_ciphertext[32] __initconst = + "\xed\x06\x2c\xd0\xbc\x48\xd1\x2e\x6a\x4e\x13\xe9\xaa\x17\x40\xca" + "\x00\xb4\xaf\x3b\x4f\xee\x73\xd6\x6c\x41\xf6\x4c\x8b\x0d\x6a\x0f"; + +static const u8 fips_aes_gcm_assoc[22] __initconst = "associated data string"; + +static const u8 fips_aes_gcm_ciphertext[48] __initconst = + "\x37\x88\x3e\x1d\x58\x50\xda\x10\x07\xeb\x52\xdf\xea\x0a\x54\xd4" + "\x44\xbf\x88\x2a\xf3\x03\x03\x84\xaf\x8b\x96\xbd\xea\x65\x60\x6f" + "\x82\xfa\x51\xf4\x28\xad\x0c\xf1\xce\x0f\x91\xdd\x1a\x4c\x77\x5f"; + +static const u8 fips_aes_xts_key[32] __initconst = + "This is an AES-128-XTS key."; + +static const u8 fips_aes_xts_ciphertext[32] __initconst = + "\x4f\xf7\x9f\x6c\x00\xa8\x30\xdf\xff\xf3\x25\x9c\xf6\x0b\x1b\xfd" + "\x3b\x34\x5e\x67\x7c\xf8\x8b\x68\x9a\xb9\x5a\x89\x51\x51\xbd\x35"; + +static const u8 fips_aes_cmac_digest[16] __initconst = + "\x0c\x05\xda\x64\x51\x0c\x8e\x6c\x86\x52\x46\xa8\x2d\xb1\xfe\x0f"; + +static const u8 fips_hmac_key[16] __initconst = "128-bit HMAC key"; + +static const u8 fips_sha1_digest[20] __initconst = + "\x1b\x78\xc7\x4b\xd5\xd4\x83\xb1\x58\xc5\x96\x83\x4f\x16\x8d\x15" + "\xb4\xaa\x22\x8c"; + +static const u8 fips_sha256_digest[32] __initconst = + "\x4e\x11\x83\x0c\x53\x80\x1e\x5f\x9b\x38\x33\x38\xe8\x74\x43\xb0" + "\xc1\x3a\xbe\xbf\x75\xf0\x12\x0f\x21\x33\xf5\x16\x33\xf1\xb0\x81"; + +static const u8 fips_hmac_sha256_digest[32] __initconst = + "\x63\x0e\xb5\x73\x79\xfc\xaf\x5f\x86\xe3\xaf\xf0\xc8\x36\xef\xd5" + "\x35\x8d\x40\x25\x38\xb3\x65\x72\x98\xf3\x59\xd8\x1e\x54\x4c\xa1"; + +static const u8 fips_sha512_digest[64] __initconst = + "\x32\xe0\x44\x23\xbd\xe3\xec\x28\xbf\xf1\x34\x11\xd5\xae\xbf\xd5" + "\xc0\x8e\xb5\xa1\x04\xef\x2f\x07\x84\xf1\xd9\x83\x0f\x6c\x31\xab" + "\xf7\xe7\x57\xfa\xf7\xae\xf0\x6f\xb2\x16\x08\x32\xcf\xc7\xef\x35" + "\xb3\x3b\x51\xb9\xfd\xe7\xff\x5e\xb2\x8b\xc6\x79\xe6\x14\x04\xb4"; + +/* + * This header was automatically generated by gen_fips140_testvecs.py. + * Don't edit it directly. + */ diff --git a/crypto/fips140-module.c b/crypto/fips140-module.c new file mode 100644 index 000000000000..6412ad6c1234 --- /dev/null +++ b/crypto/fips140-module.c @@ -0,0 +1,613 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2021 Google LLC + * Author: Ard Biesheuvel + * + * This file is the core of fips140.ko, which contains various crypto algorithms + * that are also built into vmlinux. At load time, this module overrides the + * built-in implementations of these algorithms with its implementations. It + * also runs self-tests on these algorithms and verifies the integrity of its + * code and data. If either of these steps fails, the kernel will panic. + * + * This module is intended to be loaded at early boot time in order to meet + * FIPS 140 and NIAP FPT_TST_EXT.1 requirements. It shouldn't be used if you + * don't need to meet these requirements. + */ + +/* + * Since this .c file is the real entry point of fips140.ko, it needs to be + * compiled normally, so undo the hacks that were done in fips140-defs.h. + */ +#define MODULE +#undef KBUILD_MODFILE +#undef __DISABLE_EXPORTS + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fips140-module.h" +#include "internal.h" + +/* + * FIPS 140-2 prefers the use of HMAC with a public key over a plain hash. + */ +u8 __initdata fips140_integ_hmac_key[] = "The quick brown fox jumps over the lazy dog"; + +/* this is populated by the build tool */ +u8 __initdata fips140_integ_hmac_digest[SHA256_DIGEST_SIZE]; + +const u32 __initcall_start_marker __section(".initcalls._start"); +const u32 __initcall_end_marker __section(".initcalls._end"); + +const u8 __fips140_text_start __section(".text.._start"); +const u8 __fips140_text_end __section(".text.._end"); + +const u8 __fips140_rodata_start __section(".rodata.._start"); +const u8 __fips140_rodata_end __section(".rodata.._end"); + +/* + * We need this little detour to prevent Clang from detecting out of bounds + * accesses to __fips140_text_start and __fips140_rodata_start, which only exist + * to delineate the section, and so their sizes are not relevant to us. + */ +const u32 *__initcall_start = &__initcall_start_marker; + +const u8 *__text_start = &__fips140_text_start; +const u8 *__rodata_start = &__fips140_rodata_start; + +/* + * The list of the crypto API algorithms (by cra_name) that will be unregistered + * by this module, in preparation for the module registering its own + * implementation(s) of them. + * + * All algorithms that will be declared as FIPS-approved in the module + * certification must be listed here, to ensure that the non-FIPS-approved + * implementations of these algorithms in the kernel image aren't used. + * + * For every algorithm in this list, the module should contain all the "same" + * implementations that the kernel image does, including the C implementation as + * well as any architecture-specific implementations. This is needed to avoid + * performance regressions as well as the possibility of an algorithm being + * unavailable on some CPUs. E.g., "xcbc(aes)" isn't in this list, as the + * module doesn't have a C implementation of it (and it won't be FIPS-approved). + * + * Due to a quirk in the FIPS requirements, "gcm(aes)" isn't actually able to be + * FIPS-approved. However, we otherwise treat it the same as the algorithms + * that will be FIPS-approved, and therefore it's included in this list. + * + * When adding a new algorithm here, make sure to consider whether it needs a + * self-test added to fips140_selftests[] as well. + */ +static const struct { + const char *name; + bool approved; +} fips140_algs_to_replace[] = { + {"aes", true}, + + {"cmac(aes)", true}, + {"ecb(aes)", true}, + + {"cbc(aes)", true}, + {"cts(cbc(aes))", true}, + {"ctr(aes)", true}, + {"xts(aes)", true}, + {"gcm(aes)", false}, + + {"hmac(sha1)", true}, + {"hmac(sha224)", true}, + {"hmac(sha256)", true}, + {"hmac(sha384)", true}, + {"hmac(sha512)", true}, + {"sha1", true}, + {"sha224", true}, + {"sha256", true}, + {"sha384", true}, + {"sha512", true}, + + {"stdrng", true}, + {"jitterentropy_rng", false}, +}; + +static bool __init fips140_should_unregister_alg(struct crypto_alg *alg) +{ + int i; + + /* + * All software algorithms are synchronous, hardware algorithms must + * be covered by their own FIPS 140 certification. + */ + if (alg->cra_flags & CRYPTO_ALG_ASYNC) + return false; + + for (i = 0; i < ARRAY_SIZE(fips140_algs_to_replace); i++) { + if (!strcmp(alg->cra_name, fips140_algs_to_replace[i].name)) + return true; + } + return false; +} + +/* + * FIPS 140-3 service indicators. FIPS 140-3 requires that all services + * "provide an indicator when the service utilises an approved cryptographic + * algorithm, security function or process in an approved manner". What this + * means is very debatable, even with the help of the FIPS 140-3 Implementation + * Guidance document. However, it was decided that a function that takes in an + * algorithm name and returns whether that algorithm is approved or not will + * meet this requirement. Note, this relies on some properties of the module: + * + * - The module doesn't distinguish between "services" and "algorithms"; its + * services are simply its algorithms. + * + * - The status of an approved algorithm is never non-approved, since (a) the + * module doesn't support operating in a non-approved mode, such as a mode + * where the self-tests are skipped; (b) there are no cases where the module + * supports non-approved settings for approved algorithms, e.g. + * non-approved key sizes; and (c) this function isn't available to be + * called until the module_init function has completed, so it's guaranteed + * that the self-tests and integrity check have already passed. + * + * - The module does support some non-approved algorithms, so a single static + * indicator ("return true;") would not be acceptable. + */ +bool fips140_is_approved_service(const char *name) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(fips140_algs_to_replace); i++) { + if (!strcmp(name, fips140_algs_to_replace[i].name)) + return fips140_algs_to_replace[i].approved; + } + return false; +} +EXPORT_SYMBOL_GPL(fips140_is_approved_service); + +/* + * FIPS 140-3 requires that modules provide a "service" that outputs "the name + * or module identifier and the versioning information that can be correlated + * with a validation record". This function meets that requirement. + * + * Note: the module also prints this same information to the kernel log when it + * is loaded. That might meet the requirement by itself. However, given the + * vagueness of what counts as a "service", we provide this function too, just + * in case the certification lab or CMVP is happier with an explicit function. + * + * Note: /sys/modules/fips140/scmversion also provides versioning information + * about the module. However that file just shows the bare git commit ID, so it + * probably isn't sufficient to meet the FIPS requirement, which seems to want + * the "official" module name and version number used in the FIPS certificate. + */ +const char *fips140_module_version(void) +{ + return FIPS140_MODULE_NAME " " FIPS140_MODULE_VERSION; +} +EXPORT_SYMBOL_GPL(fips140_module_version); + +static LIST_HEAD(existing_live_algos); + +/* + * Release a list of algorithms which have been removed from crypto_alg_list. + * + * Note that even though the list is a private list, we have to hold + * crypto_alg_sem while iterating through it because crypto_unregister_alg() may + * run concurrently (as we haven't taken a reference to the algorithms on the + * list), and crypto_unregister_alg() will remove the algorithm from whichever + * list it happens to be on, while holding crypto_alg_sem. That's okay, since + * in that case crypto_unregister_alg() will handle the crypto_alg_put(). + */ +static void fips140_remove_final(struct list_head *list) +{ + struct crypto_alg *alg; + struct crypto_alg *n; + + /* + * We need to take crypto_alg_sem to safely traverse the list (see + * comment above), but we have to drop it when doing each + * crypto_alg_put() as that may take crypto_alg_sem again. + */ + down_write(&crypto_alg_sem); + list_for_each_entry_safe(alg, n, list, cra_list) { + list_del_init(&alg->cra_list); + up_write(&crypto_alg_sem); + + crypto_alg_put(alg); + + down_write(&crypto_alg_sem); + } + up_write(&crypto_alg_sem); +} + +static void __init unregister_existing_fips140_algos(void) +{ + struct crypto_alg *alg, *tmp; + LIST_HEAD(remove_list); + LIST_HEAD(spawns); + + down_write(&crypto_alg_sem); + + /* + * Find all registered algorithms that we care about, and move them to a + * private list so that they are no longer exposed via the algo lookup + * API. Subsequently, we will unregister them if they are not in active + * use. If they are, we can't fully unregister them but we can ensure + * that new users won't use them. + */ + list_for_each_entry_safe(alg, tmp, &crypto_alg_list, cra_list) { + if (!fips140_should_unregister_alg(alg)) + continue; + if (refcount_read(&alg->cra_refcnt) == 1) { + /* + * This algorithm is not currently in use, but there may + * be template instances holding references to it via + * spawns. So let's tear it down like + * crypto_unregister_alg() would, but without releasing + * the lock, to prevent races with concurrent TFM + * allocations. + */ + alg->cra_flags |= CRYPTO_ALG_DEAD; + list_move(&alg->cra_list, &remove_list); + crypto_remove_spawns(alg, &spawns, NULL); + } else { + /* + * This algorithm is live, i.e. it has TFMs allocated, + * so we can't fully unregister it. It's not necessary + * to dynamically redirect existing users to the FIPS + * code, given that they can't be relying on FIPS + * certified crypto in the first place. However, we do + * need to ensure that new users will get the FIPS code. + * + * In most cases, setting alg->cra_priority to 0 + * achieves this. However, that isn't enough for + * algorithms like "hmac(sha256)" that need to be + * instantiated from a template, since existing + * algorithms always take priority over a template being + * instantiated. Therefore, we move the algorithm to + * a private list so that algorithm lookups won't find + * it anymore. To further distinguish it from the FIPS + * algorithms, we also append "+orig" to its name. + */ + pr_info("found already-live algorithm '%s' ('%s')\n", + alg->cra_name, alg->cra_driver_name); + alg->cra_priority = 0; + strlcat(alg->cra_name, "+orig", CRYPTO_MAX_ALG_NAME); + strlcat(alg->cra_driver_name, "+orig", + CRYPTO_MAX_ALG_NAME); + list_move(&alg->cra_list, &existing_live_algos); + } + } + up_write(&crypto_alg_sem); + + fips140_remove_final(&remove_list); + fips140_remove_final(&spawns); +} + +static void __init unapply_text_relocations(void *section, int section_size, + const Elf64_Rela *rela, int numrels) +{ + while (numrels--) { + u32 *place = (u32 *)(section + rela->r_offset); + + BUG_ON(rela->r_offset >= section_size); + + switch (ELF64_R_TYPE(rela->r_info)) { +#ifdef CONFIG_ARM64 + case R_AARCH64_ABS32: /* for KCFI */ + *place = 0; + break; + + case R_AARCH64_JUMP26: + case R_AARCH64_CALL26: + *place &= ~GENMASK(25, 0); + break; + + case R_AARCH64_ADR_PREL_LO21: + case R_AARCH64_ADR_PREL_PG_HI21: + case R_AARCH64_ADR_PREL_PG_HI21_NC: + *place &= ~(GENMASK(30, 29) | GENMASK(23, 5)); + break; + + case R_AARCH64_ADD_ABS_LO12_NC: + case R_AARCH64_LDST8_ABS_LO12_NC: + case R_AARCH64_LDST16_ABS_LO12_NC: + case R_AARCH64_LDST32_ABS_LO12_NC: + case R_AARCH64_LDST64_ABS_LO12_NC: + case R_AARCH64_LDST128_ABS_LO12_NC: + *place &= ~GENMASK(21, 10); + break; + default: + pr_err("unhandled relocation type %llu\n", + ELF64_R_TYPE(rela->r_info)); + BUG(); +#else +#error +#endif + } + rela++; + } +} + +static void __init unapply_rodata_relocations(void *section, int section_size, + const Elf64_Rela *rela, int numrels) +{ + while (numrels--) { + void *place = section + rela->r_offset; + + BUG_ON(rela->r_offset >= section_size); + + switch (ELF64_R_TYPE(rela->r_info)) { +#ifdef CONFIG_ARM64 + case R_AARCH64_ABS64: + *(u64 *)place = 0; + break; + default: + pr_err("unhandled relocation type %llu\n", + ELF64_R_TYPE(rela->r_info)); + BUG(); +#else +#error +#endif + } + rela++; + } +} + +extern struct { + u32 offset; + u32 count; +} fips140_rela_text, fips140_rela_rodata; + +static bool __init check_fips140_module_hmac(void) +{ + struct crypto_shash *tfm = NULL; + SHASH_DESC_ON_STACK(desc, dontcare); + u8 digest[SHA256_DIGEST_SIZE]; + void *textcopy, *rodatacopy; + int textsize, rodatasize; + bool ok = false; + int err; + + textsize = &__fips140_text_end - &__fips140_text_start; + rodatasize = &__fips140_rodata_end - &__fips140_rodata_start; + + pr_info("text size : 0x%x\n", textsize); + pr_info("rodata size: 0x%x\n", rodatasize); + + textcopy = kmalloc(textsize + rodatasize, GFP_KERNEL); + if (!textcopy) { + pr_err("Failed to allocate memory for copy of .text\n"); + goto out; + } + + rodatacopy = textcopy + textsize; + + memcpy(textcopy, __text_start, textsize); + memcpy(rodatacopy, __rodata_start, rodatasize); + + // apply the relocations in reverse on the copies of .text and .rodata + unapply_text_relocations(textcopy, textsize, + offset_to_ptr(&fips140_rela_text.offset), + fips140_rela_text.count); + + unapply_rodata_relocations(rodatacopy, rodatasize, + offset_to_ptr(&fips140_rela_rodata.offset), + fips140_rela_rodata.count); + + fips140_inject_integrity_failure(textcopy); + + tfm = crypto_alloc_shash("hmac(sha256)", 0, 0); + if (IS_ERR(tfm)) { + pr_err("failed to allocate hmac tfm (%ld)\n", PTR_ERR(tfm)); + tfm = NULL; + goto out; + } + desc->tfm = tfm; + + pr_info("using '%s' for integrity check\n", + crypto_shash_driver_name(tfm)); + + err = crypto_shash_setkey(tfm, fips140_integ_hmac_key, + strlen(fips140_integ_hmac_key)) ?: + crypto_shash_init(desc) ?: + crypto_shash_update(desc, textcopy, textsize) ?: + crypto_shash_finup(desc, rodatacopy, rodatasize, digest); + + /* Zeroizing this is important; see the comment below. */ + shash_desc_zero(desc); + + if (err) { + pr_err("failed to calculate hmac shash (%d)\n", err); + goto out; + } + + if (memcmp(digest, fips140_integ_hmac_digest, sizeof(digest))) { + pr_err("provided_digest : %*phN\n", (int)sizeof(digest), + fips140_integ_hmac_digest); + + pr_err("calculated digest: %*phN\n", (int)sizeof(digest), + digest); + goto out; + } + ok = true; +out: + /* + * FIPS 140-3 requires that all "temporary value(s) generated during the + * integrity test" be zeroized (ref: FIPS 140-3 IG 9.7.B). There is no + * technical reason to do this given that these values are public + * information, but this is the requirement so we follow it. + */ + crypto_free_shash(tfm); + memzero_explicit(digest, sizeof(digest)); + kfree_sensitive(textcopy); + return ok; +} + +static void fips140_sha256(void *p, const u8 *data, unsigned int len, u8 *out, + int *hook_inuse) +{ + sha256(data, len, out); + *hook_inuse = 1; +} + +static void fips140_aes_expandkey(void *p, struct crypto_aes_ctx *ctx, + const u8 *in_key, unsigned int key_len, + int *err) +{ + *err = aes_expandkey(ctx, in_key, key_len); +} + +static void fips140_aes_encrypt(void *priv, const struct crypto_aes_ctx *ctx, + u8 *out, const u8 *in, int *hook_inuse) +{ + aes_encrypt(ctx, out, in); + *hook_inuse = 1; +} + +static void fips140_aes_decrypt(void *priv, const struct crypto_aes_ctx *ctx, + u8 *out, const u8 *in, int *hook_inuse) +{ + aes_decrypt(ctx, out, in); + *hook_inuse = 1; +} + +static bool update_fips140_library_routines(void) +{ + int ret; + + ret = register_trace_android_vh_sha256(fips140_sha256, NULL) ?: + register_trace_android_vh_aes_expandkey(fips140_aes_expandkey, NULL) ?: + register_trace_android_vh_aes_encrypt(fips140_aes_encrypt, NULL) ?: + register_trace_android_vh_aes_decrypt(fips140_aes_decrypt, NULL); + + return ret == 0; +} + +/* + * Initialize the FIPS 140 module. + * + * Note: this routine iterates over the contents of the initcall section, which + * consists of an array of function pointers that was emitted by the linker + * rather than the compiler. This means that these function pointers lack the + * usual CFI stubs that the compiler emits when CFI codegen is enabled. So + * let's disable CFI locally when handling the initcall array, to avoid + * surpises. + */ +static int __init __attribute__((__no_sanitize__("cfi"))) +fips140_init(void) +{ + const u32 *initcall; + + pr_info("loading " FIPS140_MODULE_NAME " " FIPS140_MODULE_VERSION "\n"); + fips140_init_thread = current; + + unregister_existing_fips140_algos(); + + /* iterate over all init routines present in this module and call them */ + for (initcall = __initcall_start + 1; + initcall < &__initcall_end_marker; + initcall++) { + int (*init)(void) = offset_to_ptr(initcall); + int err = init(); + + /* + * ENODEV is expected from initcalls that only register + * algorithms that depend on non-present CPU features. Besides + * that, errors aren't expected here. + */ + if (err && err != -ENODEV) { + pr_err("initcall %ps() failed: %d\n", init, err); + goto panic; + } + } + + if (!fips140_run_selftests()) + goto panic; + + /* + * It may seem backward to perform the integrity check last, but this + * is intentional: the check itself uses hmac(sha256) which is one of + * the algorithms that are replaced with versions from this module, and + * the integrity check must use the replacement version. Also, to be + * ready for FIPS 140-3, the integrity check algorithm must have already + * been self-tested. + */ + + if (!check_fips140_module_hmac()) { + pr_crit("integrity check failed -- giving up!\n"); + goto panic; + } + pr_info("integrity check passed\n"); + + complete_all(&fips140_tests_done); + + if (!update_fips140_library_routines()) + goto panic; + + if (!fips140_eval_testing_init()) + goto panic; + + pr_info("module successfully loaded\n"); + return 0; + +panic: + panic("FIPS 140 module load failure"); +} + +module_init(fips140_init); + +MODULE_IMPORT_NS(CRYPTO_INTERNAL); +MODULE_LICENSE("GPL v2"); + +/* + * Crypto-related helper functions, reproduced here so that they will be + * covered by the FIPS 140 integrity check. + * + * Non-cryptographic helper functions such as memcpy() can be excluded from the + * FIPS module, but there is ambiguity about other helper functions like + * __crypto_xor() and crypto_inc() which aren't cryptographic by themselves, + * but are more closely associated with cryptography than e.g. memcpy(). To + * err on the side of caution, we include copies of these in the FIPS module. + */ +void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int len) +{ + while (len >= 8) { + *(u64 *)dst = *(u64 *)src1 ^ *(u64 *)src2; + dst += 8; + src1 += 8; + src2 += 8; + len -= 8; + } + + while (len >= 4) { + *(u32 *)dst = *(u32 *)src1 ^ *(u32 *)src2; + dst += 4; + src1 += 4; + src2 += 4; + len -= 4; + } + + while (len >= 2) { + *(u16 *)dst = *(u16 *)src1 ^ *(u16 *)src2; + dst += 2; + src1 += 2; + src2 += 2; + len -= 2; + } + + while (len--) + *dst++ = *src1++ ^ *src2++; +} + +void crypto_inc(u8 *a, unsigned int size) +{ + a += size; + + while (size--) + if (++*--a) + break; +} diff --git a/crypto/fips140-module.h b/crypto/fips140-module.h new file mode 100644 index 000000000000..a2a63194eb64 --- /dev/null +++ b/crypto/fips140-module.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2021 Google LLC + */ + +#ifndef _CRYPTO_FIPS140_MODULE_H +#define _CRYPTO_FIPS140_MODULE_H + +#include +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "fips140: " fmt + +/* + * This is the name and version number of the module that are shown on the FIPS + * certificate. + */ +#define FIPS140_MODULE_NAME "Android Kernel Cryptographic Module" +#define FIPS140_MODULE_VERSION UTS_RELEASE + +/* fips140-eval-testing.c */ +#ifdef CONFIG_CRYPTO_FIPS140_MOD_EVAL_TESTING +void fips140_inject_selftest_failure(const char *impl, u8 *result); +void fips140_inject_integrity_failure(u8 *textcopy); +bool fips140_eval_testing_init(void); +#else +static inline void fips140_inject_selftest_failure(const char *impl, u8 *result) +{ +} +static inline void fips140_inject_integrity_failure(u8 *textcopy) +{ +} +static inline bool fips140_eval_testing_init(void) +{ + return true; +} +#endif /* !CONFIG_CRYPTO_FIPS140_MOD_EVAL_TESTING */ + +/* fips140-module.c */ +extern struct completion fips140_tests_done; +extern struct task_struct *fips140_init_thread; +bool fips140_is_approved_service(const char *name); +const char *fips140_module_version(void); + +/* fips140-selftests.c */ +bool __init __must_check fips140_run_selftests(void); + +#endif /* _CRYPTO_FIPS140_MODULE_H */ diff --git a/crypto/fips140-refs.S b/crypto/fips140-refs.S new file mode 100644 index 000000000000..fcbd52776323 --- /dev/null +++ b/crypto/fips140-refs.S @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2021 Google LLC + * Author: Ard Biesheuvel + * + * This file contains the variable definitions that will be used by the FIPS140 + * s/w module to access the RELA sections in the ELF image. These are used to + * apply the relocations applied by the module loader in reverse, so that we + * can reconstruct the image that was used to derive the HMAC used by the + * integrity check. + * + * The first .long of each entry will be populated by the module loader based + * on the actual placement of the respective RELA section in memory. The second + * .long carries the RELA entry count, and is populated by the host tool that + * also generates the HMAC of the contents of .text and .rodata. + */ + +#include +#include + + .section ".init.rodata", "a" + + .align 2 + .globl fips140_rela_text +fips140_rela_text: + .weak __sec_rela_text + .long __sec_rela_text - . + .long 0 + + .globl fips140_rela_rodata +fips140_rela_rodata: + .weak __sec_rela_rodata + .long __sec_rela_rodata - . + .long 0 diff --git a/crypto/fips140-selftests.c b/crypto/fips140-selftests.c new file mode 100644 index 000000000000..9663b1a33bd4 --- /dev/null +++ b/crypto/fips140-selftests.c @@ -0,0 +1,998 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2021 Google LLC + * + * Authors: Elena Petrova , + * Eric Biggers + * + * Self-tests of fips140.ko cryptographic functionality. These are run at + * module load time to fulfill FIPS 140 and NIAP FPT_TST_EXT.1 requirements. + * + * The actual requirements for these self-tests are somewhat vague, but + * section 9 ("Self-Tests") of the FIPS 140-2 Implementation Guidance document + * (https://csrc.nist.gov/csrc/media/projects/cryptographic-module-validation-program/documents/fips140-2/fips1402ig.pdf) + * is somewhat helpful. Basically, all implementations of all FIPS approved + * algorithms (including modes of operation) must be tested. However: + * + * - There are provisions for skipping tests that are already sufficiently + * covered by other tests. E.g., HMAC-SHA256 may cover SHA-256. + * + * - Only one test vector is required per algorithm, and it can be generated + * by any known-good implementation or taken from any official document. + * + * - For ciphers, both encryption and decryption must be tested. + * + * - Only one key size per algorithm needs to be tested. + * + * There is some ambiguity about whether all implementations of each algorithm + * must be tested, or whether it is sufficient to test just the highest priority + * implementation. To be safe we test all implementations, except ones that can + * be excluded by one of the rules above. + * + * See fips140_selftests[] for the list of tests we've selected. Currently, all + * our test vectors except the AES-CBC-CTS and DRBG ones were generated by the + * script tools/crypto/gen_fips140_testvecs.py, using the known-good + * implementations in the Python packages hashlib, pycryptodome, and + * cryptography. + * + * Note that we don't reuse the upstream crypto API's self-tests + * (crypto/testmgr.{c,h}), for several reasons: + * + * - To meet FIPS requirements, the self-tests must be located within the FIPS + * module boundary (fips140.ko). But testmgr is integrated into the crypto + * API framework and can't be extracted into the module. + * + * - testmgr is much more heavyweight than required for FIPS and NIAP; it + * tests more algorithms and does more tests per algorithm, as it's meant to + * do proper testing and not just meet certification requirements. We need + * tests that can run with minimal overhead on every boot-up. + * + * - Despite being more heavyweight in general, testmgr doesn't test the + * SHA-256 and AES library APIs, despite that being needed here. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "fips140-module.h" + +/* Test vector for an AEAD algorithm */ +struct aead_testvec { + const u8 *key; + size_t key_size; + const u8 *iv; + size_t iv_size; + const u8 *assoc; + size_t assoc_size; + const u8 *plaintext; + size_t plaintext_size; + const u8 *ciphertext; + size_t ciphertext_size; +}; + +/* Test vector for a length-preserving encryption algorithm */ +struct skcipher_testvec { + const u8 *key; + size_t key_size; + const u8 *iv; + size_t iv_size; + const u8 *plaintext; + const u8 *ciphertext; + size_t message_size; +}; + +/* Test vector for a hash algorithm */ +struct hash_testvec { + const u8 *key; + size_t key_size; + const u8 *message; + size_t message_size; + const u8 *digest; + size_t digest_size; +}; + +/* Test vector for a DRBG algorithm */ +struct drbg_testvec { + const u8 *entropy; + size_t entropy_size; + const u8 *pers; + size_t pers_size; + const u8 *entpr_a; + const u8 *entpr_b; + size_t entpr_size; + const u8 *add_a; + const u8 *add_b; + size_t add_size; + const u8 *output; + size_t out_size; +}; + +struct fips_test { + /* The name of the algorithm, in crypto API syntax */ + const char *alg; + + /* + * The optional list of implementations to test. @func will be called + * once per implementation, or once with @alg if this list is empty. + * The implementation names must be given in crypto API syntax, or in + * the case of a library implementation should have "-lib" appended. + */ + const char *impls[8]; + + /* + * The test function. It should execute a known-answer test on an + * algorithm implementation, using the below test vector. + */ + int __must_check (*func)(const struct fips_test *test, + const char *impl); + + /* The test vector, with a format specific to the type of algorithm */ + union { + struct aead_testvec aead; + struct skcipher_testvec skcipher; + struct hash_testvec hash; + struct drbg_testvec drbg; + }; +}; + +/* Maximum IV size (in bytes) among any algorithm tested here */ +#define MAX_IV_SIZE 16 + +static int __init __must_check +fips_check_result(u8 *result, const u8 *expected_result, size_t result_size, + const char *impl, const char *operation) +{ + fips140_inject_selftest_failure(impl, result); + if (memcmp(result, expected_result, result_size) != 0) { + pr_err("wrong result from %s %s\n", impl, operation); + return -EBADMSG; + } + return 0; +} + +/* + * None of the algorithms should be ASYNC, as the FIPS module doesn't register + * any ASYNC algorithms. (The ASYNC flag is only declared by hardware + * algorithms, which would need their own FIPS certification.) + * + * Ideally we would verify alg->cra_module == THIS_MODULE here as well, but that + * doesn't work because the files are compiled as built-in code. + */ +static int __init __must_check +fips_validate_alg(const struct crypto_alg *alg) +{ + if (alg->cra_flags & CRYPTO_ALG_ASYNC) { + pr_err("unexpectedly got async implementation of %s (%s)\n", + alg->cra_name, alg->cra_driver_name); + return -EINVAL; + } + return 0; +} + +static int __init __must_check +fips_handle_alloc_tfm_error(const char *impl, int err) +{ + if (err == -ENOENT) { + /* + * The requested implementation of the algorithm wasn't found. + * This is expected if the CPU lacks a feature the + * implementation needs, such as the ARMv8 Crypto Extensions. + * + * When this happens, the implementation isn't available for + * use, so we can't test it, nor do we need to. So we just skip + * the test. + */ + pr_info("%s is unavailable (no CPU support?), skipping testing it\n", + impl); + return 0; + } + pr_err("failed to allocate %s tfm: %d\n", impl, err); + return err; +} + +static int __init __must_check +fips_test_aes_library(const struct fips_test *test, const char *impl) +{ + const struct skcipher_testvec *vec = &test->skcipher; + struct crypto_aes_ctx ctx; + u8 block[AES_BLOCK_SIZE]; + int err; + + if (WARN_ON(vec->message_size != AES_BLOCK_SIZE)) + return -EINVAL; + + err = aes_expandkey(&ctx, vec->key, vec->key_size); + if (err) { + pr_err("aes_expandkey() failed: %d\n", err); + return err; + } + aes_encrypt(&ctx, block, vec->plaintext); + err = fips_check_result(block, vec->ciphertext, AES_BLOCK_SIZE, + impl, "encryption"); + if (err) + return err; + aes_decrypt(&ctx, block, block); + return fips_check_result(block, vec->plaintext, AES_BLOCK_SIZE, + impl, "decryption"); +} + +/* Test a length-preserving symmetric cipher using the crypto_skcipher API. */ +static int __init __must_check +fips_test_skcipher(const struct fips_test *test, const char *impl) +{ + const struct skcipher_testvec *vec = &test->skcipher; + struct crypto_skcipher *tfm; + struct skcipher_request *req = NULL; + u8 *message = NULL; + struct scatterlist sg; + u8 iv[MAX_IV_SIZE]; + int err; + + if (WARN_ON(vec->iv_size > MAX_IV_SIZE)) + return -EINVAL; + if (WARN_ON(vec->message_size <= 0)) + return -EINVAL; + + tfm = crypto_alloc_skcipher(impl, 0, 0); + if (IS_ERR(tfm)) + return fips_handle_alloc_tfm_error(impl, PTR_ERR(tfm)); + err = fips_validate_alg(&crypto_skcipher_alg(tfm)->base); + if (err) + goto out; + if (crypto_skcipher_ivsize(tfm) != vec->iv_size) { + pr_err("%s has wrong IV size\n", impl); + err = -EINVAL; + goto out; + } + + req = skcipher_request_alloc(tfm, GFP_KERNEL); + message = kmemdup(vec->plaintext, vec->message_size, GFP_KERNEL); + if (!req || !message) { + err = -ENOMEM; + goto out; + } + sg_init_one(&sg, message, vec->message_size); + + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + skcipher_request_set_crypt(req, &sg, &sg, vec->message_size, iv); + + err = crypto_skcipher_setkey(tfm, vec->key, vec->key_size); + if (err) { + pr_err("failed to set %s key: %d\n", impl, err); + goto out; + } + + /* Encrypt the plaintext, then verify the resulting ciphertext. */ + memcpy(iv, vec->iv, vec->iv_size); + err = crypto_skcipher_encrypt(req); + if (err) { + pr_err("%s encryption failed: %d\n", impl, err); + goto out; + } + err = fips_check_result(message, vec->ciphertext, vec->message_size, + impl, "encryption"); + if (err) + goto out; + + /* Decrypt the ciphertext, then verify the resulting plaintext. */ + memcpy(iv, vec->iv, vec->iv_size); + err = crypto_skcipher_decrypt(req); + if (err) { + pr_err("%s decryption failed: %d\n", impl, err); + goto out; + } + err = fips_check_result(message, vec->plaintext, vec->message_size, + impl, "decryption"); +out: + kfree(message); + skcipher_request_free(req); + crypto_free_skcipher(tfm); + return err; +} + +/* Test an AEAD using the crypto_aead API. */ +static int __init __must_check +fips_test_aead(const struct fips_test *test, const char *impl) +{ + const struct aead_testvec *vec = &test->aead; + const int tag_size = vec->ciphertext_size - vec->plaintext_size; + struct crypto_aead *tfm; + struct aead_request *req = NULL; + u8 *assoc = NULL; + u8 *message = NULL; + struct scatterlist sg[2]; + int sg_idx = 0; + u8 iv[MAX_IV_SIZE]; + int err; + + if (WARN_ON(vec->iv_size > MAX_IV_SIZE)) + return -EINVAL; + if (WARN_ON(vec->ciphertext_size <= vec->plaintext_size)) + return -EINVAL; + + tfm = crypto_alloc_aead(impl, 0, 0); + if (IS_ERR(tfm)) + return fips_handle_alloc_tfm_error(impl, PTR_ERR(tfm)); + err = fips_validate_alg(&crypto_aead_alg(tfm)->base); + if (err) + goto out; + if (crypto_aead_ivsize(tfm) != vec->iv_size) { + pr_err("%s has wrong IV size\n", impl); + err = -EINVAL; + goto out; + } + + req = aead_request_alloc(tfm, GFP_KERNEL); + assoc = kmemdup(vec->assoc, vec->assoc_size, GFP_KERNEL); + message = kzalloc(vec->ciphertext_size, GFP_KERNEL); + if (!req || !assoc || !message) { + err = -ENOMEM; + goto out; + } + memcpy(message, vec->plaintext, vec->plaintext_size); + + sg_init_table(sg, ARRAY_SIZE(sg)); + if (vec->assoc_size) + sg_set_buf(&sg[sg_idx++], assoc, vec->assoc_size); + sg_set_buf(&sg[sg_idx++], message, vec->ciphertext_size); + + aead_request_set_ad(req, vec->assoc_size); + aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); + + err = crypto_aead_setkey(tfm, vec->key, vec->key_size); + if (err) { + pr_err("failed to set %s key: %d\n", impl, err); + goto out; + } + + err = crypto_aead_setauthsize(tfm, tag_size); + if (err) { + pr_err("failed to set %s authentication tag size: %d\n", + impl, err); + goto out; + } + + /* + * Encrypt the plaintext, then verify the resulting ciphertext (which + * includes the authentication tag). + */ + memcpy(iv, vec->iv, vec->iv_size); + aead_request_set_crypt(req, sg, sg, vec->plaintext_size, iv); + err = crypto_aead_encrypt(req); + if (err) { + pr_err("%s encryption failed: %d\n", impl, err); + goto out; + } + err = fips_check_result(message, vec->ciphertext, vec->ciphertext_size, + impl, "encryption"); + if (err) + goto out; + + /* + * Decrypt the ciphertext (which includes the authentication tag), then + * verify the resulting plaintext. + */ + memcpy(iv, vec->iv, vec->iv_size); + aead_request_set_crypt(req, sg, sg, vec->ciphertext_size, iv); + err = crypto_aead_decrypt(req); + if (err) { + pr_err("%s decryption failed: %d\n", impl, err); + goto out; + } + err = fips_check_result(message, vec->plaintext, vec->plaintext_size, + impl, "decryption"); +out: + kfree(message); + kfree(assoc); + aead_request_free(req); + crypto_free_aead(tfm); + return err; +} + +/* + * Test a hash algorithm using the crypto_shash API. + * + * Note that we don't need to test the crypto_ahash API too, since none of the + * hash algorithms in the FIPS module have the ASYNC flag, and thus there will + * be no hash algorithms that can be accessed only through crypto_ahash. + */ +static int __init __must_check +fips_test_hash(const struct fips_test *test, const char *impl) +{ + const struct hash_testvec *vec = &test->hash; + struct crypto_shash *tfm; + u8 digest[HASH_MAX_DIGESTSIZE]; + int err; + + if (WARN_ON(vec->digest_size > HASH_MAX_DIGESTSIZE)) + return -EINVAL; + + tfm = crypto_alloc_shash(impl, 0, 0); + if (IS_ERR(tfm)) + return fips_handle_alloc_tfm_error(impl, PTR_ERR(tfm)); + err = fips_validate_alg(&crypto_shash_alg(tfm)->base); + if (err) + goto out; + if (crypto_shash_digestsize(tfm) != vec->digest_size) { + pr_err("%s has wrong digest size\n", impl); + err = -EINVAL; + goto out; + } + + if (vec->key) { + err = crypto_shash_setkey(tfm, vec->key, vec->key_size); + if (err) { + pr_err("failed to set %s key: %d\n", impl, err); + goto out; + } + } + + err = crypto_shash_tfm_digest(tfm, vec->message, vec->message_size, + digest); + if (err) { + pr_err("%s digest computation failed: %d\n", impl, err); + goto out; + } + err = fips_check_result(digest, vec->digest, vec->digest_size, + impl, "digest"); +out: + crypto_free_shash(tfm); + return err; +} + +static int __init __must_check +fips_test_sha256_library(const struct fips_test *test, const char *impl) +{ + const struct hash_testvec *vec = &test->hash; + u8 digest[SHA256_DIGEST_SIZE]; + + if (WARN_ON(vec->digest_size != SHA256_DIGEST_SIZE)) + return -EINVAL; + + sha256(vec->message, vec->message_size, digest); + return fips_check_result(digest, vec->digest, vec->digest_size, + impl, "digest"); +} + +/* Test a DRBG using the crypto_rng API. */ +static int __init __must_check +fips_test_drbg(const struct fips_test *test, const char *impl) +{ + const struct drbg_testvec *vec = &test->drbg; + struct crypto_rng *rng; + u8 *output = NULL; + struct drbg_test_data test_data; + struct drbg_string addtl, pers, testentropy; + int err; + + rng = crypto_alloc_rng(impl, 0, 0); + if (IS_ERR(rng)) + return fips_handle_alloc_tfm_error(impl, PTR_ERR(rng)); + err = fips_validate_alg(&crypto_rng_alg(rng)->base); + if (err) + goto out; + + output = kzalloc(vec->out_size, GFP_KERNEL); + if (!output) { + err = -ENOMEM; + goto out; + } + + /* + * Initialize the DRBG with the entropy and personalization string given + * in the test vector. + */ + test_data.testentropy = &testentropy; + drbg_string_fill(&testentropy, vec->entropy, vec->entropy_size); + drbg_string_fill(&pers, vec->pers, vec->pers_size); + err = crypto_drbg_reset_test(rng, &pers, &test_data); + if (err) { + pr_err("failed to reset %s\n", impl); + goto out; + } + + /* + * Generate some random bytes using the additional data string provided + * in the test vector. Also use the additional entropy if provided + * (relevant for the prediction-resistant DRBG variants only). + */ + drbg_string_fill(&addtl, vec->add_a, vec->add_size); + if (vec->entpr_size) { + drbg_string_fill(&testentropy, vec->entpr_a, vec->entpr_size); + err = crypto_drbg_get_bytes_addtl_test(rng, output, + vec->out_size, &addtl, + &test_data); + } else { + err = crypto_drbg_get_bytes_addtl(rng, output, vec->out_size, + &addtl); + } + if (err) { + pr_err("failed to get bytes from %s (try 1): %d\n", + impl, err); + goto out; + } + + /* + * Do the same again, using a second additional data string, and (when + * applicable) a second additional entropy string. + */ + drbg_string_fill(&addtl, vec->add_b, vec->add_size); + if (test->drbg.entpr_size) { + drbg_string_fill(&testentropy, vec->entpr_b, vec->entpr_size); + err = crypto_drbg_get_bytes_addtl_test(rng, output, + vec->out_size, &addtl, + &test_data); + } else { + err = crypto_drbg_get_bytes_addtl(rng, output, vec->out_size, + &addtl); + } + if (err) { + pr_err("failed to get bytes from %s (try 2): %d\n", + impl, err); + goto out; + } + + /* Check that the DRBG generated the expected output. */ + err = fips_check_result(output, vec->output, vec->out_size, + impl, "get_bytes"); +out: + kfree(output); + crypto_free_rng(rng); + return err; +} + +/* Include the test vectors generated by the Python script. */ +#include "fips140-generated-testvecs.h" + +/* + * List of all self-tests. Keep this in sync with fips140_algorithms[]. + * + * When possible, we have followed the FIPS 140-2 Implementation Guidance (IG) + * document when creating this list of tests. The result is intended to be a + * list of tests that is near-minimal (and thus minimizes runtime overhead) + * while complying with all requirements. For additional details, see the + * comment at the beginning of this file. + */ +static const struct fips_test fips140_selftests[] __initconst = { + /* + * Test for the AES library API. + * + * Since the AES library API may use its own AES implementation and the + * module provides no support for composing it with a mode of operation + * (it's just plain AES), we must test it directly. + * + * In contrast, we don't need to directly test the "aes" ciphers that + * are accessible through the crypto_cipher API (e.g. "aes-ce"), as they + * are covered indirectly by AES-CMAC and AES-ECB tests. + */ + { + .alg = "aes", + .impls = {"aes-lib"}, + .func = fips_test_aes_library, + .skcipher = { + .key = fips_aes_key, + .key_size = sizeof(fips_aes_key), + .plaintext = fips_message, + .ciphertext = fips_aes_ecb_ciphertext, + .message_size = 16, + } + }, + /* + * Tests for AES-CMAC, a.k.a. "cmac(aes)" in crypto API syntax. + * + * The IG requires that each underlying AES implementation be tested in + * an authenticated mode, if implemented. Of such modes, this module + * implements AES-GCM and AES-CMAC. However, AES-GCM doesn't "count" + * because this module's implementations of AES-GCM won't actually be + * FIPS-approved, due to a quirk in the FIPS requirements. + * + * Therefore, for us this requirement applies to AES-CMAC, so we must + * test the "cmac" template composed with each "aes" implementation. + * + * Separately from the above, we also must test all standalone + * implementations of "cmac(aes)" such as "cmac-aes-ce", as they don't + * reuse another full AES implementation and thus can't be covered by + * another test. + */ + { + .alg = "cmac(aes)", + .impls = { + /* "cmac" template with all "aes" implementations */ + "cmac(aes-generic)", + "cmac(aes-arm64)", + "cmac(aes-ce)", + /* All standalone implementations of "cmac(aes)" */ + "cmac-aes-neon", + "cmac-aes-ce", + }, + .func = fips_test_hash, + .hash = { + .key = fips_aes_key, + .key_size = sizeof(fips_aes_key), + .message = fips_message, + .message_size = sizeof(fips_message), + .digest = fips_aes_cmac_digest, + .digest_size = sizeof(fips_aes_cmac_digest), + } + }, + /* + * Tests for AES-ECB, a.k.a. "ecb(aes)" in crypto API syntax. + * + * The IG requires that each underlying AES implementation be tested in + * a mode that exercises the encryption direction of AES and in a mode + * that exercises the decryption direction of AES. CMAC only covers the + * encryption direction, so we choose ECB to test decryption. Thus, we + * test the "ecb" template composed with each "aes" implementation. + * + * Separately from the above, we also must test all standalone + * implementations of "ecb(aes)" such as "ecb-aes-ce", as they don't + * reuse another full AES implementation and thus can't be covered by + * another test. + */ + { + .alg = "ecb(aes)", + .impls = { + /* "ecb" template with all "aes" implementations */ + "ecb(aes-generic)", + "ecb(aes-arm64)", + "ecb(aes-ce)", + /* All standalone implementations of "ecb(aes)" */ + "ecb-aes-neon", + "ecb-aes-neonbs", + "ecb-aes-ce", + }, + .func = fips_test_skcipher, + .skcipher = { + .key = fips_aes_key, + .key_size = sizeof(fips_aes_key), + .plaintext = fips_message, + .ciphertext = fips_aes_ecb_ciphertext, + .message_size = sizeof(fips_message) + } + }, + /* + * Tests for AES-CBC, AES-CBC-CTS, AES-CTR, AES-XTS, and AES-GCM. + * + * According to the IG, an AES mode of operation doesn't need to have + * its own test, provided that (a) both the encryption and decryption + * directions of the underlying AES implementation are already tested + * via other mode(s), and (b) in the case of an authenticated mode, at + * least one other authenticated mode is already tested. The tests of + * the "cmac" and "ecb" templates fulfill these conditions; therefore, + * we don't need to test any other AES mode templates. + * + * This does *not* apply to standalone implementations of these modes + * such as "cbc-aes-ce", as such implementations don't reuse another + * full AES implementation and thus can't be covered by another test. + * We must test all such standalone implementations. + * + * The AES-GCM test isn't actually required, as it's expected that this + * module's AES-GCM implementation won't actually be able to be + * FIPS-approved. This is unfortunate; it's caused by the FIPS + * requirements for GCM being incompatible with GCM implementations that + * don't generate their own IVs. We choose to still include the AES-GCM + * test to keep it on par with the other FIPS-approved algorithms, in + * case it turns out that AES-GCM can be approved after all. + */ + { + .alg = "cbc(aes)", + .impls = { + /* All standalone implementations of "cbc(aes)" */ + "cbc-aes-neon", + "cbc-aes-neonbs", + "cbc-aes-ce", + }, + .func = fips_test_skcipher, + .skcipher = { + .key = fips_aes_key, + .key_size = sizeof(fips_aes_key), + .iv = fips_aes_iv, + .iv_size = sizeof(fips_aes_iv), + .plaintext = fips_message, + .ciphertext = fips_aes_cbc_ciphertext, + .message_size = sizeof(fips_message), + } + }, { + .alg = "cts(cbc(aes))", + .impls = { + /* All standalone implementations of "cts(cbc(aes))" */ + "cts-cbc-aes-neon", + "cts-cbc-aes-ce", + }, + .func = fips_test_skcipher, + /* Test vector taken from RFC 3962 */ + .skcipher = { + .key = "\x63\x68\x69\x63\x6b\x65\x6e\x20" + "\x74\x65\x72\x69\x79\x61\x6b\x69", + .key_size = 16, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .iv_size = 16, + .plaintext = "\x49\x20\x77\x6f\x75\x6c\x64\x20" + "\x6c\x69\x6b\x65\x20\x74\x68\x65" + "\x20\x47\x65\x6e\x65\x72\x61\x6c" + "\x20\x47\x61\x75\x27\x73\x20", + .ciphertext = "\xfc\x00\x78\x3e\x0e\xfd\xb2\xc1" + "\xd4\x45\xd4\xc8\xef\xf7\xed\x22" + "\x97\x68\x72\x68\xd6\xec\xcc\xc0" + "\xc0\x7b\x25\xe2\x5e\xcf\xe5", + .message_size = 31, + } + }, { + .alg = "ctr(aes)", + .impls = { + /* All standalone implementations of "ctr(aes)" */ + "ctr-aes-neon", + "ctr-aes-neonbs", + "ctr-aes-ce", + }, + .func = fips_test_skcipher, + .skcipher = { + .key = fips_aes_key, + .key_size = sizeof(fips_aes_key), + .iv = fips_aes_iv, + .iv_size = sizeof(fips_aes_iv), + .plaintext = fips_message, + .ciphertext = fips_aes_ctr_ciphertext, + .message_size = sizeof(fips_message), + } + }, { + .alg = "xts(aes)", + .impls = { + /* All standalone implementations of "xts(aes)" */ + "xts-aes-neon", + "xts-aes-neonbs", + "xts-aes-ce", + }, + .func = fips_test_skcipher, + .skcipher = { + .key = fips_aes_xts_key, + .key_size = sizeof(fips_aes_xts_key), + .iv = fips_aes_iv, + .iv_size = sizeof(fips_aes_iv), + .plaintext = fips_message, + .ciphertext = fips_aes_xts_ciphertext, + .message_size = sizeof(fips_message), + } + }, { + .alg = "gcm(aes)", + .impls = { + /* All standalone implementations of "gcm(aes)" */ + "gcm-aes-ce", + }, + .func = fips_test_aead, + .aead = { + .key = fips_aes_key, + .key_size = sizeof(fips_aes_key), + .iv = fips_aes_iv, + /* The GCM implementations assume an IV size of 12. */ + .iv_size = 12, + .assoc = fips_aes_gcm_assoc, + .assoc_size = sizeof(fips_aes_gcm_assoc), + .plaintext = fips_message, + .plaintext_size = sizeof(fips_message), + .ciphertext = fips_aes_gcm_ciphertext, + .ciphertext_size = sizeof(fips_aes_gcm_ciphertext), + } + }, + + /* Tests for SHA-1 */ + { + .alg = "sha1", + .impls = { + /* All implementations of "sha1" */ + "sha1-generic", + "sha1-ce" + }, + .func = fips_test_hash, + .hash = { + .message = fips_message, + .message_size = sizeof(fips_message), + .digest = fips_sha1_digest, + .digest_size = sizeof(fips_sha1_digest) + } + }, + /* + * Tests for all SHA-256 implementations other than the sha256() library + * function. As per the IG, these tests also fulfill the tests for the + * corresponding SHA-224 implementations. + */ + { + .alg = "sha256", + .impls = { + /* All implementations of "sha256" */ + "sha256-generic", + "sha256-arm64", + "sha256-ce", + }, + .func = fips_test_hash, + .hash = { + .message = fips_message, + .message_size = sizeof(fips_message), + .digest = fips_sha256_digest, + .digest_size = sizeof(fips_sha256_digest) + } + }, + /* + * Test for the sha256() library function. This must be tested + * separately because it may use its own SHA-256 implementation. + */ + { + .alg = "sha256", + .impls = {"sha256-lib"}, + .func = fips_test_sha256_library, + .hash = { + .message = fips_message, + .message_size = sizeof(fips_message), + .digest = fips_sha256_digest, + .digest_size = sizeof(fips_sha256_digest) + } + }, + /* + * Tests for all SHA-512 implementations. As per the IG, these tests + * also fulfill the tests for the corresponding SHA-384 implementations. + */ + { + .alg = "sha512", + .impls = { + /* All implementations of "sha512" */ + "sha512-generic", + "sha512-arm64", + "sha512-ce", + }, + .func = fips_test_hash, + .hash = { + .message = fips_message, + .message_size = sizeof(fips_message), + .digest = fips_sha512_digest, + .digest_size = sizeof(fips_sha512_digest) + } + }, + /* + * Test for HMAC. As per the IG, only one HMAC test is required, + * provided that the same HMAC code is shared by all HMAC-SHA*. This is + * true in our case. We choose HMAC-SHA256 for the test. + * + * Note that as per the IG, this can fulfill the test for the underlying + * SHA. However, we don't currently rely on this. + */ + { + .alg = "hmac(sha256)", + .func = fips_test_hash, + .hash = { + .key = fips_hmac_key, + .key_size = sizeof(fips_hmac_key), + .message = fips_message, + .message_size = sizeof(fips_message), + .digest = fips_hmac_sha256_digest, + .digest_size = sizeof(fips_hmac_sha256_digest) + } + }, + /* + * Known-answer tests for the SP800-90A DRBG algorithms. + * + * These test vectors were manually extracted from + * https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Algorithm-Validation-Program/documents/drbg/drbgtestvectors.zip. + * + * The selection of these tests follows the FIPS 140-2 IG as well as + * Section 11 of SP800-90A: + * + * - We must test all DRBG types (HMAC, Hash, and CTR) that the module + * implements. However, currently the module only implements + * HMAC_DRBG (since CONFIG_CRYPTO_DRBG_CTR and CONFIG_CRYPTO_DRBG_HASH + * aren't enabled). Therefore, we only need to test HMAC_DRBG. + * + * - We only need to test one HMAC variant. + * + * - We must test all DRBG operations: Instantiate(), Reseed(), and + * Generate(). However, a single test sequence with a single output + * comparison may cover all three operations, and this is what we do. + * Note that Reseed() happens implicitly via the use of the additional + * input and also via the use of prediction resistance when enabled. + * + * - The personalization string, additional input, and prediction + * resistance support must be tested. Therefore we have chosen test + * vectors that have a nonempty personalization string and nonempty + * additional input, and we test the prediction-resistant variant. + * Testing the non-prediction-resistant variant is not required. + */ + { + .alg = "drbg_pr_hmac_sha256", + .func = fips_test_drbg, + .drbg = { + .entropy = + "\xc7\xcc\xbc\x67\x7e\x21\x66\x1e\x27\x2b\x63\xdd" + "\x3a\x78\xdc\xdf\x66\x6d\x3f\x24\xae\xcf\x37\x01" + "\xa9\x0d\x89\x8a\xa7\xdc\x81\x58\xae\xb2\x10\x15" + "\x7e\x18\x44\x6d\x13\xea\xdf\x37\x85\xfe\x81\xfb", + .entropy_size = 48, + .entpr_a = + "\x7b\xa1\x91\x5b\x3c\x04\xc4\x1b\x1d\x19\x2f\x1a" + "\x18\x81\x60\x3c\x6c\x62\x91\xb7\xe9\xf5\xcb\x96" + "\xbb\x81\x6a\xcc\xb5\xae\x55\xb6", + .entpr_b = + "\x99\x2c\xc7\x78\x7e\x3b\x88\x12\xef\xbe\xd3\xd2" + "\x7d\x2a\xa5\x86\xda\x8d\x58\x73\x4a\x0a\xb2\x2e" + "\xbb\x4c\x7e\xe3\x9a\xb6\x81\xc1", + .entpr_size = 32, + .output = + "\x95\x6f\x95\xfc\x3b\xb7\xfe\x3e\xd0\x4e\x1a\x14" + "\x6c\x34\x7f\x7b\x1d\x0d\x63\x5e\x48\x9c\x69\xe6" + "\x46\x07\xd2\x87\xf3\x86\x52\x3d\x98\x27\x5e\xd7" + "\x54\xe7\x75\x50\x4f\xfb\x4d\xfd\xac\x2f\x4b\x77" + "\xcf\x9e\x8e\xcc\x16\xa2\x24\xcd\x53\xde\x3e\xc5" + "\x55\x5d\xd5\x26\x3f\x89\xdf\xca\x8b\x4e\x1e\xb6" + "\x88\x78\x63\x5c\xa2\x63\x98\x4e\x6f\x25\x59\xb1" + "\x5f\x2b\x23\xb0\x4b\xa5\x18\x5d\xc2\x15\x74\x40" + "\x59\x4c\xb4\x1e\xcf\x9a\x36\xfd\x43\xe2\x03\xb8" + "\x59\x91\x30\x89\x2a\xc8\x5a\x43\x23\x7c\x73\x72" + "\xda\x3f\xad\x2b\xba\x00\x6b\xd1", + .out_size = 128, + .add_a = + "\x18\xe8\x17\xff\xef\x39\xc7\x41\x5c\x73\x03\x03" + "\xf6\x3d\xe8\x5f\xc8\xab\xe4\xab\x0f\xad\xe8\xd6" + "\x86\x88\x55\x28\xc1\x69\xdd\x76", + .add_b = + "\xac\x07\xfc\xbe\x87\x0e\xd3\xea\x1f\x7e\xb8\xe7" + "\x9d\xec\xe8\xe7\xbc\xf3\x18\x25\x77\x35\x4a\xaa" + "\x00\x99\x2a\xdd\x0a\x00\x50\x82", + .add_size = 32, + .pers = + "\xbc\x55\xab\x3c\xf6\x52\xb0\x11\x3d\x7b\x90\xb8" + "\x24\xc9\x26\x4e\x5a\x1e\x77\x0d\x3d\x58\x4a\xda" + "\xd1\x81\xe9\xf8\xeb\x30\x8f\x6f", + .pers_size = 32, + } + } +}; + +static int __init __must_check +fips_run_test(const struct fips_test *test) +{ + int i; + int err; + + /* + * If no implementations were specified, then just test the default one. + * Otherwise, test the specified list of implementations. + */ + + if (test->impls[0] == NULL) { + err = test->func(test, test->alg); + if (err) + pr_emerg("self-tests failed for algorithm %s: %d\n", + test->alg, err); + return err; + } + + for (i = 0; i < ARRAY_SIZE(test->impls) && test->impls[i] != NULL; + i++) { + err = test->func(test, test->impls[i]); + if (err) { + pr_emerg("self-tests failed for algorithm %s, implementation %s: %d\n", + test->alg, test->impls[i], err); + return err; + } + } + return 0; +} + +bool __init fips140_run_selftests(void) +{ + int i; + + pr_info("running self-tests\n"); + for (i = 0; i < ARRAY_SIZE(fips140_selftests); i++) { + if (fips_run_test(&fips140_selftests[i]) != 0) { + /* The caller is responsible for calling panic(). */ + return false; + } + } + pr_info("all self-tests passed\n"); + return true; +} diff --git a/crypto/fips140_gen_hmac.c b/crypto/fips140_gen_hmac.c new file mode 100644 index 000000000000..69f754d38a1d --- /dev/null +++ b/crypto/fips140_gen_hmac.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 - Google LLC + * Author: Ard Biesheuvel + * + * This is a host tool that is intended to be used to take the HMAC digest of + * the .text and .rodata sections of the fips140.ko module, and store it inside + * the module. The module will perform an integrity selfcheck at module_init() + * time, by recalculating the digest and comparing it with the value calculated + * here. + * + * Note that the peculiar way an HMAC is being used as a digest with a public + * key rather than as a symmetric key signature is mandated by FIPS 140-2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static Elf64_Ehdr *ehdr; +static Elf64_Shdr *shdr; +static int num_shdr; +static const char *strtab, *shstrtab; +static Elf64_Sym *syms; +static int num_syms; + +static Elf64_Shdr *find_symtab_section(void) +{ + int i; + + for (i = 0; i < num_shdr; i++) + if (shdr[i].sh_type == SHT_SYMTAB) + return &shdr[i]; + return NULL; +} + +static int get_section_idx(const char *name) +{ + int i; + + for (i = 0; i < num_shdr; i++) + if (!strcmp(shstrtab + shdr[i].sh_name, name)) + return i; + return -1; +} + +static int get_sym_idx(const char *sym_name) +{ + int i; + + for (i = 0; i < num_syms; i++) + if (!strcmp(strtab + syms[i].st_name, sym_name)) + return i; + return -1; +} + +static void *get_sym_addr(const char *sym_name) +{ + int i = get_sym_idx(sym_name); + + if (i >= 0) + return (void *)ehdr + shdr[syms[i].st_shndx].sh_offset + + syms[i].st_value; + return NULL; +} + +static int update_rela_ref(const char *name) +{ + /* + * We need to do a couple of things to ensure that the copied RELA data + * is accessible to the module itself at module init time: + * - the associated entry in the symbol table needs to refer to the + * correct section index, and have SECTION type and GLOBAL linkage. + * - the 'count' global variable in the module need to be set to the + * right value based on the size of the RELA section. + */ + unsigned int *size_var; + int sec_idx, sym_idx; + char str[32]; + + sprintf(str, "fips140_rela_%s", name); + size_var = get_sym_addr(str); + if (!size_var) { + printf("variable '%s' not found, disregarding .%s section\n", + str, name); + return 1; + } + + sprintf(str, "__sec_rela_%s", name); + sym_idx = get_sym_idx(str); + + sprintf(str, ".init.rela.%s", name); + sec_idx = get_section_idx(str); + + if (sec_idx < 0 || sym_idx < 0) { + fprintf(stderr, "failed to locate metadata for .%s section in binary\n", + name); + return 0; + } + + syms[sym_idx].st_shndx = sec_idx; + syms[sym_idx].st_info = (STB_GLOBAL << 4) | STT_SECTION; + + size_var[1] = shdr[sec_idx].sh_size / sizeof(Elf64_Rela); + + return 1; +} + +static void hmac_section(HMAC_CTX *hmac, const char *start, const char *end) +{ + void *start_addr = get_sym_addr(start); + void *end_addr = get_sym_addr(end); + + HMAC_Update(hmac, start_addr, end_addr - start_addr); +} + +int main(int argc, char **argv) +{ + Elf64_Shdr *symtab_shdr; + const char *hmac_key; + unsigned char *dg; + unsigned int dglen; + struct stat stat; + HMAC_CTX *hmac; + int fd, ret; + + if (argc < 2) { + fprintf(stderr, "file argument missing\n"); + exit(EXIT_FAILURE); + } + + fd = open(argv[1], O_RDWR); + if (fd < 0) { + fprintf(stderr, "failed to open %s\n", argv[1]); + exit(EXIT_FAILURE); + } + + ret = fstat(fd, &stat); + if (ret < 0) { + fprintf(stderr, "failed to stat() %s\n", argv[1]); + exit(EXIT_FAILURE); + } + + ehdr = mmap(0, stat.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (ehdr == MAP_FAILED) { + fprintf(stderr, "failed to mmap() %s\n", argv[1]); + exit(EXIT_FAILURE); + } + + shdr = (void *)ehdr + ehdr->e_shoff; + num_shdr = ehdr->e_shnum; + + symtab_shdr = find_symtab_section(); + + syms = (void *)ehdr + symtab_shdr->sh_offset; + num_syms = symtab_shdr->sh_size / sizeof(Elf64_Sym); + + strtab = (void *)ehdr + shdr[symtab_shdr->sh_link].sh_offset; + shstrtab = (void *)ehdr + shdr[ehdr->e_shstrndx].sh_offset; + + if (!update_rela_ref("text") || !update_rela_ref("rodata")) + exit(EXIT_FAILURE); + + hmac_key = get_sym_addr("fips140_integ_hmac_key"); + if (!hmac_key) { + fprintf(stderr, "failed to locate HMAC key in binary\n"); + exit(EXIT_FAILURE); + } + + dg = get_sym_addr("fips140_integ_hmac_digest"); + if (!dg) { + fprintf(stderr, "failed to locate HMAC digest in binary\n"); + exit(EXIT_FAILURE); + } + + hmac = HMAC_CTX_new(); + HMAC_Init_ex(hmac, hmac_key, strlen(hmac_key), EVP_sha256(), NULL); + + hmac_section(hmac, "__fips140_text_start", "__fips140_text_end"); + hmac_section(hmac, "__fips140_rodata_start", "__fips140_rodata_end"); + + HMAC_Final(hmac, dg, &dglen); + + close(fd); + return 0; +} diff --git a/crypto/hctr2.c b/crypto/hctr2.c new file mode 100644 index 000000000000..7d00a3bcb667 --- /dev/null +++ b/crypto/hctr2.c @@ -0,0 +1,581 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HCTR2 length-preserving encryption mode + * + * Copyright 2021 Google LLC + */ + + +/* + * HCTR2 is a length-preserving encryption mode that is efficient on + * processors with instructions to accelerate AES and carryless + * multiplication, e.g. x86 processors with AES-NI and CLMUL, and ARM + * processors with the ARMv8 crypto extensions. + * + * For more details, see the paper: "Length-preserving encryption with HCTR2" + * (https://eprint.iacr.org/2021/1441.pdf) + */ + +#include +#include +#include +#include +#include +#include + +#define BLOCKCIPHER_BLOCK_SIZE 16 + +/* + * The specification allows variable-length tweaks, but Linux's crypto API + * currently only allows algorithms to support a single length. The "natural" + * tweak length for HCTR2 is 16, since that fits into one POLYVAL block for + * the best performance. But longer tweaks are useful for fscrypt, to avoid + * needing to derive per-file keys. So instead we use two blocks, or 32 bytes. + */ +#define TWEAK_SIZE 32 + +struct hctr2_instance_ctx { + struct crypto_cipher_spawn blockcipher_spawn; + struct crypto_skcipher_spawn xctr_spawn; + struct crypto_shash_spawn polyval_spawn; +}; + +struct hctr2_tfm_ctx { + struct crypto_cipher *blockcipher; + struct crypto_skcipher *xctr; + struct crypto_shash *polyval; + u8 L[BLOCKCIPHER_BLOCK_SIZE]; + int hashed_tweak_offset; + /* + * This struct is allocated with extra space for two exported hash + * states. Since the hash state size is not known at compile-time, we + * can't add these to the struct directly. + * + * hashed_tweaklen_divisible; + * hashed_tweaklen_remainder; + */ +}; + +struct hctr2_request_ctx { + u8 first_block[BLOCKCIPHER_BLOCK_SIZE]; + u8 xctr_iv[BLOCKCIPHER_BLOCK_SIZE]; + struct scatterlist *bulk_part_dst; + struct scatterlist *bulk_part_src; + struct scatterlist sg_src[2]; + struct scatterlist sg_dst[2]; + /* + * Sub-request sizes are unknown at compile-time, so they need to go + * after the members with known sizes. + */ + union { + struct shash_desc hash_desc; + struct skcipher_request xctr_req; + } u; + /* + * This struct is allocated with extra space for one exported hash + * state. Since the hash state size is not known at compile-time, we + * can't add it to the struct directly. + * + * hashed_tweak; + */ +}; + +static inline u8 *hctr2_hashed_tweaklen(const struct hctr2_tfm_ctx *tctx, + bool has_remainder) +{ + u8 *p = (u8 *)tctx + sizeof(*tctx); + + if (has_remainder) /* For messages not a multiple of block length */ + p += crypto_shash_statesize(tctx->polyval); + return p; +} + +static inline u8 *hctr2_hashed_tweak(const struct hctr2_tfm_ctx *tctx, + struct hctr2_request_ctx *rctx) +{ + return (u8 *)rctx + tctx->hashed_tweak_offset; +} + +/* + * The input data for each HCTR2 hash step begins with a 16-byte block that + * contains the tweak length and a flag that indicates whether the input is evenly + * divisible into blocks. Since this implementation only supports one tweak + * length, we precompute the two hash states resulting from hashing the two + * possible values of this initial block. This reduces by one block the amount of + * data that needs to be hashed for each encryption/decryption + * + * These precomputed hashes are stored in hctr2_tfm_ctx. + */ +static int hctr2_hash_tweaklen(struct hctr2_tfm_ctx *tctx, bool has_remainder) +{ + SHASH_DESC_ON_STACK(shash, tfm->polyval); + __le64 tweak_length_block[2]; + int err; + + shash->tfm = tctx->polyval; + memset(tweak_length_block, 0, sizeof(tweak_length_block)); + + tweak_length_block[0] = cpu_to_le64(TWEAK_SIZE * 8 * 2 + 2 + has_remainder); + err = crypto_shash_init(shash); + if (err) + return err; + err = crypto_shash_update(shash, (u8 *)tweak_length_block, + POLYVAL_BLOCK_SIZE); + if (err) + return err; + return crypto_shash_export(shash, hctr2_hashed_tweaklen(tctx, has_remainder)); +} + +static int hctr2_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) +{ + struct hctr2_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); + u8 hbar[BLOCKCIPHER_BLOCK_SIZE]; + int err; + + crypto_cipher_clear_flags(tctx->blockcipher, CRYPTO_TFM_REQ_MASK); + crypto_cipher_set_flags(tctx->blockcipher, + crypto_skcipher_get_flags(tfm) & + CRYPTO_TFM_REQ_MASK); + err = crypto_cipher_setkey(tctx->blockcipher, key, keylen); + if (err) + return err; + + crypto_skcipher_clear_flags(tctx->xctr, CRYPTO_TFM_REQ_MASK); + crypto_skcipher_set_flags(tctx->xctr, + crypto_skcipher_get_flags(tfm) & + CRYPTO_TFM_REQ_MASK); + err = crypto_skcipher_setkey(tctx->xctr, key, keylen); + if (err) + return err; + + memset(hbar, 0, sizeof(hbar)); + crypto_cipher_encrypt_one(tctx->blockcipher, hbar, hbar); + + memset(tctx->L, 0, sizeof(tctx->L)); + tctx->L[0] = 0x01; + crypto_cipher_encrypt_one(tctx->blockcipher, tctx->L, tctx->L); + + crypto_shash_clear_flags(tctx->polyval, CRYPTO_TFM_REQ_MASK); + crypto_shash_set_flags(tctx->polyval, crypto_skcipher_get_flags(tfm) & + CRYPTO_TFM_REQ_MASK); + err = crypto_shash_setkey(tctx->polyval, hbar, BLOCKCIPHER_BLOCK_SIZE); + if (err) + return err; + memzero_explicit(hbar, sizeof(hbar)); + + return hctr2_hash_tweaklen(tctx, true) ?: hctr2_hash_tweaklen(tctx, false); +} + +static int hctr2_hash_tweak(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct hctr2_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); + struct hctr2_request_ctx *rctx = skcipher_request_ctx(req); + struct shash_desc *hash_desc = &rctx->u.hash_desc; + int err; + bool has_remainder = req->cryptlen % POLYVAL_BLOCK_SIZE; + + hash_desc->tfm = tctx->polyval; + err = crypto_shash_import(hash_desc, hctr2_hashed_tweaklen(tctx, has_remainder)); + if (err) + return err; + err = crypto_shash_update(hash_desc, req->iv, TWEAK_SIZE); + if (err) + return err; + + // Store the hashed tweak, since we need it when computing both + // H(T || N) and H(T || V). + return crypto_shash_export(hash_desc, hctr2_hashed_tweak(tctx, rctx)); +} + +static int hctr2_hash_message(struct skcipher_request *req, + struct scatterlist *sgl, + u8 digest[POLYVAL_DIGEST_SIZE]) +{ + static const u8 padding[BLOCKCIPHER_BLOCK_SIZE] = { 0x1 }; + struct hctr2_request_ctx *rctx = skcipher_request_ctx(req); + struct shash_desc *hash_desc = &rctx->u.hash_desc; + const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE; + struct sg_mapping_iter miter; + unsigned int remainder = bulk_len % BLOCKCIPHER_BLOCK_SIZE; + int i; + int err = 0; + int n = 0; + + sg_miter_start(&miter, sgl, sg_nents(sgl), + SG_MITER_FROM_SG | SG_MITER_ATOMIC); + for (i = 0; i < bulk_len; i += n) { + sg_miter_next(&miter); + n = min_t(unsigned int, miter.length, bulk_len - i); + err = crypto_shash_update(hash_desc, miter.addr, n); + if (err) + break; + } + sg_miter_stop(&miter); + + if (err) + return err; + + if (remainder) { + err = crypto_shash_update(hash_desc, padding, + BLOCKCIPHER_BLOCK_SIZE - remainder); + if (err) + return err; + } + return crypto_shash_final(hash_desc, digest); +} + +static int hctr2_finish(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct hctr2_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); + struct hctr2_request_ctx *rctx = skcipher_request_ctx(req); + u8 digest[POLYVAL_DIGEST_SIZE]; + struct shash_desc *hash_desc = &rctx->u.hash_desc; + int err; + + // U = UU ^ H(T || V) + // or M = MM ^ H(T || N) + hash_desc->tfm = tctx->polyval; + err = crypto_shash_import(hash_desc, hctr2_hashed_tweak(tctx, rctx)); + if (err) + return err; + err = hctr2_hash_message(req, rctx->bulk_part_dst, digest); + if (err) + return err; + crypto_xor(rctx->first_block, digest, BLOCKCIPHER_BLOCK_SIZE); + + // Copy U (or M) into dst scatterlist + scatterwalk_map_and_copy(rctx->first_block, req->dst, + 0, BLOCKCIPHER_BLOCK_SIZE, 1); + return 0; +} + +static void hctr2_xctr_done(struct crypto_async_request *areq, + int err) +{ + struct skcipher_request *req = areq->data; + + if (!err) + err = hctr2_finish(req); + + skcipher_request_complete(req, err); +} + +static int hctr2_crypt(struct skcipher_request *req, bool enc) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct hctr2_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); + struct hctr2_request_ctx *rctx = skcipher_request_ctx(req); + u8 digest[POLYVAL_DIGEST_SIZE]; + int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE; + int err; + + // Requests must be at least one block + if (req->cryptlen < BLOCKCIPHER_BLOCK_SIZE) + return -EINVAL; + + // Copy M (or U) into a temporary buffer + scatterwalk_map_and_copy(rctx->first_block, req->src, + 0, BLOCKCIPHER_BLOCK_SIZE, 0); + + // Create scatterlists for N and V + rctx->bulk_part_src = scatterwalk_ffwd(rctx->sg_src, req->src, + BLOCKCIPHER_BLOCK_SIZE); + rctx->bulk_part_dst = scatterwalk_ffwd(rctx->sg_dst, req->dst, + BLOCKCIPHER_BLOCK_SIZE); + + // MM = M ^ H(T || N) + // or UU = U ^ H(T || V) + err = hctr2_hash_tweak(req); + if (err) + return err; + err = hctr2_hash_message(req, rctx->bulk_part_src, digest); + if (err) + return err; + crypto_xor(digest, rctx->first_block, BLOCKCIPHER_BLOCK_SIZE); + + // UU = E(MM) + // or MM = D(UU) + if (enc) + crypto_cipher_encrypt_one(tctx->blockcipher, rctx->first_block, + digest); + else + crypto_cipher_decrypt_one(tctx->blockcipher, rctx->first_block, + digest); + + // S = MM ^ UU ^ L + crypto_xor(digest, rctx->first_block, BLOCKCIPHER_BLOCK_SIZE); + crypto_xor_cpy(rctx->xctr_iv, digest, tctx->L, BLOCKCIPHER_BLOCK_SIZE); + + // V = XCTR(S, N) + // or N = XCTR(S, V) + skcipher_request_set_tfm(&rctx->u.xctr_req, tctx->xctr); + skcipher_request_set_crypt(&rctx->u.xctr_req, rctx->bulk_part_src, + rctx->bulk_part_dst, bulk_len, + rctx->xctr_iv); + skcipher_request_set_callback(&rctx->u.xctr_req, + req->base.flags, + hctr2_xctr_done, req); + return crypto_skcipher_encrypt(&rctx->u.xctr_req) ?: + hctr2_finish(req); +} + +static int hctr2_encrypt(struct skcipher_request *req) +{ + return hctr2_crypt(req, true); +} + +static int hctr2_decrypt(struct skcipher_request *req) +{ + return hctr2_crypt(req, false); +} + +static int hctr2_init_tfm(struct crypto_skcipher *tfm) +{ + struct skcipher_instance *inst = skcipher_alg_instance(tfm); + struct hctr2_instance_ctx *ictx = skcipher_instance_ctx(inst); + struct hctr2_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); + struct crypto_skcipher *xctr; + struct crypto_cipher *blockcipher; + struct crypto_shash *polyval; + unsigned int subreq_size; + int err; + + xctr = crypto_spawn_skcipher(&ictx->xctr_spawn); + if (IS_ERR(xctr)) + return PTR_ERR(xctr); + + blockcipher = crypto_spawn_cipher(&ictx->blockcipher_spawn); + if (IS_ERR(blockcipher)) { + err = PTR_ERR(blockcipher); + goto err_free_xctr; + } + + polyval = crypto_spawn_shash(&ictx->polyval_spawn); + if (IS_ERR(polyval)) { + err = PTR_ERR(polyval); + goto err_free_blockcipher; + } + + tctx->xctr = xctr; + tctx->blockcipher = blockcipher; + tctx->polyval = polyval; + + BUILD_BUG_ON(offsetofend(struct hctr2_request_ctx, u) != + sizeof(struct hctr2_request_ctx)); + subreq_size = max(sizeof_field(struct hctr2_request_ctx, u.hash_desc) + + crypto_shash_descsize(polyval), + sizeof_field(struct hctr2_request_ctx, u.xctr_req) + + crypto_skcipher_reqsize(xctr)); + + tctx->hashed_tweak_offset = offsetof(struct hctr2_request_ctx, u) + + subreq_size; + crypto_skcipher_set_reqsize(tfm, tctx->hashed_tweak_offset + + crypto_shash_statesize(polyval)); + return 0; + +err_free_blockcipher: + crypto_free_cipher(blockcipher); +err_free_xctr: + crypto_free_skcipher(xctr); + return err; +} + +static void hctr2_exit_tfm(struct crypto_skcipher *tfm) +{ + struct hctr2_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); + + crypto_free_cipher(tctx->blockcipher); + crypto_free_skcipher(tctx->xctr); + crypto_free_shash(tctx->polyval); +} + +static void hctr2_free_instance(struct skcipher_instance *inst) +{ + struct hctr2_instance_ctx *ictx = skcipher_instance_ctx(inst); + + crypto_drop_cipher(&ictx->blockcipher_spawn); + crypto_drop_skcipher(&ictx->xctr_spawn); + crypto_drop_shash(&ictx->polyval_spawn); + kfree(inst); +} + +static int hctr2_create_common(struct crypto_template *tmpl, + struct rtattr **tb, + const char *xctr_name, + const char *polyval_name) +{ + u32 mask; + struct skcipher_instance *inst; + struct hctr2_instance_ctx *ictx; + struct skcipher_alg *xctr_alg; + struct crypto_alg *blockcipher_alg; + struct shash_alg *polyval_alg; + char blockcipher_name[CRYPTO_MAX_ALG_NAME]; + int len; + int err; + + err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SKCIPHER, &mask); + if (err) + return err; + + inst = kzalloc(sizeof(*inst) + sizeof(*ictx), GFP_KERNEL); + if (!inst) + return -ENOMEM; + ictx = skcipher_instance_ctx(inst); + + /* Stream cipher, xctr(block_cipher) */ + err = crypto_grab_skcipher(&ictx->xctr_spawn, + skcipher_crypto_instance(inst), + xctr_name, 0, mask); + if (err) + goto err_free_inst; + xctr_alg = crypto_spawn_skcipher_alg(&ictx->xctr_spawn); + + err = -EINVAL; + if (strncmp(xctr_alg->base.cra_name, "xctr(", 5)) + goto err_free_inst; + len = strscpy(blockcipher_name, xctr_alg->base.cra_name + 5, + sizeof(blockcipher_name)); + if (len < 1) + goto err_free_inst; + if (blockcipher_name[len - 1] != ')') + goto err_free_inst; + blockcipher_name[len - 1] = 0; + + /* Block cipher, e.g. "aes" */ + err = crypto_grab_cipher(&ictx->blockcipher_spawn, + skcipher_crypto_instance(inst), + blockcipher_name, 0, mask); + if (err) + goto err_free_inst; + blockcipher_alg = crypto_spawn_cipher_alg(&ictx->blockcipher_spawn); + + /* Require blocksize of 16 bytes */ + err = -EINVAL; + if (blockcipher_alg->cra_blocksize != BLOCKCIPHER_BLOCK_SIZE) + goto err_free_inst; + + /* Polyval ε-∆U hash function */ + err = crypto_grab_shash(&ictx->polyval_spawn, + skcipher_crypto_instance(inst), + polyval_name, 0, mask); + if (err) + goto err_free_inst; + polyval_alg = crypto_spawn_shash_alg(&ictx->polyval_spawn); + + /* Ensure Polyval is being used */ + err = -EINVAL; + if (strcmp(polyval_alg->base.cra_name, "polyval") != 0) + goto err_free_inst; + + /* Instance fields */ + + err = -ENAMETOOLONG; + if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "hctr2(%s)", + blockcipher_alg->cra_name) >= CRYPTO_MAX_ALG_NAME) + goto err_free_inst; + if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, + "hctr2_base(%s,%s)", + xctr_alg->base.cra_driver_name, + polyval_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) + goto err_free_inst; + + inst->alg.base.cra_blocksize = BLOCKCIPHER_BLOCK_SIZE; + inst->alg.base.cra_ctxsize = sizeof(struct hctr2_tfm_ctx) + + polyval_alg->statesize * 2; + inst->alg.base.cra_alignmask = xctr_alg->base.cra_alignmask | + polyval_alg->base.cra_alignmask; + /* + * The hash function is called twice, so it is weighted higher than the + * xctr and blockcipher. + */ + inst->alg.base.cra_priority = (2 * xctr_alg->base.cra_priority + + 4 * polyval_alg->base.cra_priority + + blockcipher_alg->cra_priority) / 7; + + inst->alg.setkey = hctr2_setkey; + inst->alg.encrypt = hctr2_encrypt; + inst->alg.decrypt = hctr2_decrypt; + inst->alg.init = hctr2_init_tfm; + inst->alg.exit = hctr2_exit_tfm; + inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(xctr_alg); + inst->alg.max_keysize = crypto_skcipher_alg_max_keysize(xctr_alg); + inst->alg.ivsize = TWEAK_SIZE; + + inst->free = hctr2_free_instance; + + err = skcipher_register_instance(tmpl, inst); + if (err) { +err_free_inst: + hctr2_free_instance(inst); + } + return err; +} + +static int hctr2_create_base(struct crypto_template *tmpl, struct rtattr **tb) +{ + const char *xctr_name; + const char *polyval_name; + + xctr_name = crypto_attr_alg_name(tb[1]); + if (IS_ERR(xctr_name)) + return PTR_ERR(xctr_name); + + polyval_name = crypto_attr_alg_name(tb[2]); + if (IS_ERR(polyval_name)) + return PTR_ERR(polyval_name); + + return hctr2_create_common(tmpl, tb, xctr_name, polyval_name); +} + +static int hctr2_create(struct crypto_template *tmpl, struct rtattr **tb) +{ + const char *blockcipher_name; + char xctr_name[CRYPTO_MAX_ALG_NAME]; + + blockcipher_name = crypto_attr_alg_name(tb[1]); + if (IS_ERR(blockcipher_name)) + return PTR_ERR(blockcipher_name); + + if (snprintf(xctr_name, CRYPTO_MAX_ALG_NAME, "xctr(%s)", + blockcipher_name) >= CRYPTO_MAX_ALG_NAME) + return -ENAMETOOLONG; + + return hctr2_create_common(tmpl, tb, xctr_name, "polyval"); +} + +static struct crypto_template hctr2_tmpls[] = { + { + /* hctr2_base(xctr_name, polyval_name) */ + .name = "hctr2_base", + .create = hctr2_create_base, + .module = THIS_MODULE, + }, { + /* hctr2(blockcipher_name) */ + .name = "hctr2", + .create = hctr2_create, + .module = THIS_MODULE, + } +}; + +static int __init hctr2_module_init(void) +{ + return crypto_register_templates(hctr2_tmpls, ARRAY_SIZE(hctr2_tmpls)); +} + +static void __exit hctr2_module_exit(void) +{ + return crypto_unregister_templates(hctr2_tmpls, + ARRAY_SIZE(hctr2_tmpls)); +} + +subsys_initcall(hctr2_module_init); +module_exit(hctr2_module_exit); + +MODULE_DESCRIPTION("HCTR2 length-preserving encryption mode"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("hctr2"); +MODULE_IMPORT_NS(CRYPTO_INTERNAL); diff --git a/crypto/internal.h b/crypto/internal.h index f00869af689f..932f0aafddc3 100644 --- a/crypto/internal.h +++ b/crypto/internal.h @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -27,6 +28,7 @@ struct crypto_larval { struct crypto_alg *adult; struct completion completion; u32 mask; + bool test_started; }; enum { @@ -45,6 +47,26 @@ extern struct list_head crypto_alg_list; extern struct rw_semaphore crypto_alg_sem; extern struct blocking_notifier_head crypto_chain; +#ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS +static inline bool crypto_boot_test_finished(void) +{ + return true; +} +static inline void set_crypto_boot_test_finished(void) +{ +} +#else +DECLARE_STATIC_KEY_FALSE(__crypto_boot_test_finished); +static inline bool crypto_boot_test_finished(void) +{ + return static_branch_likely(&__crypto_boot_test_finished); +} +static inline void set_crypto_boot_test_finished(void) +{ + static_branch_enable(&__crypto_boot_test_finished); +} +#endif /* !CONFIG_CRYPTO_MANAGER_DISABLE_TESTS */ + #ifdef CONFIG_PROC_FS void __init crypto_init_proc(void); void __exit crypto_exit_proc(void); @@ -70,6 +92,7 @@ struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask); struct crypto_larval *crypto_larval_alloc(const char *name, u32 type, u32 mask); void crypto_larval_kill(struct crypto_alg *alg); +void crypto_wait_for_test(struct crypto_larval *larval); void crypto_alg_tested(const char *name, int err); void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list, @@ -156,5 +179,10 @@ static inline void crypto_yield(u32 flags) cond_resched(); } +static inline int crypto_is_test_larval(struct crypto_larval *larval) +{ + return larval->alg.cra_driver_name[0]; +} + #endif /* _CRYPTO_INTERNAL_H */ diff --git a/crypto/jitterentropy.c b/crypto/jitterentropy.c index f6d3a84e3c21..c6c409142ec0 100644 --- a/crypto/jitterentropy.c +++ b/crypto/jitterentropy.c @@ -117,6 +117,22 @@ struct rand_data { #define JENT_EHEALTH 9 /* Health test failed during initialization */ #define JENT_ERCT 10 /* RCT failed during initialization */ +/* + * The output n bits can receive more than n bits of min entropy, of course, + * but the fixed output of the conditioning function can only asymptotically + * approach the output size bits of min entropy, not attain that bound. Random + * maps will tend to have output collisions, which reduces the creditable + * output entropy (that is what SP 800-90B Section 3.1.5.1.2 attempts to bound). + * + * The value "64" is justified in Appendix A.4 of the current 90C draft, + * and aligns with NIST's in "epsilon" definition in this document, which is + * that a string can be considered "full entropy" if you can bound the min + * entropy in each bit of output to at least 1-epsilon, where epsilon is + * required to be <= 2^(-32). + */ +#define JENT_ENTROPY_SAFETY_FACTOR 64 + +#include #include "jitterentropy.h" /*************************************************************************** @@ -546,7 +562,10 @@ static int jent_measure_jitter(struct rand_data *ec) */ static void jent_gen_entropy(struct rand_data *ec) { - unsigned int k = 0; + unsigned int k = 0, safety_factor = 0; + + if (fips_enabled) + safety_factor = JENT_ENTROPY_SAFETY_FACTOR; /* priming of the ->prev_time value */ jent_measure_jitter(ec); @@ -560,7 +579,7 @@ static void jent_gen_entropy(struct rand_data *ec) * We multiply the loop value with ->osr to obtain the * oversampling rate requested by the caller */ - if (++k >= (DATA_SIZE_BITS * ec->osr)) + if (++k >= ((DATA_SIZE_BITS + safety_factor) * ec->osr)) break; } } diff --git a/crypto/polyval-generic.c b/crypto/polyval-generic.c new file mode 100644 index 000000000000..16bfa6925b31 --- /dev/null +++ b/crypto/polyval-generic.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * POLYVAL: hash function for HCTR2. + * + * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying + * Copyright 2021 Google LLC + */ + +/* + * Code based on crypto/ghash-generic.c + * + * POLYVAL is a keyed hash function similar to GHASH. POLYVAL uses a different + * modulus for finite field multiplication which makes hardware accelerated + * implementations on little-endian machines faster. POLYVAL is used in the + * kernel to implement HCTR2, but was originally specified for AES-GCM-SIV + * (RFC 8452). + * + * For more information see: + * Length-preserving encryption with HCTR2: + * https://eprint.iacr.org/2021/1441.pdf + * AES-GCM-SIV: Nonce Misuse-Resistant Authenticated Encryption: + * https://datatracker.ietf.org/doc/html/rfc8452 + * + * Like GHASH, POLYVAL is not a cryptographic hash function and should + * not be used outside of crypto modes explicitly designed to use POLYVAL. + * + * This implementation uses a convenient trick involving the GHASH and POLYVAL + * fields. This trick allows multiplication in the POLYVAL field to be + * implemented by using multiplication in the GHASH field as a subroutine. An + * element of the POLYVAL field can be converted to an element of the GHASH + * field by computing x*REVERSE(a), where REVERSE reverses the byte-ordering of + * a. Similarly, an element of the GHASH field can be converted back to the + * POLYVAL field by computing REVERSE(x^{-1}*a). For more information, see: + * https://datatracker.ietf.org/doc/html/rfc8452#appendix-A + * + * By using this trick, we do not need to implement the POLYVAL field for the + * generic implementation. + * + * Warning: this generic implementation is not intended to be used in practice + * and is not constant time. For practical use, a hardware accelerated + * implementation of POLYVAL should be used instead. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct polyval_tfm_ctx { + struct gf128mul_4k *gf128; +}; + +struct polyval_desc_ctx { + union { + u8 buffer[POLYVAL_BLOCK_SIZE]; + be128 buffer128; + }; + u32 bytes; +}; + +static void copy_and_reverse(u8 dst[POLYVAL_BLOCK_SIZE], + const u8 src[POLYVAL_BLOCK_SIZE]) +{ + u64 a = get_unaligned((const u64 *)&src[0]); + u64 b = get_unaligned((const u64 *)&src[8]); + + put_unaligned(swab64(a), (u64 *)&dst[8]); + put_unaligned(swab64(b), (u64 *)&dst[0]); +} + +/* + * Performs multiplication in the POLYVAL field using the GHASH field as a + * subroutine. This function is used as a fallback for hardware accelerated + * implementations when simd registers are unavailable. + * + * Note: This function is not used for polyval-generic, instead we use the 4k + * lookup table implementation for finite field multiplication. + */ +void polyval_mul_non4k(u8 *op1, const u8 *op2) +{ + be128 a, b; + + // Assume one argument is in Montgomery form and one is not. + copy_and_reverse((u8 *)&a, op1); + copy_and_reverse((u8 *)&b, op2); + gf128mul_x_lle(&a, &a); + gf128mul_lle(&a, &b); + copy_and_reverse(op1, (u8 *)&a); +} +EXPORT_SYMBOL_GPL(polyval_mul_non4k); + +/* + * Perform a POLYVAL update using non4k multiplication. This function is used + * as a fallback for hardware accelerated implementations when simd registers + * are unavailable. + * + * Note: This function is not used for polyval-generic, instead we use the 4k + * lookup table implementation of finite field multiplication. + */ +void polyval_update_non4k(const u8 *key, const u8 *in, + size_t nblocks, u8 *accumulator) +{ + while (nblocks--) { + crypto_xor(accumulator, in, POLYVAL_BLOCK_SIZE); + polyval_mul_non4k(accumulator, key); + in += POLYVAL_BLOCK_SIZE; + } +} +EXPORT_SYMBOL_GPL(polyval_update_non4k); + +static int polyval_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct polyval_tfm_ctx *ctx = crypto_shash_ctx(tfm); + be128 k; + + if (keylen != POLYVAL_BLOCK_SIZE) + return -EINVAL; + + gf128mul_free_4k(ctx->gf128); + + BUILD_BUG_ON(sizeof(k) != POLYVAL_BLOCK_SIZE); + copy_and_reverse((u8 *)&k, key); + gf128mul_x_lle(&k, &k); + + ctx->gf128 = gf128mul_init_4k_lle(&k); + memzero_explicit(&k, POLYVAL_BLOCK_SIZE); + + if (!ctx->gf128) + return -ENOMEM; + + return 0; +} + +static int polyval_init(struct shash_desc *desc) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + + memset(dctx, 0, sizeof(*dctx)); + + return 0; +} + +static int polyval_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + const struct polyval_tfm_ctx *ctx = crypto_shash_ctx(desc->tfm); + u8 *pos; + u8 tmp[POLYVAL_BLOCK_SIZE]; + int n; + + if (dctx->bytes) { + n = min(srclen, dctx->bytes); + pos = dctx->buffer + dctx->bytes - 1; + + dctx->bytes -= n; + srclen -= n; + + while (n--) + *pos-- ^= *src++; + + if (!dctx->bytes) + gf128mul_4k_lle(&dctx->buffer128, ctx->gf128); + } + + while (srclen >= POLYVAL_BLOCK_SIZE) { + copy_and_reverse(tmp, src); + crypto_xor(dctx->buffer, tmp, POLYVAL_BLOCK_SIZE); + gf128mul_4k_lle(&dctx->buffer128, ctx->gf128); + src += POLYVAL_BLOCK_SIZE; + srclen -= POLYVAL_BLOCK_SIZE; + } + + if (srclen) { + dctx->bytes = POLYVAL_BLOCK_SIZE - srclen; + pos = dctx->buffer + POLYVAL_BLOCK_SIZE - 1; + while (srclen--) + *pos-- ^= *src++; + } + + return 0; +} + +static int polyval_final(struct shash_desc *desc, u8 *dst) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + const struct polyval_tfm_ctx *ctx = crypto_shash_ctx(desc->tfm); + + if (dctx->bytes) + gf128mul_4k_lle(&dctx->buffer128, ctx->gf128); + copy_and_reverse(dst, dctx->buffer); + return 0; +} + +static void polyval_exit_tfm(struct crypto_tfm *tfm) +{ + struct polyval_tfm_ctx *ctx = crypto_tfm_ctx(tfm); + + gf128mul_free_4k(ctx->gf128); +} + +static struct shash_alg polyval_alg = { + .digestsize = POLYVAL_DIGEST_SIZE, + .init = polyval_init, + .update = polyval_update, + .final = polyval_final, + .setkey = polyval_setkey, + .descsize = sizeof(struct polyval_desc_ctx), + .base = { + .cra_name = "polyval", + .cra_driver_name = "polyval-generic", + .cra_priority = 100, + .cra_blocksize = POLYVAL_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct polyval_tfm_ctx), + .cra_module = THIS_MODULE, + .cra_exit = polyval_exit_tfm, + }, +}; + +static int __init polyval_mod_init(void) +{ + return crypto_register_shash(&polyval_alg); +} + +static void __exit polyval_mod_exit(void) +{ + crypto_unregister_shash(&polyval_alg); +} + +subsys_initcall(polyval_mod_init); +module_exit(polyval_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("POLYVAL hash function"); +MODULE_ALIAS_CRYPTO("polyval"); +MODULE_ALIAS_CRYPTO("polyval-generic"); diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 4ada7e749390..e8305c246261 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -1741,6 +1741,7 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) ret += tcrypt_test("rfc3686(ctr(aes))"); ret += tcrypt_test("ofb(aes)"); ret += tcrypt_test("cfb(aes)"); + ret += tcrypt_test("xctr(aes)"); break; case 11: @@ -1910,6 +1911,10 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) ret += tcrypt_test("ccm(sm4)"); break; + case 57: + ret += tcrypt_test("polyval"); + break; + case 100: ret += tcrypt_test("hmac(md5)"); break; @@ -2367,6 +2372,11 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) 16, 16, aead_speed_template_19, num_mb); break; + case 226: + test_cipher_speed("hctr2(aes)", ENCRYPT, sec, NULL, + 0, speed_template_32); + break; + case 300: if (alg) { test_hash_speed(alg, sec, generic_hash_speed_template); diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 163a1283a866..b6253f76779b 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -4979,6 +4979,14 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .hash = __VECS(ghash_tv_template) } + }, { + .alg = "hctr2(aes)", + .generic_driver = + "hctr2_base(xctr(aes-generic),polyval-generic)", + .test = alg_test_skcipher, + .suite = { + .cipher = __VECS(aes_hctr2_tv_template) + } }, { .alg = "hmac(md5)", .test = alg_test_hash, @@ -5233,6 +5241,12 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .hash = __VECS(poly1305_tv_template) } + }, { + .alg = "polyval", + .test = alg_test_hash, + .suite = { + .hash = __VECS(polyval_tv_template) + } }, { .alg = "rfc3686(ctr(aes))", .test = alg_test_skcipher, @@ -5439,6 +5453,12 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .cipher = __VECS(xchacha20_tv_template) }, + }, { + .alg = "xctr(aes)", + .test = alg_test_skcipher, + .suite = { + .cipher = __VECS(aes_xctr_tv_template) + } }, { .alg = "xts(aes)", .generic_driver = "xts(ecb(aes-generic))", diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 2be20a590a60..652c40d8749f 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -32583,4 +32583,1540 @@ static const struct hash_testvec blake2b_512_tv_template[] = {{ 0xae, 0x15, 0x81, 0x15, 0xd0, 0x88, 0xa0, 0x3c, }, }}; +/* + * Test vectors generated using https://github.com/google/hctr2 + */ +static const struct cipher_testvec aes_xctr_tv_template[] = { + { + .key = "\x9c\x8d\xc4\xbd\x71\x36\xdc\x82" + "\x7c\xa1\xca\xa3\x23\x5a\xdb\xa4", + .iv = "\x8d\xe7\xa5\x6a\x95\x86\x42\xde" + "\xba\xea\x6e\x69\x03\x33\x86\x0f", + .ptext = "\xbd", + .ctext = "\xb9", + .klen = 16, + .len = 1, + }, + { + .key = "\xbc\x1b\x12\x0c\x3f\x18\xcc\x1f" + "\x5a\x1d\xab\x81\xa8\x68\x7c\x63", + .iv = "\x22\xc1\xdd\x25\x0b\x18\xcb\xa5" + "\x4a\xda\x15\x07\x73\xd9\x88\x10", + .ptext = "\x24\x6e\x64\xc6\x15\x26\x9c\xda" + "\x2a\x4b\x57\x12\xff\x7c\xd6\xb5", + .ctext = "\xd6\x47\x8d\x58\x92\xb2\x84\xf9" + "\xb7\xee\x0d\x98\xa1\x39\x4d\x8f", + .klen = 16, + .len = 16, + }, + { + .key = "\x44\x03\xbf\x4c\x30\xf0\xa7\xd6" + "\xbd\x54\xbb\x66\x8e\xa6\x0e\x8a", + .iv = "\xe6\xf7\x26\xdf\x8c\x3c\xaa\x88" + "\xce\xc1\xbd\x43\x3b\x09\x62\xad", + .ptext = "\x3c\xe3\x46\xb9\x8f\x9d\x3f\x8d" + "\xef\xf2\x53\xab\x24\xe2\x29\x08" + "\xf8\x7e\x1d\xa6\x6d\x86\x7d\x60" + "\x97\x63\x93\x29\x71\x94\xb4", + .ctext = "\xd4\xa3\xc6\xb8\xc1\x6f\x70\x1a" + "\x52\x0c\xed\x4c\xaf\x51\x56\x23" + "\x48\x45\x07\x10\x34\xc5\xba\x71" + "\xe5\xf8\x1e\xd8\xcb\xa6\xe7", + .klen = 16, + .len = 31, + }, + { + .key = "\x5b\x17\x30\x94\x19\x31\xa1\xae" + "\x24\x8e\x42\x1e\x82\xe6\xec\xb8", + .iv = "\xd1\x2e\xb9\xb8\xf8\x49\xeb\x68" + "\x06\xeb\x65\x33\x34\xa2\xeb\xf0", + .ptext = "\x19\x75\xec\x59\x60\x1b\x7a\x3e" + "\x62\x46\x87\xf0\xde\xab\x81\x36" + "\x63\x53\x11\xa0\x1f\xce\x25\x85" + "\x49\x6b\x28\xfa\x1c\x92\xe5\x18" + "\x38\x14\x00\x79\xf2\x9e\xeb\xfc" + "\x36\xa7\x6b\xe1\xe5\xcf\x04\x48" + "\x44\x6d\xbd\x64\xb3\xcb\x78\x05" + "\x8d\x7f\x9a\xaf\x3c\xcf\x6c\x45" + "\x6c\x7c\x46\x4c\xa8\xc0\x1e\xe4" + "\x33\xa5\x7b\xbb\x26\xd9\xc0\x32" + "\x9d\x8a\xb3\xf3\x3d\x52\xe6\x48" + "\x4c\x9b\x4c\x6e\xa4\xa3\xad\x66" + "\x56\x48\xd5\x98\x3a\x93\xc4\x85" + "\xe9\x89\xca\xa6\xc1\xc8\xe7\xf8" + "\xc3\xe9\xef\xbe\x77\xe6\xd1\x3a" + "\xa6\x99\xc8\x2d\xdf\x40\x0f\x44", + .ctext = "\xc6\x1a\x01\x1a\x00\xba\x04\xff" + "\x10\xd1\x7e\x5d\xad\x91\xde\x8c" + "\x08\x55\x95\xae\xd7\x22\x77\x40" + "\xf0\x33\x1b\x51\xef\xfe\x3d\x67" + "\xdf\xc4\x9f\x39\x47\x67\x93\xab" + "\xaa\x37\x55\xfe\x41\xe0\xba\xcd" + "\x25\x02\x7c\x61\x51\xa1\xcc\x72" + "\x7a\x20\x26\xb9\x06\x68\xbd\x19" + "\xc5\x2e\x1b\x75\x4a\x40\xb2\xd2" + "\xc4\xee\xd8\x5b\xa4\x55\x7d\x25" + "\xfc\x01\x4d\x6f\x0a\xfd\x37\x5d" + "\x3e\x67\xc0\x35\x72\x53\x7b\xe2" + "\xd6\x19\x5b\x92\x6c\x3a\x8c\x2a" + "\xe2\xc2\xa2\x4f\x2a\xf2\xb5\x15" + "\x65\xc5\x8d\x97\xf9\xbf\x8c\x98" + "\xe4\x50\x1a\xf2\x76\x55\x07\x49", + .klen = 16, + .len = 128, + }, + { + .key = "\x17\xa6\x01\x3d\x5d\xd6\xef\x2d" + "\x69\x8f\x4c\x54\x5b\xae\x43\xf0", + .iv = "\xa9\x1b\x47\x60\x26\x82\xf7\x1c" + "\x80\xf8\x88\xdd\xfb\x44\xd9\xda", + .ptext = "\xf7\x67\xcd\xa6\x04\x65\x53\x99" + "\x90\x5c\xa2\x56\x74\xd7\x9d\xf2" + "\x0b\x03\x7f\x4e\xa7\x84\x72\x2b" + "\xf0\xa5\xbf\xe6\x9a\x62\x3a\xfe" + "\x69\x5c\x93\x79\x23\x86\x64\x85" + "\xeb\x13\xb1\x5a\xd5\x48\x39\xa0" + "\x70\xfb\x06\x9a\xd7\x12\x5a\xb9" + "\xbe\xed\x2c\x81\x64\xf7\xcf\x80" + "\xee\xe6\x28\x32\x2d\x37\x4c\x32" + "\xf4\x1f\x23\x21\xe9\xc8\xc9\xbf" + "\x54\xbc\xcf\xb4\xc2\x65\x39\xdf" + "\xa5\xfb\x14\x11\xed\x62\x38\xcf" + "\x9b\x58\x11\xdd\xe9\xbd\x37\x57" + "\x75\x4c\x9e\xd5\x67\x0a\x48\xc6" + "\x0d\x05\x4e\xb1\x06\xd7\xec\x2e" + "\x9e\x59\xde\x4f\xab\x38\xbb\xe5" + "\x87\x04\x5a\x2c\x2a\xa2\x8f\x3c" + "\xe7\xe1\x46\xa9\x49\x9f\x24\xad" + "\x2d\xb0\x55\x40\x64\xd5\xda\x7e" + "\x1e\x77\xb8\x29\x72\x73\xc3\x84" + "\xcd\xf3\x94\x90\x58\x76\xc9\x2c" + "\x2a\xad\x56\xde\x33\x18\xb6\x3b" + "\x10\xe9\xe9\x8d\xf0\xa9\x7f\x05" + "\xf7\xb5\x8c\x13\x7e\x11\x3d\x1e" + "\x02\xbb\x5b\xea\x69\xff\x85\xcf" + "\x6a\x18\x97\x45\xe3\x96\xba\x4d" + "\x2d\x7a\x70\x78\x15\x2c\xe9\xdc" + "\x4e\x09\x92\x57\x04\xd8\x0b\xa6" + "\x20\x71\x76\x47\x76\x96\x89\xa0" + "\xd9\x29\xa2\x5a\x06\xdb\x56\x39" + "\x60\x33\x59\x04\x95\x89\xf6\x18" + "\x1d\x70\x75\x85\x3a\xb7\x6e", + .ctext = "\xe1\xe7\x3f\xd3\x6a\xb9\x2f\x64" + "\x37\xc5\xa4\xe9\xca\x0a\xa1\xd6" + "\xea\x7d\x39\xe5\xe6\xcc\x80\x54" + "\x74\x31\x2a\x04\x33\x79\x8c\x8e" + "\x4d\x47\x84\x28\x27\x9b\x3c\x58" + "\x54\x58\x20\x4f\x70\x01\x52\x5b" + "\xac\x95\x61\x49\x5f\xef\xba\xce" + "\xd7\x74\x56\xe7\xbb\xe0\x3c\xd0" + "\x7f\xa9\x23\x57\x33\x2a\xf6\xcb" + "\xbe\x42\x14\x95\xa8\xf9\x7a\x7e" + "\x12\x53\x3a\xe2\x13\xfe\x2d\x89" + "\xeb\xac\xd7\xa8\xa5\xf8\x27\xf3" + "\x74\x9a\x65\x63\xd1\x98\x3a\x7e" + "\x27\x7b\xc0\x20\x00\x4d\xf4\xe5" + "\x7b\x69\xa6\xa8\x06\x50\x85\xb6" + "\x7f\xac\x7f\xda\x1f\xf5\x37\x56" + "\x9b\x2f\xd3\x86\x6b\x70\xbd\x0e" + "\x55\x9a\x9d\x4b\x08\xb5\x5b\x7b" + "\xd4\x7c\xb4\x71\x49\x92\x4a\x1e" + "\xed\x6d\x11\x09\x47\x72\x32\x6a" + "\x97\x53\x36\xaf\xf3\x06\x06\x2c" + "\x69\xf1\x59\x00\x36\x95\x28\x2a" + "\xb6\xcd\x10\x21\x84\x73\x5c\x96" + "\x86\x14\x2c\x3d\x02\xdb\x53\x9a" + "\x61\xde\xea\x99\x84\x7a\x27\xf6" + "\xf7\xc8\x49\x73\x4b\xb8\xeb\xd3" + "\x41\x33\xdd\x09\x68\xe2\x64\xb8" + "\x5f\x75\x74\x97\x91\x54\xda\xc2" + "\x73\x2c\x1e\x5a\x84\x48\x01\x1a" + "\x0d\x8b\x0a\xdf\x07\x2e\xee\x77" + "\x1d\x17\x41\x7a\xc9\x33\x63\xfa" + "\x9f\xc3\x74\x57\x5f\x03\x4c", + .klen = 16, + .len = 255, + }, + { + .key = "\xe5\xf1\x48\x2e\x88\xdb\xc7\x28" + "\xa2\x55\x5d\x2f\x90\x02\xdc\xd3" + "\xf5\xd3\x9e\x87\xd5\x58\x30\x4a", + .iv = "\xa6\x40\x39\xf9\x63\x6c\x2d\xd4" + "\x1b\x71\x05\xa4\x88\x86\x11\xd3", + .ptext = "\xb6\x06\xae\x15\x11\x96\xc1\x44" + "\x44\xc2\x98\xf9\xa8\x0a\x0b", + .ctext = "\x27\x3b\x68\x40\xa9\x5e\x74\x6b" + "\x74\x67\x18\xf9\x37\xed\xed", + .klen = 24, + .len = 15, + }, + { + .key = "\xc8\xa0\x27\x67\x04\x3f\xed\xa5" + "\xb4\x0c\x51\x91\x2d\x27\x77\x33" + "\xa5\xfc\x2a\x9f\x78\xd8\x1c\x68", + .iv = "\x83\x99\x1a\xe2\x84\xca\xa9\x16" + "\x8d\xc4\x2d\x1b\x67\xc8\x86\x21", + .ptext = "\xd6\x22\x85\xb8\x5d\x7e\x26\x2e" + "\xbe\x04\x9d\x0c\x03\x91\x45\x4a" + "\x36", + .ctext = "\x0f\x44\xa9\x62\x72\xec\x12\x26" + "\x3a\xc6\x83\x26\x62\x5e\xb7\x13" + "\x05", + .klen = 24, + .len = 17, + }, + { + .key = "\xc5\x87\x18\x09\x0a\x4e\x66\x3e" + "\x50\x90\x19\x93\xc0\x33\xcf\x80" + "\x3a\x36\x6b\x6c\x43\xd7\xe4\x93", + .iv = "\xdd\x0b\x75\x1f\xee\x2f\xb4\x52" + "\x10\x82\x1f\x79\x8a\xa4\x9b\x87", + .ptext = "\x56\xf9\x13\xce\x9f\x30\x10\x11" + "\x1b\x59\xfd\x39\x5a\x29\xa3\x44" + "\x78\x97\x8c\xf6\x99\x6d\x26\xf1" + "\x32\x60\x6a\xeb\x04\x47\x29\x4c" + "\x7e\x14\xef\x4d\x55\x29\xfe\x36" + "\x37\xcf\x0b\x6e\xf3\xce\x15\xd2", + .ctext = "\x8f\x98\xe1\x5a\x7f\xfe\xc7\x05" + "\x76\xb0\xd5\xde\x90\x52\x2b\xa8" + "\xf3\x6e\x3c\x77\xa5\x33\x63\xdd" + "\x6f\x62\x12\xb0\x80\x10\xc1\x28" + "\x58\xe5\xd6\x24\x44\x04\x55\xf3" + "\x6d\x94\xcb\x2c\x7e\x7a\x85\x79", + .klen = 24, + .len = 48, + }, + { + .key = "\x84\x9b\xe8\x10\x4c\xb3\xd1\x7a" + "\xb3\xab\x4e\x6f\x90\x12\x07\xf8" + "\xef\xde\x42\x09\xbf\x34\x95\xb2", + .iv = "\x66\x62\xf9\x48\x9d\x17\xf7\xdf" + "\x06\x67\xf4\x6d\xf2\xbc\xa2\xe5", + .ptext = "\x2f\xd6\x16\x6b\xf9\x4b\x44\x14" + "\x90\x93\xe5\xfd\x05\xaa\x00\x26" + "\xbd\xab\x11\xb8\xf0\xcb\x11\x72" + "\xdd\xc5\x15\x4f\x4e\x1b\xf8\xc9" + "\x8f\x4a\xd5\x69\xf8\x9e\xfb\x05" + "\x8a\x37\x46\xfe\xfa\x58\x9b\x0e" + "\x72\x90\x9a\x06\xa5\x42\xf4\x7c" + "\x35\xd5\x64\x70\x72\x67\xfc\x8b" + "\xab\x5a\x2f\x64\x9b\xa1\xec\xe7" + "\xe6\x92\x69\xdb\x62\xa4\xe7\x44" + "\x88\x28\xd4\x52\x64\x19\xa9\xd7" + "\x0c\x00\xe6\xe7\xc1\x28\xc1\xf5" + "\x72\xc5\xfa\x09\x22\x2e\xf4\x82" + "\xa3\xdc\xc1\x68\xf9\x29\x55\x8d" + "\x04\x67\x13\xa6\x52\x04\x3c\x0c" + "\x14\xf2\x87\x23\x61\xab\x82\xcb" + "\x49\x5b\x6b\xd4\x4f\x0d\xd4\x95" + "\x82\xcd\xe3\x69\x47\x1b\x31\x73" + "\x73\x77\xc1\x53\x7d\x43\x5e\x4a" + "\x80\x3a\xca\x9c\xc7\x04\x1a\x31" + "\x8e\xe6\x76\x7f\xe1\xb3\xd0\x57" + "\xa2\xb2\xf6\x09\x51\xc9\x6d\xbc" + "\x79\xed\x57\x50\x36\xd2\x93\xa4" + "\x40\x5d\xac\x3a\x3b\xb6\x2d\x89" + "\x78\xa2\xbd\x23\xec\x35\x06\xf0" + "\xa8\xc8\xc9\xb0\xe3\x28\x2b\xba" + "\x70\xa0\xfe\xed\x13\xc4\xd7\x90" + "\xb1\x6a\xe0\xe1\x30\x71\x15\xd0" + "\xe2\xb3\xa6\x4e\xb0\x01\xf9\xe7" + "\x59\xc6\x1e\xed\x46\x2b\xe3\xa8" + "\x22\xeb\x7f\x1c\xd9\xcd\xe0\xa6" + "\x72\x42\x2c\x06\x75\xbb\xb7\x6b" + "\xca\x49\x5e\xa1\x47\x8d\x9e\xfe" + "\x60\xcc\x34\x95\x8e\xfa\x1e\x3e" + "\x85\x4b\x03\x54\xea\x34\x1c\x41" + "\x90\x45\xa6\xbe\xcf\x58\x4f\xca" + "\x2c\x79\xc0\x3e\x8f\xd7\x3b\xd4" + "\x55\x74\xa8\xe1\x57\x09\xbf\xab" + "\x2c\xf9\xe4\xdd\x17\x99\x57\x60" + "\x4b\x88\x2a\x7f\x43\x86\xb9\x9a" + "\x60\xbf\x4c\xcf\x9b\x41\xb8\x99" + "\x69\x15\x4f\x91\x4d\xeb\xdf\x6f" + "\xcc\x4c\xf9\x6f\xf2\x33\x23\xe7" + "\x02\x44\xaa\xa2\xfa\xb1\x39\xa5" + "\xff\x88\xf5\x37\x02\x33\x24\xfc" + "\x79\x11\x4c\x94\xc2\x31\x87\x9c" + "\x53\x19\x99\x32\xe4\xde\x18\xf4" + "\x8f\xe2\xe8\xa3\xfb\x0b\xaa\x7c" + "\xdb\x83\x0f\xf6\xc0\x8a\x9b\xcd" + "\x7b\x16\x05\x5b\xe4\xb4\x34\x03" + "\xe3\x8f\xc9\x4b\x56\x84\x2a\x4c" + "\x36\x72\x3c\x84\x4f\xba\xa2\x7f" + "\xf7\x1b\xba\x4d\x8a\xb8\x5d\x51" + "\x36\xfb\xef\x23\x18\x6f\x33\x2d" + "\xbb\x06\x24\x8e\x33\x98\x6e\xcd" + "\x63\x11\x18\x6b\xcc\x1b\x66\xb9" + "\x38\x8d\x06\x8d\x98\x1a\xef\xaa" + "\x35\x4a\x90\xfa\xb1\xd3\xcc\x11" + "\x50\x4c\x54\x18\x60\x5d\xe4\x11" + "\xfc\x19\xe1\x53\x20\x5c\xe7\xef" + "\x8a\x2b\xa8\x82\x51\x5f\x5d\x43" + "\x34\xe5\xcf\x7b\x1b\x6f\x81\x19" + "\xb7\xdf\xa8\x9e\x81\x89\x5f\x33" + "\x69\xaf\xde\x89\x68\x88\xf0\x71", + .ctext = "\xab\x15\x46\x5b\xed\x4f\xa8\xac" + "\xbf\x31\x30\x84\x55\xa4\xb8\x98" + "\x79\xba\xa0\x15\xa4\x55\x20\xec" + "\xf9\x94\x71\xe6\x6a\x6f\xee\x87" + "\x2e\x3a\xa2\x95\xae\x6e\x56\x09" + "\xe9\xc0\x0f\xe2\xc6\xb7\x30\xa9" + "\x73\x8e\x59\x7c\xfd\xe3\x71\xf7" + "\xae\x8b\x91\xab\x5e\x36\xe9\xa8" + "\xff\x17\xfa\xa2\x94\x93\x11\x42" + "\x67\x96\x99\xc5\xf0\xad\x2a\x57" + "\xf9\xa6\x70\x4a\xdf\x71\xff\xc0" + "\xe2\xaf\x9a\xae\x57\x58\x13\x3b" + "\x2d\xf1\xc7\x8f\xdb\x8a\xcc\xce" + "\x53\x1a\x69\x55\x39\xc8\xbe\xc3" + "\x2d\xb1\x03\xd9\xa3\x99\xf4\x8d" + "\xd9\x2d\x27\xae\xa5\xe7\x77\x7f" + "\xbb\x88\x84\xea\xfa\x19\x3f\x44" + "\x61\x21\x8a\x1f\xbe\xac\x60\xb4" + "\xaf\xe9\x00\xab\xef\x3c\x53\x56" + "\xcd\x4b\x53\xd8\x9b\xfe\x88\x23" + "\x5b\x85\x76\x08\xec\xd1\x6e\x4a" + "\x87\xa4\x7d\x29\x4e\x4f\x3f\xc9" + "\xa4\xab\x63\xea\xdd\xef\x9f\x79" + "\x38\x18\x7d\x90\x90\xf9\x12\x57" + "\x1d\x89\xea\xfe\xd4\x47\x45\x32" + "\x6a\xf6\xe7\xde\x22\x7e\xee\xc1" + "\xbc\x2d\xc3\xbb\xe5\xd4\x13\xac" + "\x63\xff\x5b\xb1\x05\x96\xd5\xf3" + "\x07\x9a\x62\xb6\x30\xea\x7d\x1e" + "\xee\x75\x0a\x1b\xcc\x6e\x4d\xa7" + "\xf7\x4d\x74\xd8\x60\x32\x5e\xd0" + "\x93\xd7\x19\x90\x4e\x26\xdb\xe4" + "\x5e\xd4\xa8\xb9\x76\xba\x56\x91" + "\xc4\x75\x04\x1e\xc2\x77\x24\x6f" + "\xf9\xe8\x4a\xec\x7f\x86\x95\xb3" + "\x5c\x2c\x97\xab\xf0\xf7\x74\x5b" + "\x0b\xc2\xda\x42\x40\x34\x16\xed" + "\x06\xc1\x25\x53\x17\x0d\x81\x4e" + "\xe6\xf2\x0f\x6d\x94\x3c\x90\x7a" + "\xae\x20\xe9\x3f\xf8\x18\x67\x6a" + "\x49\x1e\x41\xb6\x46\xab\xc8\xa7" + "\xcb\x19\x96\xf5\x99\xc0\x66\x3e" + "\x77\xcf\x73\x52\x83\x2a\xe2\x48" + "\x27\x6c\xeb\xe7\xe7\xc4\xd5\x6a" + "\x40\x67\xbc\xbf\x6b\x3c\xf3\xbb" + "\x51\x5e\x31\xac\x03\x81\xab\x61" + "\xfa\xa5\xa6\x7d\x8b\xc3\x8a\x75" + "\x28\x7a\x71\x9c\xac\x8f\x76\xfc" + "\xf9\x6c\x5d\x9b\xd7\xf6\x36\x2d" + "\x61\xd5\x61\xaa\xdd\x01\xfc\x57" + "\x91\x10\xcd\xcd\x6d\x27\x63\x24" + "\x67\x46\x7a\xbb\x61\x56\x39\xb1" + "\xd6\x79\xfe\x77\xca\xd6\x73\x59" + "\x6e\x58\x11\x90\x03\x26\x74\x2a" + "\xfa\x52\x12\x47\xfb\x12\xeb\x3e" + "\x88\xf0\x52\x6c\xc0\x54\x7a\x88" + "\x8c\xe5\xde\x9e\xba\xb9\xf2\xe1" + "\x97\x2e\x5c\xbd\xf4\x13\x7e\xf3" + "\xc4\xe1\x87\xa5\x35\xfa\x7c\x71" + "\x1a\xc9\xf4\xa8\x57\xe2\x5a\x6b" + "\x14\xe0\x73\xaf\x56\x6b\xa0\x00" + "\x9e\x5f\x64\xac\x00\xfb\xc4\x92" + "\xe5\xe2\x8a\xb2\x9e\x75\x49\x85" + "\x25\x66\xa5\x1a\xf9\x7d\x1d\x60", + .klen = 24, + .len = 512, + }, + { + .key = "\x05\x60\x3a\x7e\x60\x90\x46\x18" + "\x6c\x60\xba\xeb\x12\xd7\xbe\xd1" + "\xd3\xf6\x10\x46\x9d\xf1\x0c\xb4" + "\x73\xe3\x93\x27\xa8\x2c\x13\xaa", + .iv = "\xf5\x96\xd1\xb6\xcb\x44\xd8\xd0" + "\x3e\xdb\x92\x80\x08\x94\xcd\xd3", + .ptext = "\x78", + .ctext = "\xc5", + .klen = 32, + .len = 1, + }, + { + .key = "\x35\xca\x38\xf3\xd9\xd6\x34\xef" + "\xcd\xee\xa3\x26\x86\xba\xfb\x45" + "\x01\xfa\x52\x67\xff\xc5\x9d\xaa" + "\x64\x9a\x05\xbb\x85\x20\xa7\xf2", + .iv = "\xe3\xda\xf5\xff\x42\x59\x87\x86" + "\xee\x7b\xd6\xb4\x6a\x25\x44\xff", + .ptext = "\x44\x67\x1e\x04\x53\xd2\x4b\xd9" + "\x96\x33\x07\x54\xe4\x8e\x20", + .ctext = "\xcc\x55\x40\x79\x47\x5c\x8b\xa6" + "\xca\x7b\x9f\x50\xe3\x21\xea", + .klen = 32, + .len = 15, + }, + { + .key = "\xaf\xd9\x14\x14\xd5\xdb\xc9\xce" + "\x76\x5c\x5a\xbf\x43\x05\x29\x24" + "\xc4\x13\x68\xcc\xe8\x37\xbd\xb9" + "\x41\x20\xf5\x53\x48\xd0\xa2\xd6", + .iv = "\xa7\xb4\x00\x08\x79\x10\xae\xf5" + "\x02\xbf\x85\xb2\x69\x4c\xc6\x04", + .ptext = "\xac\x6a\xa8\x0c\xb0\x84\xbf\x4c" + "\xae\x94\x20\x58\x7e\x00\x93\x89", + .ctext = "\xd5\xaa\xe2\xe9\x86\x4c\x95\x4e" + "\xde\xb6\x15\xcb\xdc\x1f\x13\x38", + .klen = 32, + .len = 16, + }, + { + .key = "\xed\xe3\x8b\xe7\x1c\x17\xbf\x4a" + "\x02\xe2\xfc\x76\xac\xf5\x3c\x00" + "\x5d\xdc\xfc\x83\xeb\x45\xb4\xcb" + "\x59\x62\x60\xec\x69\x9c\x16\x45", + .iv = "\xe4\x0e\x2b\x90\xd2\xfa\x94\x2e" + "\x10\xe5\x64\x2b\x97\x28\x15\xc7", + .ptext = "\xe6\x53\xff\x60\x0e\xc4\x51\xe4" + "\x93\x4d\xe5\x55\xc5\xd9\xad\x48" + "\x52", + .ctext = "\xba\x25\x28\xf5\xcf\x31\x91\x80" + "\xda\x2b\x95\x5f\x20\xcb\xfb\x9f" + "\xc6", + .klen = 32, + .len = 17, + }, + { + .key = "\x77\x5c\xc0\x73\x9a\x64\x97\x91" + "\x2f\xee\xe0\x20\xc2\x04\x59\x2e" + "\x97\xd2\xa7\x70\xb3\xb0\x21\x6b" + "\x8f\xbf\xb8\x51\xa8\xea\x0f\x62", + .iv = "\x31\x8e\x1f\xcd\xfd\x23\xeb\x7f" + "\x8a\x1f\x1b\x23\x53\x27\x44\xe5", + .ptext = "\xcd\xff\x8c\x9b\x94\x5a\x51\x3f" + "\x40\x93\x56\x93\x66\x39\x63\x1f" + "\xbf\xe6\xa4\xfa\xbe\x79\x93\x03" + "\xf5\x66\x74\x16\xfc\xe4\xce", + .ctext = "\x8b\xd3\xc3\xce\x66\xf8\x66\x4c" + "\xad\xd6\xf5\x0f\xd8\x99\x5a\x75" + "\xa1\x3c\xab\x0b\x21\x36\x57\x72" + "\x88\x29\xe9\xea\x4a\x8d\xe9", + .klen = 32, + .len = 31, + }, + { + .key = "\xa1\x2f\x4d\xde\xfe\xa1\xff\xa8" + "\x73\xdd\xe3\xe2\x95\xfc\xea\x9c" + "\xd0\x80\x42\x0c\xb8\x43\x3e\x99" + "\x39\x38\x0a\x8c\xe8\x45\x3a\x7b", + .iv = "\x32\xc4\x6f\xb1\x14\x43\xd1\x87" + "\xe2\x6f\x5a\x58\x02\x36\x7e\x2a", + .ptext = "\x9e\x5c\x1e\xf1\xd6\x7d\x09\x57" + "\x18\x48\x55\xda\x7d\x44\xf9\x6d" + "\xac\xcd\x59\xbb\x10\xa2\x94\x67" + "\xd1\x6f\xfe\x6b\x4a\x11\xe8\x04" + "\x09\x26\x4f\x8d\x5d\xa1\x7b\x42" + "\xf9\x4b\x66\x76\x38\x12\xfe\xfe", + .ctext = "\x42\xbc\xa7\x64\x15\x9a\x04\x71" + "\x2c\x5f\x94\xba\x89\x3a\xad\xbc" + "\x87\xb3\xf4\x09\x4f\x57\x06\x18" + "\xdc\x84\x20\xf7\x64\x85\xca\x3b" + "\xab\xe6\x33\x56\x34\x60\x5d\x4b" + "\x2e\x16\x13\xd4\x77\xde\x2d\x2b", + .klen = 32, + .len = 48, + }, + { + .key = "\xfb\xf5\xb7\x3d\xa6\x95\x42\xbf" + "\xd2\x94\x6c\x74\x0f\xbc\x5a\x28" + "\x35\x3c\x51\x58\x84\xfb\x7d\x11" + "\x16\x1e\x00\x97\x37\x08\xb7\x16", + .iv = "\x9b\x53\x57\x40\xe6\xd9\xa7\x27" + "\x78\xd4\x9b\xd2\x29\x1d\x24\xa9", + .ptext = "\x8b\x02\x60\x0a\x3e\xb7\x10\x59" + "\xc3\xac\xd5\x2a\x75\x81\xf2\xdb" + "\x55\xca\x65\x86\x44\xfb\xfe\x91" + "\x26\xbb\x45\xb2\x46\x22\x3e\x08" + "\xa2\xbf\x46\xcb\x68\x7d\x45\x7b" + "\xa1\x6a\x3c\x6e\x25\xeb\xed\x31" + "\x7a\x8b\x47\xf9\xde\xec\x3d\x87" + "\x09\x20\x2e\xfa\xba\x8b\x9b\xc5" + "\x6c\x25\x9c\x9d\x2a\xe8\xab\x90" + "\x3f\x86\xee\x61\x13\x21\xd4\xde" + "\xe1\x0c\x95\xfc\x5c\x8a\x6e\x0a" + "\x73\xcf\x08\x69\x44\x4e\xde\x25" + "\xaf\xaa\x56\x04\xc4\xb3\x60\x44" + "\x3b\x8b\x3d\xee\xae\x42\x4b\xd2" + "\x9a\x6c\xa0\x8e\x52\x06\xb2\xd1" + "\x5d\x38\x30\x6d\x27\x9b\x1a\xd8", + .ctext = "\xa3\x78\x33\x78\x95\x95\x97\x07" + "\x53\xa3\xa1\x5b\x18\x32\x27\xf7" + "\x09\x12\x53\x70\x83\xb5\x6a\x9f" + "\x26\x6d\x10\x0d\xe0\x1c\xe6\x2b" + "\x70\x00\xdc\xa1\x60\xef\x1b\xee" + "\xc5\xa5\x51\x17\xae\xcc\xf2\xed" + "\xc4\x60\x07\xdf\xd5\x7a\xe9\x90" + "\x3c\x9f\x96\x5d\x72\x65\x5d\xef" + "\xd0\x94\x32\xc4\x85\x90\x78\xa1" + "\x2e\x64\xf6\xee\x8e\x74\x3f\x20" + "\x2f\x12\x3b\x3d\xd5\x39\x8e\x5a" + "\xf9\x8f\xce\x94\x5d\x82\x18\x66" + "\x14\xaf\x4c\xfe\xe0\x91\xc3\x4a" + "\x85\xcf\xe7\xe8\xf7\xcb\xf0\x31" + "\x88\x7d\xc9\x5b\x71\x9d\x5f\xd2" + "\xfa\xed\xa6\x24\xda\xbb\xb1\x84", + .klen = 32, + .len = 128, + }, + { + .key = "\x32\x37\x2b\x8f\x7b\xb1\x23\x79" + "\x05\x52\xde\x05\xf1\x68\x3f\x6c" + "\xa4\xae\xbc\x21\xc2\xc6\xf0\xbd" + "\x0f\x20\xb7\xa4\xc5\x05\x7b\x64", + .iv = "\xff\x26\x4e\x67\x48\xdd\xcf\xfe" + "\x42\x09\x04\x98\x5f\x1e\xfa\x80", + .ptext = "\x99\xdc\x3b\x19\x41\xf9\xff\x6e" + "\x76\xb5\x03\xfa\x61\xed\xf8\x44" + "\x70\xb9\xf0\x83\x80\x6e\x31\x77" + "\x77\xe4\xc7\xb4\x77\x02\xab\x91" + "\x82\xc6\xf8\x7c\x46\x61\x03\x69" + "\x09\xa0\xf7\x12\xb7\x81\x6c\xa9" + "\x10\x5c\xbb\x55\xb3\x44\xed\xb5" + "\xa2\x52\x48\x71\x90\x5d\xda\x40" + "\x0b\x7f\x4a\x11\x6d\xa7\x3d\x8e" + "\x1b\xcd\x9d\x4e\x75\x8b\x7d\x87" + "\xe5\x39\x34\x32\x1e\xe6\x8d\x51" + "\xd4\x1f\xe3\x1d\x50\xa0\x22\x37" + "\x7c\xb0\xd9\xfb\xb6\xb2\x16\xf6" + "\x6d\x26\xa0\x4e\x8c\x6a\xe6\xb6" + "\xbe\x4c\x7c\xe3\x88\x10\x18\x90" + "\x11\x50\x19\x90\xe7\x19\x3f\xd0" + "\x31\x15\x0f\x06\x96\xfe\xa7\x7b" + "\xc3\x32\x88\x69\xa4\x12\xe3\x64" + "\x02\x30\x17\x74\x6c\x88\x7c\x9b" + "\xd6\x6d\x75\xdf\x11\x86\x70\x79" + "\x48\x7d\x34\x3e\x33\x58\x07\x8b" + "\xd2\x50\xac\x35\x15\x45\x05\xb4" + "\x4d\x31\x97\x19\x87\x23\x4b\x87" + "\x53\xdc\xa9\x19\x78\xf1\xbf\x35" + "\x30\x04\x14\xd4\xcf\xb2\x8c\x87" + "\x7d\xdb\x69\xc9\xcd\xfe\x40\x3e" + "\x8d\x66\x5b\x61\xe5\xf0\x2d\x87" + "\x93\x3a\x0c\x2b\x04\x98\x05\xc2" + "\x56\x4d\xc4\x6c\xcd\x7a\x98\x7e" + "\xe2\x2d\x79\x07\x91\x9f\xdf\x2f" + "\x72\xc9\x8f\xcb\x0b\x87\x1b\xb7" + "\x04\x86\xcb\x47\xfa\x5d\x03", + .ctext = "\x0b\x00\xf7\xf2\xc8\x6a\xba\x9a" + "\x0a\x97\x18\x7a\x00\xa0\xdb\xf4" + "\x5e\x8e\x4a\xb7\xe0\x51\xf1\x75" + "\x17\x8b\xb4\xf1\x56\x11\x05\x9f" + "\x2f\x2e\xba\x67\x04\xe1\xb4\xa5" + "\xfc\x7c\x8c\xad\xc6\xb9\xd1\x64" + "\xca\xbd\x5d\xaf\xdb\x65\x48\x4f" + "\x1b\xb3\x94\x5c\x0b\xd0\xee\xcd" + "\xb5\x7f\x43\x8a\xd8\x8b\x66\xde" + "\xd2\x9c\x13\x65\xa4\x47\xa7\x03" + "\xc5\xa1\x46\x8f\x2f\x84\xbc\xef" + "\x48\x9d\x9d\xb5\xbd\x43\xff\xd2" + "\xd2\x7a\x5a\x13\xbf\xb4\xf6\x05" + "\x17\xcd\x01\x12\xf0\x35\x27\x96" + "\xf4\xc1\x65\xf7\x69\xef\x64\x1b" + "\x6e\x4a\xe8\x77\xce\x83\x01\xb7" + "\x60\xe6\x45\x2a\xcd\x41\x4a\xb5" + "\x8e\xcc\x45\x93\xf1\xd6\x64\x5f" + "\x32\x60\xe4\x29\x4a\x82\x6c\x86" + "\x16\xe4\xcc\xdb\x5f\xc8\x11\xa6" + "\xfe\x88\xd6\xc3\xe5\x5c\xbb\x67" + "\xec\xa5\x7b\xf5\xa8\x4f\x77\x25" + "\x5d\x0c\x2a\x99\xf9\xb9\xd1\xae" + "\x3c\x83\x2a\x93\x9b\x66\xec\x68" + "\x2c\x93\x02\x8a\x8a\x1e\x2f\x50" + "\x09\x37\x19\x5c\x2a\x3a\xc2\xcb" + "\xcb\x89\x82\x81\xb7\xbb\xef\x73" + "\x8b\xc9\xae\x42\x96\xef\x70\xc0" + "\x89\xc7\x3e\x6a\x26\xc3\xe4\x39" + "\x53\xa9\xcf\x63\x7d\x05\xf3\xff" + "\x52\x04\xf6\x7f\x23\x96\xe9\xf7" + "\xff\xd6\x50\xa3\x0e\x20\x71", + .klen = 32, + .len = 255, + }, + { + .key = "\x39\x5f\xf4\x9c\x90\x3a\x9a\x25" + "\x15\x11\x79\x39\xed\x26\x5e\xf6" + "\xda\xcf\x33\x4f\x82\x97\xab\x10" + "\xc1\x55\x48\x82\x80\xa8\x02\xb2", + .iv = "\x82\x60\xd9\x06\xeb\x40\x99\x76" + "\x08\xc5\xa4\x83\x45\xb8\x38\x5a", + .ptext = "\xa1\xa8\xac\xac\x08\xaf\x8f\x84" + "\xbf\xcc\x79\x31\x5e\x61\x01\xd1" + "\x4d\x5f\x9b\xcd\x91\x92\x9a\xa1" + "\x99\x0d\x49\xb2\xd7\xfd\x25\x93" + "\x51\x96\xbd\x91\x8b\x08\xf1\xc6" + "\x0d\x17\xf6\xef\xfd\xd2\x78\x16" + "\xc8\x08\x27\x7b\xca\x98\xc6\x12" + "\x86\x11\xdb\xd5\x08\x3d\x5a\x2c" + "\xcf\x15\x0e\x9b\x42\x78\xeb\x1f" + "\x52\xbc\xd7\x5a\x8a\x33\x6c\x14" + "\xfc\x61\xad\x2e\x1e\x03\x66\xea" + "\x79\x0e\x88\x88\xde\x93\xe3\x81" + "\xb5\xc4\x1c\xe6\x9c\x08\x18\x8e" + "\xa0\x87\xda\xe6\xf8\xcb\x30\x44" + "\x2d\x4e\xc0\xa3\x60\xf9\x62\x7b" + "\x4b\xd5\x61\x6d\xe2\x67\x95\x54" + "\x10\xd1\xca\x22\xe8\xb6\xb1\x3a" + "\x2d\xd7\x35\x5b\x22\x88\x55\x67" + "\x3d\x83\x8f\x07\x98\xa8\xf2\xcf" + "\x04\xb7\x9e\x52\xca\xe0\x98\x72" + "\x5c\xc1\x00\xd4\x1f\x2c\x61\xf3" + "\xe8\x40\xaf\x4a\xee\x66\x41\xa0" + "\x02\x77\x29\x30\x65\x59\x4b\x20" + "\x7b\x0d\x80\x97\x27\x7f\xd5\x90" + "\xbb\x9d\x76\x90\xe5\x43\x43\x72" + "\xd0\xd4\x14\x75\x66\xb3\xb6\xaf" + "\x09\xe4\x23\xb0\x62\xad\x17\x28" + "\x39\x26\xab\xf5\xf7\x5c\xb6\x33" + "\xbd\x27\x09\x5b\x29\xe4\x40\x0b" + "\xc1\x26\x32\xdb\x9a\xdf\xf9\x5a" + "\xae\x03\x2c\xa4\x40\x84\x9a\xb7" + "\x4e\x47\xa8\x0f\x23\xc7\xbb\xcf" + "\x2b\xf2\x32\x6c\x35\x6a\x91\xba" + "\x0e\xea\xa2\x8b\x2f\xbd\xb5\xea" + "\x6e\xbc\xb5\x4b\x03\xb3\x86\xe0" + "\x86\xcf\xba\xcb\x38\x2c\x32\xa6" + "\x6d\xe5\x28\xa6\xad\xd2\x7f\x73" + "\x43\x14\xf8\xb1\x99\x12\x2d\x2b" + "\xdf\xcd\xf2\x81\x43\x94\xdf\xb1" + "\x17\xc9\x33\xa6\x3d\xef\x96\xb8" + "\xd6\x0d\x00\xec\x49\x66\x85\x5d" + "\x44\x62\x12\x04\x55\x5c\x48\xd3" + "\xbd\x73\xac\x54\x8f\xbf\x97\x8e" + "\x85\xfd\xc2\xa1\x25\x32\x38\x6a" + "\x1f\xac\x57\x3c\x4f\x56\x73\xf2" + "\x1d\xb6\x48\x68\xc7\x0c\xe7\x60" + "\xd2\x8e\x4d\xfb\xc7\x20\x7b\xb7" + "\x45\x28\x12\xc6\x26\xae\xea\x7c" + "\x5d\xe2\x46\xb5\xae\xe1\xc3\x98" + "\x6f\x72\xd5\xa2\xfd\xed\x40\xfd" + "\xf9\xdf\x61\xec\x45\x2c\x15\xe0" + "\x1e\xbb\xde\x71\x37\x5f\x73\xc2" + "\x11\xcc\x6e\x6d\xe1\xb5\x1b\xd2" + "\x2a\xdd\x19\x8a\xc2\xe1\xa0\xa4" + "\x26\xeb\xb2\x2c\x4f\x77\x52\xf1" + "\x42\x72\x6c\xad\xd7\x78\x5d\x72" + "\xc9\x16\x26\x25\x1b\x4c\xe6\x58" + "\x79\x57\xb5\x06\x15\x4f\xe5\xba" + "\xa2\x7f\x2d\x5b\x87\x8a\x44\x70" + "\xec\xc7\xef\x84\xae\x60\xa2\x61" + "\x86\xe9\x18\xcd\x28\xc4\xa4\xf5" + "\xbc\x84\xb8\x86\xa0\xba\xf1\xf1" + "\x08\x3b\x32\x75\x35\x22\x7a\x65" + "\xca\x48\xe8\xef\x6e\xe2\x8e\x00", + .ctext = "\x2f\xae\xd8\x67\xeb\x15\xde\x75" + "\x53\xa3\x0e\x5a\xcf\x1c\xbe\xea" + "\xde\xf9\xcf\xc2\x9f\xfd\x0f\x44" + "\xc0\xe0\x7a\x76\x1d\xcb\x4a\xf8" + "\x35\xd6\xe3\x95\x98\x6b\x3f\x89" + "\xc4\xe6\xb6\x6f\xe1\x8b\x39\x4b" + "\x1c\x6c\x77\xe4\xe1\x8a\xbc\x61" + "\x00\x6a\xb1\x37\x2f\x45\xe6\x04" + "\x52\x0b\xfc\x1e\x32\xc1\xd8\x9d" + "\xfa\xdd\x67\x5c\xe0\x75\x83\xd0" + "\x21\x9e\x02\xea\xc0\x7f\xc0\x29" + "\xb3\x6c\xa5\x97\xb3\x29\x82\x1a" + "\x94\xa5\xb4\xb6\x49\xe5\xa5\xad" + "\x95\x40\x52\x7c\x84\x88\xa4\xa8" + "\x26\xe4\xd9\x5d\x41\xf2\x93\x7b" + "\xa4\x48\x1b\x66\x91\xb9\x7c\xc2" + "\x99\x29\xdf\xd8\x30\xac\xd4\x47" + "\x42\xa0\x14\x87\x67\xb8\xfd\x0b" + "\x1e\xcb\x5e\x5c\x9a\xc2\x04\x8b" + "\x17\x29\x9d\x99\x7f\x86\x4c\xe2" + "\x5c\x96\xa6\x0f\xb6\x47\x33\x5c" + "\xe4\x50\x49\xd5\x4f\x92\x0b\x9a" + "\xbc\x52\x4c\x41\xf5\xc9\x3e\x76" + "\x55\x55\xd4\xdc\x71\x14\x23\xfc" + "\x5f\xd5\x08\xde\xa0\xf7\x28\xc0" + "\xe1\x61\xac\x64\x66\xf6\xd1\x31" + "\xe4\xa4\xa9\xed\xbc\xad\x4f\x3b" + "\x59\xb9\x48\x1b\xe7\xb1\x6f\xc6" + "\xba\x40\x1c\x0b\xe7\x2f\x31\x65" + "\x85\xf5\xe9\x14\x0a\x31\xf5\xf3" + "\xc0\x1c\x20\x35\x73\x38\x0f\x8e" + "\x39\xf0\x68\xae\x08\x9c\x87\x4b" + "\x42\xfc\x22\x17\xee\x96\x51\x2a" + "\xd8\x57\x5a\x35\xea\x72\x74\xfc" + "\xb3\x0e\x69\x9a\xe1\x4f\x24\x90" + "\xc5\x4b\xe5\xd7\xe3\x82\x2f\xc5" + "\x62\x46\x3e\xab\x72\x4e\xe0\xf3" + "\x90\x09\x4c\xb2\xe1\xe8\xa0\xf5" + "\x46\x40\x2b\x47\x85\x3c\x21\x90" + "\x3d\xad\x25\x5a\x36\xdf\xe5\xbc" + "\x7e\x80\x4d\x53\x77\xf1\x79\xa6" + "\xec\x22\x80\x88\x68\xd6\x2d\x8b" + "\x3e\xf7\x52\xc7\x2a\x20\x42\x5c" + "\xed\x99\x4f\x32\x80\x00\x7e\x73" + "\xd7\x6d\x7f\x7d\x42\x54\x4a\xfe" + "\xff\x6f\x61\xca\x2a\xbb\x4f\xeb" + "\x4f\xe4\x4e\xaf\x2c\x4f\x82\xcd" + "\xa1\xa7\x11\xb3\x34\x33\xcf\x32" + "\x63\x0e\x24\x3a\x35\xbe\x06\xd5" + "\x17\xcb\x02\x30\x33\x6e\x8c\x49" + "\x40\x6e\x34\x8c\x07\xd4\x3e\xe6" + "\xaf\x78\x6d\x8c\x10\x5f\x21\x58" + "\x49\x26\xc5\xaf\x0d\x7d\xd4\xaf" + "\xcd\x5b\xa1\xe3\xf6\x39\x1c\x9b" + "\x8e\x00\xa1\xa7\x9e\x17\x4a\xc0" + "\x54\x56\x9e\xcf\xcf\x88\x79\x8d" + "\x50\xf7\x56\x8e\x0a\x73\x46\x6b" + "\xc3\xb9\x9b\x6c\x7d\xc4\xc8\xb6" + "\x03\x5f\x30\x62\x7d\xe6\xdb\x15" + "\xe1\x39\x02\x8c\xff\xda\xc8\x43" + "\xf2\xa9\xbf\x00\xe7\x3a\x61\x89" + "\xdf\xb0\xca\x7d\x8c\x8a\x6a\x9f" + "\x18\x89\x3d\x39\xac\x36\x6f\x05" + "\x1f\xb5\xda\x00\xea\xe1\x51\x21", + .klen = 32, + .len = 512, + }, + +}; + +/* + * Test vectors generated using https://github.com/google/hctr2 + * + * To ensure compatibility with RFC 8452, some tests were sourced from + * https://datatracker.ietf.org/doc/html/rfc8452 + */ +static const struct hash_testvec polyval_tv_template[] = { + { // From RFC 8452 + .key = "\x31\x07\x28\xd9\x91\x1f\x1f\x38" + "\x37\xb2\x43\x16\xc3\xfa\xb9\xa0", + .plaintext = "\x65\x78\x61\x6d\x70\x6c\x65\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x48\x65\x6c\x6c\x6f\x20\x77\x6f" + "\x72\x6c\x64\x00\x00\x00\x00\x00" + "\x38\x00\x00\x00\x00\x00\x00\x00" + "\x58\x00\x00\x00\x00\x00\x00\x00", + .digest = "\xad\x7f\xcf\x0b\x51\x69\x85\x16" + "\x62\x67\x2f\x3c\x5f\x95\x13\x8f", + .psize = 48, + .ksize = 16, + }, + { // From RFC 8452 + .key = "\xd9\xb3\x60\x27\x96\x94\x94\x1a" + "\xc5\xdb\xc6\x98\x7a\xda\x73\x77", + .plaintext = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .digest = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .psize = 16, + .ksize = 16, + }, + { // From RFC 8452 + .key = "\xd9\xb3\x60\x27\x96\x94\x94\x1a" + "\xc5\xdb\xc6\x98\x7a\xda\x73\x77", + .plaintext = "\x01\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x40\x00\x00\x00\x00\x00\x00\x00", + .digest = "\xeb\x93\xb7\x74\x09\x62\xc5\xe4" + "\x9d\x2a\x90\xa7\xdc\x5c\xec\x74", + .psize = 32, + .ksize = 16, + }, + { // From RFC 8452 + .key = "\xd9\xb3\x60\x27\x96\x94\x94\x1a" + "\xc5\xdb\xc6\x98\x7a\xda\x73\x77", + .plaintext = "\x01\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x02\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x03\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x80\x01\x00\x00\x00\x00\x00\x00", + .digest = "\x81\x38\x87\x46\xbc\x22\xd2\x6b" + "\x2a\xbc\x3d\xcb\x15\x75\x42\x22", + .psize = 64, + .ksize = 16, + }, + { // From RFC 8452 + .key = "\xd9\xb3\x60\x27\x96\x94\x94\x1a" + "\xc5\xdb\xc6\x98\x7a\xda\x73\x77", + .plaintext = "\x01\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x02\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x03\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x04\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x02\x00\x00\x00\x00\x00\x00", + .digest = "\x1e\x39\xb6\xd3\x34\x4d\x34\x8f" + "\x60\x44\xf8\x99\x35\xd1\xcf\x78", + .psize = 80, + .ksize = 16, + }, + { // From RFC 8452 + .key = "\xd9\xb3\x60\x27\x96\x94\x94\x1a" + "\xc5\xdb\xc6\x98\x7a\xda\x73\x77", + .plaintext = "\x01\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x02\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x03\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x04\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x05\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x08\x00\x00\x00\x00\x00\x00\x00" + "\x00\x02\x00\x00\x00\x00\x00\x00", + .digest = "\xff\xcd\x05\xd5\x77\x0f\x34\xad" + "\x92\x67\xf0\xa5\x99\x94\xb1\x5a", + .psize = 96, + .ksize = 16, + }, + { // Random ( 1) + .key = "\x90\xcc\xac\xee\xba\xd7\xd4\x68" + "\x98\xa6\x79\x70\xdf\x66\x15\x6c", + .plaintext = "", + .digest = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .psize = 0, + .ksize = 16, + }, + { // Random ( 1) + .key = "\xc1\x45\x71\xf0\x30\x07\x94\xe7" + "\x3a\xdd\xe4\xc6\x19\x2d\x02\xa2", + .plaintext = "\xc1\x5d\x47\xc7\x4c\x7c\x5e\x07" + "\x85\x14\x8f\x79\xcc\x73\x83\xf7" + "\x35\xb8\xcb\x73\x61\xf0\x53\x31" + "\xbf\x84\xde\xb6\xde\xaf\xb0\xb8" + "\xb7\xd9\x11\x91\x89\xfd\x1e\x4c" + "\x84\x4a\x1f\x2a\x87\xa4\xaf\x62" + "\x8d\x7d\x58\xf6\x43\x35\xfc\x53" + "\x8f\x1a\xf6\x12\xe1\x13\x3f\x66" + "\x91\x4b\x13\xd6\x45\xfb\xb0\x7a" + "\xe0\x8b\x8e\x99\xf7\x86\x46\x37" + "\xd1\x22\x9e\x52\xf3\x3f\xd9\x75" + "\x2c\x2c\xc6\xbb\x0e\x08\x14\x29" + "\xe8\x50\x2f\xd8\xbe\xf4\xe9\x69" + "\x4a\xee\xf7\xae\x15\x65\x35\x1e", + .digest = "\x00\x4f\x5d\xe9\x3b\xc0\xd6\x50" + "\x3e\x38\x73\x86\xc6\xda\xca\x7f", + .psize = 112, + .ksize = 16, + }, + { // Random ( 1) + .key = "\x37\xbe\x68\x16\x50\xb9\x4e\xb0" + "\x47\xde\xe2\xbd\xde\xe4\x48\x09", + .plaintext = "\x87\xfc\x68\x9f\xff\xf2\x4a\x1e" + "\x82\x3b\x73\x8f\xc1\xb2\x1b\x7a" + "\x6c\x4f\x81\xbc\x88\x9b\x6c\xa3" + "\x9c\xc2\xa5\xbc\x14\x70\x4c\x9b" + "\x0c\x9f\x59\x92\x16\x4b\x91\x3d" + "\x18\x55\x22\x68\x12\x8c\x63\xb2" + "\x51\xcb\x85\x4b\xd2\xae\x0b\x1c" + "\x5d\x28\x9d\x1d\xb1\xc8\xf0\x77" + "\xe9\xb5\x07\x4e\x06\xc8\xee\xf8" + "\x1b\xed\x72\x2a\x55\x7d\x16\xc9" + "\xf2\x54\xe7\xe9\xe0\x44\x5b\x33" + "\xb1\x49\xee\xff\x43\xfb\x82\xcd" + "\x4a\x70\x78\x81\xa4\x34\x36\xe8" + "\x4c\x28\x54\xa6\x6c\xc3\x6b\x78" + "\xe7\xc0\x5d\xc6\x5d\x81\xab\x70" + "\x08\x86\xa1\xfd\xf4\x77\x55\xfd" + "\xa3\xe9\xe2\x1b\xdf\x99\xb7\x80" + "\xf9\x0a\x4f\x72\x4a\xd3\xaf\xbb" + "\xb3\x3b\xeb\x08\x58\x0f\x79\xce" + "\xa5\x99\x05\x12\x34\xd4\xf4\x86" + "\x37\x23\x1d\xc8\x49\xc0\x92\xae" + "\xa6\xac\x9b\x31\x55\xed\x15\xc6" + "\x05\x17\x37\x8d\x90\x42\xe4\x87" + "\x89\x62\x88\x69\x1c\x6a\xfd\xe3" + "\x00\x2b\x47\x1a\x73\xc1\x51\xc2" + "\xc0\x62\x74\x6a\x9e\xb2\xe5\x21" + "\xbe\x90\xb5\xb0\x50\xca\x88\x68" + "\xe1\x9d\x7a\xdf\x6c\xb7\xb9\x98" + "\xee\x28\x62\x61\x8b\xd1\x47\xf9" + "\x04\x7a\x0b\x5d\xcd\x2b\x65\xf5" + "\x12\xa3\xfe\x1a\xaa\x2c\x78\x42" + "\xb8\xbe\x7d\x74\xeb\x59\xba\xba", + .digest = "\xae\x11\xd4\x60\x2a\x5f\x9e\x42" + "\x89\x04\xc2\x34\x8d\x55\x94\x0a", + .psize = 256, + .ksize = 16, + }, + +}; + +/* + * Test vectors generated using https://github.com/google/hctr2 + */ +static const struct cipher_testvec aes_hctr2_tv_template[] = { + { + .key = "\xe1\x15\x66\x3c\x8d\xc6\x3a\xff" + "\xef\x41\xd7\x47\xa2\xcc\x8a\xba", + .iv = "\xc3\xbe\x2a\xcb\xb5\x39\x86\xf1" + "\x91\xad\x6c\xf4\xde\x74\x45\x63" + "\x5c\x7a\xd5\xcc\x8b\x76\xef\x0e" + "\xcf\x2c\x60\x69\x37\xfd\x07\x96", + .ptext = "\x65\x75\xae\xd3\xe2\xbc\x43\x5c" + "\xb3\x1a\xd8\x05\xc3\xd0\x56\x29", + .ctext = "\x11\x91\xea\x74\x58\xcc\xd5\xa2" + "\xd0\x55\x9e\x3d\xfe\x7f\xc8\xfe", + .klen = 16, + .len = 16, + }, + { + .key = "\xe7\xd1\x77\x48\x76\x0b\xcd\x34" + "\x2a\x2d\xe7\x74\xca\x11\x9c\xae", + .iv = "\x71\x1c\x49\x62\xd9\x5b\x50\x5e" + "\x68\x87\xbc\xf6\x89\xff\xed\x30" + "\xe4\xe5\xbd\xb6\x10\x4f\x9f\x66" + "\x28\x06\x5a\xf4\x27\x35\xcd\xe5", + .ptext = "\x87\x03\x8f\x06\xa8\x61\x54\xda" + "\x01\x45\xd4\x01\xef\x4a\x22\xcf" + "\x78\x15\x9f\xbd\x64\xbd\x2c\xb9" + "\x40\x1d\x72\xae\x53\x63\xa5", + .ctext = "\x4e\xa1\x05\x27\xb8\x45\xe4\xa1" + "\xbb\x30\xb4\xa6\x12\x74\x63\xd6" + "\x17\xc9\xcc\x2f\x18\x64\xe0\x06" + "\x0a\xa0\xff\x72\x10\x7b\x22", + .klen = 16, + .len = 31, + }, + { + .key = "\x59\x65\x3b\x1d\x43\x5e\xc0\xae" + "\xb8\x9d\x9b\xdd\x22\x03\xbf\xca", + .iv = "\xec\x95\xfa\x5a\xcf\x5e\xd2\x93" + "\xa3\xb5\xe5\xbe\xf3\x01\x7b\x01" + "\xd1\xca\x6c\x06\x82\xf0\xbd\x67" + "\xd9\x6c\xa4\xdc\xb4\x38\x0f\x74", + .ptext = "\x45\xdf\x75\x87\xbc\x72\xce\x55" + "\xc9\xfa\xcb\xfc\x9f\x40\x82\x2b" + "\xc6\x4f\x4f\x5b\x8b\x3b\x6d\x67" + "\xa6\x93\x62\x89\x8c\x19\xf4\xe3" + "\x08\x92\x9c\xc9\x47\x2c\x6e\xd0" + "\xa3\x02\x2b\xdb\x2c\xf2\x8d\x46" + "\xcd\xb0\x9d\x26\x63\x4c\x40\x6b" + "\x79\x43\xe5\xce\x42\xa8\xec\x3b" + "\x5b\xd0\xea\xa4\xe6\xdb\x66\x55" + "\x7a\x76\xec\xab\x7d\x2a\x2b\xbd" + "\xa9\xab\x22\x64\x1a\xa1\xae\x84" + "\x86\x79\x67\xe9\xb2\x50\xbe\x12" + "\x2f\xb2\x14\xf0\xdb\x71\xd8\xa7" + "\x41\x8a\x88\xa0\x6a\x6e\x9d\x2a" + "\xfa\x11\x37\x40\x32\x09\x4c\x47" + "\x41\x07\x31\x85\x3d\xa8\xf7\x64", + .ctext = "\x2d\x4b\x9f\x93\xca\x5a\x48\x26" + "\x01\xcc\x54\xe4\x31\x50\x12\xf0" + "\x49\xff\x59\x42\x68\xbd\x87\x8f" + "\x9e\x62\x96\xcd\xb9\x24\x57\xa4" + "\x0b\x7b\xf5\x2e\x0e\xa8\x65\x07" + "\xab\x05\xd5\xca\xe7\x9c\x6c\x34" + "\x5d\x42\x34\xa4\x62\xe9\x75\x48" + "\x3d\x9e\x8f\xfa\x42\xe9\x75\x08" + "\x4e\x54\x91\x2b\xbd\x11\x0f\x8e" + "\xf0\x82\xf5\x24\xf1\xc4\xfc\xae" + "\x42\x54\x7f\xce\x15\xa8\xb2\x33" + "\xc0\x86\xb6\x2b\xe8\x44\xce\x1f" + "\x68\x57\x66\x94\x6e\xad\xeb\xf3" + "\x30\xf8\x11\xbd\x60\x00\xc6\xd5" + "\x4c\x81\xf1\x20\x2b\x4a\x5b\x99" + "\x79\x3b\xc9\x5c\x74\x23\xe6\x5d", + .klen = 16, + .len = 128, + }, + { + .key = "\x3e\x08\x5d\x64\x6c\x98\xec\xec" + "\x70\x0e\x0d\xa1\x41\x20\x99\x82", + .iv = "\x11\xb7\x77\x91\x0d\x99\xd9\x8d" + "\x35\x3a\xf7\x14\x6b\x09\x37\xe5" + "\xad\x51\xf6\xc3\x96\x4b\x64\x56" + "\xa8\xbd\x81\xcc\xbe\x94\xaf\xe4", + .ptext = "\xff\x8d\xb9\xc0\xe3\x69\xb3\xb2" + "\x8b\x11\x26\xb3\x11\xec\xfb\xb9" + "\x9c\xc1\x71\xd6\xe3\x26\x0e\xe0" + "\x68\x40\x60\xb9\x3a\x63\x56\x8a" + "\x9e\xc1\xf0\x10\xb1\x64\x32\x70" + "\xf8\xcd\xc6\xc4\x49\x4c\xe1\xce" + "\xf3\xe1\x03\xf8\x35\xae\xe0\x5e" + "\xef\x5f\xbc\x41\x75\x26\x13\xcc" + "\x37\x85\xdf\xc0\x5d\xa6\x47\x98" + "\xf1\x97\x52\x58\x04\xe6\xb5\x01" + "\xc0\xb8\x17\x6d\x74\xbd\x9a\xdf" + "\xa4\x37\x94\x86\xb0\x13\x83\x28" + "\xc9\xa2\x07\x3f\xb5\xb2\x72\x40" + "\x0e\x60\xdf\x57\x07\xb7\x2c\x66" + "\x10\x3f\x8d\xdd\x30\x0a\x47\xd5" + "\xe8\x9d\xfb\xa1\xaf\x53\xd7\x05" + "\xc7\xd2\xba\xe7\x2c\xa0\xbf\xb8" + "\xd1\x93\xe7\x41\x82\xa3\x41\x3a" + "\xaf\x12\xd6\xf8\x34\xda\x92\x46" + "\xad\xa2\x2f\xf6\x7e\x46\x96\xd8" + "\x03\xf3\x49\x64\xde\xd8\x06\x8b" + "\xa0\xbc\x63\x35\x38\xb6\x6b\xda" + "\x5b\x50\x3f\x13\xa5\x84\x1b\x1b" + "\x66\x89\x95\xb7\xc2\x16\x3c\xe9" + "\x24\xb0\x8c\x6f\x49\xef\xf7\x28" + "\x6a\x24\xfd\xbe\x25\xe2\xb4\x90" + "\x77\x44\x08\xb8\xda\xd2\xde\x2c" + "\xa0\x57\x45\x57\x29\x47\x6b\x89" + "\x4a\xf6\xa7\x2a\xc3\x9e\x7b\xc8" + "\xfd\x9f\x89\xab\xee\x6d\xa3\xb4" + "\x23\x90\x7a\xe9\x89\xa0\xc7\xb3" + "\x17\x41\x87\x91\xfc\x97\x42", + .ctext = "\xfc\x9b\x96\x66\xc4\x82\x2a\x4a" + "\xb1\x24\xba\xc7\x78\x5f\x79\xc1" + "\x57\x2e\x47\x29\x4d\x7b\xd2\x9a" + "\xbd\xc6\xc1\x26\x7b\x8e\x3f\x5d" + "\xd4\xb4\x9f\x6a\x02\x24\x4a\xad" + "\x0c\x00\x1b\xdf\x92\xc5\x8a\xe1" + "\x77\x79\xcc\xd5\x20\xbf\x83\xf4" + "\x4b\xad\x11\xbf\xdb\x47\x65\x70" + "\x43\xf3\x65\xdf\xb7\xdc\xb2\xb9" + "\xaa\x3f\xb3\xdf\x79\x69\x0d\xa0" + "\x86\x1c\xba\x48\x0b\x01\xc1\x88" + "\xdf\x03\xb1\x06\x3c\x1d\x56\xa1" + "\x8e\x98\xc1\xa6\x95\xa2\x5b\x72" + "\x76\x59\xd2\x26\x25\xcd\xef\x7c" + "\xc9\x60\xea\x43\xd1\x12\x8a\x8a" + "\x63\x12\x78\xcb\x2f\x88\x1e\x88" + "\x78\x59\xde\xba\x4d\x2c\x78\x61" + "\x75\x37\x54\xfd\x80\xc7\x5e\x98" + "\xcf\x14\x62\x8e\xfb\x72\xee\x4d" + "\x9f\xaf\x8b\x09\xe5\x21\x0a\x91" + "\x8f\x88\x87\xd5\xb1\x84\xab\x18" + "\x08\x57\xed\x72\x35\xa6\x0e\xc6" + "\xff\xcb\xfe\x2c\x48\x39\x14\x44" + "\xba\x59\x32\x3a\x2d\xc4\x5f\xcb" + "\xbe\x68\x8e\x7b\xee\x21\xa4\x32" + "\x11\xa0\x99\xfd\x90\xde\x59\x43" + "\xeb\xed\xd5\x87\x68\x46\xc6\xde" + "\x0b\x07\x17\x59\x6a\xab\xca\x15" + "\x65\x02\x01\xb6\x71\x8c\x3b\xaa" + "\x18\x3b\x30\xae\x38\x5b\x2c\x74" + "\xd4\xee\x4a\xfc\xf7\x1b\x09\xd4" + "\xda\x8b\x1d\x5d\x6f\x21\x6c", + .klen = 16, + .len = 255, + }, + { + .key = "\x24\xf6\xe1\x62\xe5\xaf\x99\xda" + "\x84\xec\x41\xb0\xa3\x0b\xd5\xa8" + "\xa0\x3e\x7b\xa6\xdd\x6c\x8f\xa8", + .iv = "\x7f\x80\x24\x62\x32\xdd\xab\x66" + "\xf2\x87\x29\x24\xec\xd2\x4b\x9f" + "\x0c\x33\x52\xd9\xe0\xcc\x6e\xe4" + "\x90\x85\x43\x97\xc4\x62\x14\x33", + .ptext = "\xef\x58\xe7\x7f\xa9\xd9\xb8\xd7" + "\xa2\x91\x97\x07\x27\x9e\xba\xe8" + "\xaa", + .ctext = "\xd7\xc3\x81\x91\xf2\x40\x17\x73" + "\x3e\x3b\x1c\x2a\x8e\x11\x9c\x17" + "\xf1", + .klen = 24, + .len = 17, + }, + { + .key = "\xbf\xaf\xd7\x67\x8c\x47\xcf\x21" + "\x8a\xa5\xdd\x32\x25\x47\xbe\x4f" + "\xf1\x3a\x0b\xa6\xaa\x2d\xcf\x09", + .iv = "\xd9\xe8\xf0\x92\x4e\xfc\x1d\xf2" + "\x81\x37\x7c\x8f\xf1\x59\x09\x20" + "\xf4\x46\x51\x86\x4f\x54\x8b\x32" + "\x58\xd1\x99\x8b\x8c\x03\xeb\x5d", + .ptext = "\xcd\x64\x90\xf9\x7c\xe5\x0e\x5a" + "\x75\xe7\x8e\x39\x86\xec\x20\x43" + "\x8a\x49\x09\x15\x47\xf4\x3c\x89" + "\x21\xeb\xcf\x4e\xcf\x91\xb5\x40" + "\xcd\xe5\x4d\x5c\x6f\xf2\xd2\x80" + "\xfa\xab\xb3\x76\x9f\x7f\x84\x0a", + .ctext = "\x44\x98\x64\x15\xb7\x0b\x80\xa3" + "\xb9\xca\x23\xff\x3b\x0b\x68\x74" + "\xbb\x3e\x20\x19\x9f\x28\x71\x2a" + "\x48\x3c\x7c\xe2\xef\xb5\x10\xac" + "\x82\x9f\xcd\x08\x8f\x6b\x16\x6f" + "\xc3\xbb\x07\xfb\x3c\xb0\x1b\x27", + .klen = 24, + .len = 48, + }, + { + .key = "\xb8\x35\xa2\x5f\x86\xbb\x82\x99" + "\x27\xeb\x01\x3f\x92\xaf\x80\x24" + "\x4c\x66\xa2\x89\xff\x2e\xa2\x25", + .iv = "\x0a\x1d\x96\xd3\xe0\xe8\x0c\x9b" + "\x9d\x6f\x21\x97\xc2\x17\xdb\x39" + "\x3f\xd8\x64\x48\x80\x04\xee\x43" + "\x02\xce\x88\xe2\x81\x81\x5f\x81", + .ptext = "\xb8\xf9\x16\x8b\x25\x68\xd0\x9c" + "\xd2\x28\xac\xa8\x79\xc2\x30\xc1" + "\x31\xde\x1c\x37\x1b\xa2\xb5\xe6" + "\xf0\xd0\xf8\x9c\x7f\xc6\x46\x07" + "\x5c\xc3\x06\xe4\xf0\x02\xec\xf8" + "\x59\x7c\xc2\x5d\xf8\x0c\x21\xae" + "\x9e\x82\xb1\x1a\x5f\x78\x44\x15" + "\x00\xa7\x2e\x52\xc5\x98\x98\x35" + "\x03\xae\xd0\x8e\x07\x57\xe2\x5a" + "\x17\xbf\x52\x40\x54\x5b\x74\xe5" + "\x2d\x35\xaf\x9e\x37\xf7\x7e\x4a" + "\x8c\x9e\xa1\xdc\x40\xb4\x5b\x36" + "\xdc\x3a\x68\xe6\xb7\x35\x0b\x8a" + "\x90\xec\x74\x8f\x09\x9a\x7f\x02" + "\x4d\x03\x46\x35\x62\xb1\xbd\x08" + "\x3f\x54\x2a\x10\x0b\xdc\x69\xaf" + "\x25\x3a\x0c\x5f\xe0\x51\xe7\x11" + "\xb7\x00\xab\xbb\x9a\xb0\xdc\x4d" + "\xc3\x7d\x1a\x6e\xd1\x09\x52\xbd" + "\x6b\x43\x55\x22\x3a\x78\x14\x7d" + "\x79\xfd\x8d\xfc\x9b\x1d\x0f\xa2" + "\xc7\xb9\xf8\x87\xd5\x96\x50\x61" + "\xa7\x5e\x1e\x57\x97\xe0\xad\x2f" + "\x93\xe6\xe8\x83\xec\x85\x26\x5e" + "\xd9\x2a\x15\xe0\xe9\x09\x25\xa1" + "\x77\x2b\x88\xdc\xa4\xa5\x48\xb6" + "\xf7\xcc\xa6\xa9\xba\xf3\x42\x5c" + "\x70\x9d\xe9\x29\xc1\xf1\x33\xdd" + "\x56\x48\x17\x86\x14\x51\x5c\x10" + "\xab\xfd\xd3\x26\x8c\x21\xf5\x93" + "\x1b\xeb\x47\x97\x73\xbb\x88\x10" + "\xf3\xfe\xf5\xde\xf3\x2e\x05\x46" + "\x1c\x0d\xa3\x10\x48\x9c\x71\x16" + "\x78\x33\x4d\x0a\x74\x3b\xe9\x34" + "\x0b\xa7\x0e\x9e\x61\xe9\xe9\xfd" + "\x85\xa0\xcb\x19\xfd\x7c\x33\xe3" + "\x0e\xce\xc2\x6f\x9d\xa4\x2d\x77" + "\xfd\xad\xee\x5e\x08\x3e\xd7\xf5" + "\xfb\xc3\xd7\x93\x96\x08\x96\xca" + "\x58\x81\x16\x9b\x98\x0a\xe2\xef" + "\x7f\xda\x40\xe4\x1f\x46\x9e\x67" + "\x2b\x84\xcb\x42\xc4\xd6\x6a\xcf" + "\x2d\xb2\x33\xc0\x56\xb3\x35\x6f" + "\x29\x36\x8f\x6a\x5b\xec\xd5\x4f" + "\xa0\x70\xff\xb6\x5b\xde\x6a\x93" + "\x20\x3c\xe2\x76\x7a\xef\x3c\x79" + "\x31\x65\xce\x3a\x0e\xd0\xbe\xa8" + "\x21\x95\xc7\x2b\x62\x8e\x67\xdd" + "\x20\x79\xe4\xe5\x01\x15\xc0\xec" + "\x0f\xd9\x23\xc8\xca\xdf\xd4\x7d" + "\x1d\xf8\x64\x4f\x56\xb1\x83\xa7" + "\x43\xbe\xfc\xcf\xc2\x8c\x33\xda" + "\x36\xd0\x52\xef\x9e\x9e\x88\xf4" + "\xa8\x21\x0f\xaa\xee\x8d\xa0\x24" + "\x4d\xcb\xb1\x72\x07\xf0\xc2\x06" + "\x60\x65\x85\x84\x2c\x60\xcf\x61" + "\xe7\x56\x43\x5b\x2b\x50\x74\xfa" + "\xdb\x4e\xea\x88\xd4\xb3\x83\x8f" + "\x6f\x97\x4b\x57\x7a\x64\x64\xae" + "\x0a\x37\x66\xc5\x03\xad\xb5\xf9" + "\x08\xb0\x3a\x74\xde\x97\x51\xff" + "\x48\x4f\x5c\xa4\xf8\x7a\xb4\x05" + "\x27\x70\x52\x86\x1b\x78\xfc\x18" + "\x06\x27\xa9\x62\xf7\xda\xd2\x8e", + .ctext = "\x3b\xe1\xdb\xb3\xc5\x9a\xde\x69" + "\x58\x05\xcc\xeb\x02\x51\x78\x4a" + "\xac\x28\xe9\xed\xd1\xc9\x15\x7d" + "\x33\x7d\xc1\x47\x12\x41\x11\xf8" + "\x4a\x2c\xb7\xa3\x41\xbe\x59\xf7" + "\x22\xdb\x2c\xda\x9c\x00\x61\x9b" + "\x73\xb3\x0b\x84\x2b\xc1\xf3\x80" + "\x84\xeb\x19\x60\x80\x09\xe1\xcd" + "\x16\x3a\x20\x23\xc4\x82\x4f\xba" + "\x3b\x8e\x55\xd7\xa9\x0b\x75\xd0" + "\xda\xce\xd2\xee\x7e\x4b\x7f\x65" + "\x4d\x28\xc5\xd3\x15\x2c\x40\x96" + "\x52\xd4\x18\x61\x2b\xe7\x83\xec" + "\x89\x62\x9c\x4c\x50\xe6\xe2\xbb" + "\x25\xa1\x0f\xa7\xb0\xb4\xb2\xde" + "\x54\x20\xae\xa3\x56\xa5\x26\x4c" + "\xd5\xcc\xe5\xcb\x28\x44\xb1\xef" + "\x67\x2e\x93\x6d\x00\x88\x83\x9a" + "\xf2\x1c\x48\x38\xec\x1a\x24\x90" + "\x73\x0a\xdb\xe8\xce\x95\x7a\x2c" + "\x8c\xe9\xb7\x07\x1d\xb3\xa3\x20" + "\xbe\xad\x61\x84\xac\xde\x76\xb5" + "\xa6\x28\x29\x47\x63\xc4\xfc\x13" + "\x3f\x71\xfb\x58\x37\x34\x82\xed" + "\x9e\x05\x19\x1f\xc1\x67\xc1\xab" + "\xf5\xfd\x7c\xea\xfa\xa4\xf8\x0a" + "\xac\x4c\x92\xdf\x65\x73\xd7\xdb" + "\xed\x2c\xe0\x84\x5f\x57\x8c\x76" + "\x3e\x05\xc0\xc3\x68\x96\x95\x0b" + "\x88\x97\xfe\x2e\x99\xd5\xc2\xb9" + "\x53\x9f\xf3\x32\x10\x1f\x1f\x5d" + "\xdf\x21\x95\x70\x91\xe8\xa1\x3e" + "\x19\x3e\xb6\x0b\xa8\xdb\xf8\xd4" + "\x54\x27\xb8\xab\x5d\x78\x0c\xe6" + "\xb7\x08\xee\xa4\xb6\x6b\xeb\x5a" + "\x89\x69\x2b\xbd\xd4\x21\x5b\xbf" + "\x79\xbb\x0f\xff\xdb\x23\x9a\xeb" + "\x8d\xf2\xc4\x39\xb4\x90\x77\x6f" + "\x68\xe2\xb8\xf3\xf1\x65\x4f\xd5" + "\x24\x80\x06\xaf\x7c\x8d\x15\x0c" + "\xfd\x56\xe5\xe3\x01\xa5\xf7\x1c" + "\x31\xd6\xa2\x01\x1e\x59\xf9\xa9" + "\x42\xd5\xc2\x34\xda\x25\xde\xc6" + "\x5d\x38\xef\xd1\x4c\xc1\xd9\x1b" + "\x98\xfd\xcd\x57\x6f\xfd\x46\x91" + "\x90\x3d\x52\x2b\x2c\x7d\xcf\x71" + "\xcf\xd1\x77\x23\x71\x36\xb1\xce" + "\xc7\x5d\xf0\x5b\x44\x3d\x43\x71" + "\xac\xb8\xa0\x6a\xea\x89\x5c\xff" + "\x81\x73\xd4\x83\xd1\xc9\xe9\xe2" + "\xa8\xa6\x0f\x36\xe6\xaa\x57\xd4" + "\x27\xd2\xc9\xda\x94\x02\x1f\xfb" + "\xe1\xa1\x07\xbe\xe1\x1b\x15\x94" + "\x1e\xac\x2f\x57\xbb\x41\x22\xaf" + "\x60\x5e\xcc\x66\xcb\x16\x62\xab" + "\xb8\x7c\x99\xf4\x84\x93\x0c\xc2" + "\xa2\x49\xe4\xfd\x17\x55\xe1\xa6" + "\x8d\x5b\xc6\x1b\xc8\xac\xec\x11" + "\x33\xcf\xb0\xe8\xc7\x28\x4f\xb2" + "\x5c\xa6\xe2\x71\xab\x80\x0a\xa7" + "\x5c\x59\x50\x9f\x7a\x32\xb7\xe5" + "\x24\x9a\x8e\x25\x21\x2e\xb7\x18" + "\xd0\xf2\xe7\x27\x6f\xda\xc1\x00" + "\xd9\xa6\x03\x59\xac\x4b\xcb\xba", + .klen = 24, + .len = 512, + }, + { + .key = "\x9e\xeb\xb2\x49\x3c\x1c\xf5\xf4" + "\x6a\x99\xc2\xc4\xdf\xb1\xf4\xdd" + "\x75\x20\x57\xea\x2c\x4f\xcd\xb2" + "\xa5\x3d\x7b\x49\x1e\xab\xfd\x0f", + .iv = "\xdf\x63\xd4\xab\xd2\x49\xf3\xd8" + "\x33\x81\x37\x60\x7d\xfa\x73\x08" + "\xd8\x49\x6d\x80\xe8\x2f\x62\x54" + "\xeb\x0e\xa9\x39\x5b\x45\x7f\x8a", + .ptext = "\x67\xc9\xf2\x30\x84\x41\x8e\x43" + "\xfb\xf3\xb3\x3e\x79\x36\x7f\xe8", + .ctext = "\x27\x38\x78\x47\x16\xd9\x71\x35" + "\x2e\x7e\xdd\x7e\x43\x3c\xb8\x40", + .klen = 32, + .len = 16, + }, + { + .key = "\x93\xfa\x7e\xe2\x0e\x67\xc4\x39" + "\xe7\xca\x47\x95\x68\x9d\x5e\x5a" + "\x7c\x26\x19\xab\xc6\xca\x6a\x4c" + "\x45\xa6\x96\x42\xae\x6c\xff\xe7", + .iv = "\xea\x82\x47\x95\x3b\x22\xa1\x3a" + "\x6a\xca\x24\x4c\x50\x7e\x23\xcd" + "\x0e\x50\xe5\x41\xb6\x65\x29\xd8" + "\x30\x23\x00\xd2\x54\xa7\xd6\x56", + .ptext = "\xdb\x1f\x1f\xec\xad\x83\x6e\x5d" + "\x19\xa5\xf6\x3b\xb4\x93\x5a\x57" + "\x6f", + .ctext = "\xf1\x46\x6e\x9d\xb3\x01\xf0\x6b" + "\xc2\xac\x57\x88\x48\x6d\x40\x72" + "\x68", + .klen = 32, + .len = 17, + }, + { + .key = "\x36\x2b\x57\x97\xf8\x5d\xcd\x99" + "\x5f\x1a\x5a\x44\x1d\x92\x0f\x27" + "\xcc\x16\xd7\x2b\x85\x63\x99\xd3" + "\xba\x96\xa1\xdb\xd2\x60\x68\xda", + .iv = "\xef\x58\x69\xb1\x2c\x5e\x9a\x47" + "\x24\xc1\xb1\x69\xe1\x12\x93\x8f" + "\x43\x3d\x6d\x00\xdb\x5e\xd8\xd9" + "\x12\x9a\xfe\xd9\xff\x2d\xaa\xc4", + .ptext = "\x5e\xa8\x68\x19\x85\x98\x12\x23" + "\x26\x0a\xcc\xdb\x0a\x04\xb9\xdf" + "\x4d\xb3\x48\x7b\xb0\xe3\xc8\x19" + "\x43\x5a\x46\x06\x94\x2d\xf2", + .ctext = "\xdb\xfd\xc8\x03\xd0\xec\xc1\xfe" + "\xbd\x64\x37\xb8\x82\x43\x62\x4e" + "\x7e\x54\xa3\xe2\x24\xa7\x27\xe8" + "\xa4\xd5\xb3\x6c\xb2\x26\xb4", + .klen = 32, + .len = 31, + }, + { + .key = "\x03\x65\x03\x6e\x4d\xe6\xe8\x4e" + "\x8b\xbe\x22\x19\x48\x31\xee\xd9" + "\xa0\x91\x21\xbe\x62\x89\xde\x78" + "\xd9\xb0\x36\xa3\x3c\xce\x43\xd5", + .iv = "\xa9\xc3\x4b\xe7\x0f\xfc\x6d\xbf" + "\x56\x27\x21\x1c\xfc\xd6\x04\x10" + "\x5f\x43\xe2\x30\x35\x29\x6c\x10" + "\x90\xf1\xbf\x61\xed\x0f\x8a\x91", + .ptext = "\x07\xaa\x02\x26\xb4\x98\x11\x5e" + "\x33\x41\x21\x51\x51\x63\x2c\x72" + "\x00\xab\x32\xa7\x1c\xc8\x3c\x9c" + "\x25\x0e\x8b\x9a\xdf\x85\xed\x2d" + "\xf4\xf2\xbc\x55\xca\x92\x6d\x22" + "\xfd\x22\x3b\x42\x4c\x0b\x74\xec", + .ctext = "\x7b\xb1\x43\x6d\xd8\x72\x6c\xf6" + "\x67\x6a\x00\xc4\xf1\xf0\xf5\xa4" + "\xfc\x60\x91\xab\x46\x0b\x15\xfc" + "\xd7\xc1\x28\x15\xa1\xfc\xf7\x68" + "\x8e\xcc\x27\x62\x00\x64\x56\x72" + "\xa6\x17\xd7\x3f\x67\x80\x10\x58", + .klen = 32, + .len = 48, + }, + { + .key = "\xa5\x28\x24\x34\x1a\x3c\xd8\xf7" + "\x05\x91\x8f\xee\x85\x1f\x35\x7f" + "\x80\x3d\xfc\x9b\x94\xf6\xfc\x9e" + "\x19\x09\x00\xa9\x04\x31\x4f\x11", + .iv = "\xa1\xba\x49\x95\xff\x34\x6d\xb8" + "\xcd\x87\x5d\x5e\xfd\xea\x85\xdb" + "\x8a\x7b\x5e\xb2\x5d\x57\xdd\x62" + "\xac\xa9\x8c\x41\x42\x94\x75\xb7", + .ptext = "\x69\xb4\xe8\x8c\x37\xe8\x67\x82" + "\xf1\xec\x5d\x04\xe5\x14\x91\x13" + "\xdf\xf2\x87\x1b\x69\x81\x1d\x71" + "\x70\x9e\x9c\x3b\xde\x49\x70\x11" + "\xa0\xa3\xdb\x0d\x54\x4f\x66\x69" + "\xd7\xdb\x80\xa7\x70\x92\x68\xce" + "\x81\x04\x2c\xc6\xab\xae\xe5\x60" + "\x15\xe9\x6f\xef\xaa\x8f\xa7\xa7" + "\x63\x8f\xf2\xf0\x77\xf1\xa8\xea" + "\xe1\xb7\x1f\x9e\xab\x9e\x4b\x3f" + "\x07\x87\x5b\x6f\xcd\xa8\xaf\xb9" + "\xfa\x70\x0b\x52\xb8\xa8\xa7\x9e" + "\x07\x5f\xa6\x0e\xb3\x9b\x79\x13" + "\x79\xc3\x3e\x8d\x1c\x2c\x68\xc8" + "\x51\x1d\x3c\x7b\x7d\x79\x77\x2a" + "\x56\x65\xc5\x54\x23\x28\xb0\x03", + .ctext = "\xeb\xf9\x98\x86\x3c\x40\x9f\x16" + "\x84\x01\xf9\x06\x0f\xeb\x3c\xa9" + "\x4c\xa4\x8e\x5d\xc3\x8d\xe5\xd3" + "\xae\xa6\xe6\xcc\xd6\x2d\x37\x4f" + "\x99\xc8\xa3\x21\x46\xb8\x69\xf2" + "\xe3\x14\x89\xd7\xb9\xf5\x9e\x4e" + "\x07\x93\x6f\x78\x8e\x6b\xea\x8f" + "\xfb\x43\xb8\x3e\x9b\x4c\x1d\x7e" + "\x20\x9a\xc5\x87\xee\xaf\xf6\xf9" + "\x46\xc5\x18\x8a\xe8\x69\xe7\x96" + "\x52\x55\x5f\x00\x1e\x1a\xdc\xcc" + "\x13\xa5\xee\xff\x4b\x27\xca\xdc" + "\x10\xa6\x48\x76\x98\x43\x94\xa3" + "\xc7\xe2\xc9\x65\x9b\x08\x14\x26" + "\x1d\x68\xfb\x15\x0a\x33\x49\x84" + "\x84\x33\x5a\x1b\x24\x46\x31\x92", + .klen = 32, + .len = 128, + }, + { + .key = "\x36\x45\x11\xa2\x98\x5f\x96\x7c" + "\xc6\xb4\x94\x31\x0a\x67\x09\x32" + "\x6c\x6f\x6f\x00\xf0\x17\xcb\xac" + "\xa5\xa9\x47\x9e\x2e\x85\x2f\xfa", + .iv = "\x28\x88\xaa\x9b\x59\x3b\x1e\x97" + "\x82\xe5\x5c\x9e\x6d\x14\x11\x19" + "\x6e\x38\x8f\xd5\x40\x2b\xca\xf9" + "\x7b\x4c\xe4\xa3\xd0\xd2\x8a\x13", + .ptext = "\x95\xd2\xf7\x71\x1b\xca\xa5\x86" + "\xd9\x48\x01\x93\x2f\x79\x55\x29" + "\x71\x13\x15\x0e\xe6\x12\xbc\x4d" + "\x8a\x31\xe3\x40\x2a\xc6\x5e\x0d" + "\x68\xbb\x4a\x62\x8d\xc7\x45\x77" + "\xd2\xb8\xc7\x1d\xf1\xd2\x5d\x97" + "\xcf\xac\x52\xe5\x32\x77\xb6\xda" + "\x30\x85\xcf\x2b\x98\xe9\xaa\x34" + "\x62\xb5\x23\x9e\xb7\xa6\xd4\xe0" + "\xb4\x58\x18\x8c\x4d\xde\x4d\x01" + "\x83\x89\x24\xca\xfb\x11\xd4\x82" + "\x30\x7a\x81\x35\xa0\xb4\xd4\xb6" + "\x84\xea\x47\x91\x8c\x19\x86\x25" + "\xa6\x06\x8d\x78\xe6\xed\x87\xeb" + "\xda\xea\x73\x7c\xbf\x66\xb8\x72" + "\xe3\x0a\xb8\x0c\xcb\x1a\x73\xf1" + "\xa7\xca\x0a\xde\x57\x2b\xbd\x2b" + "\xeb\x8b\x24\x38\x22\xd3\x0e\x1f" + "\x17\xa0\x84\x98\x31\x77\xfd\x34" + "\x6a\x4e\x3d\x84\x4c\x0e\xfb\xed" + "\xc8\x2a\x51\xfa\xd8\x73\x21\x8a" + "\xdb\xb5\xfe\x1f\xee\xc4\xe8\x65" + "\x54\x84\xdd\x96\x6d\xfd\xd3\x31" + "\x77\x36\x52\x6b\x80\x4f\x9e\xb4" + "\xa2\x55\xbf\x66\x41\x49\x4e\x87" + "\xa7\x0c\xca\xe7\xa5\xc5\xf6\x6f" + "\x27\x56\xe2\x48\x22\xdd\x5f\x59" + "\x3c\xf1\x9f\x83\xe5\x2d\xfb\x71" + "\xad\xd1\xae\x1b\x20\x5c\x47\xb7" + "\x3b\xd3\x14\xce\x81\x42\xb1\x0a" + "\xf0\x49\xfa\xc2\xe7\x86\xbf\xcd" + "\xb0\x95\x9f\x8f\x79\x41\x54", + .ctext = "\xf6\x57\x51\xc4\x25\x61\x2d\xfa" + "\xd6\xd9\x3f\x9a\x81\x51\xdd\x8e" + "\x3d\xe7\xaa\x2d\xb1\xda\xc8\xa6" + "\x9d\xaa\x3c\xab\x62\xf2\x80\xc3" + "\x2c\xe7\x58\x72\x1d\x44\xc5\x28" + "\x7f\xb4\xf9\xbc\x9c\xb2\xab\x8e" + "\xfa\xd1\x4d\x72\xd9\x79\xf5\xa0" + "\x24\x3e\x90\x25\x31\x14\x38\x45" + "\x59\xc8\xf6\xe2\xc6\xf6\xc1\xa7" + "\xb2\xf8\xa7\xa9\x2b\x6f\x12\x3a" + "\xb0\x81\xa4\x08\x57\x59\xb1\x56" + "\x4c\x8f\x18\x55\x33\x5f\xd6\x6a" + "\xc6\xa0\x4b\xd6\x6b\x64\x3e\x9e" + "\xfd\x66\x16\xe2\xdb\xeb\x5f\xb3" + "\x50\x50\x3e\xde\x8d\x72\x76\x01" + "\xbe\xcc\xc9\x52\x09\x2d\x8d\xe7" + "\xd6\xc3\x66\xdb\x36\x08\xd1\x77" + "\xc8\x73\x46\x26\x24\x29\xbf\x68" + "\x2d\x2a\x99\x43\x56\x55\xe4\x93" + "\xaf\xae\x4d\xe7\x55\x4a\xc0\x45" + "\x26\xeb\x3b\x12\x90\x7c\xdc\xd1" + "\xd5\x6f\x0a\xd0\xa9\xd7\x4b\x89" + "\x0b\x07\xd8\x86\xad\xa1\xc4\x69" + "\x1f\x5e\x8b\xc4\x9e\x91\x41\x25" + "\x56\x98\x69\x78\x3a\x9e\xae\x91" + "\xd8\xd9\xfa\xfb\xff\x81\x25\x09" + "\xfc\xed\x2d\x87\xbc\x04\x62\x97" + "\x35\xe1\x26\xc2\x46\x1c\xcf\xd7" + "\x14\xed\x02\x09\xa5\xb2\xb6\xaa" + "\x27\x4e\x61\xb3\x71\x6b\x47\x16" + "\xb7\xe8\xd4\xaf\x52\xeb\x6a\x6b" + "\xdb\x4c\x65\x21\x9e\x1c\x36", + .klen = 32, + .len = 255, + }, + { + .key = "\xd3\x81\x72\x18\x23\xff\x6f\x4a" + "\x25\x74\x29\x0d\x51\x8a\x0e\x13" + "\xc1\x53\x5d\x30\x8d\xee\x75\x0d" + "\x14\xd6\x69\xc9\x15\xa9\x0c\x60", + .iv = "\x65\x9b\xd4\xa8\x7d\x29\x1d\xf4" + "\xc4\xd6\x9b\x6a\x28\xab\x64\xe2" + "\x62\x81\x97\xc5\x81\xaa\xf9\x44" + "\xc1\x72\x59\x82\xaf\x16\xc8\x2c", + .ptext = "\xc7\x6b\x52\x6a\x10\xf0\xcc\x09" + "\xc1\x12\x1d\x6d\x21\xa6\x78\xf5" + "\x05\xa3\x69\x60\x91\x36\x98\x57" + "\xba\x0c\x14\xcc\xf3\x2d\x73\x03" + "\xc6\xb2\x5f\xc8\x16\x27\x37\x5d" + "\xd0\x0b\x87\xb2\x50\x94\x7b\x58" + "\x04\xf4\xe0\x7f\x6e\x57\x8e\xc9" + "\x41\x84\xc1\xb1\x7e\x4b\x91\x12" + "\x3a\x8b\x5d\x50\x82\x7b\xcb\xd9" + "\x9a\xd9\x4e\x18\x06\x23\x9e\xd4" + "\xa5\x20\x98\xef\xb5\xda\xe5\xc0" + "\x8a\x6a\x83\x77\x15\x84\x1e\xae" + "\x78\x94\x9d\xdf\xb7\xd1\xea\x67" + "\xaa\xb0\x14\x15\xfa\x67\x21\x84" + "\xd3\x41\x2a\xce\xba\x4b\x4a\xe8" + "\x95\x62\xa9\x55\xf0\x80\xad\xbd" + "\xab\xaf\xdd\x4f\xa5\x7c\x13\x36" + "\xed\x5e\x4f\x72\xad\x4b\xf1\xd0" + "\x88\x4e\xec\x2c\x88\x10\x5e\xea" + "\x12\xc0\x16\x01\x29\xa3\xa0\x55" + "\xaa\x68\xf3\xe9\x9d\x3b\x0d\x3b" + "\x6d\xec\xf8\xa0\x2d\xf0\x90\x8d" + "\x1c\xe2\x88\xd4\x24\x71\xf9\xb3" + "\xc1\x9f\xc5\xd6\x76\x70\xc5\x2e" + "\x9c\xac\xdb\x90\xbd\x83\x72\xba" + "\x6e\xb5\xa5\x53\x83\xa9\xa5\xbf" + "\x7d\x06\x0e\x3c\x2a\xd2\x04\xb5" + "\x1e\x19\x38\x09\x16\xd2\x82\x1f" + "\x75\x18\x56\xb8\x96\x0b\xa6\xf9" + "\xcf\x62\xd9\x32\x5d\xa9\xd7\x1d" + "\xec\xe4\xdf\x1b\xbe\xf1\x36\xee" + "\xe3\x7b\xb5\x2f\xee\xf8\x53\x3d" + "\x6a\xb7\x70\xa9\xfc\x9c\x57\x25" + "\xf2\x89\x10\xd3\xb8\xa8\x8c\x30" + "\xae\x23\x4f\x0e\x13\x66\x4f\xe1" + "\xb6\xc0\xe4\xf8\xef\x93\xbd\x6e" + "\x15\x85\x6b\xe3\x60\x81\x1d\x68" + "\xd7\x31\x87\x89\x09\xab\xd5\x96" + "\x1d\xf3\x6d\x67\x80\xca\x07\x31" + "\x5d\xa7\xe4\xfb\x3e\xf2\x9b\x33" + "\x52\x18\xc8\x30\xfe\x2d\xca\x1e" + "\x79\x92\x7a\x60\x5c\xb6\x58\x87" + "\xa4\x36\xa2\x67\x92\x8b\xa4\xb7" + "\xf1\x86\xdf\xdc\xc0\x7e\x8f\x63" + "\xd2\xa2\xdc\x78\xeb\x4f\xd8\x96" + "\x47\xca\xb8\x91\xf9\xf7\x94\x21" + "\x5f\x9a\x9f\x5b\xb8\x40\x41\x4b" + "\x66\x69\x6a\x72\xd0\xcb\x70\xb7" + "\x93\xb5\x37\x96\x05\x37\x4f\xe5" + "\x8c\xa7\x5a\x4e\x8b\xb7\x84\xea" + "\xc7\xfc\x19\x6e\x1f\x5a\xa1\xac" + "\x18\x7d\x52\x3b\xb3\x34\x62\x99" + "\xe4\x9e\x31\x04\x3f\xc0\x8d\x84" + "\x17\x7c\x25\x48\x52\x67\x11\x27" + "\x67\xbb\x5a\x85\xca\x56\xb2\x5c" + "\xe6\xec\xd5\x96\x3d\x15\xfc\xfb" + "\x22\x25\xf4\x13\xe5\x93\x4b\x9a" + "\x77\xf1\x52\x18\xfa\x16\x5e\x49" + "\x03\x45\xa8\x08\xfa\xb3\x41\x92" + "\x79\x50\x33\xca\xd0\xd7\x42\x55" + "\xc3\x9a\x0c\x4e\xd9\xa4\x3c\x86" + "\x80\x9f\x53\xd1\xa4\x2e\xd1\xbc" + "\xf1\x54\x6e\x93\xa4\x65\x99\x8e" + "\xdf\x29\xc0\x64\x63\x07\xbb\xea", + .ctext = "\x9f\x72\x87\xc7\x17\xfb\x20\x15" + "\x65\xb3\x55\xa8\x1c\x8e\x52\x32" + "\xb1\x82\x8d\xbf\xb5\x9f\x10\x0a" + "\xe8\x0c\x70\x62\xef\x89\xb6\x1f" + "\x73\xcc\xe4\xcc\x7a\x3a\x75\x4a" + "\x26\xe7\xf5\xd7\x7b\x17\x39\x2d" + "\xd2\x27\x6e\xf9\x2f\x9e\xe2\xf6" + "\xfa\x16\xc2\xf2\x49\x26\xa7\x5b" + "\xe7\xca\x25\x0e\x45\xa0\x34\xc2" + "\x9a\x37\x79\x7e\x7c\x58\x18\x94" + "\x10\xa8\x7c\x48\xa9\xd7\x63\x89" + "\x9e\x61\x4d\x26\x34\xd9\xf0\xb1" + "\x2d\x17\x2c\x6f\x7c\x35\x0e\xbe" + "\x77\x71\x7c\x17\x5b\xab\x70\xdb" + "\x2f\x54\x0f\xa9\xc8\xf4\xf5\xab" + "\x52\x04\x3a\xb8\x03\xa7\xfd\x57" + "\x45\x5e\xbc\x77\xe1\xee\x79\x8c" + "\x58\x7b\x1f\xf7\x75\xde\x68\x17" + "\x98\x85\x8a\x18\x5c\xd2\x39\x78" + "\x7a\x6f\x26\x6e\xe1\x13\x91\xdd" + "\xdf\x0e\x6e\x67\xcc\x51\x53\xd8" + "\x17\x5e\xce\xa7\xe4\xaf\xfa\xf3" + "\x4f\x9f\x01\x9b\x04\xe7\xfc\xf9" + "\x6a\xdc\x1d\x0c\x9a\xaa\x3a\x7a" + "\x73\x03\xdf\xbf\x3b\x82\xbe\xb0" + "\xb4\xa4\xcf\x07\xd7\xde\x71\x25" + "\xc5\x10\xee\x0a\x15\x96\x8b\x4f" + "\xfe\xb8\x28\xbd\x4a\xcd\xeb\x9f" + "\x5d\x00\xc1\xee\xe8\x16\x44\xec" + "\xe9\x7b\xd6\x85\x17\x29\xcf\x58" + "\x20\xab\xf7\xce\x6b\xe7\x71\x7d" + "\x4f\xa8\xb0\xe9\x7d\x70\xd6\x0b" + "\x2e\x20\xb1\x1a\x63\x37\xaa\x2c" + "\x94\xee\xd5\xf6\x58\x2a\xf4\x7a" + "\x4c\xba\xf5\xe9\x3c\x6f\x95\x13" + "\x5f\x96\x81\x5b\xb5\x62\xf2\xd7" + "\x8d\xbe\xa1\x31\x51\xe6\xfe\xc9" + "\x07\x7d\x0f\x00\x3a\x66\x8c\x4b" + "\x94\xaa\xe5\x56\xde\xcd\x74\xa7" + "\x48\x67\x6f\xed\xc9\x6a\xef\xaf" + "\x9a\xb7\xae\x60\xfa\xc0\x37\x39" + "\xa5\x25\xe5\x22\xea\x82\x55\x68" + "\x3e\x30\xc3\x5a\xb6\x29\x73\x7a" + "\xb6\xfb\x34\xee\x51\x7c\x54\xe5" + "\x01\x4d\x72\x25\x32\x4a\xa3\x68" + "\x80\x9a\x89\xc5\x11\x66\x4c\x8c" + "\x44\x50\xbe\xd7\xa0\xee\xa6\xbb" + "\x92\x0c\xe6\xd7\x83\x51\xb1\x69" + "\x63\x40\xf3\xf4\x92\x84\xc4\x38" + "\x29\xfb\xb4\x84\xa0\x19\x75\x16" + "\x60\xbf\x0a\x9c\x89\xee\xad\xb4" + "\x43\xf9\x71\x39\x45\x7c\x24\x83" + "\x30\xbb\xee\x28\xb0\x86\x7b\xec" + "\x93\xc1\xbf\xb9\x97\x1b\x96\xef" + "\xee\x58\x35\x61\x12\x19\xda\x25" + "\x77\xe5\x80\x1a\x31\x27\x9b\xe4" + "\xda\x8b\x7e\x51\x4d\xcb\x01\x19" + "\x4f\xdc\x92\x1a\x17\xd5\x6b\xf4" + "\x50\xe3\x06\xe4\x76\x9f\x65\x00" + "\xbd\x7a\xe2\x64\x26\xf2\xe4\x7e" + "\x40\xf2\x80\xab\x62\xd5\xef\x23" + "\x8b\xfb\x6f\x24\x6e\x9b\x66\x0e" + "\xf4\x1c\x24\x1e\x1d\x26\x95\x09" + "\x94\x3c\xb2\xb6\x02\xa7\xd9\x9a", + .klen = 32, + .len = 512, + }, + +}; + #endif /* _CRYPTO_TESTMGR_H */ diff --git a/crypto/xctr.c b/crypto/xctr.c new file mode 100644 index 000000000000..5c00147e8ec4 --- /dev/null +++ b/crypto/xctr.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * XCTR: XOR Counter mode - Adapted from ctr.c + * + * (C) Copyright IBM Corp. 2007 - Joy Latten + * Copyright 2021 Google LLC + */ + +/* + * XCTR mode is a blockcipher mode of operation used to implement HCTR2. XCTR is + * closely related to the CTR mode of operation; the main difference is that CTR + * generates the keystream using E(CTR + IV) whereas XCTR generates the + * keystream using E(CTR ^ IV). This allows implementations to avoid dealing + * with multi-limb integers (as is required in CTR mode). XCTR is also specified + * using little-endian arithmetic which makes it slightly faster on LE machines. + * + * See the HCTR2 paper for more details: + * Length-preserving encryption with HCTR2 + * (https://eprint.iacr.org/2021/1441.pdf) + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* For now this implementation is limited to 16-byte blocks for simplicity */ +#define XCTR_BLOCKSIZE 16 + +static void crypto_xctr_crypt_final(struct skcipher_walk *walk, + struct crypto_cipher *tfm, u32 byte_ctr) +{ + u8 keystream[XCTR_BLOCKSIZE]; + const u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + __le32 ctr32 = cpu_to_le32(byte_ctr / XCTR_BLOCKSIZE + 1); + + crypto_xor(walk->iv, (u8 *)&ctr32, sizeof(ctr32)); + crypto_cipher_encrypt_one(tfm, keystream, walk->iv); + crypto_xor_cpy(dst, keystream, src, nbytes); + crypto_xor(walk->iv, (u8 *)&ctr32, sizeof(ctr32)); +} + +static int crypto_xctr_crypt_segment(struct skcipher_walk *walk, + struct crypto_cipher *tfm, u32 byte_ctr) +{ + void (*fn)(struct crypto_tfm *, u8 *, const u8 *) = + crypto_cipher_alg(tfm)->cia_encrypt; + const u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + __le32 ctr32 = cpu_to_le32(byte_ctr / XCTR_BLOCKSIZE + 1); + + do { + crypto_xor(walk->iv, (u8 *)&ctr32, sizeof(ctr32)); + fn(crypto_cipher_tfm(tfm), dst, walk->iv); + crypto_xor(dst, src, XCTR_BLOCKSIZE); + crypto_xor(walk->iv, (u8 *)&ctr32, sizeof(ctr32)); + + le32_add_cpu(&ctr32, 1); + + src += XCTR_BLOCKSIZE; + dst += XCTR_BLOCKSIZE; + } while ((nbytes -= XCTR_BLOCKSIZE) >= XCTR_BLOCKSIZE); + + return nbytes; +} + +static int crypto_xctr_crypt_inplace(struct skcipher_walk *walk, + struct crypto_cipher *tfm, u32 byte_ctr) +{ + void (*fn)(struct crypto_tfm *, u8 *, const u8 *) = + crypto_cipher_alg(tfm)->cia_encrypt; + unsigned long alignmask = crypto_cipher_alignmask(tfm); + unsigned int nbytes = walk->nbytes; + u8 *data = walk->src.virt.addr; + u8 tmp[XCTR_BLOCKSIZE + MAX_CIPHER_ALIGNMASK]; + u8 *keystream = PTR_ALIGN(tmp + 0, alignmask + 1); + __le32 ctr32 = cpu_to_le32(byte_ctr / XCTR_BLOCKSIZE + 1); + + do { + crypto_xor(walk->iv, (u8 *)&ctr32, sizeof(ctr32)); + fn(crypto_cipher_tfm(tfm), keystream, walk->iv); + crypto_xor(data, keystream, XCTR_BLOCKSIZE); + crypto_xor(walk->iv, (u8 *)&ctr32, sizeof(ctr32)); + + le32_add_cpu(&ctr32, 1); + + data += XCTR_BLOCKSIZE; + } while ((nbytes -= XCTR_BLOCKSIZE) >= XCTR_BLOCKSIZE); + + return nbytes; +} + +static int crypto_xctr_crypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_cipher *cipher = skcipher_cipher_simple(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + u32 byte_ctr = 0; + + err = skcipher_walk_virt(&walk, req, false); + + while (walk.nbytes >= XCTR_BLOCKSIZE) { + if (walk.src.virt.addr == walk.dst.virt.addr) + nbytes = crypto_xctr_crypt_inplace(&walk, cipher, + byte_ctr); + else + nbytes = crypto_xctr_crypt_segment(&walk, cipher, + byte_ctr); + + byte_ctr += walk.nbytes - nbytes; + err = skcipher_walk_done(&walk, nbytes); + } + + if (walk.nbytes) { + crypto_xctr_crypt_final(&walk, cipher, byte_ctr); + err = skcipher_walk_done(&walk, 0); + } + + return err; +} + +static int crypto_xctr_create(struct crypto_template *tmpl, struct rtattr **tb) +{ + struct skcipher_instance *inst; + struct crypto_alg *alg; + int err; + + inst = skcipher_alloc_instance_simple(tmpl, tb); + if (IS_ERR(inst)) + return PTR_ERR(inst); + + alg = skcipher_ialg_simple(inst); + + /* Block size must be 16 bytes. */ + err = -EINVAL; + if (alg->cra_blocksize != XCTR_BLOCKSIZE) + goto out_free_inst; + + /* XCTR mode is a stream cipher. */ + inst->alg.base.cra_blocksize = 1; + + /* + * To simplify the implementation, configure the skcipher walk to only + * give a partial block at the very end, never earlier. + */ + inst->alg.chunksize = alg->cra_blocksize; + + inst->alg.encrypt = crypto_xctr_crypt; + inst->alg.decrypt = crypto_xctr_crypt; + + err = skcipher_register_instance(tmpl, inst); + if (err) { +out_free_inst: + inst->free(inst); + } + + return err; +} + +static struct crypto_template crypto_xctr_tmpl = { + .name = "xctr", + .create = crypto_xctr_create, + .module = THIS_MODULE, +}; + +static int __init crypto_xctr_module_init(void) +{ + return crypto_register_template(&crypto_xctr_tmpl); +} + +static void __exit crypto_xctr_module_exit(void) +{ + crypto_unregister_template(&crypto_xctr_tmpl); +} + +subsys_initcall(crypto_xctr_module_init); +module_exit(crypto_xctr_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("XCTR block cipher mode of operation"); +MODULE_ALIAS_CRYPTO("xctr"); +MODULE_IMPORT_NS(CRYPTO_INTERNAL); diff --git a/drivers/Kconfig b/drivers/Kconfig index 0d399ddaa185..67f44716797e 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -107,6 +107,8 @@ source "drivers/usb/Kconfig" source "drivers/mmc/Kconfig" +source "drivers/ufs/Kconfig" + source "drivers/memstick/Kconfig" source "drivers/leds/Kconfig" diff --git a/drivers/Makefile b/drivers/Makefile index a110338c860c..2d5fcb5cc6f5 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -128,6 +128,7 @@ obj-$(CONFIG_PM_OPP) += opp/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_CPU_IDLE) += cpuidle/ obj-y += mmc/ +obj-y += ufs/ obj-$(CONFIG_MEMSTICK) += memstick/ obj-$(CONFIG_NEW_LEDS) += leds/ obj-$(CONFIG_INFINIBAND) += infiniband/ diff --git a/drivers/OWNERS b/drivers/OWNERS new file mode 100644 index 000000000000..c13fa056367d --- /dev/null +++ b/drivers/OWNERS @@ -0,0 +1,6 @@ +per-file base/**=gregkh@google.com,saravanak@google.com +per-file block/**=akailash@google.com +per-file md/**=akailash@google.com,paullawrence@google.com +per-file net/**=file:/net/OWNERS +per-file scsi/**=bvanassche@google.com,jaegeuk@google.com +per-file {tty,usb}/**=gregkh@google.com diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 53b22e26266c..491751ab0dbf 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -54,6 +54,61 @@ config ANDROID_BINDER_IPC_SELFTEST exhaustively with combinations of various buffer sizes and alignments. +config ANDROID_DEBUG_SYMBOLS + bool "Android Debug Symbols" + help + Enables export of debug symbols that are useful for offline debugging + of a kernel. These symbols would be used in vendor modules to find + addresses of the core kernel symbols for vendor extensions. + + This driver is statically compiled into kernel and maintains all the + required symbol addresses for vendor modules and provides necessary + interface vendor modules. + +config ANDROID_VENDOR_HOOKS + bool "Android Vendor Hooks" + depends on TRACEPOINTS + help + Enable vendor hooks implemented as tracepoints + + Allow vendor modules to attach to tracepoint "hooks" defined via + DECLARE_HOOK or DECLARE_RESTRICTED_HOOK. + +config ANDROID_KABI_RESERVE + bool "Android KABI reserve padding" + default y + help + This option enables the padding that the Android GKI kernel adds + to many different kernel structures to support an in-kernel stable ABI + over the lifespan of support for the kernel. + + Only disable this option if you have a system that needs the Android + kernel drivers, but is NOT an Android GKI kernel image. If disabled + it has the possibility to make the kernel static and runtime image + slightly smaller but will NOT be supported by the Google Android + kernel team. + + If even slightly unsure, say Y. + +config ANDROID_VENDOR_OEM_DATA + bool "Android vendor and OEM data padding" + default y + help + This option enables the padding that the Android GKI kernel adds + to many different kernel structures to support an in-kernel stable ABI + over the lifespan of support for the kernel as well as OEM additional + fields that are needed by some of the Android kernel tracepoints. The + macros enabled by this option are used to enable padding in vendor modules + used for the above specified purposes. + + Only disable this option if you have a system that needs the Android + kernel drivers, but is NOT an Android GKI kernel image and you do NOT + use the Android kernel tracepoints. If disabled it has the possibility + to make the kernel static and runtime image slightly smaller but will + NOT be supported by the Google Android kernel team. + + If even slightly unsure, say Y. + endif # if ANDROID endmenu diff --git a/drivers/android/Makefile b/drivers/android/Makefile index c9d3d0c99c25..f1ac44102987 100644 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -4,3 +4,5 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o +obj-$(CONFIG_ANDROID_DEBUG_SYMBOLS) += android_debug_symbols.o +obj-$(CONFIG_ANDROID_VENDOR_HOOKS) += vendor_hooks.o diff --git a/drivers/android/android_debug_symbols.c b/drivers/android/android_debug_symbols.c new file mode 100644 index 000000000000..254d52b0e368 --- /dev/null +++ b/drivers/android/android_debug_symbols.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (c) 2021, The Linux Foundation. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include "../../mm/slab.h" +#include +#include +#include +#include +#include + +struct ads_entry { + char *name; + void *addr; +}; + +bool ads_page_owner; +bool ads_slub_debug; +unsigned long ads_vmalloc_nr_pages; +unsigned long ads_pcpu_nr_pages; + +#define _ADS_ENTRY(index, symbol) \ + [index] = { .name = #symbol, .addr = (void *)symbol } +#define ADS_ENTRY(index, symbol) _ADS_ENTRY(index, symbol) + +#define _ADS_PER_CPU_ENTRY(index, symbol) \ + [index] = { .name = #symbol, .addr = (void *)&symbol } +#define ADS_PER_CPU_ENTRY(index, symbol) _ADS_PER_CPU_ENTRY(index, symbol) + +/* + * This module maintains static array of symbol and address information. + * Add all required core kernel symbols and their addresses into ads_entries[] array, + * so that vendor modules can query and to find address of non-exported symbol. + */ +static const struct ads_entry ads_entries[ADS_END] = { + ADS_ENTRY(ADS_SDATA, _sdata), + ADS_ENTRY(ADS_BSS_END, __bss_stop), + ADS_ENTRY(ADS_PER_CPU_START, __per_cpu_start), + ADS_ENTRY(ADS_PER_CPU_END, __per_cpu_end), + ADS_ENTRY(ADS_START_RO_AFTER_INIT, __start_ro_after_init), + ADS_ENTRY(ADS_END_RO_AFTER_INIT, __end_ro_after_init), + ADS_ENTRY(ADS_LINUX_BANNER, linux_banner), +#ifdef CONFIG_CMA + ADS_ENTRY(ADS_TOTAL_CMA, &totalcma_pages), +#endif + ADS_ENTRY(ADS_SLAB_CACHES, &slab_caches), + ADS_ENTRY(ADS_SLAB_MUTEX, &slab_mutex), + ADS_ENTRY(ADS_MIN_LOW_PFN, &min_low_pfn), + ADS_ENTRY(ADS_MAX_PFN, &max_pfn), + ADS_ENTRY(ADS_VMALLOC_NR_PAGES, &ads_vmalloc_nr_pages), + ADS_ENTRY(ADS_PCPU_NR_PAGES, &ads_pcpu_nr_pages), +#ifdef CONFIG_PAGE_OWNER + ADS_ENTRY(ADS_PAGE_OWNER_ENABLED, &ads_page_owner), +#endif +#ifdef CONFIG_SLUB_DEBUG + ADS_ENTRY(ADS_SLUB_DEBUG, &ads_slub_debug), +#endif +#ifdef CONFIG_SWAP + ADS_ENTRY(ADS_NR_SWAP_PAGES, &nr_swap_pages), +#endif +#ifdef CONFIG_MMU + ADS_ENTRY(ADS_MMAP_MIN_ADDR, &mmap_min_addr), +#endif + ADS_ENTRY(ADS_STACK_GUARD_GAP, &stack_guard_gap), +#ifdef CONFIG_SYSCTL + ADS_ENTRY(ADS_SYSCTL_LEGACY_VA_LAYOUT, &sysctl_legacy_va_layout), +#endif + ADS_ENTRY(ADS_SHOW_MEM, show_mem), +}; + +/* + * ads_per_cpu_entries array contains all the per_cpu variable address information. + */ +static const struct ads_entry ads_per_cpu_entries[ADS_DEBUG_PER_CPU_END] = { +#ifdef CONFIG_ARM64 + ADS_PER_CPU_ENTRY(ADS_IRQ_STACK_PTR, irq_stack_ptr), +#endif +#ifdef CONFIG_X86 + ADS_PER_CPU_ENTRY(ADS_IRQ_STACK_PTR, hardirq_stack_ptr), +#endif +}; + +/* + * android_debug_symbol - Provide address inforamtion of debug symbol. + * @symbol: Index of debug symbol array. + * + * Return address of core kernel symbol on success and a negative errno will be + * returned in error cases. + * + */ +void *android_debug_symbol(enum android_debug_symbol symbol) +{ + if (symbol >= ADS_END) + return ERR_PTR(-EINVAL); + + return ads_entries[symbol].addr; +} +EXPORT_SYMBOL_NS_GPL(android_debug_symbol, MINIDUMP); + +/* + * android_debug_per_cpu_symbol - Provide address inforamtion of per cpu debug symbol. + * @symbol: Index of per cpu debug symbol array. + * + * Return address of core kernel symbol on success and a negative errno will be + * returned in error cases. + * + */ +void *android_debug_per_cpu_symbol(enum android_debug_per_cpu_symbol symbol) +{ + if (symbol >= ADS_DEBUG_PER_CPU_END) + return ERR_PTR(-EINVAL); + + return ads_per_cpu_entries[symbol].addr; +} +EXPORT_SYMBOL_NS_GPL(android_debug_per_cpu_symbol, MINIDUMP); + +static int __init debug_symbol_init(void) +{ +#ifdef CONFIG_PAGE_OWNER + ads_page_owner = page_owner_ops.need(); +#endif +#ifdef CONFIG_SLUB_DEBUG + ads_slub_debug = __slub_debug_enabled(); +#endif + ads_vmalloc_nr_pages = vmalloc_nr_pages(); + ads_pcpu_nr_pages = pcpu_nr_pages(); + return 0; +} +module_init(debug_symbol_init); + +static void __exit debug_symbol_exit(void) +{ } +module_exit(debug_symbol_exit); + +MODULE_DESCRIPTION("Debug Symbol Driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 00c6c03ff822..414e95c43d67 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -66,13 +66,18 @@ #include #include #include +#include +#include #include #include #include "binder_internal.h" #include "binder_trace.h" +#include + +#include "../../kernel/sched/sched.h" static HLIST_HEAD(binder_deferred_list); static DEFINE_MUTEX(binder_deferred_lock); @@ -547,6 +552,7 @@ static void binder_wakeup_poll_threads_ilocked(struct binder_proc *proc, thread = rb_entry(n, struct binder_thread, rb_node); if (thread->looper & BINDER_LOOPER_STATE_POLL && binder_available_for_proc_work_ilocked(thread)) { + trace_android_vh_binder_wakeup_ilocked(thread->task, sync, proc); if (sync) wake_up_interruptible_sync(&thread->wait); else @@ -606,6 +612,7 @@ static void binder_wakeup_thread_ilocked(struct binder_proc *proc, assert_spin_locked(&proc->inner_lock); if (thread) { + trace_android_vh_binder_wakeup_ilocked(thread->task, sync, proc); if (sync) wake_up_interruptible_sync(&thread->wait); else @@ -636,22 +643,245 @@ static void binder_wakeup_proc_ilocked(struct binder_proc *proc) binder_wakeup_thread_ilocked(proc, thread, /* sync = */false); } -static void binder_set_nice(long nice) +static bool is_rt_policy(int policy) { - long min_nice; + return policy == SCHED_FIFO || policy == SCHED_RR; +} - if (can_nice(current, nice)) { - set_user_nice(current, nice); +static bool is_fair_policy(int policy) +{ + return policy == SCHED_NORMAL || policy == SCHED_BATCH; +} + +static bool binder_supported_policy(int policy) +{ + return is_fair_policy(policy) || is_rt_policy(policy); +} + +#ifdef CONFIG_UCLAMP_TASK +static void set_binder_prio_uclamp(struct binder_priority *prio, struct task_struct *task) +{ + if (!uclamp_is_used()) + return; + + prio->uclamp[UCLAMP_MIN] = task->uclamp_req[UCLAMP_MIN].value; + prio->uclamp[UCLAMP_MAX] = task->uclamp_req[UCLAMP_MAX].value; +} + +static void set_inherited_uclamp(struct binder_transaction *t) +{ + if (!uclamp_is_used()) + return; + + t->priority.uclamp[UCLAMP_MIN] = uclamp_eff_value(current, UCLAMP_MIN); + t->priority.uclamp[UCLAMP_MAX] = uclamp_eff_value(current, UCLAMP_MAX); +} + +static bool is_uclamp_equal(struct task_struct *task, const struct binder_priority *desired) +{ + if (!uclamp_is_used()) + return true; + + return task->uclamp_req[UCLAMP_MIN].value == desired->uclamp[UCLAMP_MIN] + && task->uclamp_req[UCLAMP_MAX].value == desired->uclamp[UCLAMP_MAX]; +} + +#else +static void set_binder_prio_uclamp(struct binder_priority *prio, struct task_struct *task) { } +static void set_inherited_uclamp(struct binder_transaction *t) { } +static bool is_uclamp_equal(struct task_struct *task, const struct binder_priority *desired) +{ + return true; +} +#endif + +static int to_userspace_prio(int policy, int kernel_priority) +{ + if (is_fair_policy(policy)) + return PRIO_TO_NICE(kernel_priority); + else + return MAX_RT_PRIO - 1 - kernel_priority; +} + +static int to_kernel_prio(int policy, int user_priority) +{ + if (is_fair_policy(policy)) + return NICE_TO_PRIO(user_priority); + else + return MAX_RT_PRIO - 1 - user_priority; +} + +static void binder_do_set_priority(struct binder_thread *thread, + const struct binder_priority *desired, + bool verify) +{ + struct task_struct *task = thread->task; + int priority; /* user-space prio value */ + bool has_cap_nice; + unsigned int policy = desired->sched_policy; + struct sched_attr attrs = { + .sched_flags = SCHED_FLAG_RESET_ON_FORK + }; + + if (uclamp_is_used()) { + attrs.sched_flags |= SCHED_FLAG_UTIL_CLAMP; + attrs.sched_util_min = desired->uclamp[UCLAMP_MIN]; + attrs.sched_util_max = desired->uclamp[UCLAMP_MAX]; + } + + if (task->policy == policy && task->normal_prio == desired->prio + && is_uclamp_equal(task, desired)) { + spin_lock(&thread->prio_lock); + if (thread->prio_state == BINDER_PRIO_PENDING) + thread->prio_state = BINDER_PRIO_SET; + spin_unlock(&thread->prio_lock); return; } - min_nice = rlimit_to_nice(rlimit(RLIMIT_NICE)); - binder_debug(BINDER_DEBUG_PRIORITY_CAP, - "%d: nice value %ld not allowed use %ld instead\n", - current->pid, nice, min_nice); - set_user_nice(current, min_nice); - if (min_nice <= MAX_NICE) + + has_cap_nice = has_capability_noaudit(task, CAP_SYS_NICE); + + priority = to_userspace_prio(policy, desired->prio); + + if (verify && is_rt_policy(policy) && !has_cap_nice) { + long max_rtprio = task_rlimit(task, RLIMIT_RTPRIO); + + if (max_rtprio == 0) { + policy = SCHED_NORMAL; + priority = MIN_NICE; + } else if (priority > max_rtprio) { + priority = max_rtprio; + } + } + + if (verify && is_fair_policy(policy) && !has_cap_nice) { + long min_nice = rlimit_to_nice(task_rlimit(task, RLIMIT_NICE)); + + if (min_nice > MAX_NICE) { + binder_user_error("%d RLIMIT_NICE not set\n", + task->pid); + return; + } else if (priority < min_nice) { + priority = min_nice; + } + } + + if (policy != desired->sched_policy || + to_kernel_prio(policy, priority) != desired->prio) + binder_debug(BINDER_DEBUG_PRIORITY_CAP, + "%d: priority %d not allowed, using %d instead\n", + task->pid, desired->prio, + to_kernel_prio(policy, priority)); + + trace_binder_set_priority(task->tgid, task->pid, task->normal_prio, + to_kernel_prio(policy, priority), + desired->prio); + + spin_lock(&thread->prio_lock); + if (!verify && thread->prio_state == BINDER_PRIO_ABORT) { + /* + * A new priority has been set by an incoming nested + * transaction. Abort this priority restore and allow + * the transaction to run at the new desired priority. + */ + spin_unlock(&thread->prio_lock); + binder_debug(BINDER_DEBUG_PRIORITY_CAP, + "%d: %s: aborting priority restore\n", + thread->pid, __func__); return; - binder_user_error("%d RLIMIT_NICE not set\n", current->pid); + } + + /* Set the actual priority and uclamp */ + if (task->policy != policy || is_rt_policy(policy)) { + attrs.sched_policy = policy; + attrs.sched_priority = is_rt_policy(policy) ? priority : 0; + attrs.sched_nice = PRIO_TO_NICE(task->static_prio); + } else { + attrs.sched_flags |= SCHED_FLAG_KEEP_ALL; + } + + sched_setattr_nocheck(task, &attrs); + + if (is_fair_policy(policy)) + set_user_nice(task, priority); + + thread->prio_state = BINDER_PRIO_SET; + spin_unlock(&thread->prio_lock); +} + +static void binder_set_priority(struct binder_thread *thread, + const struct binder_priority *desired) +{ + binder_do_set_priority(thread, desired, /* verify = */ true); +} + +static void binder_restore_priority(struct binder_thread *thread, + const struct binder_priority *desired) +{ + binder_do_set_priority(thread, desired, /* verify = */ false); +} + +static void binder_transaction_priority(struct binder_thread *thread, + struct binder_transaction *t, + struct binder_node *node) +{ + struct task_struct *task = thread->task; + struct binder_priority desired = t->priority; + const struct binder_priority node_prio = { + .sched_policy = node->sched_policy, + .prio = node->min_priority, + }; + bool skip = false; + + if (t->set_priority_called) + return; + + t->set_priority_called = true; + + trace_android_vh_binder_priority_skip(task, &skip); + if (skip) + return; + + if (!node->inherit_rt && is_rt_policy(desired.sched_policy)) { + desired.prio = NICE_TO_PRIO(0); + desired.sched_policy = SCHED_NORMAL; + } + + if (node_prio.prio < t->priority.prio || + (node_prio.prio == t->priority.prio && + node_prio.sched_policy == SCHED_FIFO)) { + /* + * In case the minimum priority on the node is + * higher (lower value), use that priority. If + * the priority is the same, but the node uses + * SCHED_FIFO, prefer SCHED_FIFO, since it can + * run unbounded, unlike SCHED_RR. + */ + desired.prio = node_prio.prio; + desired.sched_policy = node_prio.sched_policy; + } + + spin_lock(&thread->prio_lock); + if (thread->prio_state == BINDER_PRIO_PENDING) { + /* + * Task is in the process of changing priorities + * saving its current values would be incorrect. + * Instead, save the pending priority and signal + * the task to abort the priority restore. + */ + t->saved_priority = thread->prio_next; + thread->prio_state = BINDER_PRIO_ABORT; + binder_debug(BINDER_DEBUG_PRIORITY_CAP, + "%d: saved pending priority %d\n", + current->pid, thread->prio_next.prio); + } else { + t->saved_priority.sched_policy = task->policy; + t->saved_priority.prio = task->normal_prio; + set_binder_prio_uclamp(&t->saved_priority, task); + } + spin_unlock(&thread->prio_lock); + + binder_set_priority(thread, &desired); + trace_android_vh_binder_set_priority(t, task); } static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc, @@ -704,6 +934,7 @@ static struct binder_node *binder_init_node_ilocked( binder_uintptr_t ptr = fp ? fp->binder : 0; binder_uintptr_t cookie = fp ? fp->cookie : 0; __u32 flags = fp ? fp->flags : 0; + s8 priority; assert_spin_locked(&proc->inner_lock); @@ -736,8 +967,12 @@ static struct binder_node *binder_init_node_ilocked( node->ptr = ptr; node->cookie = cookie; node->work.type = BINDER_WORK_NODE; - node->min_priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK; + priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK; + node->sched_policy = (flags & FLAT_BINDER_FLAG_SCHED_POLICY_MASK) >> + FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT; + node->min_priority = to_kernel_prio(node->sched_policy, priority); node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); + node->inherit_rt = !!(flags & FLAT_BINDER_FLAG_INHERIT_RT); node->txn_security_ctx = !!(flags & FLAT_BINDER_FLAG_TXN_SECURITY_CTX); spin_lock_init(&node->lock); INIT_LIST_HEAD(&node->work.entry); @@ -2380,6 +2615,56 @@ static int binder_fixup_parent(struct binder_transaction *t, return 0; } +/** + * binder_can_update_transaction() - Can a txn be superseded by an updated one? + * @t1: the pending async txn in the frozen process + * @t2: the new async txn to supersede the outdated pending one + * + * Return: true if t2 can supersede t1 + * false if t2 can not supersede t1 + */ +static bool binder_can_update_transaction(struct binder_transaction *t1, + struct binder_transaction *t2) +{ + if ((t1->flags & t2->flags & (TF_ONE_WAY | TF_UPDATE_TXN)) != + (TF_ONE_WAY | TF_UPDATE_TXN) || !t1->to_proc || !t2->to_proc) + return false; + if (t1->to_proc->tsk == t2->to_proc->tsk && t1->code == t2->code && + t1->flags == t2->flags && t1->buffer->pid == t2->buffer->pid && + t1->buffer->target_node->ptr == t2->buffer->target_node->ptr && + t1->buffer->target_node->cookie == t2->buffer->target_node->cookie) + return true; + return false; +} + +/** + * binder_find_outdated_transaction_ilocked() - Find the outdated transaction + * @t: new async transaction + * @target_list: list to find outdated transaction + * + * Return: the outdated transaction if found + * NULL if no outdated transacton can be found + * + * Requires the proc->inner_lock to be held. + */ +static struct binder_transaction * +binder_find_outdated_transaction_ilocked(struct binder_transaction *t, + struct list_head *target_list) +{ + struct binder_work *w; + + list_for_each_entry(w, target_list, entry) { + struct binder_transaction *t_queued; + + if (w->type != BINDER_WORK_TRANSACTION) + continue; + t_queued = container_of(w, struct binder_transaction, work); + if (binder_can_update_transaction(t_queued, t)) + return t_queued; + } + return NULL; +} + /** * binder_proc_transaction() - sends a transaction to a process and wakes it up * @t: transaction to send @@ -2396,7 +2681,10 @@ static int binder_fixup_parent(struct binder_transaction *t, * * Return: 0 if the transaction was successfully queued * BR_DEAD_REPLY if the target process or thread is dead - * BR_FROZEN_REPLY if the target process or thread is frozen + * BR_FROZEN_REPLY if the target process or thread is frozen and + * the sync transaction was rejected + * BR_TRANSACTION_PENDING_FROZEN if the target process is frozen + * and the async transaction was successfully queued */ static int binder_proc_transaction(struct binder_transaction *t, struct binder_proc *proc, @@ -2405,9 +2693,12 @@ static int binder_proc_transaction(struct binder_transaction *t, struct binder_node *node = t->buffer->target_node; bool oneway = !!(t->flags & TF_ONE_WAY); bool pending_async = false; + struct binder_transaction *t_outdated = NULL; + bool frozen = false; BUG_ON(!node); binder_node_lock(node); + if (oneway) { BUG_ON(thread); if (node->has_async_transaction) @@ -2418,26 +2709,40 @@ static int binder_proc_transaction(struct binder_transaction *t, binder_inner_proc_lock(proc); if (proc->is_frozen) { + frozen = true; proc->sync_recv |= !oneway; proc->async_recv |= oneway; } - if ((proc->is_frozen && !oneway) || proc->is_dead || + if ((frozen && !oneway) || proc->is_dead || (thread && thread->is_dead)) { binder_inner_proc_unlock(proc); binder_node_unlock(node); - return proc->is_frozen ? BR_FROZEN_REPLY : BR_DEAD_REPLY; + return frozen ? BR_FROZEN_REPLY : BR_DEAD_REPLY; } if (!thread && !pending_async) thread = binder_select_thread_ilocked(proc); - if (thread) + if (thread) { + binder_transaction_priority(thread, t, node); binder_enqueue_thread_work_ilocked(thread, &t->work); - else if (!pending_async) + } else if (!pending_async) { binder_enqueue_work_ilocked(&t->work, &proc->todo); - else + } else { + if ((t->flags & TF_UPDATE_TXN) && frozen) { + t_outdated = binder_find_outdated_transaction_ilocked(t, + &node->async_todo); + if (t_outdated) { + binder_debug(BINDER_DEBUG_TRANSACTION, + "txn %d supersedes %d\n", + t->debug_id, t_outdated->debug_id); + list_del_init(&t_outdated->work.entry); + proc->outstanding_txns--; + } + } binder_enqueue_work_ilocked(&t->work, &node->async_todo); + } if (!pending_async) binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */); @@ -2446,6 +2751,25 @@ static int binder_proc_transaction(struct binder_transaction *t, binder_inner_proc_unlock(proc); binder_node_unlock(node); + /* + * To reduce potential contention, free the outdated transaction and + * buffer after releasing the locks. + */ + if (t_outdated) { + struct binder_buffer *buffer = t_outdated->buffer; + + t_outdated->buffer = NULL; + buffer->transaction = NULL; + trace_binder_transaction_update_buffer_release(buffer); + binder_transaction_buffer_release(proc, NULL, buffer, 0, 0); + binder_alloc_free_buf(&proc->alloc, buffer); + kfree(t_outdated); + binder_stats_deleted(BINDER_STAT_TRANSACTION); + } + + if (oneway && frozen) + return BR_TRANSACTION_PENDING_FROZEN; + return 0; } @@ -2521,6 +2845,7 @@ static void binder_transaction(struct binder_proc *proc, u32 secctx_sz = 0; const void __user *user_buffer = (const void __user *) (uintptr_t)tr->data.ptr.buffer; + bool is_nested = false; e = binder_transaction_log_add(&binder_transaction_log); e->debug_id = t_debug_id; @@ -2562,7 +2887,6 @@ static void binder_transaction(struct binder_proc *proc, } thread->transaction_stack = in_reply_to->to_parent; binder_inner_proc_unlock(proc); - binder_set_nice(in_reply_to->saved_priority); target_thread = binder_get_txn_from_and_acq_inner(in_reply_to); if (target_thread == NULL) { /* annotation for sparse */ @@ -2704,6 +3028,7 @@ static void binder_transaction(struct binder_proc *proc, atomic_inc(&from->tmp_ref); target_thread = from; spin_unlock(&tmp->lock); + is_nested = true; break; } spin_unlock(&tmp->lock); @@ -2715,6 +3040,7 @@ static void binder_transaction(struct binder_proc *proc, if (target_thread) e->to_thread = target_thread->pid; e->to_proc = target_proc->pid; + trace_android_rvh_binder_transaction(target_proc, proc, thread, tr); /* TODO: reuse incoming transaction for reply */ t = kzalloc(sizeof(*t), GFP_KERNEL); @@ -2727,6 +3053,7 @@ static void binder_transaction(struct binder_proc *proc, INIT_LIST_HEAD(&t->fd_fixups); binder_stats_created(BINDER_STAT_TRANSACTION); spin_lock_init(&t->lock); + trace_android_vh_binder_transaction_init(t); tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL); if (tcomplete == NULL) { @@ -2767,7 +3094,19 @@ static void binder_transaction(struct binder_proc *proc, t->to_thread = target_thread; t->code = tr->code; t->flags = tr->flags; - t->priority = task_nice(current); + t->is_nested = is_nested; + if (!(t->flags & TF_ONE_WAY) && + binder_supported_policy(current->policy)) { + /* Inherit supported policies for synchronous transactions */ + t->priority.sched_policy = current->policy; + t->priority.prio = current->normal_prio; + } else { + /* Otherwise, fall back to the default priority */ + t->priority = target_proc->default_priority; + } + + if (!(t->flags & TF_ONE_WAY)) + set_inherited_uclamp(t); if (target_node && target_node->txn_security_ctx) { u32 secid; @@ -3129,7 +3468,15 @@ static void binder_transaction(struct binder_proc *proc, binder_enqueue_thread_work_ilocked(target_thread, &t->work); target_proc->outstanding_txns++; binder_inner_proc_unlock(target_proc); + if (in_reply_to->is_nested) { + spin_lock(&thread->prio_lock); + thread->prio_state = BINDER_PRIO_PENDING; + thread->prio_next = in_reply_to->saved_priority; + spin_unlock(&thread->prio_lock); + } wake_up_interruptible_sync(&target_thread->wait); + trace_android_vh_binder_restore_priority(in_reply_to, current); + binder_restore_priority(thread, &in_reply_to->saved_priority); binder_free_transaction(in_reply_to); } else if (!(t->flags & TF_ONE_WAY)) { BUG_ON(t->buffer->async_transaction != 0); @@ -3157,9 +3504,17 @@ static void binder_transaction(struct binder_proc *proc, } else { BUG_ON(target_node == NULL); BUG_ON(t->buffer->async_transaction != 1); - binder_enqueue_thread_work(thread, tcomplete); return_error = binder_proc_transaction(t, target_proc, NULL); - if (return_error) + /* + * Let the caller know when async transaction reaches a frozen + * process and is put in a pending queue, waiting for the target + * process to be unfrozen. + */ + if (return_error == BR_TRANSACTION_PENDING_FROZEN) + tcomplete->type = BINDER_WORK_TRANSACTION_PENDING; + binder_enqueue_thread_work(thread, tcomplete); + if (return_error && + return_error != BR_TRANSACTION_PENDING_FROZEN) goto err_dead_proc_or_thread; } if (target_thread) @@ -3244,6 +3599,8 @@ err_invalid_target_handle: BUG_ON(thread->return_error.cmd != BR_OK); if (in_reply_to) { + trace_android_vh_binder_restore_priority(in_reply_to, current); + binder_restore_priority(thread, &in_reply_to->saved_priority); thread->return_error.cmd = BR_TRANSACTION_COMPLETE; binder_enqueue_thread_work(thread, &thread->return_error.work); binder_send_failed_reply(in_reply_to, return_error); @@ -3804,6 +4161,7 @@ static int binder_wait_for_work(struct binder_thread *thread, if (do_proc_work) list_add(&thread->waiting_thread_node, &proc->waiting_threads); + trace_android_vh_binder_wait_for_work(do_proc_work, thread, proc); binder_inner_proc_unlock(proc); schedule(); binder_inner_proc_lock(proc); @@ -3921,7 +4279,8 @@ retry: wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); } - binder_set_nice(proc->default_priority); + trace_android_vh_binder_restore_priority(NULL, current); + binder_restore_priority(thread, &proc->default_priority); } if (non_block) { @@ -3989,10 +4348,13 @@ retry: binder_stat_br(proc, thread, cmd); } break; case BINDER_WORK_TRANSACTION_COMPLETE: + case BINDER_WORK_TRANSACTION_PENDING: case BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT: { if (proc->oneway_spam_detection_enabled && w->type == BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT) cmd = BR_ONEWAY_SPAM_SUSPECT; + else if (w->type == BINDER_WORK_TRANSACTION_PENDING) + cmd = BR_TRANSACTION_PENDING_FROZEN; else cmd = BR_TRANSACTION_COMPLETE; binder_inner_proc_unlock(proc); @@ -4151,13 +4513,7 @@ retry: trd->target.ptr = target_node->ptr; trd->cookie = target_node->cookie; - t->saved_priority = task_nice(current); - if (t->priority < target_node->min_priority && - !(t->flags & TF_ONE_WAY)) - binder_set_nice(t->priority); - else if (!(t->flags & TF_ONE_WAY) || - t->saved_priority > target_node->min_priority) - binder_set_nice(target_node->min_priority); + binder_transaction_priority(thread, t, target_node); cmd = BR_TRANSACTION; } else { trd->target.ptr = 0; @@ -4175,6 +4531,7 @@ retry: trd->sender_pid = task_tgid_nr_ns(sender, task_active_pid_ns(current)); + trace_android_vh_sync_txn_recvd(thread->task, t_from->task); } else { trd->sender_pid = 0; } @@ -4375,6 +4732,8 @@ static struct binder_thread *binder_get_thread_ilocked( binder_stats_created(BINDER_STAT_THREAD); thread->proc = proc; thread->pid = current->pid; + get_task_struct(current); + thread->task = current; atomic_set(&thread->tmp_ref, 0); init_waitqueue_head(&thread->wait); INIT_LIST_HEAD(&thread->todo); @@ -4385,6 +4744,8 @@ static struct binder_thread *binder_get_thread_ilocked( thread->return_error.cmd = BR_OK; thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR; thread->reply_error.cmd = BR_OK; + spin_lock_init(&thread->prio_lock); + thread->prio_state = BINDER_PRIO_SET; INIT_LIST_HEAD(&new_thread->waiting_thread_node); return thread; } @@ -4436,6 +4797,7 @@ static void binder_free_thread(struct binder_thread *thread) BUG_ON(!list_empty(&thread->todo)); binder_stats_deleted(BINDER_STAT_THREAD); binder_proc_dec_tmpref(thread->proc); + put_task_struct(thread->task); kfree(thread); } @@ -5129,7 +5491,16 @@ static int binder_open(struct inode *nodp, struct file *filp) proc->cred = get_cred(filp->f_cred); INIT_LIST_HEAD(&proc->todo); init_waitqueue_head(&proc->freeze_wait); - proc->default_priority = task_nice(current); + if (binder_supported_policy(current->policy)) { + proc->default_priority.sched_policy = current->policy; + proc->default_priority.prio = current->normal_prio; + } else { + proc->default_priority.sched_policy = SCHED_NORMAL; + proc->default_priority.prio = NICE_TO_PRIO(0); + } + + set_binder_prio_uclamp(&proc->default_priority, current); + /* binderfs stashes devices in i_private */ if (is_binderfs_device(nodp)) { binder_dev = nodp->i_private; @@ -5452,15 +5823,17 @@ static void print_binder_transaction_ilocked(struct seq_file *m, struct binder_buffer *buffer = t->buffer; spin_lock(&t->lock); + trace_android_vh_binder_print_transaction_info(m, proc, prefix, t); to_proc = t->to_proc; seq_printf(m, - "%s %d: %pK from %d:%d to %d:%d code %x flags %x pri %ld r%d", + "%s %d: %pK from %d:%d to %d:%d code %x flags %x pri %d:%d r%d", prefix, t->debug_id, t, t->from ? t->from->proc->pid : 0, t->from ? t->from->pid : 0, to_proc ? to_proc->pid : 0, t->to_thread ? t->to_thread->pid : 0, - t->code, t->flags, t->priority, t->need_reply); + t->code, t->flags, t->priority.sched_policy, + t->priority.prio, t->need_reply); spin_unlock(&t->lock); if (proc != to_proc) { @@ -5578,8 +5951,9 @@ static void print_binder_node_nilocked(struct seq_file *m, hlist_for_each_entry(ref, &node->refs, node_entry) count++; - seq_printf(m, " node %d: u%016llx c%016llx hs %d hw %d ls %d lw %d is %d iw %d tr %d", + seq_printf(m, " node %d: u%016llx c%016llx pri %d:%d hs %d hw %d ls %d lw %d is %d iw %d tr %d", node->debug_id, (u64)node->ptr, (u64)node->cookie, + node->sched_policy, node->min_priority, node->has_strong_ref, node->has_weak_ref, node->local_strong_refs, node->local_weak_refs, node->internal_strong_refs, count, node->tmp_refs); @@ -5697,6 +6071,7 @@ static const char * const binder_return_strings[] = { "BR_FAILED_REPLY", "BR_FROZEN_REPLY", "BR_ONEWAY_SPAM_SUSPECT", + "BR_TRANSACTION_PENDING_FROZEN" }; static const char * const binder_command_strings[] = { @@ -6123,5 +6498,7 @@ device_initcall(binder_init); #define CREATE_TRACE_POINTS #include "binder_trace.h" +EXPORT_TRACEPOINT_SYMBOL_GPL(binder_transaction_received); +EXPORT_TRACEPOINT_SYMBOL_GPL(binder_txn_latency_free); MODULE_LICENSE("GPL v2"); diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h index 1ade9799c8d5..ac97ecdc2f9f 100644 --- a/drivers/android/binder_internal.h +++ b/drivers/android/binder_internal.h @@ -77,6 +77,10 @@ extern const struct file_operations binder_fops; extern char *binder_devices_param; +#ifdef CONFIG_UCLAMP_TASK +unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +#endif + #ifdef CONFIG_ANDROID_BINDERFS extern bool is_binderfs_device(const struct inode *inode); extern struct dentry *binderfs_create_file(struct dentry *dir, const char *name, @@ -133,7 +137,7 @@ enum binder_stat_types { }; struct binder_stats { - atomic_t br[_IOC_NR(BR_ONEWAY_SPAM_SUSPECT) + 1]; + atomic_t br[_IOC_NR(BR_TRANSACTION_PENDING_FROZEN) + 1]; atomic_t bc[_IOC_NR(BC_REPLY_SG) + 1]; atomic_t obj_created[BINDER_STAT_COUNT]; atomic_t obj_deleted[BINDER_STAT_COUNT]; @@ -152,6 +156,7 @@ struct binder_work { enum binder_work_type { BINDER_WORK_TRANSACTION = 1, BINDER_WORK_TRANSACTION_COMPLETE, + BINDER_WORK_TRANSACTION_PENDING, BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT, BINDER_WORK_RETURN_ERROR, BINDER_WORK_NODE, @@ -215,10 +220,13 @@ struct binder_error { * and by @lock) * @has_async_transaction: async transaction to node in progress * (protected by @lock) + * @sched_policy: minimum scheduling policy for node + * (invariant after initialized) * @accept_fds: file descriptor operations supported for node * (invariant after initialized) * @min_priority: minimum scheduling priority * (invariant after initialized) + * @inherit_rt: inherit RT scheduling policy from caller * @txn_security_ctx: require sender's security context * (invariant after initialized) * @async_todo: list of async work items @@ -256,6 +264,8 @@ struct binder_node { /* * invariant after initialization */ + u8 sched_policy:2; + u8 inherit_rt:1; u8 accept_fds:1; u8 txn_security_ctx:1; u8 min_priority; @@ -324,6 +334,29 @@ struct binder_ref { struct binder_ref_death *death; }; +/** + * struct binder_priority - scheduler policy, priority and uclamp + * @sched_policy scheduler policy + * @prio [100..139] for SCHED_NORMAL, [0..99] for FIFO/RT + * @uclamp [0..1024] for UCLAMP_MIN & UCLAMP_MAX + * The binder driver supports inheriting the following scheduler policies: + * SCHED_NORMAL + * SCHED_BATCH + * SCHED_FIFO + * SCHED_RR + */ +struct binder_priority { + unsigned int sched_policy; + int prio; + unsigned int uclamp[UCLAMP_CNT]; +}; + +enum binder_prio_state { + BINDER_PRIO_SET, /* desired priority set */ + BINDER_PRIO_PENDING, /* initiated a saved priority restore */ + BINDER_PRIO_ABORT, /* abort the pending priority restore */ +}; + /** * struct binder_proc - binder process bookkeeping * @proc_node: element for binder_procs list @@ -424,7 +457,7 @@ struct binder_proc { int requested_threads; int requested_threads_started; int tmp_ref; - long default_priority; + struct binder_priority default_priority; struct dentry *debugfs_entry; struct binder_alloc alloc; struct binder_context *context; @@ -467,6 +500,13 @@ struct binder_proc { * @is_dead: thread is dead and awaiting free * when outstanding transactions are cleaned up * (protected by @proc->inner_lock) + * @task: struct task_struct for this thread + * @prio_lock: protects thread priority fields + * @prio_next: saved priority to be restored next + * (protected by @prio_lock) + * @prio_state: state of the priority restore process as + * defined by enum binder_prio_state + * (protected by @prio_lock) * * Bookkeeping structure for binder threads. */ @@ -486,6 +526,10 @@ struct binder_thread { struct binder_stats stats; atomic_t tmp_ref; bool is_dead; + struct task_struct *task; + spinlock_t prio_lock; + struct binder_priority prio_next; + enum binder_prio_state prio_state; }; /** @@ -519,8 +563,10 @@ struct binder_transaction { struct binder_buffer *buffer; unsigned int code; unsigned int flags; - long priority; - long saved_priority; + struct binder_priority priority; + struct binder_priority saved_priority; + bool set_priority_called; + bool is_nested; kuid_t sender_euid; struct list_head fd_fixups; binder_uintptr_t security_ctx; @@ -531,6 +577,7 @@ struct binder_transaction { * during thread teardown */ spinlock_t lock; + ANDROID_VENDOR_DATA(1); }; /** diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h index 8eeccdc64724..5d82cf8af88b 100644 --- a/drivers/android/binder_trace.h +++ b/drivers/android/binder_trace.h @@ -76,6 +76,30 @@ DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_ioctl_done); DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_write_done); DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_read_done); +TRACE_EVENT(binder_set_priority, + TP_PROTO(int proc, int thread, unsigned int old_prio, + unsigned int desired_prio, unsigned int new_prio), + TP_ARGS(proc, thread, old_prio, new_prio, desired_prio), + + TP_STRUCT__entry( + __field(int, proc) + __field(int, thread) + __field(unsigned int, old_prio) + __field(unsigned int, new_prio) + __field(unsigned int, desired_prio) + ), + TP_fast_assign( + __entry->proc = proc; + __entry->thread = thread; + __entry->old_prio = old_prio; + __entry->new_prio = new_prio; + __entry->desired_prio = desired_prio; + ), + TP_printk("proc=%d thread=%d old=%d => new=%d desired=%d", + __entry->proc, __entry->thread, __entry->old_prio, + __entry->new_prio, __entry->desired_prio) +); + TRACE_EVENT(binder_wait_for_work, TP_PROTO(bool proc_work, bool transaction_stack, bool thread_todo), TP_ARGS(proc_work, transaction_stack, thread_todo), @@ -311,6 +335,10 @@ DEFINE_EVENT(binder_buffer_class, binder_transaction_failed_buffer_release, TP_PROTO(struct binder_buffer *buffer), TP_ARGS(buffer)); +DEFINE_EVENT(binder_buffer_class, binder_transaction_update_buffer_release, + TP_PROTO(struct binder_buffer *buffer), + TP_ARGS(buffer)); + TRACE_EVENT(binder_update_page_range, TP_PROTO(struct binder_alloc *alloc, bool allocate, void __user *start, void __user *end), diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c new file mode 100644 index 000000000000..f6f4a0e7db0f --- /dev/null +++ b/drivers/android/vendor_hooks.c @@ -0,0 +1,292 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* vendor_hook.c + * + * Android Vendor Hook Support + * + * Copyright 2020 Google LLC + */ + +#define CREATE_TRACE_POINTS +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Export tracepoints that act as a bare tracehook (ie: have no trace event + * associated with them) to allow external modules to probe them. + */ +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sk_alloc); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sk_free); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_nf_conn_alloc); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_nf_conn_free); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_arch_set_freq_scale); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_is_fpsimd_save); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_transaction_init); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_priority_skip); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_set_priority); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_restore_priority); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_wakeup_ilocked); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_send_sig_info); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_futex_plist_add); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_wait_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_wait_finish); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_init); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_finish); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_read_wait_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_read_wait_finish); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_wait_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_wait_finish); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sched_show_task); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpu_idle_enter); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpu_idle_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mpam_set); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_resume); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_wq_lockup_pool); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ipi_stop); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sysrq_crash); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_hotplug); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_caller_id); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_caller); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_ext_header); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_gic_v3_set_affinity); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_set_affinity); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_v3_affinity_init); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_suspend_epoch_val); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_resume_epoch_val); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_freq_table_limits); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_resolve_freq); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_fast_switch); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_target); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_skip_swapcache_flags); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_gfp_zone_flags); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_readahead_gfp_mask); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_preempt_disable); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_preempt_enable); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_irqs_disable); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_irqs_enable); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpu_cgroup_attach); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpu_cgroup_can_attach); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpu_cgroup_online); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_oops_enter); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_oops_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_size_check); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_format_check); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_dump_buffer); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_fill_prdt); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ufs_complete_init); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ufs_reprogram_all_keys); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_prepare_command); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_update_sysfs); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_send_command); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_compl_command); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cgroup_set_task); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_syscall_prctl_finished); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_send_uic_command); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_send_tm_command); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_check_int_errors); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_update_sdev); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_clock_scaling); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cgroup_attach); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_iommu_setup_dma_ops); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_iommu_iovad_alloc_iova); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_iommu_iovad_free_iova); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ptype_head); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kfree_skb); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_timer_calc_index); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_allow_domain_state); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpuidle_psci_enter); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpuidle_psci_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cgroup_force_kthread_migration); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_wait_for_work); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_binder_transaction); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sync_txn_recvd); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_update_topology_flags_workfn); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpufreq_transition); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_balance_anon_file_reclaim); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_show_max_freq); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_referenced_check_bypass); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_drain_all_pages_bypass); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cma_drain_all_pages_bypass); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_pcplist_add_cma_pages_bypass); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_shrink_slab_bypass); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_insert); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_node_delete); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_node_replace); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_lookup); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_commit_creds); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_exit_creds); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_override_creds); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_revert_creds); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_memory_nx); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_memory_rw); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_module_permit_before_init); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_module_permit_after_init); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_oom_check_panic); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_is_initialized); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_mmap_file); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_file_open); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_bpf_syscall); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_logbuf); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_logbuf_pr_cont); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rproc_recovery); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_uninterruptible_tasks); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_uninterruptible_tasks_dn); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_meminfo_proc_show); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_exit_mm); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_slowpath); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_mem); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_print_slabinfo_header); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_shrink_slab); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cache_show); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_report_bug); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_watchdog_timer_softlockup); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_freeze_todo); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_freeze_todo_unfrozen); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_die_kernel_fault); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_sea); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_mem_abort); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_sp_pc_abort); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_undefinstr); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_ptrauth_fault); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_panic_unhandled); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_arm64_serror_panic); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sha256); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_expandkey); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_encrypt); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_decrypt); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_request_freq); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_target_freq); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_register); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_unregister); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rproc_recovery_set); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_disable_thermal_cooling_stats); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_enable_thermal_power_throttle); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_init); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_wake); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_finished); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_rwsem_list_add); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_power_cap); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_tk_based_time_sync); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kswapd_per_node); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_vendor_set); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_ep_action); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_synctype); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_connect); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_audio_usb_offload_disconnect); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_atomic_remove_fb); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_drm_atomic_check_modeset); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_get_thermal_zone_device); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_psci_tos_resident_on); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_psci_cpu_suspend); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_usb_new_device_added); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_regmap_update); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_mutex_list_add); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_unlock_slowpath); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_dma_buf_release); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_pass_input_event); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_check_status); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_blk_alloc_rqs); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_blk_rq_ctx_init); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmap_region); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_unmap_one); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_shrink_node_memcgs); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_sdio_pm_flag_set); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_tcp_sendmsg_locked); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_tcp_recvmsg); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_udp_sendmsg); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_udp_recvmsg); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_tcp_recvmsg_stat); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_partial_init); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_mmc_cache_card_properties); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_print_transaction_info); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_handle_tlb_conf); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_memcgv2_init); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_memcgv2_calc_decayed_watermark); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_watermark); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_blk_reset); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_blk_mq_rw_recovery); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sd_update_bus_speed_mode); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_attach_sd); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sdhci_get_cd); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_gpio_cd_irqt); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_id_remove); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_css_offline); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_css_online); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_free); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_alloc); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cma_alloc_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cma_alloc_finish); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cma_alloc_adjust); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_enable_thermal_genl_check); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mm_compaction_begin); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mm_compaction_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_pagevec_drain); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_zap_pte_range_tlb_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_zap_pte_range_tlb_force_flush); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_zap_pte_range_tlb_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_bh_lru_install); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_skip_lru_disable); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_madvise_blk_plug); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_shrink_inactive_list_blk_plug); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_shrink_lruvec_blk_plug); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_reclaim_pages_plug); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_use_amu_fie); diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 31bd6f4e5dc4..729e60c812a9 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -21,6 +21,7 @@ #include #include #include +#include static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); static struct cpumask scale_freq_counters_mask; @@ -28,6 +29,12 @@ static bool scale_freq_invariant; static bool supports_scale_freq_counters(const struct cpumask *cpus) { + bool use_amu_fie = true; + + trace_android_vh_use_amu_fie(&use_amu_fie); + if (!use_amu_fie) + return false; + return cpumask_subset(cpus, &scale_freq_counters_mask); } @@ -144,6 +151,8 @@ void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq; + trace_android_vh_arch_set_freq_scale(cpus, cur_freq, max_freq, &scale); + for_each_cpu(i, cpus) per_cpu(arch_freq_scale, i) = scale; } @@ -157,6 +166,7 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity) } DEFINE_PER_CPU(unsigned long, thermal_pressure); +EXPORT_PER_CPU_SYMBOL_GPL(thermal_pressure); void topology_set_thermal_pressure(const struct cpumask *cpus, unsigned long th_pressure) @@ -202,6 +212,8 @@ static int register_cpu_capacity_sysctl(void) subsys_initcall(register_cpu_capacity_sysctl); static int update_topology; +bool topology_update_done; +EXPORT_SYMBOL_GPL(topology_update_done); int topology_update_cpu_topology(void) { @@ -216,6 +228,8 @@ static void update_topology_flags_workfn(struct work_struct *work) { update_topology = 1; rebuild_sched_domains(); + topology_update_done = true; + trace_android_vh_update_topology_flags_workfn(NULL); pr_debug("sched_domain hierarchy rebuilt, flags updated\n"); update_topology = 0; } diff --git a/drivers/base/core.c b/drivers/base/core.c index 10e027e92692..193e04d74d6f 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1581,7 +1581,7 @@ static int __init fw_devlink_setup(char *arg) } early_param("fw_devlink", fw_devlink_setup); -static bool fw_devlink_strict; +static bool fw_devlink_strict = true; static int __init fw_devlink_strict_setup(char *arg) { return strtobool(arg, &fw_devlink_strict); diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c index f4d794d6bb85..1c06781f7114 100644 --- a/drivers/base/devcoredump.c +++ b/drivers/base/devcoredump.c @@ -25,6 +25,47 @@ struct devcd_entry { struct device devcd_dev; void *data; size_t datalen; + /* + * Here, mutex is required to serialize the calls to del_wk work between + * user/kernel space which happens when devcd is added with device_add() + * and that sends uevent to user space. User space reads the uevents, + * and calls to devcd_data_write() which try to modify the work which is + * not even initialized/queued from devcoredump. + * + * + * + * cpu0(X) cpu1(Y) + * + * dev_coredump() uevent sent to user space + * device_add() ======================> user space process Y reads the + * uevents writes to devcd fd + * which results into writes to + * + * devcd_data_write() + * mod_delayed_work() + * try_to_grab_pending() + * del_timer() + * debug_assert_init() + * INIT_DELAYED_WORK() + * schedule_delayed_work() + * + * + * Also, mutex alone would not be enough to avoid scheduling of + * del_wk work after it get flush from a call to devcd_free() + * mentioned as below. + * + * disabled_store() + * devcd_free() + * mutex_lock() devcd_data_write() + * flush_delayed_work() + * mutex_unlock() + * mutex_lock() + * mod_delayed_work() + * mutex_unlock() + * So, delete_work flag is required. + */ + struct mutex mutex; + bool delete_work; struct module *owner; ssize_t (*read)(char *buffer, loff_t offset, size_t count, void *data, size_t datalen); @@ -84,7 +125,12 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj, struct device *dev = kobj_to_dev(kobj); struct devcd_entry *devcd = dev_to_devcd(dev); - mod_delayed_work(system_wq, &devcd->del_wk, 0); + mutex_lock(&devcd->mutex); + if (!devcd->delete_work) { + devcd->delete_work = true; + mod_delayed_work(system_wq, &devcd->del_wk, 0); + } + mutex_unlock(&devcd->mutex); return count; } @@ -112,7 +158,12 @@ static int devcd_free(struct device *dev, void *data) { struct devcd_entry *devcd = dev_to_devcd(dev); + mutex_lock(&devcd->mutex); + if (!devcd->delete_work) + devcd->delete_work = true; + flush_delayed_work(&devcd->del_wk); + mutex_unlock(&devcd->mutex); return 0; } @@ -122,6 +173,30 @@ static ssize_t disabled_show(struct class *class, struct class_attribute *attr, return sysfs_emit(buf, "%d\n", devcd_disabled); } +/* + * + * disabled_store() worker() + * class_for_each_device(&devcd_class, + * NULL, NULL, devcd_free) + * ... + * ... + * while ((dev = class_dev_iter_next(&iter)) + * devcd_del() + * device_del() + * put_device() <- last reference + * error = fn(dev, data) devcd_dev_release() + * devcd_free(dev, data) kfree(devcd) + * mutex_lock(&devcd->mutex); + * + * + * In the above diagram, It looks like disabled_store() would be racing with parallely + * running devcd_del() and result in memory abort while acquiring devcd->mutex which + * is called after kfree of devcd memory after dropping its last reference with + * put_device(). However, this will not happens as fn(dev, data) runs + * with its own reference to device via klist_node so it is not its last reference. + * so, above situation would not occur. + */ + static ssize_t disabled_store(struct class *class, struct class_attribute *attr, const char *buf, size_t count) { @@ -278,13 +353,16 @@ void dev_coredumpm(struct device *dev, struct module *owner, devcd->read = read; devcd->free = free; devcd->failing_dev = get_device(dev); + devcd->delete_work = false; + mutex_init(&devcd->mutex); device_initialize(&devcd->devcd_dev); dev_set_name(&devcd->devcd_dev, "devcd%d", atomic_inc_return(&devcd_count)); devcd->devcd_dev.class = &devcd_class; + mutex_lock(&devcd->mutex); if (device_add(&devcd->devcd_dev)) goto put_device; @@ -301,10 +379,11 @@ void dev_coredumpm(struct device *dev, struct module *owner, INIT_DELAYED_WORK(&devcd->del_wk, devcd_del); schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT); - + mutex_unlock(&devcd->mutex); return; put_device: put_device(&devcd->devcd_dev); + mutex_unlock(&devcd->mutex); put_module: module_put(owner); free: diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index 04ede46f7512..6d151626b764 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -464,21 +464,71 @@ static int fw_decompress_xz(struct device *dev, struct fw_priv *fw_priv, #endif /* CONFIG_FW_LOADER_COMPRESS */ /* direct firmware loading support */ -static char fw_path_para[256]; +#define CUSTOM_FW_PATH_COUNT 10 +#define PATH_SIZE 255 +static char fw_path_para[CUSTOM_FW_PATH_COUNT][PATH_SIZE]; static const char * const fw_path[] = { - fw_path_para, + fw_path_para[0], + fw_path_para[1], + fw_path_para[2], + fw_path_para[3], + fw_path_para[4], + fw_path_para[5], + fw_path_para[6], + fw_path_para[7], + fw_path_para[8], + fw_path_para[9], "/lib/firmware/updates/" UTS_RELEASE, "/lib/firmware/updates", "/lib/firmware/" UTS_RELEASE, "/lib/firmware" }; +static char strpath[PATH_SIZE * CUSTOM_FW_PATH_COUNT]; +static int firmware_param_path_set(const char *val, const struct kernel_param *kp) +{ + int i; + char *path, *end; + + strcpy(strpath, val); + /* Remove leading and trailing spaces from path */ + path = strim(strpath); + for (i = 0; path && i < CUSTOM_FW_PATH_COUNT; i++) { + end = strchr(path, ','); + + /* Skip continuous token case, for example ',,,' */ + if (end == path) { + i--; + path = ++end; + continue; + } + + if (end != NULL) + *end = '\0'; + else { + /* end of the string reached and no other tockens ',' */ + strncpy(fw_path_para[i], path, PATH_SIZE); + break; + } + + strncpy(fw_path_para[i], path, PATH_SIZE); + path = ++end; + } + + return 0; +} + /* - * Typical usage is that passing 'firmware_class.path=$CUSTOMIZED_PATH' + * Typical usage is that passing 'firmware_class.path=/vendor,/firwmare_mnt' * from kernel command line because firmware_class is generally built in - * kernel instead of module. + * kernel instead of module. ',' is used as delimiter for setting 10 + * custom paths for firmware loader. */ -module_param_string(path, fw_path_para, sizeof(fw_path_para), 0644); + +static const struct kernel_param_ops firmware_param_ops = { + .set = firmware_param_path_set, +}; +module_param_cb(path, &firmware_param_ops, NULL, 0200); MODULE_PARM_DESC(path, "customized firmware image search path with a higher priority than default path"); static int diff --git a/drivers/base/node.c b/drivers/base/node.c index 5366d1b5359c..28274f4bf913 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -433,6 +433,7 @@ static ssize_t node_read_meminfo(struct device *dev, "Node %d ShadowCallStack:%8lu kB\n" #endif "Node %d PageTables: %8lu kB\n" + "Node %d SecPageTables: %8lu kB\n" "Node %d NFS_Unstable: %8lu kB\n" "Node %d Bounce: %8lu kB\n" "Node %d WritebackTmp: %8lu kB\n" @@ -459,6 +460,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, node_page_state(pgdat, NR_KERNEL_SCS_KB), #endif nid, K(node_page_state(pgdat, NR_PAGETABLE)), + nid, K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), nid, 0UL, nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), diff --git a/drivers/base/power/domain_governor.c b/drivers/base/power/domain_governor.c index cd08c5885190..22424e7dbc34 100644 --- a/drivers/base/power/domain_governor.c +++ b/drivers/base/power/domain_governor.c @@ -12,6 +12,8 @@ #include #include +#include + static int dev_update_qos_constraint(struct device *dev, void *data) { s64 *constraint_ns_p = data; @@ -174,6 +176,11 @@ static bool __default_power_down_ok(struct dev_pm_domain *pd, struct pm_domain_data *pdd; s64 min_off_time_ns; s64 off_on_time_ns; + bool allow = true; + + trace_android_vh_allow_domain_state(genpd, state, &allow); + if (!allow) + return false; off_on_time_ns = genpd->states[state].power_off_latency_ns + genpd->states[state].power_on_latency_ns; diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 8c4819fe73d4..595646768b29 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "../base.h" #include "power.h" @@ -1244,6 +1245,8 @@ Run: error = dpm_run_callback(callback, dev, state, info); if (error) { async_error = error; + log_suspend_abort_reason("Callback failed on %s in %pS returned %d", + dev_name(dev), callback, error); goto Complete; } @@ -1440,6 +1443,8 @@ Run: error = dpm_run_callback(callback, dev, state, info); if (error) { async_error = error; + log_suspend_abort_reason("Callback failed on %s in %pS returned %d", + dev_name(dev), callback, error); goto Complete; } dpm_propagate_wakeup_to_parent(dev); @@ -1715,6 +1720,9 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) dpm_propagate_wakeup_to_parent(dev); dpm_clear_superiors_direct_complete(dev); + } else { + log_suspend_abort_reason("Callback failed on %s in %pS returned %d", + dev_name(dev), callback, error); } device_unlock(dev); @@ -1928,6 +1936,9 @@ int dpm_prepare(pm_message_t state) } else { dev_info(dev, "not prepared for power transition: code %d\n", error); + log_suspend_abort_reason("Device %s not prepared for power transition: code %d", + dev_name(dev), error); + dpm_save_failed_dev(dev_name(dev)); } mutex_unlock(&dpm_list_mtx); diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c index 8e93167f1783..0489db3a1f60 100644 --- a/drivers/base/power/qos.c +++ b/drivers/base/power/qos.c @@ -137,6 +137,7 @@ s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type) return ret; } +EXPORT_SYMBOL_GPL(dev_pm_qos_read_value); /** * apply_constraint - Add/modify/remove device PM QoS request. diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 8666590201c9..8c8ffa38404a 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -15,6 +15,9 @@ #include #include #include +#include +#include +#include #include #include "power.h" @@ -874,6 +877,37 @@ void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard) } EXPORT_SYMBOL_GPL(pm_wakeup_dev_event); +void pm_get_active_wakeup_sources(char *pending_wakeup_source, size_t max) +{ + struct wakeup_source *ws, *last_active_ws = NULL; + int len = 0; + bool active = false; + + rcu_read_lock(); + list_for_each_entry_rcu(ws, &wakeup_sources, entry) { + if (ws->active && len < max) { + if (!active) + len += scnprintf(pending_wakeup_source, max, + "Pending Wakeup Sources: "); + len += scnprintf(pending_wakeup_source + len, max - len, + "%s ", ws->name); + active = true; + } else if (!active && + (!last_active_ws || + ktime_to_ns(ws->last_time) > + ktime_to_ns(last_active_ws->last_time))) { + last_active_ws = ws; + } + } + if (!active && last_active_ws) { + scnprintf(pending_wakeup_source, max, + "Last active Wakeup Source: %s", + last_active_ws->name); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(pm_get_active_wakeup_sources); + void pm_print_active_wakeup_sources(void) { struct wakeup_source *ws; @@ -912,6 +946,7 @@ bool pm_wakeup_pending(void) { unsigned long flags; bool ret = false; + char suspend_abort[MAX_SUSPEND_ABORT_LEN]; raw_spin_lock_irqsave(&events_lock, flags); if (events_check_enabled) { @@ -926,6 +961,10 @@ bool pm_wakeup_pending(void) if (ret) { pm_pr_dbg("Wakeup pending, aborting suspend\n"); pm_print_active_wakeup_sources(); + pm_get_active_wakeup_sources(suspend_abort, + MAX_SUSPEND_ABORT_LEN); + log_suspend_abort_reason(suspend_abort); + pr_info("PM: %s\n", suspend_abort); } return ret || atomic_read(&pm_abort_suspend) > 0; @@ -975,8 +1014,20 @@ void pm_system_irq_wakeup(unsigned int irq_number) raw_spin_unlock_irqrestore(&wakeup_irq_lock, flags); - if (irq_number) + if (irq_number) { + struct irq_desc *desc; + const char *name = "null"; + + desc = irq_to_desc(irq_number); + if (desc == NULL) + name = "stray irq"; + else if (desc->action && desc->action->name) + name = desc->action->name; + + log_irq_wakeup_reason(irq_number); + pr_warn("%s: %d triggered %s\n", __func__, irq_number, name); pm_system_wakeup(); + } } unsigned int pm_wakeup_irq(void) diff --git a/drivers/base/syscore.c b/drivers/base/syscore.c index 13db1f78d2ce..3a9527dc545c 100644 --- a/drivers/base/syscore.c +++ b/drivers/base/syscore.c @@ -10,6 +10,7 @@ #include #include #include +#include static LIST_HEAD(syscore_ops_list); static DEFINE_MUTEX(syscore_ops_lock); @@ -73,7 +74,9 @@ int syscore_suspend(void) return 0; err_out: - pr_err("PM: System core suspend callback %pS failed.\n", ops->suspend); + log_suspend_abort_reason("System core suspend callback %pS failed", + ops->suspend); + pr_err("PM: System core suspend callback %pF failed.\n", ops->suspend); list_for_each_entry_continue(ops, &syscore_ops_list, node) if (ops->resume) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 68a0c0fe64dd..cd9d484af6db 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -86,6 +86,7 @@ #include #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) +#define LOOP_DEFAULT_HW_Q_DEPTH (128) static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); @@ -2105,6 +2106,24 @@ module_param(max_loop, int, 0444); MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); + +static int hw_queue_depth = LOOP_DEFAULT_HW_Q_DEPTH; + +static int loop_set_hw_queue_depth(const char *s, const struct kernel_param *p) +{ + int ret = kstrtoint(s, 10, &hw_queue_depth); + + return (ret || (hw_queue_depth < 1)) ? -EINVAL : 0; +} + +static const struct kernel_param_ops loop_hw_qdepth_param_ops = { + .set = loop_set_hw_queue_depth, + .get = param_get_int, +}; + +device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444); +MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 128"); + MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -2337,7 +2356,7 @@ static int loop_add(int i) err = -ENOMEM; lo->tag_set.ops = &loop_mq_ops; lo->tag_set.nr_hw_queues = 1; - lo->tag_set.queue_depth = 128; + lo->tag_set.queue_depth = hw_queue_depth; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING | diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index d2ba849bb8d1..b6670108872c 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -313,8 +313,6 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, int err; bool notify = false; - BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); - err = virtblk_setup_cmd(vblk->vdev, req, vbr); if (unlikely(err)) return err; @@ -552,6 +550,10 @@ static int init_vq(struct virtio_blk *vblk) &num_vqs); if (err) num_vqs = 1; + if (!err && !num_vqs) { + dev_err(&vdev->dev, "MQ advertisted but zero queues reported\n"); + return -EINVAL; + } num_vqs = min_t(unsigned int, nr_cpu_ids, num_vqs); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 6383c81ac5b3..0da494afd62b 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -474,7 +474,7 @@ static ssize_t backing_dev_store(struct device *dev, if (sz > 0 && file_name[sz - 1] == '\n') file_name[sz - 1] = 0x00; - backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0); + backing_dev = filp_open_block(file_name, O_RDWR|O_LARGEFILE, 0); if (IS_ERR(backing_dev)) { err = PTR_ERR(backing_dev); backing_dev = NULL; @@ -1376,13 +1376,14 @@ compress_again: __GFP_KSWAPD_RECLAIM | __GFP_NOWARN | __GFP_HIGHMEM | - __GFP_MOVABLE); + __GFP_MOVABLE | + __GFP_CMA); if (!handle) { zcomp_stream_put(zram->comp); atomic64_inc(&zram->stats.writestall); handle = zs_malloc(zram->mem_pool, comp_len, GFP_NOIO | __GFP_HIGHMEM | - __GFP_MOVABLE); + __GFP_MOVABLE | __GFP_CMA); if (handle) goto compress_again; return -ENOMEM; diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 77bc993d7513..35025f283bf6 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -28,6 +28,7 @@ #include "../tty/hvc/hvc_console.h" #define is_rproc_enabled IS_ENABLED(CONFIG_REMOTEPROC) +#define VIRTCONS_MAX_PORTS 0x8000 /* * This is a global struct for storing common data for all the devices @@ -2043,6 +2044,14 @@ static int virtcons_probe(struct virtio_device *vdev) virtio_cread_feature(vdev, VIRTIO_CONSOLE_F_MULTIPORT, struct virtio_console_config, max_nr_ports, &portdev->max_nr_ports) == 0) { + if (portdev->max_nr_ports == 0 || + portdev->max_nr_ports > VIRTCONS_MAX_PORTS) { + dev_err(&vdev->dev, + "Invalidate max_nr_ports %d", + portdev->max_nr_ports); + err = -EINVAL; + goto free; + } multiport = true; } diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c index 510a9965633b..ba8d4d8cf8dd 100644 --- a/drivers/clk/clk-composite.c +++ b/drivers/clk/clk-composite.c @@ -362,6 +362,7 @@ struct clk *clk_register_composite(struct device *dev, const char *name, return ERR_CAST(hw); return hw->clk; } +EXPORT_SYMBOL_GPL(clk_register_composite); struct clk *clk_register_composite_pdata(struct device *dev, const char *name, const struct clk_parent_data *parent_data, diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 24e243860e27..7480359f5a93 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -72,6 +72,8 @@ struct clk_core { unsigned long flags; bool orphan; bool rpm_enabled; + bool need_sync; + bool boot_enabled; unsigned int enable_count; unsigned int prepare_count; unsigned int protect_count; @@ -1289,6 +1291,10 @@ static void __init clk_unprepare_unused_subtree(struct clk_core *core) hlist_for_each_entry(child, &core->children, child_node) clk_unprepare_unused_subtree(child); + if (dev_has_sync_state(core->dev) && + !(core->flags & CLK_DONT_HOLD_STATE)) + return; + if (core->prepare_count) return; @@ -1320,6 +1326,10 @@ static void __init clk_disable_unused_subtree(struct clk_core *core) hlist_for_each_entry(child, &core->children, child_node) clk_disable_unused_subtree(child); + if (dev_has_sync_state(core->dev) && + !(core->flags & CLK_DONT_HOLD_STATE)) + return; + if (core->flags & CLK_OPS_PARENT_ENABLE) clk_core_prepare_enable(core->parent); @@ -1393,6 +1403,38 @@ static int __init clk_disable_unused(void) } late_initcall_sync(clk_disable_unused); +static void clk_unprepare_disable_dev_subtree(struct clk_core *core, + struct device *dev) +{ + struct clk_core *child; + + lockdep_assert_held(&prepare_lock); + + hlist_for_each_entry(child, &core->children, child_node) + clk_unprepare_disable_dev_subtree(child, dev); + + if (core->dev != dev || !core->need_sync) + return; + + clk_core_disable_unprepare(core); +} + +void clk_sync_state(struct device *dev) +{ + struct clk_core *core; + + clk_prepare_lock(); + + hlist_for_each_entry(core, &clk_root_list, child_node) + clk_unprepare_disable_dev_subtree(core, dev); + + hlist_for_each_entry(core, &clk_orphan_list, child_node) + clk_unprepare_disable_dev_subtree(core, dev); + + clk_prepare_unlock(); +} +EXPORT_SYMBOL_GPL(clk_sync_state); + static int clk_core_determine_round_nolock(struct clk_core *core, struct clk_rate_request *req) { @@ -1837,6 +1879,33 @@ int clk_hw_get_parent_index(struct clk_hw *hw) } EXPORT_SYMBOL_GPL(clk_hw_get_parent_index); +static void clk_core_hold_state(struct clk_core *core) +{ + if (core->need_sync || !core->boot_enabled) + return; + + if (core->orphan || !dev_has_sync_state(core->dev)) + return; + + if (core->flags & CLK_DONT_HOLD_STATE) + return; + + core->need_sync = !clk_core_prepare_enable(core); +} + +static void __clk_core_update_orphan_hold_state(struct clk_core *core) +{ + struct clk_core *child; + + if (core->orphan) + return; + + clk_core_hold_state(core); + + hlist_for_each_entry(child, &core->children, child_node) + __clk_core_update_orphan_hold_state(child); +} + /* * Update the orphan status of @core and all its children. */ @@ -2157,6 +2226,13 @@ static struct clk_core *clk_propagate_rate_change(struct clk_core *core, fail_clk = core; } + if (core->ops->pre_rate_change) { + ret = core->ops->pre_rate_change(core->hw, core->rate, + core->new_rate); + if (ret) + fail_clk = core; + } + hlist_for_each_entry(child, &core->children, child_node) { /* Skip children who will be reparented to another clock */ if (child->new_parent && child->new_parent != core) @@ -2251,6 +2327,9 @@ static void clk_change_rate(struct clk_core *core) if (core->flags & CLK_RECALC_NEW_RATES) (void)clk_calc_new_rates(core, core->new_rate); + if (core->ops->post_rate_change) + core->ops->post_rate_change(core->hw, old_rate, core->rate); + /* * Use safe iteration, as change_rate can actually swap parents * for certain clock types. @@ -3194,7 +3273,7 @@ static int clk_dump_show(struct seq_file *s, void *data) } DEFINE_SHOW_ATTRIBUTE(clk_dump); -#undef CLOCK_ALLOW_WRITE_DEBUGFS +#define CLOCK_ALLOW_WRITE_DEBUGFS #ifdef CLOCK_ALLOW_WRITE_DEBUGFS /* * This can be dangerous, therefore don't provide any real compile time @@ -3480,24 +3559,6 @@ static int __init clk_debug_init(void) { struct clk_core *core; -#ifdef CLOCK_ALLOW_WRITE_DEBUGFS - pr_warn("\n"); - pr_warn("********************************************************************\n"); - pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warn("** **\n"); - pr_warn("** WRITEABLE clk DebugFS SUPPORT HAS BEEN ENABLED IN THIS KERNEL **\n"); - pr_warn("** **\n"); - pr_warn("** This means that this kernel is built to expose clk operations **\n"); - pr_warn("** such as parent or rate setting, enabling, disabling, etc. **\n"); - pr_warn("** to userspace, which may compromise security on your system. **\n"); - pr_warn("** **\n"); - pr_warn("** If you see this message and you are not debugging the **\n"); - pr_warn("** kernel, report this immediately to your vendor! **\n"); - pr_warn("** **\n"); - pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warn("********************************************************************\n"); -#endif - rootdir = debugfs_create_dir("clk", NULL); debugfs_create_file("clk_summary", 0444, rootdir, &all_lists, @@ -3550,6 +3611,7 @@ static void clk_core_reparent_orphans_nolock(void) __clk_set_parent_after(orphan, parent, NULL); __clk_recalc_accuracies(orphan); __clk_recalc_rates(orphan, 0); + __clk_core_update_orphan_hold_state(orphan); /* * __clk_init_parent() will set the initial req_rate to @@ -3729,6 +3791,8 @@ static int __clk_core_init(struct clk_core *core) rate = 0; core->rate = core->req_rate = rate; + core->boot_enabled = clk_core_is_enabled(core); + /* * Enable CLK_IS_CRITICAL clocks so newly added critical clocks * don't get accidentally disabled when walking the orphan tree and @@ -3751,6 +3815,7 @@ static int __clk_core_init(struct clk_core *core) } } + clk_core_hold_state(core); clk_core_reparent_orphans_nolock(); diff --git a/drivers/clk/mediatek/Kconfig b/drivers/clk/mediatek/Kconfig index 439b7c8d0d07..d201bbd0a3ab 100644 --- a/drivers/clk/mediatek/Kconfig +++ b/drivers/clk/mediatek/Kconfig @@ -6,7 +6,7 @@ menu "Clock driver for MediaTek SoC" depends on ARCH_MEDIATEK || COMPILE_TEST config COMMON_CLK_MEDIATEK - bool + tristate select RESET_CONTROLLER help MediaTek SoCs' clock support. @@ -204,7 +204,7 @@ config COMMON_CLK_MT6765_MIPI2BSYS This driver supports MediaTek MT6765 mipi2bsys clocks. config COMMON_CLK_MT6779 - bool "Clock driver for MediaTek MT6779" + tristate "Clock driver for MediaTek MT6779" depends on (ARCH_MEDIATEK && ARM64) || COMPILE_TEST select COMMON_CLK_MEDIATEK default ARCH_MEDIATEK && ARM64 @@ -212,49 +212,49 @@ config COMMON_CLK_MT6779 This driver supports MediaTek MT6779 basic clocks. config COMMON_CLK_MT6779_MMSYS - bool "Clock driver for MediaTek MT6779 mmsys" + tristate "Clock driver for MediaTek MT6779 mmsys" depends on COMMON_CLK_MT6779 help This driver supports MediaTek MT6779 mmsys clocks. config COMMON_CLK_MT6779_IMGSYS - bool "Clock driver for MediaTek MT6779 imgsys" + tristate "Clock driver for MediaTek MT6779 imgsys" depends on COMMON_CLK_MT6779 help This driver supports MediaTek MT6779 imgsys clocks. config COMMON_CLK_MT6779_IPESYS - bool "Clock driver for MediaTek MT6779 ipesys" + tristate "Clock driver for MediaTek MT6779 ipesys" depends on COMMON_CLK_MT6779 help This driver supports MediaTek MT6779 ipesys clocks. config COMMON_CLK_MT6779_CAMSYS - bool "Clock driver for MediaTek MT6779 camsys" + tristate "Clock driver for MediaTek MT6779 camsys" depends on COMMON_CLK_MT6779 help This driver supports MediaTek MT6779 camsys clocks. config COMMON_CLK_MT6779_VDECSYS - bool "Clock driver for MediaTek MT6779 vdecsys" + tristate "Clock driver for MediaTek MT6779 vdecsys" depends on COMMON_CLK_MT6779 help This driver supports MediaTek MT6779 vdecsys clocks. config COMMON_CLK_MT6779_VENCSYS - bool "Clock driver for MediaTek MT6779 vencsys" + tristate "Clock driver for MediaTek MT6779 vencsys" depends on COMMON_CLK_MT6779 help This driver supports MediaTek MT6779 vencsys clocks. config COMMON_CLK_MT6779_MFGCFG - bool "Clock driver for MediaTek MT6779 mfgcfg" + tristate "Clock driver for MediaTek MT6779 mfgcfg" depends on COMMON_CLK_MT6779 help This driver supports MediaTek MT6779 mfgcfg clocks. config COMMON_CLK_MT6779_AUDSYS - bool "Clock driver for Mediatek MT6779 audsys" + tristate "Clock driver for Mediatek MT6779 audsys" depends on COMMON_CLK_MT6779 help This driver supports Mediatek MT6779 audsys clocks. diff --git a/drivers/clk/mediatek/clk-apmixed.c b/drivers/clk/mediatek/clk-apmixed.c index 258d128370f2..caa9119413f1 100644 --- a/drivers/clk/mediatek/clk-apmixed.c +++ b/drivers/clk/mediatek/clk-apmixed.c @@ -5,6 +5,7 @@ */ #include +#include #include #include @@ -97,3 +98,5 @@ struct clk * __init mtk_clk_register_ref2usb_tx(const char *name, return clk; } + +MODULE_LICENSE("GPL"); diff --git a/drivers/clk/mediatek/clk-cpumux.c b/drivers/clk/mediatek/clk-cpumux.c index 61eeae4e60fb..e188018bc906 100644 --- a/drivers/clk/mediatek/clk-cpumux.c +++ b/drivers/clk/mediatek/clk-cpumux.c @@ -6,6 +6,7 @@ #include #include +#include #include #include "clk-mtk.h" @@ -106,3 +107,5 @@ int mtk_clk_register_cpumuxes(struct device_node *node, return 0; } + +MODULE_LICENSE("GPL"); diff --git a/drivers/clk/mediatek/clk-gate.c b/drivers/clk/mediatek/clk-gate.c index a35cf0b22150..b02d2f74dd0d 100644 --- a/drivers/clk/mediatek/clk-gate.c +++ b/drivers/clk/mediatek/clk-gate.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "clk-mtk.h" #include "clk-gate.h" @@ -122,24 +123,28 @@ const struct clk_ops mtk_clk_gate_ops_setclr = { .enable = mtk_cg_enable, .disable = mtk_cg_disable, }; +EXPORT_SYMBOL_GPL(mtk_clk_gate_ops_setclr); const struct clk_ops mtk_clk_gate_ops_setclr_inv = { .is_enabled = mtk_cg_bit_is_set, .enable = mtk_cg_enable_inv, .disable = mtk_cg_disable_inv, }; +EXPORT_SYMBOL_GPL(mtk_clk_gate_ops_setclr_inv); const struct clk_ops mtk_clk_gate_ops_no_setclr = { .is_enabled = mtk_cg_bit_is_cleared, .enable = mtk_cg_enable_no_setclr, .disable = mtk_cg_disable_no_setclr, }; +EXPORT_SYMBOL_GPL(mtk_clk_gate_ops_no_setclr); const struct clk_ops mtk_clk_gate_ops_no_setclr_inv = { .is_enabled = mtk_cg_bit_is_set, .enable = mtk_cg_enable_inv_no_setclr, .disable = mtk_cg_disable_inv_no_setclr, }; +EXPORT_SYMBOL_GPL(mtk_clk_gate_ops_no_setclr_inv); struct clk *mtk_clk_register_gate( const char *name, @@ -181,3 +186,6 @@ struct clk *mtk_clk_register_gate( return clk; } +EXPORT_SYMBOL_GPL(mtk_clk_register_gate); + +MODULE_LICENSE("GPL"); diff --git a/drivers/clk/mediatek/clk-mt6779-aud.c b/drivers/clk/mediatek/clk-mt6779-aud.c index 11b209f95e25..9e889e4c361a 100644 --- a/drivers/clk/mediatek/clk-mt6779-aud.c +++ b/drivers/clk/mediatek/clk-mt6779-aud.c @@ -4,6 +4,7 @@ * Author: Wendell Lin */ +#include #include #include #include @@ -114,4 +115,5 @@ static struct platform_driver clk_mt6779_aud_drv = { }, }; -builtin_platform_driver(clk_mt6779_aud_drv); +module_platform_driver(clk_mt6779_aud_drv); +MODULE_LICENSE("GPL"); diff --git a/drivers/clk/mediatek/clk-mt6779-cam.c b/drivers/clk/mediatek/clk-mt6779-cam.c index 244d4208b7fb..7f07a2a139ac 100644 --- a/drivers/clk/mediatek/clk-mt6779-cam.c +++ b/drivers/clk/mediatek/clk-mt6779-cam.c @@ -4,6 +4,7 @@ * Author: Wendell Lin */ +#include #include #include #include @@ -63,4 +64,5 @@ static struct platform_driver clk_mt6779_cam_drv = { }, }; -builtin_platform_driver(clk_mt6779_cam_drv); +module_platform_driver(clk_mt6779_cam_drv); +MODULE_LICENSE("GPL"); diff --git a/drivers/clk/mediatek/clk-mt6779-img.c b/drivers/clk/mediatek/clk-mt6779-img.c index 26292a45c613..f0961fa1a286 100644 --- a/drivers/clk/mediatek/clk-mt6779-img.c +++ b/drivers/clk/mediatek/clk-mt6779-img.c @@ -4,6 +4,7 @@ * Author: Wendell Lin */ +#include #include #include #include @@ -55,4 +56,5 @@ static struct platform_driver clk_mt6779_img_drv = { }, }; -builtin_platform_driver(clk_mt6779_img_drv); +module_platform_driver(clk_mt6779_img_drv); +MODULE_LICENSE("GPL"); diff --git a/drivers/clk/mediatek/clk-mt6779-ipe.c b/drivers/clk/mediatek/clk-mt6779-ipe.c index bb519075639c..8c6f3e154bf3 100644 --- a/drivers/clk/mediatek/clk-mt6779-ipe.c +++ b/drivers/clk/mediatek/clk-mt6779-ipe.c @@ -4,6 +4,7 @@ * Author: Wendell Lin */ +#include #include #include #include @@ -57,4 +58,5 @@ static struct platform_driver clk_mt6779_ipe_drv = { }, }; -builtin_platform_driver(clk_mt6779_ipe_drv); +module_platform_driver(clk_mt6779_ipe_drv); +MODULE_LICENSE("GPL"); diff --git a/drivers/clk/mediatek/clk-mt6779-mfg.c b/drivers/clk/mediatek/clk-mt6779-mfg.c index c6ee2a89c070..9f3372886e6b 100644 --- a/drivers/clk/mediatek/clk-mt6779-mfg.c +++ b/drivers/clk/mediatek/clk-mt6779-mfg.c @@ -4,6 +4,7 @@ * Author: Wendell Lin */ +#include #include #include @@ -52,4 +53,5 @@ static struct platform_driver clk_mt6779_mfg_drv = { }, }; -builtin_platform_driver(clk_mt6779_mfg_drv); +module_platform_driver(clk_mt6779_mfg_drv); +MODULE_LICENSE("GPL"); diff --git a/drivers/clk/mediatek/clk-mt6779-mm.c b/drivers/clk/mediatek/clk-mt6779-mm.c index 059c1a41ac7a..33946e647122 100644 --- a/drivers/clk/mediatek/clk-mt6779-mm.c +++ b/drivers/clk/mediatek/clk-mt6779-mm.c @@ -4,6 +4,7 @@ * Author: Wendell Lin */ +#include #include #include #include @@ -105,4 +106,5 @@ static struct platform_driver clk_mt6779_mm_drv = { }, }; -builtin_platform_driver(clk_mt6779_mm_drv); +module_platform_driver(clk_mt6779_mm_drv); +MODULE_LICENSE("GPL"); diff --git a/drivers/clk/mediatek/clk-mt6779-vdec.c b/drivers/clk/mediatek/clk-mt6779-vdec.c index 1900da2586a1..f4358844c2e0 100644 --- a/drivers/clk/mediatek/clk-mt6779-vdec.c +++ b/drivers/clk/mediatek/clk-mt6779-vdec.c @@ -4,6 +4,7 @@ * Author: Wendell Lin */ +#include #include #include @@ -64,4 +65,5 @@ static struct platform_driver clk_mt6779_vdec_drv = { }, }; -builtin_platform_driver(clk_mt6779_vdec_drv); +module_platform_driver(clk_mt6779_vdec_drv); +MODULE_LICENSE("GPL"); diff --git a/drivers/clk/mediatek/clk-mt6779-venc.c b/drivers/clk/mediatek/clk-mt6779-venc.c index b41d1f859edc..ff67084af5aa 100644 --- a/drivers/clk/mediatek/clk-mt6779-venc.c +++ b/drivers/clk/mediatek/clk-mt6779-venc.c @@ -4,6 +4,7 @@ * Author: Wendell Lin */ +#include #include #include @@ -55,4 +56,5 @@ static struct platform_driver clk_mt6779_venc_drv = { }, }; -builtin_platform_driver(clk_mt6779_venc_drv); +module_platform_driver(clk_mt6779_venc_drv); +MODULE_LICENSE("GPL"); diff --git a/drivers/clk/mediatek/clk-mt6779.c b/drivers/clk/mediatek/clk-mt6779.c index 6e0d3a166729..9825385c9f94 100644 --- a/drivers/clk/mediatek/clk-mt6779.c +++ b/drivers/clk/mediatek/clk-mt6779.c @@ -4,6 +4,7 @@ * Author: Wendell Lin */ +#include #include #include #include @@ -1314,3 +1315,4 @@ static int __init clk_mt6779_init(void) } arch_initcall(clk_mt6779_init); +MODULE_LICENSE("GPL"); diff --git a/drivers/clk/mediatek/clk-mtk.c b/drivers/clk/mediatek/clk-mtk.c index 4b6096c44d74..6624f00763d6 100644 --- a/drivers/clk/mediatek/clk-mtk.c +++ b/drivers/clk/mediatek/clk-mtk.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,7 @@ err_out: return NULL; } +EXPORT_SYMBOL_GPL(mtk_alloc_clk_data); void mtk_clk_register_fixed_clks(const struct mtk_fixed_clk *clks, int num, struct clk_onecell_data *clk_data) @@ -68,6 +70,7 @@ void mtk_clk_register_fixed_clks(const struct mtk_fixed_clk *clks, clk_data->clks[rc->id] = clk; } } +EXPORT_SYMBOL_GPL(mtk_clk_register_fixed_clks); void mtk_clk_register_factors(const struct mtk_fixed_factor *clks, int num, struct clk_onecell_data *clk_data) @@ -94,6 +97,7 @@ void mtk_clk_register_factors(const struct mtk_fixed_factor *clks, clk_data->clks[ff->id] = clk; } } +EXPORT_SYMBOL_GPL(mtk_clk_register_factors); int mtk_clk_register_gates_with_dev(struct device_node *node, const struct mtk_gate *clks, @@ -146,6 +150,7 @@ int mtk_clk_register_gates(struct device_node *node, return mtk_clk_register_gates_with_dev(node, clks, num, clk_data, NULL); } +EXPORT_SYMBOL_GPL(mtk_clk_register_gates); struct clk *mtk_clk_register_composite(const struct mtk_composite *mc, void __iomem *base, spinlock_t *lock) @@ -259,6 +264,7 @@ void mtk_clk_register_composites(const struct mtk_composite *mcs, clk_data->clks[mc->id] = clk; } } +EXPORT_SYMBOL_GPL(mtk_clk_register_composites); void mtk_clk_register_dividers(const struct mtk_clk_divider *mcds, int num, void __iomem *base, spinlock_t *lock, @@ -309,3 +315,5 @@ int mtk_clk_simple_probe(struct platform_device *pdev) return of_clk_add_provider(node, of_clk_src_onecell_get, clk_data); } + +MODULE_LICENSE("GPL"); diff --git a/drivers/clk/mediatek/clk-mux.c b/drivers/clk/mediatek/clk-mux.c index 855b0a1f7eb9..6d3a50eb7d6f 100644 --- a/drivers/clk/mediatek/clk-mux.c +++ b/drivers/clk/mediatek/clk-mux.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "clk-mtk.h" #include "clk-mux.h" @@ -120,6 +121,7 @@ const struct clk_ops mtk_mux_clr_set_upd_ops = { .get_parent = mtk_clk_mux_get_parent, .set_parent = mtk_clk_mux_set_parent_setclr_lock, }; +EXPORT_SYMBOL_GPL(mtk_mux_clr_set_upd_ops); const struct clk_ops mtk_mux_gate_clr_set_upd_ops = { .enable = mtk_clk_mux_enable_setclr, @@ -128,6 +130,7 @@ const struct clk_ops mtk_mux_gate_clr_set_upd_ops = { .get_parent = mtk_clk_mux_get_parent, .set_parent = mtk_clk_mux_set_parent_setclr_lock, }; +EXPORT_SYMBOL_GPL(mtk_mux_gate_clr_set_upd_ops); static struct clk *mtk_clk_register_mux(const struct mtk_mux *mux, struct regmap *regmap, @@ -195,3 +198,6 @@ int mtk_clk_register_muxes(const struct mtk_mux *muxes, return 0; } +EXPORT_SYMBOL_GPL(mtk_clk_register_muxes); + +MODULE_LICENSE("GPL"); diff --git a/drivers/clk/mediatek/clk-pll.c b/drivers/clk/mediatek/clk-pll.c index 7fb001a4e7d8..20bf8a15a276 100644 --- a/drivers/clk/mediatek/clk-pll.c +++ b/drivers/clk/mediatek/clk-pll.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -385,3 +386,6 @@ void mtk_clk_register_plls(struct device_node *node, clk_data->clks[pll->id] = clk; } } +EXPORT_SYMBOL_GPL(mtk_clk_register_plls); + +MODULE_LICENSE("GPL"); diff --git a/drivers/clk/mediatek/reset.c b/drivers/clk/mediatek/reset.c index d311da574499..e038c33d0a95 100644 --- a/drivers/clk/mediatek/reset.c +++ b/drivers/clk/mediatek/reset.c @@ -137,3 +137,5 @@ void mtk_register_reset_controller_set_clr(struct device_node *np, mtk_register_reset_controller_common(np, num_regs, regofs, &mtk_reset_ops_set_clr); } + +MODULE_LICENSE("GPL"); diff --git a/drivers/clk/qcom/dispcc-sdm845.c b/drivers/clk/qcom/dispcc-sdm845.c index 735adfefc379..414ffb6feb02 100644 --- a/drivers/clk/qcom/dispcc-sdm845.c +++ b/drivers/clk/qcom/dispcc-sdm845.c @@ -3,6 +3,7 @@ * Copyright (c) 2018-2019, The Linux Foundation. All rights reserved. */ +#include #include #include #include @@ -869,6 +870,7 @@ static struct platform_driver disp_cc_sdm845_driver = { .driver = { .name = "disp_cc-sdm845", .of_match_table = disp_cc_sdm845_match_table, + .sync_state = clk_sync_state, }, }; diff --git a/drivers/clk/qcom/gcc-msm8998.c b/drivers/clk/qcom/gcc-msm8998.c index 050c91af888e..2523e2bd1330 100644 --- a/drivers/clk/qcom/gcc-msm8998.c +++ b/drivers/clk/qcom/gcc-msm8998.c @@ -3162,6 +3162,7 @@ static struct platform_driver gcc_msm8998_driver = { .driver = { .name = "gcc-msm8998", .of_match_table = gcc_msm8998_match_table, + .sync_state = clk_sync_state, }, }; diff --git a/drivers/clk/qcom/gcc-sdm845.c b/drivers/clk/qcom/gcc-sdm845.c index 58aa3ec9a7fc..c2f093f6ae61 100644 --- a/drivers/clk/qcom/gcc-sdm845.c +++ b/drivers/clk/qcom/gcc-sdm845.c @@ -3624,6 +3624,7 @@ static struct platform_driver gcc_sdm845_driver = { .driver = { .name = "gcc-sdm845", .of_match_table = gcc_sdm845_match_table, + .sync_state = clk_sync_state, }, }; diff --git a/drivers/clk/qcom/gpucc-sdm845.c b/drivers/clk/qcom/gpucc-sdm845.c index 110b54401bc6..8749aabd3b0d 100644 --- a/drivers/clk/qcom/gpucc-sdm845.c +++ b/drivers/clk/qcom/gpucc-sdm845.c @@ -205,6 +205,7 @@ static struct platform_driver gpu_cc_sdm845_driver = { .driver = { .name = "sdm845-gpucc", .of_match_table = gpu_cc_sdm845_match_table, + .sync_state = clk_sync_state, }, }; diff --git a/drivers/clk/qcom/videocc-sdm845.c b/drivers/clk/qcom/videocc-sdm845.c index c77a4dd5d39c..f678ade82656 100644 --- a/drivers/clk/qcom/videocc-sdm845.c +++ b/drivers/clk/qcom/videocc-sdm845.c @@ -337,6 +337,7 @@ static struct platform_driver video_cc_sdm845_driver = { .driver = { .name = "sdm845-videocc", .of_match_table = video_cc_sdm845_match_table, + .sync_state = clk_sync_state, }, }; diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 08f8cb944a2a..265c4dc858ab 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -128,7 +128,7 @@ config RDA_TIMER Enables the support for the RDA Micro timer driver. config SUN4I_TIMER - bool "Sun4i timer driver" if COMPILE_TEST + bool "Sun4i timer driver" depends on HAS_IOMEM select CLKSRC_MMIO select TIMER_OF diff --git a/drivers/clocksource/mmio.c b/drivers/clocksource/mmio.c index 9de751531831..826dcc42629c 100644 --- a/drivers/clocksource/mmio.c +++ b/drivers/clocksource/mmio.c @@ -21,6 +21,7 @@ u64 clocksource_mmio_readl_up(struct clocksource *c) { return (u64)readl_relaxed(to_mmio_clksrc(c)->reg); } +EXPORT_SYMBOL_GPL(clocksource_mmio_readl_up); u64 clocksource_mmio_readl_down(struct clocksource *c) { @@ -46,7 +47,7 @@ u64 clocksource_mmio_readw_down(struct clocksource *c) * @bits: Number of valid bits * @read: One of clocksource_mmio_read*() above */ -int __init clocksource_mmio_init(void __iomem *base, const char *name, +int clocksource_mmio_init(void __iomem *base, const char *name, unsigned long hz, int rating, unsigned bits, u64 (*read)(struct clocksource *)) { @@ -68,3 +69,4 @@ int __init clocksource_mmio_init(void __iomem *base, const char *name, return clocksource_register_hz(&cs->clksrc, hz); } +EXPORT_SYMBOL_GPL(clocksource_mmio_init); diff --git a/drivers/clocksource/timer-of.c b/drivers/clocksource/timer-of.c index c3f54d9912be..d11c9de303df 100644 --- a/drivers/clocksource/timer-of.c +++ b/drivers/clocksource/timer-of.c @@ -19,7 +19,7 @@ * * Free the irq resource */ -static __init void timer_of_irq_exit(struct of_timer_irq *of_irq) +static void timer_of_irq_exit(struct of_timer_irq *of_irq) { struct timer_of *to = container_of(of_irq, struct timer_of, of_irq); @@ -47,7 +47,7 @@ static __init void timer_of_irq_exit(struct of_timer_irq *of_irq) * * Returns 0 on success, < 0 otherwise */ -static __init int timer_of_irq_init(struct device_node *np, +static int timer_of_irq_init(struct device_node *np, struct of_timer_irq *of_irq) { int ret; @@ -91,7 +91,7 @@ static __init int timer_of_irq_init(struct device_node *np, * * Disables and releases the refcount on the clk */ -static __init void timer_of_clk_exit(struct of_timer_clk *of_clk) +static void timer_of_clk_exit(struct of_timer_clk *of_clk) { of_clk->rate = 0; clk_disable_unprepare(of_clk->clk); @@ -107,7 +107,7 @@ static __init void timer_of_clk_exit(struct of_timer_clk *of_clk) * * Returns 0 on success, < 0 otherwise */ -static __init int timer_of_clk_init(struct device_node *np, +static int timer_of_clk_init(struct device_node *np, struct of_timer_clk *of_clk) { int ret; @@ -146,12 +146,12 @@ out_clk_put: goto out; } -static __init void timer_of_base_exit(struct of_timer_base *of_base) +static void timer_of_base_exit(struct of_timer_base *of_base) { iounmap(of_base->base); } -static __init int timer_of_base_init(struct device_node *np, +static int timer_of_base_init(struct device_node *np, struct of_timer_base *of_base) { of_base->base = of_base->name ? @@ -165,7 +165,7 @@ static __init int timer_of_base_init(struct device_node *np, return 0; } -int __init timer_of_init(struct device_node *np, struct timer_of *to) +int timer_of_init(struct device_node *np, struct timer_of *to) { int ret = -EINVAL; int flags = 0; @@ -209,6 +209,7 @@ out_fail: timer_of_base_exit(&to->of_base); return ret; } +EXPORT_SYMBOL_GPL(timer_of_init); /** * timer_of_cleanup - release timer_of resources @@ -217,7 +218,7 @@ out_fail: * Release the resources that has been used in timer_of_init(). * This function should be called in init error cases */ -void __init timer_of_cleanup(struct timer_of *to) +void timer_of_cleanup(struct timer_of *to) { if (to->flags & TIMER_OF_IRQ) timer_of_irq_exit(&to->of_irq); diff --git a/drivers/clocksource/timer-of.h b/drivers/clocksource/timer-of.h index a5478f3e8589..1b8cfac5900a 100644 --- a/drivers/clocksource/timer-of.h +++ b/drivers/clocksource/timer-of.h @@ -66,9 +66,9 @@ static inline unsigned long timer_of_period(struct timer_of *to) return to->of_clk.period; } -extern int __init timer_of_init(struct device_node *np, +extern int timer_of_init(struct device_node *np, struct timer_of *to); -extern void __init timer_of_cleanup(struct timer_of *to); +extern void timer_of_cleanup(struct timer_of *to); #endif diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index c3038cdc6865..c63de9ed3dc0 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -35,6 +35,13 @@ config CPU_FREQ_STAT If in doubt, say N. +config CPU_FREQ_TIMES + bool "CPU frequency time-in-state statistics" + help + Export CPU time-in-state information through procfs. + + If in doubt, say N. + choice prompt "Default CPUFreq governor" default CPU_FREQ_DEFAULT_GOV_USERSPACE if ARM_SA1100_CPUFREQ || ARM_SA1110_CPUFREQ @@ -227,6 +234,15 @@ config CPUFREQ_DT_PLATDEV If in doubt, say N. +config CPUFREQ_DUMMY + tristate "Dummy CPU frequency driver" + help + This option adds a generic dummy CPUfreq driver, which sets a fake + 2-frequency table when initializing each policy and otherwise does + nothing. + + If in doubt, say N + if X86 source "drivers/cpufreq/Kconfig.x86" endif diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index 48ee5859030c..f357aa757a09 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -5,7 +5,10 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o freq_table.o # CPUfreq stats obj-$(CONFIG_CPU_FREQ_STAT) += cpufreq_stats.o -# CPUfreq governors +# CPUfreq times +obj-$(CONFIG_CPU_FREQ_TIMES) += cpufreq_times.o + +# CPUfreq governors obj-$(CONFIG_CPU_FREQ_GOV_PERFORMANCE) += cpufreq_performance.o obj-$(CONFIG_CPU_FREQ_GOV_POWERSAVE) += cpufreq_powersave.o obj-$(CONFIG_CPU_FREQ_GOV_USERSPACE) += cpufreq_userspace.o @@ -17,6 +20,8 @@ obj-$(CONFIG_CPU_FREQ_GOV_ATTR_SET) += cpufreq_governor_attr_set.o obj-$(CONFIG_CPUFREQ_DT) += cpufreq-dt.o obj-$(CONFIG_CPUFREQ_DT_PLATDEV) += cpufreq-dt-platdev.o +obj-$(CONFIG_CPUFREQ_DUMMY) += dummy-cpufreq.o + ################################################################################## # x86 drivers. # Link order matters. K8 is preferred to ACPI because of firmware bugs in early diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index b998b5083953..6bfaaead22a6 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -30,6 +31,8 @@ #include #include #include +#include +#include static LIST_HEAD(cpufreq_policy_list); @@ -387,7 +390,9 @@ static void cpufreq_notify_transition(struct cpufreq_policy *policy, CPUFREQ_POSTCHANGE, freqs); cpufreq_stats_record_transition(policy, freqs->new); + cpufreq_times_record_transition(policy, freqs->new); policy->cur = freqs->new; + trace_android_rvh_cpufreq_transition(policy); } } @@ -529,8 +534,10 @@ static unsigned int __resolve_freq(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { unsigned int idx; + unsigned int old_target_freq = target_freq; target_freq = clamp_val(target_freq, policy->min, policy->max); + trace_android_vh_cpufreq_resolve_freq(policy, &target_freq, old_target_freq); if (!policy->freq_table) return target_freq; @@ -689,8 +696,15 @@ static ssize_t show_##file_name \ return sprintf(buf, "%u\n", policy->object); \ } +static ssize_t show_cpuinfo_max_freq(struct cpufreq_policy *policy, char *buf) +{ + unsigned int max_freq = policy->cpuinfo.max_freq; + + trace_android_rvh_show_max_freq(policy, &max_freq); + return sprintf(buf, "%u\n", max_freq); +} + show_one(cpuinfo_min_freq, cpuinfo.min_freq); -show_one(cpuinfo_max_freq, cpuinfo.max_freq); show_one(cpuinfo_transition_latency, cpuinfo.transition_latency); show_one(scaling_min_freq, min); show_one(scaling_max_freq, max); @@ -1487,6 +1501,7 @@ static int cpufreq_online(unsigned int cpu) goto out_destroy_policy; cpufreq_stats_create_table(policy); + cpufreq_times_create_policy(policy); write_lock_irqsave(&cpufreq_driver_lock, flags); list_add(&policy->policy_list, &cpufreq_policy_list); @@ -1517,8 +1532,10 @@ static int cpufreq_online(unsigned int cpu) kobject_uevent(&policy->kobj, KOBJ_ADD); - if (cpufreq_thermal_control_enabled(cpufreq_driver)) + if (cpufreq_thermal_control_enabled(cpufreq_driver)) { policy->cdev = of_cpufreq_cooling_register(policy); + trace_android_vh_thermal_register(policy); + } pr_debug("initialization complete\n"); @@ -1613,6 +1630,7 @@ static int cpufreq_offline(unsigned int cpu) if (cpufreq_thermal_control_enabled(cpufreq_driver)) { cpufreq_cooling_unregister(policy->cdev); + trace_android_vh_thermal_unregister(policy); policy->cdev = NULL; } @@ -2098,9 +2116,11 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq) { unsigned int freq; + unsigned int old_target_freq = target_freq; int cpu; target_freq = clamp_val(target_freq, policy->min, policy->max); + trace_android_vh_cpufreq_fast_switch(policy, &target_freq, old_target_freq); freq = cpufreq_driver->fast_switch(policy, target_freq); if (!freq) @@ -2110,6 +2130,7 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, arch_set_freq_scale(policy->related_cpus, freq, policy->cpuinfo.max_freq); cpufreq_stats_record_transition(policy, freq); + trace_android_rvh_cpufreq_transition(policy); if (trace_cpu_frequency_enabled()) { for_each_cpu(cpu, policy->cpus) @@ -2257,6 +2278,8 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy, target_freq = __resolve_freq(policy, target_freq, relation); + trace_android_vh_cpufreq_target(policy, &target_freq, old_target_freq); + pr_debug("target for CPU %u: %u kHz, relation %u, requested %u kHz\n", policy->cpu, target_freq, relation, old_target_freq); @@ -2578,7 +2601,6 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, ret = cpufreq_start_governor(policy); if (!ret) { pr_debug("governor change\n"); - sched_cpufreq_governor_change(policy, old_gov); return 0; } cpufreq_exit_governor(policy); @@ -2596,6 +2618,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, return ret; } +EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency_limits); /** * cpufreq_update_policy - Re-evaluate an existing cpufreq policy. diff --git a/drivers/cpufreq/cpufreq_times.c b/drivers/cpufreq/cpufreq_times.c new file mode 100644 index 000000000000..4df55b323fc1 --- /dev/null +++ b/drivers/cpufreq/cpufreq_times.c @@ -0,0 +1,211 @@ +/* drivers/cpufreq/cpufreq_times.c + * + * Copyright (C) 2018 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(task_time_in_state_lock); /* task->time_in_state */ + +/** + * struct cpu_freqs - per-cpu frequency information + * @offset: start of these freqs' stats in task time_in_state array + * @max_state: number of entries in freq_table + * @last_index: index in freq_table of last frequency switched to + * @freq_table: list of available frequencies + */ +struct cpu_freqs { + unsigned int offset; + unsigned int max_state; + unsigned int last_index; + unsigned int freq_table[0]; +}; + +static struct cpu_freqs *all_freqs[NR_CPUS]; + +static unsigned int next_offset; + +void cpufreq_task_times_init(struct task_struct *p) +{ + unsigned long flags; + + spin_lock_irqsave(&task_time_in_state_lock, flags); + p->time_in_state = NULL; + spin_unlock_irqrestore(&task_time_in_state_lock, flags); + p->max_state = 0; +} + +void cpufreq_task_times_alloc(struct task_struct *p) +{ + void *temp; + unsigned long flags; + unsigned int max_state = READ_ONCE(next_offset); + + /* We use one array to avoid multiple allocs per task */ + temp = kcalloc(max_state, sizeof(p->time_in_state[0]), GFP_ATOMIC); + if (!temp) + return; + + spin_lock_irqsave(&task_time_in_state_lock, flags); + p->time_in_state = temp; + spin_unlock_irqrestore(&task_time_in_state_lock, flags); + p->max_state = max_state; +} + +/* Caller must hold task_time_in_state_lock */ +static int cpufreq_task_times_realloc_locked(struct task_struct *p) +{ + void *temp; + unsigned int max_state = READ_ONCE(next_offset); + + temp = krealloc(p->time_in_state, max_state * sizeof(u64), GFP_ATOMIC); + if (!temp) + return -ENOMEM; + p->time_in_state = temp; + memset(p->time_in_state + p->max_state, 0, + (max_state - p->max_state) * sizeof(u64)); + p->max_state = max_state; + return 0; +} + +void cpufreq_task_times_exit(struct task_struct *p) +{ + unsigned long flags; + void *temp; + + if (!p->time_in_state) + return; + + spin_lock_irqsave(&task_time_in_state_lock, flags); + temp = p->time_in_state; + p->time_in_state = NULL; + spin_unlock_irqrestore(&task_time_in_state_lock, flags); + kfree(temp); +} + +int proc_time_in_state_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *p) +{ + unsigned int cpu, i; + u64 cputime; + unsigned long flags; + struct cpu_freqs *freqs; + struct cpu_freqs *last_freqs = NULL; + + spin_lock_irqsave(&task_time_in_state_lock, flags); + for_each_possible_cpu(cpu) { + freqs = all_freqs[cpu]; + if (!freqs || freqs == last_freqs) + continue; + last_freqs = freqs; + + seq_printf(m, "cpu%u\n", cpu); + for (i = 0; i < freqs->max_state; i++) { + cputime = 0; + if (freqs->offset + i < p->max_state && + p->time_in_state) + cputime = p->time_in_state[freqs->offset + i]; + seq_printf(m, "%u %lu\n", freqs->freq_table[i], + (unsigned long)nsec_to_clock_t(cputime)); + } + } + spin_unlock_irqrestore(&task_time_in_state_lock, flags); + return 0; +} + +void cpufreq_acct_update_power(struct task_struct *p, u64 cputime) +{ + unsigned long flags; + unsigned int state; + struct cpu_freqs *freqs = all_freqs[task_cpu(p)]; + + if (!freqs || is_idle_task(p) || p->flags & PF_EXITING) + return; + + state = freqs->offset + READ_ONCE(freqs->last_index); + + spin_lock_irqsave(&task_time_in_state_lock, flags); + if ((state < p->max_state || !cpufreq_task_times_realloc_locked(p)) && + p->time_in_state) + p->time_in_state[state] += cputime; + spin_unlock_irqrestore(&task_time_in_state_lock, flags); +} + +static int cpufreq_times_get_index(struct cpu_freqs *freqs, unsigned int freq) +{ + int index; + for (index = 0; index < freqs->max_state; ++index) { + if (freqs->freq_table[index] == freq) + return index; + } + return -1; +} + +void cpufreq_times_create_policy(struct cpufreq_policy *policy) +{ + int cpu, index = 0; + unsigned int count = 0; + struct cpufreq_frequency_table *pos, *table; + struct cpu_freqs *freqs; + void *tmp; + + if (all_freqs[policy->cpu]) + return; + + table = policy->freq_table; + if (!table) + return; + + cpufreq_for_each_valid_entry(pos, table) + count++; + + tmp = kzalloc(sizeof(*freqs) + sizeof(freqs->freq_table[0]) * count, + GFP_KERNEL); + if (!tmp) + return; + + freqs = tmp; + freqs->max_state = count; + + cpufreq_for_each_valid_entry(pos, table) + freqs->freq_table[index++] = pos->frequency; + + index = cpufreq_times_get_index(freqs, policy->cur); + if (index >= 0) + WRITE_ONCE(freqs->last_index, index); + + freqs->offset = next_offset; + WRITE_ONCE(next_offset, freqs->offset + count); + for_each_cpu(cpu, policy->related_cpus) + all_freqs[cpu] = freqs; +} + +void cpufreq_times_record_transition(struct cpufreq_policy *policy, + unsigned int new_freq) +{ + int index; + struct cpu_freqs *freqs = all_freqs[policy->cpu]; + if (!freqs) + return; + + index = cpufreq_times_get_index(freqs, new_freq); + if (index >= 0) + WRITE_ONCE(freqs->last_index, index); +} diff --git a/drivers/cpufreq/dummy-cpufreq.c b/drivers/cpufreq/dummy-cpufreq.c new file mode 100644 index 000000000000..e74ef6782e5a --- /dev/null +++ b/drivers/cpufreq/dummy-cpufreq.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Google, Inc. + */ +#include +#include + +static struct cpufreq_frequency_table freq_table[] = { + { .frequency = 1 }, + { .frequency = 2 }, + { .frequency = CPUFREQ_TABLE_END }, +}; + +static int dummy_cpufreq_target_index(struct cpufreq_policy *policy, + unsigned int index) +{ + return 0; +} + +static int dummy_cpufreq_driver_init(struct cpufreq_policy *policy) +{ + policy->freq_table = freq_table; + return 0; +} + +static unsigned int dummy_cpufreq_get(unsigned int cpu) +{ + return 1; +} + +static int dummy_cpufreq_verify(struct cpufreq_policy_data *data) +{ + return 0; +} + +static struct cpufreq_driver dummy_cpufreq_driver = { + .name = "dummy", + .target_index = dummy_cpufreq_target_index, + .init = dummy_cpufreq_driver_init, + .get = dummy_cpufreq_get, + .verify = dummy_cpufreq_verify, + .attr = cpufreq_generic_attr, +}; + +static int __init dummy_cpufreq_init(void) +{ + return cpufreq_register_driver(&dummy_cpufreq_driver); +} + +static void __exit dummy_cpufreq_exit(void) +{ + cpufreq_unregister_driver(&dummy_cpufreq_driver); +} + +module_init(dummy_cpufreq_init); +module_exit(dummy_cpufreq_exit); + +MODULE_AUTHOR("Connor O'Brien "); +MODULE_DESCRIPTION("dummy cpufreq driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c index 67e56cf638ef..24d3115c8350 100644 --- a/drivers/cpufreq/freq_table.c +++ b/drivers/cpufreq/freq_table.c @@ -9,6 +9,7 @@ #include #include +#include /********************************************************************* * FREQUENCY TABLE HELPERS * @@ -51,6 +52,7 @@ int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, max_freq = freq; } + trace_android_vh_freq_table_limits(policy, min_freq, max_freq); policy->min = policy->cpuinfo.min_freq = min_freq; policy->max = max_freq; /* diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c index 540105ca0781..cfefef7483ac 100644 --- a/drivers/cpuidle/cpuidle-psci.c +++ b/drivers/cpuidle/cpuidle-psci.c @@ -26,6 +26,7 @@ #include #include +#include #include "cpuidle-psci.h" #include "dt_idle_states.h" @@ -68,8 +69,12 @@ static int __psci_enter_domain_idle_state(struct cpuidle_device *dev, if (ret) return -1; + /* Do runtime PM to manage a hierarchical CPU toplogy. */ rcu_irq_enter_irqson(); + + trace_android_vh_cpuidle_psci_enter(dev, s2idle); + if (s2idle) dev_pm_genpd_suspend(pd_dev); else @@ -87,6 +92,9 @@ static int __psci_enter_domain_idle_state(struct cpuidle_device *dev, dev_pm_genpd_resume(pd_dev); else pm_runtime_get_sync(pd_dev); + + trace_android_vh_cpuidle_psci_exit(dev, s2idle); + rcu_irq_exit_irqson(); cpu_pm_exit(); diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index ef2ea1b12cd8..a01f04b31d50 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "cpuidle.h" @@ -202,10 +203,21 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, { int entered_state; - struct cpuidle_state *target_state = &drv->states[index]; - bool broadcast = !!(target_state->flags & CPUIDLE_FLAG_TIMER_STOP); + struct cpuidle_state *target_state; + bool broadcast; ktime_t time_start, time_end; + /* + * The vendor hook may modify index, which means target_state and + * broadcast must be assigned after the vendor hook. + */ + trace_android_vh_cpu_idle_enter(&index, dev); + if (index < 0) + return index; + + target_state = &drv->states[index]; + broadcast = !!(target_state->flags & CPUIDLE_FLAG_TIMER_STOP); + /* * Tell the time framework to switch to a broadcast timer because our * local timer will be shut down. If a local timer is used from another @@ -242,6 +254,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, sched_clock_idle_wakeup_event(); time_end = ns_to_ktime(local_clock()); trace_cpu_idle(PWR_EVENT_EXIT, dev->cpu); + trace_android_vh_cpu_idle_exit(entered_state, dev); /* The cpu is no longer idle or about to enter idle. */ sched_idle_set_state(NULL); diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index f70aa17e2a8e..5bfdb8663e99 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -385,3 +385,4 @@ unlock: mutex_unlock(&cpuidle_lock); } +EXPORT_SYMBOL_GPL(cpuidle_driver_state_disabled); diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c index 29acaf48e575..0e51ed25665e 100644 --- a/drivers/cpuidle/governor.c +++ b/drivers/cpuidle/governor.c @@ -102,6 +102,7 @@ int cpuidle_register_governor(struct cpuidle_governor *gov) return ret; } +EXPORT_SYMBOL_GPL(cpuidle_register_governor); /** * cpuidle_governor_latency_req - Compute a latency constraint for CPU @@ -118,3 +119,4 @@ s64 cpuidle_governor_latency_req(unsigned int cpu) return (s64)device_req * NSEC_PER_USEC; } +EXPORT_SYMBOL_GPL(cpuidle_governor_latency_req); diff --git a/drivers/dma-buf/dma-buf-sysfs-stats.c b/drivers/dma-buf/dma-buf-sysfs-stats.c index 053baadcada9..b989b9c79d20 100644 --- a/drivers/dma-buf/dma-buf-sysfs-stats.c +++ b/drivers/dma-buf/dma-buf-sysfs-stats.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "dma-buf-sysfs-stats.h" @@ -168,10 +169,46 @@ void dma_buf_uninit_sysfs_statistics(void) kset_unregister(dma_buf_stats_kset); } +static void sysfs_add_workfn(struct work_struct *work) +{ + struct dma_buf_sysfs_entry *sysfs_entry = + container_of(work, struct dma_buf_sysfs_entry, sysfs_add_work); + struct dma_buf *dmabuf = sysfs_entry->dmabuf; + + /* + * A dmabuf is ref-counted via its file member. If this handler holds the only + * reference to the dmabuf, there is no need for sysfs kobject creation. This is an + * optimization and a race; when the reference count drops to 1 immediately after + * this check it is not harmful as the sysfs entry will still get cleaned up in + * dma_buf_stats_teardown, which won't get called until the final dmabuf reference + * is released, and that can't happen until the end of this function. + */ + if (file_count(dmabuf->file) > 1) { + /* + * kobject_init_and_add expects kobject to be zero-filled, but we have populated it + * (the sysfs_add_work union member) to trigger this work function. + */ + memset(&dmabuf->sysfs_entry->kobj, 0, sizeof(dmabuf->sysfs_entry->kobj)); + dmabuf->sysfs_entry->kobj.kset = dma_buf_per_buffer_stats_kset; + if (kobject_init_and_add(&dmabuf->sysfs_entry->kobj, &dma_buf_ktype, NULL, + "%lu", file_inode(dmabuf->file)->i_ino)) { + kobject_put(&dmabuf->sysfs_entry->kobj); + dmabuf->sysfs_entry = NULL; + } + } else { + /* + * Free the sysfs_entry and reset the pointer so dma_buf_stats_teardown doesn't + * attempt to operate on it. + */ + kfree(dmabuf->sysfs_entry); + dmabuf->sysfs_entry = NULL; + } + dma_buf_put(dmabuf); +} + int dma_buf_stats_setup(struct dma_buf *dmabuf) { struct dma_buf_sysfs_entry *sysfs_entry; - int ret; if (!dmabuf || !dmabuf->file) return -EINVAL; @@ -181,25 +218,16 @@ int dma_buf_stats_setup(struct dma_buf *dmabuf) return -EINVAL; } - sysfs_entry = kzalloc(sizeof(struct dma_buf_sysfs_entry), GFP_KERNEL); + sysfs_entry = kmalloc(sizeof(struct dma_buf_sysfs_entry), GFP_KERNEL); if (!sysfs_entry) return -ENOMEM; - sysfs_entry->kobj.kset = dma_buf_per_buffer_stats_kset; sysfs_entry->dmabuf = dmabuf; - dmabuf->sysfs_entry = sysfs_entry; - /* create the directory for buffer stats */ - ret = kobject_init_and_add(&sysfs_entry->kobj, &dma_buf_ktype, NULL, - "%lu", file_inode(dmabuf->file)->i_ino); - if (ret) - goto err_sysfs_dmabuf; + INIT_WORK(&dmabuf->sysfs_entry->sysfs_add_work, sysfs_add_workfn); + get_dma_buf(dmabuf); /* This reference will be dropped in sysfs_add_workfn. */ + schedule_work(&dmabuf->sysfs_entry->sysfs_add_work); return 0; - -err_sysfs_dmabuf: - kobject_put(&sysfs_entry->kobj); - dmabuf->sysfs_entry = NULL; - return ret; } diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 968c3df2810e..25a204333d62 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -29,9 +29,9 @@ #include #include -#include "dma-buf-sysfs-stats.h" +#include -static inline int is_dma_buf_file(struct file *); +#include "dma-buf-sysfs-stats.h" struct dma_buf_list { struct list_head head; @@ -40,6 +40,30 @@ struct dma_buf_list { static struct dma_buf_list db_list; +/* + * This function helps in traversing the db_list and calls the + * callback function which can extract required info out of each + * dmabuf. + */ +int get_each_dmabuf(int (*callback)(const struct dma_buf *dmabuf, + void *private), void *private) +{ + struct dma_buf *buf; + int ret = mutex_lock_interruptible(&db_list.lock); + + if (ret) + return ret; + + list_for_each_entry(buf, &db_list.head, list_node) { + ret = callback(buf, private); + if (ret) + break; + } + mutex_unlock(&db_list.lock); + return ret; +} +EXPORT_SYMBOL_NS_GPL(get_each_dmabuf, MINIDUMP); + static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen) { struct dma_buf *dmabuf; @@ -76,6 +100,7 @@ static void dma_buf_release(struct dentry *dentry) dma_buf_stats_teardown(dmabuf); dmabuf->ops->release(dmabuf); + trace_android_vh_dma_buf_release(dmabuf); if (dmabuf->resv == (struct dma_resv *)&dmabuf[1]) dma_resv_fini(dmabuf->resv); @@ -322,6 +347,25 @@ static __poll_t dma_buf_poll(struct file *file, poll_table *poll) return events; } +static long _dma_buf_set_name(struct dma_buf *dmabuf, const char *name) +{ + long ret = 0; + + dma_resv_lock(dmabuf->resv, NULL); + if (!list_empty(&dmabuf->attachments)) { + ret = -EBUSY; + goto out_unlock; + } + spin_lock(&dmabuf->name_lock); + kfree(dmabuf->name); + dmabuf->name = name; + spin_unlock(&dmabuf->name_lock); + +out_unlock: + dma_resv_unlock(dmabuf->resv); + return ret; +} + /** * dma_buf_set_name - Set a name to a specific dma_buf to track the usage. * The name of the dma-buf buffer can only be set when the dma-buf is not @@ -337,7 +381,23 @@ static __poll_t dma_buf_poll(struct file *file, poll_table *poll) * devices, return -EBUSY. * */ -static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf) +long dma_buf_set_name(struct dma_buf *dmabuf, const char *name) +{ + long ret = 0; + char *buf = kstrndup(name, DMA_BUF_NAME_LEN, GFP_KERNEL); + + if (!buf) + return -ENOMEM; + + ret = _dma_buf_set_name(dmabuf, buf); + if (ret) + kfree(buf); + + return ret; +} +EXPORT_SYMBOL_GPL(dma_buf_set_name); + +static long dma_buf_set_name_user(struct dma_buf *dmabuf, const char __user *buf) { char *name = strndup_user(buf, DMA_BUF_NAME_LEN); long ret = 0; @@ -345,19 +405,10 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf) if (IS_ERR(name)) return PTR_ERR(name); - dma_resv_lock(dmabuf->resv, NULL); - if (!list_empty(&dmabuf->attachments)) { - ret = -EBUSY; + ret = _dma_buf_set_name(dmabuf, name); + if (ret) kfree(name); - goto out_unlock; - } - spin_lock(&dmabuf->name_lock); - kfree(dmabuf->name); - dmabuf->name = name; - spin_unlock(&dmabuf->name_lock); -out_unlock: - dma_resv_unlock(dmabuf->resv); return ret; } @@ -402,7 +453,7 @@ static long dma_buf_ioctl(struct file *file, case DMA_BUF_SET_NAME_A: case DMA_BUF_SET_NAME_B: - return dma_buf_set_name(dmabuf, (const char __user *)arg); + return dma_buf_set_name_user(dmabuf, (const char __user *)arg); default: return -ENOTTY; @@ -436,10 +487,11 @@ static const struct file_operations dma_buf_fops = { /* * is_dma_buf_file - Check if struct file* is associated with dma_buf */ -static inline int is_dma_buf_file(struct file *file) +int is_dma_buf_file(struct file *file) { return file->f_op == &dma_buf_fops; } +EXPORT_SYMBOL_NS_GPL(is_dma_buf_file, MINIDUMP); static struct file *dma_buf_getfile(struct dma_buf *dmabuf, int flags) { @@ -1218,6 +1270,58 @@ int dma_buf_begin_cpu_access(struct dma_buf *dmabuf, } EXPORT_SYMBOL_GPL(dma_buf_begin_cpu_access); + +/** + * dma_buf_begin_cpu_access_partial - An alternative to dma_buf_begin_cpu_access + * which applies to a subset of the buffer instead of the entire buffer. This + * can offer better performance than dma_buf_begin_cpu_access if implemented + * by an exporter. If an exporter does not implement partial buffer access then + * this function will return -EOPNOTSUPP and it is the responsibility of the + * caller to fallback to dma_buf_begin_cpu_access or fail entirely. + * + * @dmabuf: [in] buffer to prepare cpu access for. + * @direction: [in] direction for access. + * @offset: [in] offset in bytes from beginning of buffer for partial + range. + * @len: [in] length in bytes of partial range beginning at @offset. + * + * After the cpu access is complete the caller should call + * dma_buf_end_cpu_access_partial(). Only when cpu access is bracketed by both + * calls is it guaranteed to be coherent with other DMA access. + * + * This function will also wait for any DMA transactions tracked through + * implicit synchronization in &dma_buf.resv. For DMA transactions with explicit + * synchronization this function will only ensure cache coherency, callers must + * ensure synchronization with such DMA transactions on their own. + * + * Can return negative error values, returns 0 on success. + */ +int dma_buf_begin_cpu_access_partial(struct dma_buf *dmabuf, + enum dma_data_direction direction, + unsigned int offset, unsigned int len) +{ + int ret; + + if (WARN_ON(!dmabuf)) + return -EINVAL; + + if (!dmabuf->ops->begin_cpu_access_partial) + return -EOPNOTSUPP; + + ret = dmabuf->ops->begin_cpu_access_partial(dmabuf, direction, + offset, len); + + /* Ensure that all fences are waited upon - but we first allow + * the native handler the chance to do so more efficiently if it + * chooses. A double invocation here will be reasonably cheap no-op. + */ + if (ret == 0) + ret = __dma_buf_begin_cpu_access(dmabuf, direction); + + return ret; +} +EXPORT_SYMBOL_GPL(dma_buf_begin_cpu_access_partial); + /** * dma_buf_end_cpu_access - Must be called after accessing a dma_buf from the * cpu in the kernel context. Calls end_cpu_access to allow exporter-specific @@ -1246,6 +1350,37 @@ int dma_buf_end_cpu_access(struct dma_buf *dmabuf, } EXPORT_SYMBOL_GPL(dma_buf_end_cpu_access); +/** + * dma_buf_end_cpu_access_partial - An alternative to dma_buf_end_cpu_access + * which applies to a subset of the buffer instead of the entire buffer. This + * can offer better performance than dma_buf_end_cpu_access if implemented + * by an exporter. If an exporter does not implement partial buffer access then + * this function will return -EOPNOTSUPP and it is the responsibility of the + * caller to fallback to dma_buf_end_cpu_access or fail entirely. + * + * @dmabuf: [in] buffer to complete cpu access for. + * @direction: [in] direction for access. + * @offset: [in] offset in bytes from beginning of buffer for partial + range. + * @len: [in] length in bytes of partial range beginning at @offset. + * + * This terminates CPU access started with dma_buf_begin_cpu_access_partial(). + * + * Can return negative error values, returns 0 on success. + */ +int dma_buf_end_cpu_access_partial(struct dma_buf *dmabuf, + enum dma_data_direction direction, + unsigned int offset, unsigned int len) +{ + WARN_ON(!dmabuf); + + if (!dmabuf->ops->end_cpu_access_partial) + return -EOPNOTSUPP; + + return dmabuf->ops->end_cpu_access_partial(dmabuf, direction, + offset, len); +} +EXPORT_SYMBOL_GPL(dma_buf_end_cpu_access_partial); /** * dma_buf_mmap - Setup up a userspace mmap with the given vma @@ -1366,6 +1501,20 @@ void dma_buf_vunmap(struct dma_buf *dmabuf, struct dma_buf_map *map) } EXPORT_SYMBOL_GPL(dma_buf_vunmap); +int dma_buf_get_flags(struct dma_buf *dmabuf, unsigned long *flags) +{ + int ret = 0; + + if (WARN_ON(!dmabuf) || !flags) + return -EINVAL; + + if (dmabuf->ops->get_flags) + ret = dmabuf->ops->get_flags(dmabuf, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(dma_buf_get_flags); + #ifdef CONFIG_DEBUG_FS static int dma_buf_debug_show(struct seq_file *s, void *unused) { @@ -1393,6 +1542,7 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused) if (ret) goto error_unlock; + spin_lock(&buf_obj->name_lock); seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\t%s\n", buf_obj->size, buf_obj->file->f_flags, buf_obj->file->f_mode, @@ -1400,6 +1550,7 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused) buf_obj->exp_name, file_inode(buf_obj->file)->i_ino, buf_obj->name ?: ""); + spin_unlock(&buf_obj->name_lock); robj = buf_obj->resv; fence = dma_resv_excl_fence(robj); diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c index 59d158873f4c..53d8ee80a26d 100644 --- a/drivers/dma-buf/dma-heap.c +++ b/drivers/dma-buf/dma-heap.c @@ -31,6 +31,7 @@ * @heap_devt heap device node * @list list head connecting to list of heaps * @heap_cdev heap char device + * @heap_dev heap device struct * * Represents a heap of memory from which buffers can be made. */ @@ -41,6 +42,8 @@ struct dma_heap { dev_t heap_devt; struct list_head list; struct cdev heap_cdev; + struct kref refcount; + struct device *heap_dev; }; static LIST_HEAD(heap_list); @@ -49,22 +52,60 @@ static dev_t dma_heap_devt; static struct class *dma_heap_class; static DEFINE_XARRAY_ALLOC(dma_heap_minors); -static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len, - unsigned int fd_flags, - unsigned int heap_flags) +struct dma_heap *dma_heap_find(const char *name) { - struct dma_buf *dmabuf; - int fd; + struct dma_heap *h; + mutex_lock(&heap_list_lock); + list_for_each_entry(h, &heap_list, list) { + if (!strcmp(h->name, name)) { + kref_get(&h->refcount); + mutex_unlock(&heap_list_lock); + return h; + } + } + mutex_unlock(&heap_list_lock); + return NULL; +} +EXPORT_SYMBOL_GPL(dma_heap_find); + + +void dma_heap_buffer_free(struct dma_buf *dmabuf) +{ + dma_buf_put(dmabuf); +} +EXPORT_SYMBOL_GPL(dma_heap_buffer_free); + +struct dma_buf *dma_heap_buffer_alloc(struct dma_heap *heap, size_t len, + unsigned int fd_flags, + unsigned int heap_flags) +{ + if (fd_flags & ~DMA_HEAP_VALID_FD_FLAGS) + return ERR_PTR(-EINVAL); + + if (heap_flags & ~DMA_HEAP_VALID_HEAP_FLAGS) + return ERR_PTR(-EINVAL); /* * Allocations from all heaps have to begin * and end on page boundaries. */ len = PAGE_ALIGN(len); if (!len) - return -EINVAL; + return ERR_PTR(-EINVAL); + + return heap->ops->allocate(heap, len, fd_flags, heap_flags); +} +EXPORT_SYMBOL_GPL(dma_heap_buffer_alloc); + +int dma_heap_bufferfd_alloc(struct dma_heap *heap, size_t len, + unsigned int fd_flags, + unsigned int heap_flags) +{ + struct dma_buf *dmabuf; + int fd; + + dmabuf = dma_heap_buffer_alloc(heap, len, fd_flags, heap_flags); - dmabuf = heap->ops->allocate(heap, len, fd_flags, heap_flags); if (IS_ERR(dmabuf)) return PTR_ERR(dmabuf); @@ -74,7 +115,9 @@ static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len, /* just return, as put will call release and that will free */ } return fd; + } +EXPORT_SYMBOL_GPL(dma_heap_bufferfd_alloc); static int dma_heap_open(struct inode *inode, struct file *file) { @@ -102,15 +145,9 @@ static long dma_heap_ioctl_allocate(struct file *file, void *data) if (heap_allocation->fd) return -EINVAL; - if (heap_allocation->fd_flags & ~DMA_HEAP_VALID_FD_FLAGS) - return -EINVAL; - - if (heap_allocation->heap_flags & ~DMA_HEAP_VALID_HEAP_FLAGS) - return -EINVAL; - - fd = dma_heap_buffer_alloc(heap, heap_allocation->len, - heap_allocation->fd_flags, - heap_allocation->heap_flags); + fd = dma_heap_bufferfd_alloc(heap, heap_allocation->len, + heap_allocation->fd_flags, + heap_allocation->heap_flags); if (fd < 0) return fd; @@ -203,6 +240,47 @@ void *dma_heap_get_drvdata(struct dma_heap *heap) { return heap->priv; } +EXPORT_SYMBOL_GPL(dma_heap_get_drvdata); + +static void dma_heap_release(struct kref *ref) +{ + struct dma_heap *heap = container_of(ref, struct dma_heap, refcount); + int minor = MINOR(heap->heap_devt); + + /* Note, we already holding the heap_list_lock here */ + list_del(&heap->list); + + device_destroy(dma_heap_class, heap->heap_devt); + cdev_del(&heap->heap_cdev); + xa_erase(&dma_heap_minors, minor); + + kfree(heap); +} + +void dma_heap_put(struct dma_heap *h) +{ + /* + * Take the heap_list_lock now to avoid racing with code + * scanning the list and then taking a kref. + */ + mutex_lock(&heap_list_lock); + kref_put(&h->refcount, dma_heap_release); + mutex_unlock(&heap_list_lock); +} +EXPORT_SYMBOL_GPL(dma_heap_put); + +/** + * dma_heap_get_dev() - get device struct for the heap + * @heap: DMA-Heap to retrieve device struct from + * + * Returns: + * The device struct for the heap. + */ +struct device *dma_heap_get_dev(struct dma_heap *heap) +{ + return heap->heap_dev; +} +EXPORT_SYMBOL_GPL(dma_heap_get_dev); /** * dma_heap_get_name() - get heap name @@ -215,11 +293,11 @@ const char *dma_heap_get_name(struct dma_heap *heap) { return heap->name; } +EXPORT_SYMBOL_GPL(dma_heap_get_name); struct dma_heap *dma_heap_add(const struct dma_heap_export_info *exp_info) { struct dma_heap *heap, *h, *err_ret; - struct device *dev_ret; unsigned int minor; int ret; @@ -237,6 +315,7 @@ struct dma_heap *dma_heap_add(const struct dma_heap_export_info *exp_info) if (!heap) return ERR_PTR(-ENOMEM); + kref_init(&heap->refcount); heap->name = exp_info->name; heap->ops = exp_info->ops; heap->priv = exp_info->priv; @@ -261,17 +340,20 @@ struct dma_heap *dma_heap_add(const struct dma_heap_export_info *exp_info) goto err1; } - dev_ret = device_create(dma_heap_class, - NULL, - heap->heap_devt, - NULL, - heap->name); - if (IS_ERR(dev_ret)) { + heap->heap_dev = device_create(dma_heap_class, + NULL, + heap->heap_devt, + NULL, + heap->name); + if (IS_ERR(heap->heap_dev)) { pr_err("dma_heap: Unable to create device\n"); - err_ret = ERR_CAST(dev_ret); + err_ret = ERR_CAST(heap->heap_dev); goto err2; } + /* Make sure it doesn't disappear on us */ + heap->heap_dev = get_device(heap->heap_dev); + mutex_lock(&heap_list_lock); /* check the name is unique */ list_for_each_entry(h, &heap_list, list) { @@ -280,6 +362,7 @@ struct dma_heap *dma_heap_add(const struct dma_heap_export_info *exp_info) pr_err("dma_heap: Already registered heap named %s\n", exp_info->name); err_ret = ERR_PTR(-EINVAL); + put_device(heap->heap_dev); goto err3; } } @@ -300,27 +383,88 @@ err0: kfree(heap); return err_ret; } +EXPORT_SYMBOL_GPL(dma_heap_add); static char *dma_heap_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "dma_heap/%s", dev_name(dev)); } +static ssize_t total_pools_kb_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct dma_heap *heap; + u64 total_pool_size = 0; + + mutex_lock(&heap_list_lock); + list_for_each_entry(heap, &heap_list, list) { + if (heap->ops->get_pool_size) + total_pool_size += heap->ops->get_pool_size(heap); + } + mutex_unlock(&heap_list_lock); + + return sysfs_emit(buf, "%llu\n", total_pool_size / 1024); +} + +static struct kobj_attribute total_pools_kb_attr = + __ATTR_RO(total_pools_kb); + +static struct attribute *dma_heap_sysfs_attrs[] = { + &total_pools_kb_attr.attr, + NULL, +}; + +ATTRIBUTE_GROUPS(dma_heap_sysfs); + +static struct kobject *dma_heap_kobject; + +static int dma_heap_sysfs_setup(void) +{ + int ret; + + dma_heap_kobject = kobject_create_and_add("dma_heap", kernel_kobj); + if (!dma_heap_kobject) + return -ENOMEM; + + ret = sysfs_create_groups(dma_heap_kobject, dma_heap_sysfs_groups); + if (ret) { + kobject_put(dma_heap_kobject); + return ret; + } + + return 0; +} + +static void dma_heap_sysfs_teardown(void) +{ + kobject_put(dma_heap_kobject); +} + static int dma_heap_init(void) { int ret; - ret = alloc_chrdev_region(&dma_heap_devt, 0, NUM_HEAP_MINORS, DEVNAME); + ret = dma_heap_sysfs_setup(); if (ret) return ret; + ret = alloc_chrdev_region(&dma_heap_devt, 0, NUM_HEAP_MINORS, DEVNAME); + if (ret) + goto err_chrdev; + dma_heap_class = class_create(THIS_MODULE, DEVNAME); if (IS_ERR(dma_heap_class)) { - unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS); - return PTR_ERR(dma_heap_class); + ret = PTR_ERR(dma_heap_class); + goto err_class; } dma_heap_class->devnode = dma_heap_devnode; return 0; + +err_class: + unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS); +err_chrdev: + dma_heap_sysfs_teardown(); + return ret; } subsys_initcall(dma_heap_init); diff --git a/drivers/dma-buf/heaps/Kconfig b/drivers/dma-buf/heaps/Kconfig index a5eef06c4226..ff52efa83f39 100644 --- a/drivers/dma-buf/heaps/Kconfig +++ b/drivers/dma-buf/heaps/Kconfig @@ -1,12 +1,22 @@ +menuconfig DMABUF_HEAPS_DEFERRED_FREE + bool "DMA-BUF heaps deferred-free library" + help + Choose this option to enable the DMA-BUF heaps deferred-free library. + +menuconfig DMABUF_HEAPS_PAGE_POOL + bool "DMA-BUF heaps page-pool library" + help + Choose this option to enable the DMA-BUF heaps page-pool library. + config DMABUF_HEAPS_SYSTEM - bool "DMA-BUF System Heap" - depends on DMABUF_HEAPS + tristate "DMA-BUF System Heap" + depends on DMABUF_HEAPS && DMABUF_HEAPS_DEFERRED_FREE && DMABUF_HEAPS_PAGE_POOL help Choose this option to enable the system dmabuf heap. The system heap is backed by pages from the buddy allocator. If in doubt, say Y. config DMABUF_HEAPS_CMA - bool "DMA-BUF CMA Heap" + tristate "DMA-BUF CMA Heap" depends on DMABUF_HEAPS && DMA_CMA help Choose this option to enable dma-buf CMA heap. This heap is backed diff --git a/drivers/dma-buf/heaps/Makefile b/drivers/dma-buf/heaps/Makefile index 974467791032..4d4cd94a3a4a 100644 --- a/drivers/dma-buf/heaps/Makefile +++ b/drivers/dma-buf/heaps/Makefile @@ -1,3 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_DMABUF_HEAPS_DEFERRED_FREE) += deferred-free-helper.o +obj-$(CONFIG_DMABUF_HEAPS_PAGE_POOL) += page_pool.o obj-$(CONFIG_DMABUF_HEAPS_SYSTEM) += system_heap.o obj-$(CONFIG_DMABUF_HEAPS_CMA) += cma_heap.o diff --git a/drivers/dma-buf/heaps/cma_heap.c b/drivers/dma-buf/heaps/cma_heap.c index 83f02bd51dda..86f3cfee1266 100644 --- a/drivers/dma-buf/heaps/cma_heap.c +++ b/drivers/dma-buf/heaps/cma_heap.c @@ -99,9 +99,10 @@ static struct sg_table *cma_heap_map_dma_buf(struct dma_buf_attachment *attachme { struct dma_heap_attachment *a = attachment->priv; struct sg_table *table = &a->table; + int attrs = attachment->dma_map_attrs; int ret; - ret = dma_map_sgtable(attachment->dev, table, direction, 0); + ret = dma_map_sgtable(attachment->dev, table, direction, attrs); if (ret) return ERR_PTR(-ENOMEM); a->mapped = true; @@ -113,9 +114,10 @@ static void cma_heap_unmap_dma_buf(struct dma_buf_attachment *attachment, enum dma_data_direction direction) { struct dma_heap_attachment *a = attachment->priv; + int attrs = attachment->dma_map_attrs; a->mapped = false; - dma_unmap_sgtable(attachment->dev, table, direction, 0); + dma_unmap_sgtable(attachment->dev, table, direction, attrs); } static int cma_heap_dma_buf_begin_cpu_access(struct dma_buf *dmabuf, @@ -300,7 +302,7 @@ static struct dma_buf *cma_heap_allocate(struct dma_heap *heap, if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - cma_pages = cma_alloc(cma_heap->cma, pagecount, align, false); + cma_pages = cma_alloc(cma_heap->cma, pagecount, align, GFP_KERNEL); if (!cma_pages) goto free_buffer; diff --git a/drivers/dma-buf/heaps/deferred-free-helper.c b/drivers/dma-buf/heaps/deferred-free-helper.c new file mode 100644 index 000000000000..e19c8b68dfeb --- /dev/null +++ b/drivers/dma-buf/heaps/deferred-free-helper.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Deferred dmabuf freeing helper + * + * Copyright (C) 2020 Linaro, Ltd. + * + * Based on the ION page pool code + * Copyright (C) 2011 Google, Inc. + */ + +#include +#include +#include +#include +#include + +#include "deferred-free-helper.h" + +static LIST_HEAD(free_list); +static size_t list_nr_pages; +wait_queue_head_t freelist_waitqueue; +struct task_struct *freelist_task; +static DEFINE_SPINLOCK(free_list_lock); + +void deferred_free(struct deferred_freelist_item *item, + void (*free)(struct deferred_freelist_item*, + enum df_reason), + size_t nr_pages) +{ + unsigned long flags; + + INIT_LIST_HEAD(&item->list); + item->nr_pages = nr_pages; + item->free = free; + + spin_lock_irqsave(&free_list_lock, flags); + list_add(&item->list, &free_list); + list_nr_pages += nr_pages; + spin_unlock_irqrestore(&free_list_lock, flags); + wake_up(&freelist_waitqueue); +} +EXPORT_SYMBOL_GPL(deferred_free); + +static size_t free_one_item(enum df_reason reason) +{ + unsigned long flags; + size_t nr_pages; + struct deferred_freelist_item *item; + + spin_lock_irqsave(&free_list_lock, flags); + if (list_empty(&free_list)) { + spin_unlock_irqrestore(&free_list_lock, flags); + return 0; + } + item = list_first_entry(&free_list, struct deferred_freelist_item, list); + list_del(&item->list); + nr_pages = item->nr_pages; + list_nr_pages -= nr_pages; + spin_unlock_irqrestore(&free_list_lock, flags); + + item->free(item, reason); + return nr_pages; +} + +static unsigned long get_freelist_nr_pages(void) +{ + unsigned long nr_pages; + unsigned long flags; + + spin_lock_irqsave(&free_list_lock, flags); + nr_pages = list_nr_pages; + spin_unlock_irqrestore(&free_list_lock, flags); + return nr_pages; +} + +static unsigned long freelist_shrink_count(struct shrinker *shrinker, + struct shrink_control *sc) +{ + return get_freelist_nr_pages(); +} + +static unsigned long freelist_shrink_scan(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long total_freed = 0; + + if (sc->nr_to_scan == 0) + return 0; + + while (total_freed < sc->nr_to_scan) { + size_t pages_freed = free_one_item(DF_UNDER_PRESSURE); + + if (!pages_freed) + break; + + total_freed += pages_freed; + } + + return total_freed; +} + +static struct shrinker freelist_shrinker = { + .count_objects = freelist_shrink_count, + .scan_objects = freelist_shrink_scan, + .seeks = DEFAULT_SEEKS, + .batch = 0, +}; + +static int deferred_free_thread(void *data) +{ + while (true) { + wait_event_freezable(freelist_waitqueue, + get_freelist_nr_pages() > 0); + + free_one_item(DF_NORMAL); + } + + return 0; +} + +static int deferred_freelist_init(void) +{ + list_nr_pages = 0; + + init_waitqueue_head(&freelist_waitqueue); + freelist_task = kthread_run(deferred_free_thread, NULL, + "%s", "dmabuf-deferred-free-worker"); + if (IS_ERR(freelist_task)) { + pr_err("Creating thread for deferred free failed\n"); + return -1; + } + sched_set_normal(freelist_task, 19); + + return register_shrinker(&freelist_shrinker); +} +module_init(deferred_freelist_init); +MODULE_LICENSE("GPL v2"); + diff --git a/drivers/dma-buf/heaps/deferred-free-helper.h b/drivers/dma-buf/heaps/deferred-free-helper.h new file mode 100644 index 000000000000..11940328ce3f --- /dev/null +++ b/drivers/dma-buf/heaps/deferred-free-helper.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef DEFERRED_FREE_HELPER_H +#define DEFERRED_FREE_HELPER_H + +/** + * df_reason - enum for reason why item was freed + * + * This provides a reason for why the free function was called + * on the item. This is useful when deferred_free is used in + * combination with a pagepool, so under pressure the page can + * be immediately freed. + * + * DF_NORMAL: Normal deferred free + * + * DF_UNDER_PRESSURE: Free was called because the system + * is under memory pressure. Usually + * from a shrinker. Avoid allocating + * memory in the free call, as it may + * fail. + */ +enum df_reason { + DF_NORMAL, + DF_UNDER_PRESSURE, +}; + +/** + * deferred_freelist_item - item structure for deferred freelist + * + * This is to be added to the structure for whatever you want to + * defer freeing on. + * + * @nr_pages: number of pages used by item to be freed + * @free: function pointer to be called when freeing the item + * @list: list entry for the deferred list + */ +struct deferred_freelist_item { + size_t nr_pages; + void (*free)(struct deferred_freelist_item *i, + enum df_reason reason); + struct list_head list; +}; + +/** + * deferred_free - call to add item to the deferred free list + * + * @item: Pointer to deferred_freelist_item field of a structure + * @free: Function pointer to the free call + * @nr_pages: number of pages to be freed + */ +void deferred_free(struct deferred_freelist_item *item, + void (*free)(struct deferred_freelist_item *i, + enum df_reason reason), + size_t nr_pages); +#endif diff --git a/drivers/dma-buf/heaps/page_pool.c b/drivers/dma-buf/heaps/page_pool.c new file mode 100644 index 000000000000..7d46c36d96a7 --- /dev/null +++ b/drivers/dma-buf/heaps/page_pool.c @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DMA BUF page pool system + * + * Copyright (C) 2020 Linaro Ltd. + * + * Based on the ION page pool code + * Copyright (C) 2011 Google, Inc. + */ + +#include "page_pool.h" + +#include +#include +#include +#include +#include + +static LIST_HEAD(pool_list); +static DEFINE_MUTEX(pool_list_lock); + +static inline +struct page *dmabuf_page_pool_alloc_pages(struct dmabuf_page_pool *pool) +{ + if (fatal_signal_pending(current)) + return NULL; + return alloc_pages(pool->gfp_mask, pool->order); +} + +static inline void dmabuf_page_pool_free_pages(struct dmabuf_page_pool *pool, + struct page *page) +{ + __free_pages(page, pool->order); +} + +static void dmabuf_page_pool_add(struct dmabuf_page_pool *pool, struct page *page) +{ + int index; + + if (PageHighMem(page)) + index = POOL_HIGHPAGE; + else + index = POOL_LOWPAGE; + + spin_lock(&pool->lock); + list_add_tail(&page->lru, &pool->items[index]); + pool->count[index]++; + spin_unlock(&pool->lock); + mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE, + 1 << pool->order); +} + +static struct page *dmabuf_page_pool_remove(struct dmabuf_page_pool *pool, int index) +{ + struct page *page; + + spin_lock(&pool->lock); + page = list_first_entry_or_null(&pool->items[index], struct page, lru); + if (page) { + pool->count[index]--; + list_del(&page->lru); + spin_unlock(&pool->lock); + mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE, + -(1 << pool->order)); + goto out; + } + spin_unlock(&pool->lock); +out: + return page; +} + +static struct page *dmabuf_page_pool_fetch(struct dmabuf_page_pool *pool) +{ + struct page *page = NULL; + + page = dmabuf_page_pool_remove(pool, POOL_HIGHPAGE); + if (!page) + page = dmabuf_page_pool_remove(pool, POOL_LOWPAGE); + + return page; +} + +struct page *dmabuf_page_pool_alloc(struct dmabuf_page_pool *pool) +{ + struct page *page = NULL; + + if (WARN_ON(!pool)) + return NULL; + + page = dmabuf_page_pool_fetch(pool); + + if (!page) + page = dmabuf_page_pool_alloc_pages(pool); + return page; +} +EXPORT_SYMBOL_GPL(dmabuf_page_pool_alloc); + +void dmabuf_page_pool_free(struct dmabuf_page_pool *pool, struct page *page) +{ + if (WARN_ON(pool->order != compound_order(page))) + return; + + dmabuf_page_pool_add(pool, page); +} +EXPORT_SYMBOL_GPL(dmabuf_page_pool_free); + +static int dmabuf_page_pool_total(struct dmabuf_page_pool *pool, bool high) +{ + int count = pool->count[POOL_LOWPAGE]; + + if (high) + count += pool->count[POOL_HIGHPAGE]; + + return count << pool->order; +} + +struct dmabuf_page_pool *dmabuf_page_pool_create(gfp_t gfp_mask, unsigned int order) +{ + struct dmabuf_page_pool *pool = kmalloc(sizeof(*pool), GFP_KERNEL); + int i; + + if (!pool) + return NULL; + + for (i = 0; i < POOL_TYPE_SIZE; i++) { + pool->count[i] = 0; + INIT_LIST_HEAD(&pool->items[i]); + } + pool->gfp_mask = gfp_mask | __GFP_COMP; + pool->order = order; + spin_lock_init(&pool->lock); + + mutex_lock(&pool_list_lock); + list_add(&pool->list, &pool_list); + mutex_unlock(&pool_list_lock); + + return pool; +} +EXPORT_SYMBOL_GPL(dmabuf_page_pool_create); + +void dmabuf_page_pool_destroy(struct dmabuf_page_pool *pool) +{ + struct page *page; + int i; + + /* Remove us from the pool list */ + mutex_lock(&pool_list_lock); + list_del(&pool->list); + mutex_unlock(&pool_list_lock); + + /* Free any remaining pages in the pool */ + for (i = 0; i < POOL_TYPE_SIZE; i++) { + while ((page = dmabuf_page_pool_remove(pool, i))) + dmabuf_page_pool_free_pages(pool, page); + } + + kfree(pool); +} +EXPORT_SYMBOL_GPL(dmabuf_page_pool_destroy); + +static int dmabuf_page_pool_do_shrink(struct dmabuf_page_pool *pool, gfp_t gfp_mask, + int nr_to_scan) +{ + int freed = 0; + bool high; + + if (current_is_kswapd()) + high = true; + else + high = !!(gfp_mask & __GFP_HIGHMEM); + + if (nr_to_scan == 0) + return dmabuf_page_pool_total(pool, high); + + while (freed < nr_to_scan) { + struct page *page; + + /* Try to free low pages first */ + page = dmabuf_page_pool_remove(pool, POOL_LOWPAGE); + if (!page) + page = dmabuf_page_pool_remove(pool, POOL_HIGHPAGE); + + if (!page) + break; + + dmabuf_page_pool_free_pages(pool, page); + freed += (1 << pool->order); + } + + return freed; +} + +static int dmabuf_page_pool_shrink(gfp_t gfp_mask, int nr_to_scan) +{ + struct dmabuf_page_pool *pool; + int nr_total = 0; + int nr_freed; + int only_scan = 0; + + if (!nr_to_scan) + only_scan = 1; + + mutex_lock(&pool_list_lock); + list_for_each_entry(pool, &pool_list, list) { + if (only_scan) { + nr_total += dmabuf_page_pool_do_shrink(pool, + gfp_mask, + nr_to_scan); + } else { + nr_freed = dmabuf_page_pool_do_shrink(pool, + gfp_mask, + nr_to_scan); + nr_to_scan -= nr_freed; + nr_total += nr_freed; + if (nr_to_scan <= 0) + break; + } + } + mutex_unlock(&pool_list_lock); + + return nr_total; +} + +static unsigned long dmabuf_page_pool_shrink_count(struct shrinker *shrinker, + struct shrink_control *sc) +{ + return dmabuf_page_pool_shrink(sc->gfp_mask, 0); +} + +static unsigned long dmabuf_page_pool_shrink_scan(struct shrinker *shrinker, + struct shrink_control *sc) +{ + if (sc->nr_to_scan == 0) + return 0; + return dmabuf_page_pool_shrink(sc->gfp_mask, sc->nr_to_scan); +} + +struct shrinker pool_shrinker = { + .count_objects = dmabuf_page_pool_shrink_count, + .scan_objects = dmabuf_page_pool_shrink_scan, + .seeks = DEFAULT_SEEKS, + .batch = 0, +}; + +static int dmabuf_page_pool_init_shrinker(void) +{ + return register_shrinker(&pool_shrinker); +} +module_init(dmabuf_page_pool_init_shrinker); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/dma-buf/heaps/page_pool.h b/drivers/dma-buf/heaps/page_pool.h new file mode 100644 index 000000000000..891adee7491f --- /dev/null +++ b/drivers/dma-buf/heaps/page_pool.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * DMA BUF PagePool implementation + * Based on earlier ION code by Google + * + * Copyright (C) 2011 Google, Inc. + * Copyright (C) 2020 Linaro Ltd. + */ + +#ifndef _DMABUF_PAGE_POOL_H +#define _DMABUF_PAGE_POOL_H + +#include +#include +#include + +/* page types we track in the pool */ +enum { + POOL_LOWPAGE, /* Clean lowmem pages */ + POOL_HIGHPAGE, /* Clean highmem pages */ + + POOL_TYPE_SIZE, +}; + +/** + * struct dmabuf_page_pool - pagepool struct + * @count[]: array of number of pages of that type in the pool + * @items[]: array of list of pages of the specific type + * @lock: lock protecting this struct and especially the count + * item list + * @gfp_mask: gfp_mask to use from alloc + * @order: order of pages in the pool + * @list: list node for list of pools + * + * Allows you to keep a pool of pre allocated pages to use + */ +struct dmabuf_page_pool { + int count[POOL_TYPE_SIZE]; + struct list_head items[POOL_TYPE_SIZE]; + struct spinlock lock; + gfp_t gfp_mask; + unsigned int order; + struct list_head list; +}; + +struct dmabuf_page_pool *dmabuf_page_pool_create(gfp_t gfp_mask, + unsigned int order); +void dmabuf_page_pool_destroy(struct dmabuf_page_pool *pool); +struct page *dmabuf_page_pool_alloc(struct dmabuf_page_pool *pool); +void dmabuf_page_pool_free(struct dmabuf_page_pool *pool, struct page *page); + +#endif /* _DMABUF_PAGE_POOL_H */ diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c index 8660508f3684..af4e6faea8d0 100644 --- a/drivers/dma-buf/heaps/system_heap.c +++ b/drivers/dma-buf/heaps/system_heap.c @@ -21,7 +21,10 @@ #include #include +#include "page_pool.h" + static struct dma_heap *sys_heap; +static struct dma_heap *sys_uncached_heap; struct system_heap_buffer { struct dma_heap *heap; @@ -31,6 +34,8 @@ struct system_heap_buffer { struct sg_table sg_table; int vmap_cnt; void *vaddr; + + bool uncached; }; struct dma_heap_attachment { @@ -38,13 +43,16 @@ struct dma_heap_attachment { struct sg_table *table; struct list_head list; bool mapped; + + bool uncached; }; +#define LOW_ORDER_GFP (GFP_HIGHUSER | __GFP_ZERO | __GFP_COMP) +#define MID_ORDER_GFP (LOW_ORDER_GFP | __GFP_NOWARN) #define HIGH_ORDER_GFP (((GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN \ | __GFP_NORETRY) & ~__GFP_RECLAIM) \ | __GFP_COMP) -#define LOW_ORDER_GFP (GFP_HIGHUSER | __GFP_ZERO | __GFP_COMP) -static gfp_t order_flags[] = {HIGH_ORDER_GFP, LOW_ORDER_GFP, LOW_ORDER_GFP}; +static gfp_t order_flags[] = {HIGH_ORDER_GFP, MID_ORDER_GFP, LOW_ORDER_GFP}; /* * The selection of the orders used for allocation (1MB, 64K, 4K) is designed * to match with the sizes often found in IOMMUs. Using order 4 pages instead @@ -53,6 +61,7 @@ static gfp_t order_flags[] = {HIGH_ORDER_GFP, LOW_ORDER_GFP, LOW_ORDER_GFP}; */ static const unsigned int orders[] = {8, 4, 0}; #define NUM_ORDERS ARRAY_SIZE(orders) +struct dmabuf_page_pool *pools[NUM_ORDERS]; static struct sg_table *dup_sg_table(struct sg_table *table) { @@ -100,7 +109,7 @@ static int system_heap_attach(struct dma_buf *dmabuf, a->dev = attachment->dev; INIT_LIST_HEAD(&a->list); a->mapped = false; - + a->uncached = buffer->uncached; attachment->priv = a; mutex_lock(&buffer->lock); @@ -130,9 +139,13 @@ static struct sg_table *system_heap_map_dma_buf(struct dma_buf_attachment *attac { struct dma_heap_attachment *a = attachment->priv; struct sg_table *table = a->table; + int attr = attachment->dma_map_attrs; int ret; - ret = dma_map_sgtable(attachment->dev, table, direction, 0); + if (a->uncached) + attr |= DMA_ATTR_SKIP_CPU_SYNC; + + ret = dma_map_sgtable(attachment->dev, table, direction, attr); if (ret) return ERR_PTR(ret); @@ -145,9 +158,12 @@ static void system_heap_unmap_dma_buf(struct dma_buf_attachment *attachment, enum dma_data_direction direction) { struct dma_heap_attachment *a = attachment->priv; + int attr = attachment->dma_map_attrs; + if (a->uncached) + attr |= DMA_ATTR_SKIP_CPU_SYNC; a->mapped = false; - dma_unmap_sgtable(attachment->dev, table, direction, 0); + dma_unmap_sgtable(attachment->dev, table, direction, attr); } static int system_heap_dma_buf_begin_cpu_access(struct dma_buf *dmabuf, @@ -161,10 +177,12 @@ static int system_heap_dma_buf_begin_cpu_access(struct dma_buf *dmabuf, if (buffer->vmap_cnt) invalidate_kernel_vmap_range(buffer->vaddr, buffer->len); - list_for_each_entry(a, &buffer->attachments, list) { - if (!a->mapped) - continue; - dma_sync_sgtable_for_cpu(a->dev, a->table, direction); + if (!buffer->uncached) { + list_for_each_entry(a, &buffer->attachments, list) { + if (!a->mapped) + continue; + dma_sync_sgtable_for_cpu(a->dev, a->table, direction); + } } mutex_unlock(&buffer->lock); @@ -182,10 +200,12 @@ static int system_heap_dma_buf_end_cpu_access(struct dma_buf *dmabuf, if (buffer->vmap_cnt) flush_kernel_vmap_range(buffer->vaddr, buffer->len); - list_for_each_entry(a, &buffer->attachments, list) { - if (!a->mapped) - continue; - dma_sync_sgtable_for_device(a->dev, a->table, direction); + if (!buffer->uncached) { + list_for_each_entry(a, &buffer->attachments, list) { + if (!a->mapped) + continue; + dma_sync_sgtable_for_device(a->dev, a->table, direction); + } } mutex_unlock(&buffer->lock); @@ -200,6 +220,9 @@ static int system_heap_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma) struct sg_page_iter piter; int ret; + if (buffer->uncached) + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + for_each_sgtable_page(table, &piter, vma->vm_pgoff) { struct page *page = sg_page_iter_page(&piter); @@ -221,17 +244,21 @@ static void *system_heap_do_vmap(struct system_heap_buffer *buffer) struct page **pages = vmalloc(sizeof(struct page *) * npages); struct page **tmp = pages; struct sg_page_iter piter; + pgprot_t pgprot = PAGE_KERNEL; void *vaddr; if (!pages) return ERR_PTR(-ENOMEM); + if (buffer->uncached) + pgprot = pgprot_writecombine(PAGE_KERNEL); + for_each_sgtable_page(table, &piter, 0) { WARN_ON(tmp - pages >= npages); *tmp++ = sg_page_iter_page(&piter); } - vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL); + vaddr = vmap(pages, npages, VM_MAP, pgprot); vfree(pages); if (!vaddr) @@ -281,18 +308,43 @@ static void system_heap_vunmap(struct dma_buf *dmabuf, struct dma_buf_map *map) dma_buf_map_clear(map); } +static int system_heap_zero_buffer(struct system_heap_buffer *buffer) +{ + struct sg_table *sgt = &buffer->sg_table; + struct sg_page_iter piter; + struct page *p; + void *vaddr; + int ret = 0; + + for_each_sgtable_page(sgt, &piter, 0) { + p = sg_page_iter_page(&piter); + vaddr = kmap_atomic(p); + memset(vaddr, 0, PAGE_SIZE); + kunmap_atomic(vaddr); + } + + return ret; +} + static void system_heap_dma_buf_release(struct dma_buf *dmabuf) { struct system_heap_buffer *buffer = dmabuf->priv; struct sg_table *table; struct scatterlist *sg; - int i; + int i, j; + + /* Zero the buffer pages before adding back to the pool */ + system_heap_zero_buffer(buffer); table = &buffer->sg_table; for_each_sgtable_sg(table, sg, i) { struct page *page = sg_page(sg); - __free_pages(page, compound_order(page)); + for (j = 0; j < NUM_ORDERS; j++) { + if (compound_order(page) == orders[j]) + break; + } + dmabuf_page_pool_free(pools[j], page); } sg_free_table(table); kfree(buffer); @@ -322,8 +374,7 @@ static struct page *alloc_largest_available(unsigned long size, continue; if (max_order < orders[i]) continue; - - page = alloc_pages(order_flags[i], orders[i]); + page = dmabuf_page_pool_alloc(pools[i]); if (!page) continue; return page; @@ -331,10 +382,11 @@ static struct page *alloc_largest_available(unsigned long size, return NULL; } -static struct dma_buf *system_heap_allocate(struct dma_heap *heap, - unsigned long len, - unsigned long fd_flags, - unsigned long heap_flags) +static struct dma_buf *system_heap_do_allocate(struct dma_heap *heap, + unsigned long len, + unsigned long fd_flags, + unsigned long heap_flags, + bool uncached) { struct system_heap_buffer *buffer; DEFINE_DMA_BUF_EXPORT_INFO(exp_info); @@ -355,6 +407,7 @@ static struct dma_buf *system_heap_allocate(struct dma_heap *heap, mutex_init(&buffer->lock); buffer->heap = heap; buffer->len = len; + buffer->uncached = uncached; INIT_LIST_HEAD(&pages); i = 0; @@ -400,6 +453,18 @@ static struct dma_buf *system_heap_allocate(struct dma_heap *heap, ret = PTR_ERR(dmabuf); goto free_pages; } + + /* + * For uncached buffers, we need to initially flush cpu cache, since + * the __GFP_ZERO on the allocation means the zeroing was done by the + * cpu and thus it is likely cached. Map (and implicitly flush) and + * unmap it now so we don't get corruption later on. + */ + if (buffer->uncached) { + dma_map_sgtable(dma_heap_get_dev(heap), table, DMA_BIDIRECTIONAL, 0); + dma_unmap_sgtable(dma_heap_get_dev(heap), table, DMA_BIDIRECTIONAL, 0); + } + return dmabuf; free_pages: @@ -417,13 +482,73 @@ free_buffer: return ERR_PTR(ret); } +static struct dma_buf *system_heap_allocate(struct dma_heap *heap, + unsigned long len, + unsigned long fd_flags, + unsigned long heap_flags) +{ + return system_heap_do_allocate(heap, len, fd_flags, heap_flags, false); +} + +static long system_get_pool_size(struct dma_heap *heap) +{ + int i; + long num_pages = 0; + struct dmabuf_page_pool **pool; + + pool = pools; + for (i = 0; i < NUM_ORDERS; i++, pool++) { + num_pages += ((*pool)->count[POOL_LOWPAGE] + + (*pool)->count[POOL_HIGHPAGE]) << (*pool)->order; + } + + return num_pages << PAGE_SHIFT; +} + static const struct dma_heap_ops system_heap_ops = { .allocate = system_heap_allocate, + .get_pool_size = system_get_pool_size, +}; + +static struct dma_buf *system_uncached_heap_allocate(struct dma_heap *heap, + unsigned long len, + unsigned long fd_flags, + unsigned long heap_flags) +{ + return system_heap_do_allocate(heap, len, fd_flags, heap_flags, true); +} + +/* Dummy function to be used until we can call coerce_mask_and_coherent */ +static struct dma_buf *system_uncached_heap_not_initialized(struct dma_heap *heap, + unsigned long len, + unsigned long fd_flags, + unsigned long heap_flags) +{ + return ERR_PTR(-EBUSY); +} + +static struct dma_heap_ops system_uncached_heap_ops = { + /* After system_heap_create is complete, we will swap this */ + .allocate = system_uncached_heap_not_initialized, }; static int system_heap_create(void) { struct dma_heap_export_info exp_info; + int i; + + for (i = 0; i < NUM_ORDERS; i++) { + pools[i] = dmabuf_page_pool_create(order_flags[i], orders[i]); + + if (IS_ERR(pools[i])) { + int j; + + pr_err("%s: page pool creation failed!\n", __func__); + for (j = 0; j < i; j++) + dmabuf_page_pool_destroy(pools[j]); + return PTR_ERR(pools[i]); + } + } exp_info.name = "system"; exp_info.ops = &system_heap_ops; @@ -433,6 +558,18 @@ static int system_heap_create(void) if (IS_ERR(sys_heap)) return PTR_ERR(sys_heap); + exp_info.name = "system-uncached"; + exp_info.ops = &system_uncached_heap_ops; + exp_info.priv = NULL; + + sys_uncached_heap = dma_heap_add(&exp_info); + if (IS_ERR(sys_uncached_heap)) + return PTR_ERR(sys_uncached_heap); + + dma_coerce_mask_and_coherent(dma_heap_get_dev(sys_uncached_heap), DMA_BIT_MASK(64)); + mb(); /* make sure we only set allocate after dma_mask is set */ + system_uncached_heap_ops.allocate = system_uncached_heap_allocate; + return 0; } module_init(system_heap_create); diff --git a/drivers/extcon/extcon.c b/drivers/extcon/extcon.c index f305503ec27e..1d5df1472db4 100644 --- a/drivers/extcon/extcon.c +++ b/drivers/extcon/extcon.c @@ -486,7 +486,7 @@ EXPORT_SYMBOL_GPL(extcon_sync); * * Returns 0 if success or error number if fail. */ -int extcon_get_state(struct extcon_dev *edev, const unsigned int id) +int extcon_get_state(struct extcon_dev *edev, unsigned int id) { int index, state; unsigned long flags; @@ -576,19 +576,7 @@ EXPORT_SYMBOL_GPL(extcon_set_state); */ int extcon_set_state_sync(struct extcon_dev *edev, unsigned int id, bool state) { - int ret, index; - unsigned long flags; - - index = find_cable_index_by_id(edev, id); - if (index < 0) - return index; - - /* Check whether the external connector's state is changed. */ - spin_lock_irqsave(&edev->lock, flags); - ret = is_extcon_changed(edev, index, state); - spin_unlock_irqrestore(&edev->lock, flags); - if (!ret) - return 0; + int ret; ret = extcon_set_state(edev, id, state); if (ret < 0) diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c index 814d3bf32489..bed57b9147fe 100644 --- a/drivers/firmware/arm_ffa/driver.c +++ b/drivers/firmware/arm_ffa/driver.c @@ -36,81 +36,6 @@ #include "common.h" #define FFA_DRIVER_VERSION FFA_VERSION_1_0 - -#define FFA_SMC(calling_convention, func_num) \ - ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, (calling_convention), \ - ARM_SMCCC_OWNER_STANDARD, (func_num)) - -#define FFA_SMC_32(func_num) FFA_SMC(ARM_SMCCC_SMC_32, (func_num)) -#define FFA_SMC_64(func_num) FFA_SMC(ARM_SMCCC_SMC_64, (func_num)) - -#define FFA_ERROR FFA_SMC_32(0x60) -#define FFA_SUCCESS FFA_SMC_32(0x61) -#define FFA_INTERRUPT FFA_SMC_32(0x62) -#define FFA_VERSION FFA_SMC_32(0x63) -#define FFA_FEATURES FFA_SMC_32(0x64) -#define FFA_RX_RELEASE FFA_SMC_32(0x65) -#define FFA_RXTX_MAP FFA_SMC_32(0x66) -#define FFA_FN64_RXTX_MAP FFA_SMC_64(0x66) -#define FFA_RXTX_UNMAP FFA_SMC_32(0x67) -#define FFA_PARTITION_INFO_GET FFA_SMC_32(0x68) -#define FFA_ID_GET FFA_SMC_32(0x69) -#define FFA_MSG_POLL FFA_SMC_32(0x6A) -#define FFA_MSG_WAIT FFA_SMC_32(0x6B) -#define FFA_YIELD FFA_SMC_32(0x6C) -#define FFA_RUN FFA_SMC_32(0x6D) -#define FFA_MSG_SEND FFA_SMC_32(0x6E) -#define FFA_MSG_SEND_DIRECT_REQ FFA_SMC_32(0x6F) -#define FFA_FN64_MSG_SEND_DIRECT_REQ FFA_SMC_64(0x6F) -#define FFA_MSG_SEND_DIRECT_RESP FFA_SMC_32(0x70) -#define FFA_FN64_MSG_SEND_DIRECT_RESP FFA_SMC_64(0x70) -#define FFA_MEM_DONATE FFA_SMC_32(0x71) -#define FFA_FN64_MEM_DONATE FFA_SMC_64(0x71) -#define FFA_MEM_LEND FFA_SMC_32(0x72) -#define FFA_FN64_MEM_LEND FFA_SMC_64(0x72) -#define FFA_MEM_SHARE FFA_SMC_32(0x73) -#define FFA_FN64_MEM_SHARE FFA_SMC_64(0x73) -#define FFA_MEM_RETRIEVE_REQ FFA_SMC_32(0x74) -#define FFA_FN64_MEM_RETRIEVE_REQ FFA_SMC_64(0x74) -#define FFA_MEM_RETRIEVE_RESP FFA_SMC_32(0x75) -#define FFA_MEM_RELINQUISH FFA_SMC_32(0x76) -#define FFA_MEM_RECLAIM FFA_SMC_32(0x77) -#define FFA_MEM_OP_PAUSE FFA_SMC_32(0x78) -#define FFA_MEM_OP_RESUME FFA_SMC_32(0x79) -#define FFA_MEM_FRAG_RX FFA_SMC_32(0x7A) -#define FFA_MEM_FRAG_TX FFA_SMC_32(0x7B) -#define FFA_NORMAL_WORLD_RESUME FFA_SMC_32(0x7C) - -/* - * For some calls it is necessary to use SMC64 to pass or return 64-bit values. - * For such calls FFA_FN_NATIVE(name) will choose the appropriate - * (native-width) function ID. - */ -#ifdef CONFIG_64BIT -#define FFA_FN_NATIVE(name) FFA_FN64_##name -#else -#define FFA_FN_NATIVE(name) FFA_##name -#endif - -/* FFA error codes. */ -#define FFA_RET_SUCCESS (0) -#define FFA_RET_NOT_SUPPORTED (-1) -#define FFA_RET_INVALID_PARAMETERS (-2) -#define FFA_RET_NO_MEMORY (-3) -#define FFA_RET_BUSY (-4) -#define FFA_RET_INTERRUPTED (-5) -#define FFA_RET_DENIED (-6) -#define FFA_RET_RETRY (-7) -#define FFA_RET_ABORTED (-8) - -#define MAJOR_VERSION_MASK GENMASK(30, 16) -#define MINOR_VERSION_MASK GENMASK(15, 0) -#define MAJOR_VERSION(x) ((u16)(FIELD_GET(MAJOR_VERSION_MASK, (x)))) -#define MINOR_VERSION(x) ((u16)(FIELD_GET(MINOR_VERSION_MASK, (x)))) -#define PACK_VERSION_INFO(major, minor) \ - (FIELD_PREP(MAJOR_VERSION_MASK, (major)) | \ - FIELD_PREP(MINOR_VERSION_MASK, (minor))) -#define FFA_VERSION_1_0 PACK_VERSION_INFO(1, 0) #define FFA_MIN_VERSION FFA_VERSION_1_0 #define SENDER_ID_MASK GENMASK(31, 16) @@ -120,12 +45,6 @@ #define PACK_TARGET_INFO(s, r) \ (FIELD_PREP(SENDER_ID_MASK, (s)) | FIELD_PREP(RECEIVER_ID_MASK, (r))) -/* - * FF-A specification mentions explicitly about '4K pages'. This should - * not be confused with the kernel PAGE_SIZE, which is the translation - * granule kernel is configured and may be one among 4K, 16K and 64K. - */ -#define FFA_PAGE_SIZE SZ_4K /* * Keeping RX TX buffer size as 4K for now * 64K may be preferred to keep it min a page in 64K PAGE_SIZE config @@ -167,6 +86,27 @@ struct ffa_drv_info { static struct ffa_drv_info *drv_info; +/* + * The driver must be able to support all the versions from the earliest + * supported FFA_MIN_VERSION to the latest supported FFA_DRIVER_VERSION. + * The specification states that if firmware supports a FFA implementation + * that is incompatible with and at a greater version number than specified + * by the caller(FFA_DRIVER_VERSION passed as parameter to FFA_VERSION), + * it must return the NOT_SUPPORTED error code. + */ +static u32 ffa_compatible_version_find(u32 version) +{ + u16 major = FFA_MAJOR_VERSION(version), minor = FFA_MINOR_VERSION(version); + u16 drv_major = FFA_MAJOR_VERSION(FFA_DRIVER_VERSION); + u16 drv_minor = FFA_MINOR_VERSION(FFA_DRIVER_VERSION); + + if ((major < drv_major) || (major == drv_major && minor <= drv_minor)) + return version; + + pr_info("Firmware version higher than driver version, downgrading\n"); + return FFA_DRIVER_VERSION; +} + static int ffa_version_check(u32 *version) { ffa_value_t ver; @@ -180,15 +120,20 @@ static int ffa_version_check(u32 *version) return -EOPNOTSUPP; } - if (ver.a0 < FFA_MIN_VERSION || ver.a0 > FFA_DRIVER_VERSION) { - pr_err("Incompatible version %d.%d found\n", - MAJOR_VERSION(ver.a0), MINOR_VERSION(ver.a0)); + if (ver.a0 < FFA_MIN_VERSION) { + pr_err("Incompatible v%d.%d! Earliest supported v%d.%d\n", + FFA_MAJOR_VERSION(ver.a0), FFA_MINOR_VERSION(ver.a0), + FFA_MAJOR_VERSION(FFA_MIN_VERSION), + FFA_MINOR_VERSION(FFA_MIN_VERSION)); return -EINVAL; } - *version = ver.a0; - pr_info("Version %d.%d found\n", MAJOR_VERSION(ver.a0), - MINOR_VERSION(ver.a0)); + pr_info("Driver version %d.%d\n", FFA_MAJOR_VERSION(FFA_DRIVER_VERSION), + FFA_MINOR_VERSION(FFA_DRIVER_VERSION)); + pr_info("Firmware version %d.%d found\n", FFA_MAJOR_VERSION(ver.a0), + FFA_MINOR_VERSION(ver.a0)); + *version = ffa_compatible_version_find(ver.a0); + return 0; } @@ -586,6 +531,22 @@ ffa_memory_share(struct ffa_device *dev, struct ffa_mem_ops_args *args) return ffa_memory_ops(FFA_FN_NATIVE(MEM_SHARE), args); } +static int +ffa_memory_lend(struct ffa_device *dev, struct ffa_mem_ops_args *args) +{ + /* Note that upon a successful MEM_LEND request the caller + * must ensure that the memory region specified is not accessed + * until a successful MEM_RECALIM call has been made. + * On systems with a hypervisor present this will been enforced, + * however on systems without a hypervisor the responsibility + * falls to the calling kernel driver to prevent access. + */ + if (dev->mode_32bit) + return ffa_memory_ops(FFA_MEM_LEND, args); + + return ffa_memory_ops(FFA_FN_NATIVE(MEM_LEND), args); +} + static const struct ffa_dev_ops ffa_ops = { .api_version_get = ffa_api_version_get, .partition_info_get = ffa_partition_info_get, @@ -593,6 +554,7 @@ static const struct ffa_dev_ops ffa_ops = { .sync_send_receive = ffa_sync_send_receive, .memory_reclaim = ffa_memory_reclaim, .memory_share = ffa_memory_share, + .memory_lend = ffa_memory_lend, }; const struct ffa_dev_ops *ffa_dev_ops_get(struct ffa_device *dev) diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 9cc556013d08..23c0b8bca642 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -23,8 +23,8 @@ efi_status_t check_platform_features(void) if (IS_ENABLED(CONFIG_ARM64_4K_PAGES)) return EFI_SUCCESS; - tg = (read_cpuid(ID_AA64MMFR0_EL1) >> ID_AA64MMFR0_TGRAN_SHIFT) & 0xf; - if (tg < ID_AA64MMFR0_TGRAN_SUPPORTED_MIN || tg > ID_AA64MMFR0_TGRAN_SUPPORTED_MAX) { + tg = (read_cpuid(ID_AA64MMFR0_EL1) >> ID_AA64MMFR0_EL1_TGRAN_SHIFT) & 0xf; + if (tg < ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN || tg > ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX) { if (IS_ENABLED(CONFIG_ARM64_64K_PAGES)) efi_err("This 64 KB granular kernel is not supported by your CPU\n"); else diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index cfb448eabdaa..b41cacaaf961 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -27,6 +27,7 @@ #include #include #include +#include /* * While a 64-bit OS can make calls with SMC32 calling conventions, for some @@ -52,6 +53,12 @@ static enum arm_smccc_conduit psci_conduit = SMCCC_CONDUIT_NONE; bool psci_tos_resident_on(int cpu) { + bool resident = false; + + trace_android_vh_psci_tos_resident_on(cpu, &resident); + if (resident) + return resident; + return cpu == resident_cpu; } @@ -169,6 +176,11 @@ int psci_set_osi_mode(bool enable) static int __psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point) { int err; + bool deny = false; + + trace_android_vh_psci_cpu_suspend(state, &deny); + if (deny) + return -EPERM; err = invoke_psci_fn(fn, state, entry_point, 0); return psci_to_linux_errno(err); diff --git a/drivers/firmware/smccc/kvm_guest.c b/drivers/firmware/smccc/kvm_guest.c index 2d3e866decaa..56169e73252a 100644 --- a/drivers/firmware/smccc/kvm_guest.c +++ b/drivers/firmware/smccc/kvm_guest.c @@ -9,6 +9,8 @@ #include +void __weak kvm_arm_init_hyp_services(void) {} + static DECLARE_BITMAP(__kvm_arm_hyp_services, ARM_SMCCC_KVM_NUM_FUNCS) __ro_after_init = { }; void __init kvm_init_hyp_services(void) @@ -38,6 +40,8 @@ void __init kvm_init_hyp_services(void) pr_info("hypervisor services detected (0x%08lx 0x%08lx 0x%08lx 0x%08lx)\n", res.a3, res.a2, res.a1, res.a0); + + kvm_arm_init_hyp_services(); } bool kvm_arm_hyp_service_available(u32 func_id) diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c index e54f086b5ba7..f91f52c7fedd 100644 --- a/drivers/gpu/drm/drm_atomic_helper.c +++ b/drivers/gpu/drm/drm_atomic_helper.c @@ -42,6 +42,8 @@ #include #include +#include + #include "drm_crtc_helper_internal.h" #include "drm_crtc_internal.h" @@ -616,6 +618,7 @@ drm_atomic_helper_check_modeset(struct drm_device *dev, struct drm_connector_state *old_connector_state, *new_connector_state; int i, ret; unsigned int connectors_mask = 0; + bool allow = false; for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state, new_crtc_state, i) { bool has_connectors = @@ -722,6 +725,10 @@ drm_atomic_helper_check_modeset(struct drm_device *dev, if (ret != 0) return ret; + trace_android_vh_drm_atomic_check_modeset(state, crtc, &allow); + if (allow) + continue; + ret = drm_atomic_add_affected_planes(state, crtc); if (ret != 0) return ret; diff --git a/drivers/gpu/drm/drm_framebuffer.c b/drivers/gpu/drm/drm_framebuffer.c index 8683b59bcbcb..8e267e06f998 100644 --- a/drivers/gpu/drm/drm_framebuffer.c +++ b/drivers/gpu/drm/drm_framebuffer.c @@ -35,6 +35,8 @@ #include #include +#include + #include "drm_crtc_internal.h" #include "drm_internal.h" @@ -965,6 +967,7 @@ static int atomic_remove_fb(struct drm_framebuffer *fb) int i, ret; unsigned plane_mask; bool disable_crtcs = false; + bool allow = false; retry_disable: drm_modeset_acquire_init(&ctx, 0); @@ -1034,6 +1037,10 @@ retry: goto unlock; } + trace_android_vh_atomic_remove_fb(fb, &allow); + if (allow) + goto unlock; + if (plane_mask) ret = drm_atomic_commit(state); diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c index fb5e6f86dea2..d5f76682991a 100644 --- a/drivers/gpu/drm/drm_ioctl.c +++ b/drivers/gpu/drm/drm_ioctl.c @@ -691,9 +691,9 @@ static const struct drm_ioctl_desc drm_ioctls[] = { DRM_IOCTL_DEF(DRM_IOCTL_MODE_RMFB, drm_mode_rmfb_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_PAGE_FLIP, drm_mode_page_flip_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_DIRTYFB, drm_mode_dirtyfb_ioctl, DRM_MASTER), - DRM_IOCTL_DEF(DRM_IOCTL_MODE_CREATE_DUMB, drm_mode_create_dumb_ioctl, 0), - DRM_IOCTL_DEF(DRM_IOCTL_MODE_MAP_DUMB, drm_mode_mmap_dumb_ioctl, 0), - DRM_IOCTL_DEF(DRM_IOCTL_MODE_DESTROY_DUMB, drm_mode_destroy_dumb_ioctl, 0), + DRM_IOCTL_DEF(DRM_IOCTL_MODE_CREATE_DUMB, drm_mode_create_dumb_ioctl, DRM_RENDER_ALLOW), + DRM_IOCTL_DEF(DRM_IOCTL_MODE_MAP_DUMB, drm_mode_mmap_dumb_ioctl, DRM_RENDER_ALLOW), + DRM_IOCTL_DEF(DRM_IOCTL_MODE_DESTROY_DUMB, drm_mode_destroy_dumb_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_MODE_OBJ_GETPROPERTIES, drm_mode_obj_get_properties_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_OBJ_SETPROPERTY, drm_mode_obj_set_property_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_CURSOR2, drm_mode_cursor2_ioctl, DRM_MASTER), diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c index 0f492d7c651a..ec699c4951cb 100644 --- a/drivers/gpu/drm/drm_modes.c +++ b/drivers/gpu/drm/drm_modes.c @@ -1987,6 +1987,7 @@ void drm_mode_convert_to_umode(struct drm_mode_modeinfo *out, strncpy(out->name, in->name, DRM_DISPLAY_MODE_LEN); out->name[DRM_DISPLAY_MODE_LEN-1] = 0; } +EXPORT_SYMBOL_GPL(drm_mode_convert_to_umode); /** * drm_mode_convert_umode - convert a modeinfo into a drm_display_mode @@ -2063,6 +2064,7 @@ int drm_mode_convert_umode(struct drm_device *dev, return 0; } +EXPORT_SYMBOL_GPL(drm_mode_convert_umode); /** * drm_mode_is_420_only - if a given videomode can be only supported in YCBCR420 diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 9cf6ac575de1..4c5584f10257 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -494,7 +494,7 @@ static void error_print_context(struct drm_i915_error_state_buf *m, } static struct i915_vma_coredump * -__find_vma(struct i915_vma_coredump *vma, const char *name) +__i915_find_vma(struct i915_vma_coredump *vma, const char *name) { while (vma) { if (strcmp(vma->name, name) == 0) @@ -508,7 +508,7 @@ __find_vma(struct i915_vma_coredump *vma, const char *name) static struct i915_vma_coredump * find_batch(const struct intel_engine_coredump *ee) { - return __find_vma(ee->vma, "batch"); + return __i915_find_vma(ee->vma, "batch"); } static void error_print_engine(struct drm_i915_error_state_buf *m, diff --git a/drivers/gpu/drm/virtio/virtgpu_display.c b/drivers/gpu/drm/virtio/virtgpu_display.c index ef1f19083cd3..938dfcfc095d 100644 --- a/drivers/gpu/drm/virtio/virtgpu_display.c +++ b/drivers/gpu/drm/virtio/virtgpu_display.c @@ -300,10 +300,6 @@ virtio_gpu_user_framebuffer_create(struct drm_device *dev, struct virtio_gpu_framebuffer *virtio_gpu_fb; int ret; - if (mode_cmd->pixel_format != DRM_FORMAT_HOST_XRGB8888 && - mode_cmd->pixel_format != DRM_FORMAT_HOST_ARGB8888) - return ERR_PTR(-ENOENT); - /* lookup object associated with res handle */ obj = drm_gem_object_lookup(file_priv, mode_cmd->handles[0]); if (!obj) @@ -337,7 +333,6 @@ int virtio_gpu_modeset_init(struct virtio_gpu_device *vgdev) if (ret) return ret; - vgdev->ddev->mode_config.quirk_addfb_prefer_host_byte_order = true; vgdev->ddev->mode_config.funcs = &virtio_gpu_mode_funcs; /* modes will be validated against the framebuffer size */ diff --git a/drivers/gpu/drm/virtio/virtgpu_plane.c b/drivers/gpu/drm/virtio/virtgpu_plane.c index 7e8cbcee1722..04d6cbcac9f7 100644 --- a/drivers/gpu/drm/virtio/virtgpu_plane.c +++ b/drivers/gpu/drm/virtio/virtgpu_plane.c @@ -31,7 +31,14 @@ #include "virtgpu_drv.h" static const uint32_t virtio_gpu_formats[] = { - DRM_FORMAT_HOST_XRGB8888, + DRM_FORMAT_XRGB8888, + DRM_FORMAT_ARGB8888, + DRM_FORMAT_BGRX8888, + DRM_FORMAT_BGRA8888, + DRM_FORMAT_RGBX8888, + DRM_FORMAT_RGBA8888, + DRM_FORMAT_XBGR8888, + DRM_FORMAT_ABGR8888, }; static const uint32_t virtio_gpu_cursor_formats[] = { @@ -43,6 +50,32 @@ uint32_t virtio_gpu_translate_format(uint32_t drm_fourcc) uint32_t format; switch (drm_fourcc) { +#ifdef __BIG_ENDIAN + case DRM_FORMAT_XRGB8888: + format = VIRTIO_GPU_FORMAT_X8R8G8B8_UNORM; + break; + case DRM_FORMAT_ARGB8888: + format = VIRTIO_GPU_FORMAT_A8R8G8B8_UNORM; + break; + case DRM_FORMAT_BGRX8888: + format = VIRTIO_GPU_FORMAT_B8G8R8X8_UNORM; + break; + case DRM_FORMAT_BGRA8888: + format = VIRTIO_GPU_FORMAT_B8G8R8A8_UNORM; + break; + case DRM_FORMAT_RGBX8888: + format = VIRTIO_GPU_FORMAT_R8G8B8X8_UNORM; + break; + case DRM_FORMAT_RGBA8888: + format = VIRTIO_GPU_FORMAT_R8G8B8A8_UNORM; + break; + case DRM_FORMAT_XBGR8888: + format = VIRTIO_GPU_FORMAT_X8B8G8R8_UNORM; + break; + case DRM_FORMAT_ABGR8888: + format = VIRTIO_GPU_FORMAT_A8B8G8R8_UNORM; + break; +#else case DRM_FORMAT_XRGB8888: format = VIRTIO_GPU_FORMAT_B8G8R8X8_UNORM; break; @@ -55,6 +88,19 @@ uint32_t virtio_gpu_translate_format(uint32_t drm_fourcc) case DRM_FORMAT_BGRA8888: format = VIRTIO_GPU_FORMAT_A8R8G8B8_UNORM; break; + case DRM_FORMAT_RGBX8888: + format = VIRTIO_GPU_FORMAT_X8B8G8R8_UNORM; + break; + case DRM_FORMAT_RGBA8888: + format = VIRTIO_GPU_FORMAT_A8B8G8R8_UNORM; + break; + case DRM_FORMAT_XBGR8888: + format = VIRTIO_GPU_FORMAT_R8G8B8X8_UNORM; + break; + case DRM_FORMAT_ABGR8888: + format = VIRTIO_GPU_FORMAT_R8G8B8A8_UNORM; + break; +#endif default: /* * This should not happen, we handle everything listed diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index 9235ab7161e3..df1740750e3d 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -731,6 +731,17 @@ config HID_MULTITOUCH To compile this driver as a module, choose M here: the module will be called hid-multitouch. +config HID_NINTENDO + tristate "Nintendo Joy-Con and Pro Controller support" + depends on HID + help + Adds support for the Nintendo Switch Joy-Cons and Pro Controller. + All controllers support bluetooth, and the Pro Controller also supports + its USB mode. + + To compile this driver as a module, choose M here: the + module will be called hid-nintendo. + config HID_NTI tristate "NTI keyboard adapters" help diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile index e29efcb1c040..7a71371e3adf 100644 --- a/drivers/hid/Makefile +++ b/drivers/hid/Makefile @@ -78,6 +78,7 @@ obj-$(CONFIG_HID_MAYFLASH) += hid-mf.o obj-$(CONFIG_HID_MICROSOFT) += hid-microsoft.o obj-$(CONFIG_HID_MONTEREY) += hid-monterey.o obj-$(CONFIG_HID_MULTITOUCH) += hid-multitouch.o +obj-$(CONFIG_HID_NINTENDO) += hid-nintendo.o obj-$(CONFIG_HID_NTI) += hid-nti.o obj-$(CONFIG_HID_NTRIG) += hid-ntrig.o obj-$(CONFIG_HID_ORTEK) += hid-ortek.o diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 53facf157ecb..1d617eb4e369 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -929,6 +929,9 @@ #define USB_VENDOR_ID_NINTENDO 0x057e #define USB_DEVICE_ID_NINTENDO_WIIMOTE 0x0306 #define USB_DEVICE_ID_NINTENDO_WIIMOTE2 0x0330 +#define USB_DEVICE_ID_NINTENDO_JOYCONL 0x2006 +#define USB_DEVICE_ID_NINTENDO_JOYCONR 0x2007 +#define USB_DEVICE_ID_NINTENDO_PROCON 0x2009 #define USB_VENDOR_ID_NOVATEK 0x0603 #define USB_DEVICE_ID_NOVATEK_PCT 0x0600 diff --git a/drivers/hid/hid-nintendo.c b/drivers/hid/hid-nintendo.c new file mode 100644 index 000000000000..3695b96694bd --- /dev/null +++ b/drivers/hid/hid-nintendo.c @@ -0,0 +1,820 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * HID driver for Nintendo Switch Joy-Cons and Pro Controllers + * + * Copyright (c) 2019 Daniel J. Ogorchock + * + * The following resources/projects were referenced for this driver: + * https://github.com/dekuNukem/Nintendo_Switch_Reverse_Engineering + * https://gitlab.com/pjranki/joycon-linux-kernel (Peter Rankin) + * https://github.com/FrotBot/SwitchProConLinuxUSB + * https://github.com/MTCKC/ProconXInput + * hid-wiimote kernel hid driver + * hid-logitech-hidpp driver + * + * This driver supports the Nintendo Switch Joy-Cons and Pro Controllers. The + * Pro Controllers can either be used over USB or Bluetooth. + * + * The driver will retrieve the factory calibration info from the controllers, + * so little to no user calibration should be required. + * + */ + +#include "hid-ids.h" +#include +#include +#include +#include +#include +#include + +/* + * Reference the url below for the following HID report defines: + * https://github.com/dekuNukem/Nintendo_Switch_Reverse_Engineering + */ + +/* Output Reports */ +static const u8 JC_OUTPUT_RUMBLE_AND_SUBCMD = 0x01; +static const u8 JC_OUTPUT_FW_UPDATE_PKT = 0x03; +static const u8 JC_OUTPUT_RUMBLE_ONLY = 0x10; +static const u8 JC_OUTPUT_MCU_DATA = 0x11; +static const u8 JC_OUTPUT_USB_CMD = 0x80; + +/* Subcommand IDs */ +static const u8 JC_SUBCMD_STATE /*= 0x00*/; +static const u8 JC_SUBCMD_MANUAL_BT_PAIRING = 0x01; +static const u8 JC_SUBCMD_REQ_DEV_INFO = 0x02; +static const u8 JC_SUBCMD_SET_REPORT_MODE = 0x03; +static const u8 JC_SUBCMD_TRIGGERS_ELAPSED = 0x04; +static const u8 JC_SUBCMD_GET_PAGE_LIST_STATE = 0x05; +static const u8 JC_SUBCMD_SET_HCI_STATE = 0x06; +static const u8 JC_SUBCMD_RESET_PAIRING_INFO = 0x07; +static const u8 JC_SUBCMD_LOW_POWER_MODE = 0x08; +static const u8 JC_SUBCMD_SPI_FLASH_READ = 0x10; +static const u8 JC_SUBCMD_SPI_FLASH_WRITE = 0x11; +static const u8 JC_SUBCMD_RESET_MCU = 0x20; +static const u8 JC_SUBCMD_SET_MCU_CONFIG = 0x21; +static const u8 JC_SUBCMD_SET_MCU_STATE = 0x22; +static const u8 JC_SUBCMD_SET_PLAYER_LIGHTS = 0x30; +static const u8 JC_SUBCMD_GET_PLAYER_LIGHTS = 0x31; +static const u8 JC_SUBCMD_SET_HOME_LIGHT = 0x38; +static const u8 JC_SUBCMD_ENABLE_IMU = 0x40; +static const u8 JC_SUBCMD_SET_IMU_SENSITIVITY = 0x41; +static const u8 JC_SUBCMD_WRITE_IMU_REG = 0x42; +static const u8 JC_SUBCMD_READ_IMU_REG = 0x43; +static const u8 JC_SUBCMD_ENABLE_VIBRATION = 0x48; +static const u8 JC_SUBCMD_GET_REGULATED_VOLTAGE = 0x50; + +/* Input Reports */ +static const u8 JC_INPUT_BUTTON_EVENT = 0x3F; +static const u8 JC_INPUT_SUBCMD_REPLY = 0x21; +static const u8 JC_INPUT_IMU_DATA = 0x30; +static const u8 JC_INPUT_MCU_DATA = 0x31; +static const u8 JC_INPUT_USB_RESPONSE = 0x81; + +/* Feature Reports */ +static const u8 JC_FEATURE_LAST_SUBCMD = 0x02; +static const u8 JC_FEATURE_OTA_FW_UPGRADE = 0x70; +static const u8 JC_FEATURE_SETUP_MEM_READ = 0x71; +static const u8 JC_FEATURE_MEM_READ = 0x72; +static const u8 JC_FEATURE_ERASE_MEM_SECTOR = 0x73; +static const u8 JC_FEATURE_MEM_WRITE = 0x74; +static const u8 JC_FEATURE_LAUNCH = 0x75; + +/* USB Commands */ +static const u8 JC_USB_CMD_CONN_STATUS = 0x01; +static const u8 JC_USB_CMD_HANDSHAKE = 0x02; +static const u8 JC_USB_CMD_BAUDRATE_3M = 0x03; +static const u8 JC_USB_CMD_NO_TIMEOUT = 0x04; +static const u8 JC_USB_CMD_EN_TIMEOUT = 0x05; +static const u8 JC_USB_RESET = 0x06; +static const u8 JC_USB_PRE_HANDSHAKE = 0x91; +static const u8 JC_USB_SEND_UART = 0x92; + +/* SPI storage addresses of factory calibration data */ +static const u16 JC_CAL_DATA_START = 0x603d; +static const u16 JC_CAL_DATA_END = 0x604e; +#define JC_CAL_DATA_SIZE (JC_CAL_DATA_END - JC_CAL_DATA_START + 1) + + +/* The raw analog joystick values will be mapped in terms of this magnitude */ +static const u16 JC_MAX_STICK_MAG = 32767; +static const u16 JC_STICK_FUZZ = 250; +static const u16 JC_STICK_FLAT = 500; + +/* States for controller state machine */ +enum joycon_ctlr_state { + JOYCON_CTLR_STATE_INIT, + JOYCON_CTLR_STATE_READ, +}; + +struct joycon_stick_cal { + s32 max; + s32 min; + s32 center; +}; + +/* + * All the controller's button values are stored in a u32. + * They can be accessed with bitwise ANDs. + */ +static const u32 JC_BTN_Y = BIT(0); +static const u32 JC_BTN_X = BIT(1); +static const u32 JC_BTN_B = BIT(2); +static const u32 JC_BTN_A = BIT(3); +static const u32 JC_BTN_SR_R = BIT(4); +static const u32 JC_BTN_SL_R = BIT(5); +static const u32 JC_BTN_R = BIT(6); +static const u32 JC_BTN_ZR = BIT(7); +static const u32 JC_BTN_MINUS = BIT(8); +static const u32 JC_BTN_PLUS = BIT(9); +static const u32 JC_BTN_RSTICK = BIT(10); +static const u32 JC_BTN_LSTICK = BIT(11); +static const u32 JC_BTN_HOME = BIT(12); +static const u32 JC_BTN_CAP = BIT(13); /* capture button */ +static const u32 JC_BTN_DOWN = BIT(16); +static const u32 JC_BTN_UP = BIT(17); +static const u32 JC_BTN_RIGHT = BIT(18); +static const u32 JC_BTN_LEFT = BIT(19); +static const u32 JC_BTN_SR_L = BIT(20); +static const u32 JC_BTN_SL_L = BIT(21); +static const u32 JC_BTN_L = BIT(22); +static const u32 JC_BTN_ZL = BIT(23); + +enum joycon_msg_type { + JOYCON_MSG_TYPE_NONE, + JOYCON_MSG_TYPE_USB, + JOYCON_MSG_TYPE_SUBCMD, +}; + +struct joycon_subcmd_request { + u8 output_id; /* must be 0x01 for subcommand, 0x10 for rumble only */ + u8 packet_num; /* incremented every send */ + u8 rumble_data[8]; + u8 subcmd_id; + u8 data[0]; /* length depends on the subcommand */ +} __packed; + +struct joycon_subcmd_reply { + u8 ack; /* MSB 1 for ACK, 0 for NACK */ + u8 id; /* id of requested subcmd */ + u8 data[0]; /* will be at most 35 bytes */ +} __packed; + +struct joycon_input_report { + u8 id; + u8 timer; + u8 bat_con; /* battery and connection info */ + u8 button_status[3]; + u8 left_stick[3]; + u8 right_stick[3]; + u8 vibrator_report; + + /* + * If support for firmware updates, gyroscope data, and/or NFC/IR + * are added in the future, this can be swapped for a union. + */ + struct joycon_subcmd_reply reply; +} __packed; + +#define JC_MAX_RESP_SIZE (sizeof(struct joycon_input_report) + 35) + +/* Each physical controller is associated with a joycon_ctlr struct */ +struct joycon_ctlr { + struct hid_device *hdev; + struct input_dev *input; + enum joycon_ctlr_state ctlr_state; + + /* The following members are used for synchronous sends/receives */ + enum joycon_msg_type msg_type; + u8 subcmd_num; + struct mutex output_mutex; + u8 input_buf[JC_MAX_RESP_SIZE]; + wait_queue_head_t wait; + bool received_resp; + u8 usb_ack_match; + u8 subcmd_ack_match; + + /* factory calibration data */ + struct joycon_stick_cal left_stick_cal_x; + struct joycon_stick_cal left_stick_cal_y; + struct joycon_stick_cal right_stick_cal_x; + struct joycon_stick_cal right_stick_cal_y; + +}; + +static int __joycon_hid_send(struct hid_device *hdev, u8 *data, size_t len) +{ + u8 *buf; + int ret; + + buf = kmemdup(data, len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + ret = hid_hw_output_report(hdev, buf, len); + kfree(buf); + if (ret < 0) + hid_dbg(hdev, "Failed to send output report ret=%d\n", ret); + return ret; +} + +static int joycon_hid_send_sync(struct joycon_ctlr *ctlr, u8 *data, size_t len) +{ + int ret; + + ret = __joycon_hid_send(ctlr->hdev, data, len); + if (ret < 0) { + memset(ctlr->input_buf, 0, JC_MAX_RESP_SIZE); + return ret; + } + + if (!wait_event_timeout(ctlr->wait, ctlr->received_resp, HZ)) { + hid_dbg(ctlr->hdev, "synchronous send/receive timed out\n"); + memset(ctlr->input_buf, 0, JC_MAX_RESP_SIZE); + return -ETIMEDOUT; + } + + ctlr->received_resp = false; + return 0; +} + +static int joycon_send_usb(struct joycon_ctlr *ctlr, u8 cmd) +{ + int ret; + u8 buf[2] = {JC_OUTPUT_USB_CMD}; + + buf[1] = cmd; + ctlr->usb_ack_match = cmd; + ctlr->msg_type = JOYCON_MSG_TYPE_USB; + ret = joycon_hid_send_sync(ctlr, buf, sizeof(buf)); + if (ret) + hid_dbg(ctlr->hdev, "send usb command failed; ret=%d\n", ret); + return ret; +} + +static int joycon_send_subcmd(struct joycon_ctlr *ctlr, + struct joycon_subcmd_request *subcmd, + size_t data_len) +{ + int ret; + + subcmd->output_id = JC_OUTPUT_RUMBLE_AND_SUBCMD; + subcmd->packet_num = ctlr->subcmd_num; + if (++ctlr->subcmd_num > 0xF) + ctlr->subcmd_num = 0; + ctlr->subcmd_ack_match = subcmd->subcmd_id; + ctlr->msg_type = JOYCON_MSG_TYPE_SUBCMD; + + ret = joycon_hid_send_sync(ctlr, (u8 *)subcmd, + sizeof(*subcmd) + data_len); + if (ret < 0) + hid_dbg(ctlr->hdev, "send subcommand failed; ret=%d\n", ret); + else + ret = 0; + return ret; +} + +/* Supply nibbles for flash and on. Ones correspond to active */ +static int joycon_set_player_leds(struct joycon_ctlr *ctlr, u8 flash, u8 on) +{ + struct joycon_subcmd_request *req; + u8 buffer[sizeof(*req) + 1] = { 0 }; + + req = (struct joycon_subcmd_request *)buffer; + req->subcmd_id = JC_SUBCMD_SET_PLAYER_LIGHTS; + req->data[0] = (flash << 4) | on; + + hid_dbg(ctlr->hdev, "setting player leds\n"); + return joycon_send_subcmd(ctlr, req, 1); +} + +static const u16 DFLT_STICK_CAL_CEN = 2000; +static const u16 DFLT_STICK_CAL_MAX = 3500; +static const u16 DFLT_STICK_CAL_MIN = 500; +static int joycon_request_calibration(struct joycon_ctlr *ctlr) +{ + struct joycon_subcmd_request *req; + u8 buffer[sizeof(*req) + 5] = { 0 }; + struct joycon_input_report *report; + struct joycon_stick_cal *cal_x; + struct joycon_stick_cal *cal_y; + s32 x_max_above; + s32 x_min_below; + s32 y_max_above; + s32 y_min_below; + u8 *data; + u8 *raw_cal; + int ret; + + req = (struct joycon_subcmd_request *)buffer; + req->subcmd_id = JC_SUBCMD_SPI_FLASH_READ; + data = req->data; + data[0] = 0xFF & JC_CAL_DATA_START; + data[1] = 0xFF & (JC_CAL_DATA_START >> 8); + data[2] = 0xFF & (JC_CAL_DATA_START >> 16); + data[3] = 0xFF & (JC_CAL_DATA_START >> 24); + data[4] = JC_CAL_DATA_SIZE; + + hid_dbg(ctlr->hdev, "requesting cal data\n"); + ret = joycon_send_subcmd(ctlr, req, 5); + if (ret) { + hid_warn(ctlr->hdev, + "Failed to read stick cal, using defaults; ret=%d\n", + ret); + + ctlr->left_stick_cal_x.center = DFLT_STICK_CAL_CEN; + ctlr->left_stick_cal_x.max = DFLT_STICK_CAL_MAX; + ctlr->left_stick_cal_x.min = DFLT_STICK_CAL_MIN; + + ctlr->left_stick_cal_y.center = DFLT_STICK_CAL_CEN; + ctlr->left_stick_cal_y.max = DFLT_STICK_CAL_MAX; + ctlr->left_stick_cal_y.min = DFLT_STICK_CAL_MIN; + + ctlr->right_stick_cal_x.center = DFLT_STICK_CAL_CEN; + ctlr->right_stick_cal_x.max = DFLT_STICK_CAL_MAX; + ctlr->right_stick_cal_x.min = DFLT_STICK_CAL_MIN; + + ctlr->right_stick_cal_y.center = DFLT_STICK_CAL_CEN; + ctlr->right_stick_cal_y.max = DFLT_STICK_CAL_MAX; + ctlr->right_stick_cal_y.min = DFLT_STICK_CAL_MIN; + + return ret; + } + + report = (struct joycon_input_report *)ctlr->input_buf; + raw_cal = &report->reply.data[5]; + + /* left stick calibration parsing */ + cal_x = &ctlr->left_stick_cal_x; + cal_y = &ctlr->left_stick_cal_y; + + x_max_above = hid_field_extract(ctlr->hdev, (raw_cal + 0), 0, 12); + y_max_above = hid_field_extract(ctlr->hdev, (raw_cal + 1), 4, 12); + cal_x->center = hid_field_extract(ctlr->hdev, (raw_cal + 3), 0, 12); + cal_y->center = hid_field_extract(ctlr->hdev, (raw_cal + 4), 4, 12); + x_min_below = hid_field_extract(ctlr->hdev, (raw_cal + 6), 0, 12); + y_min_below = hid_field_extract(ctlr->hdev, (raw_cal + 7), 4, 12); + cal_x->max = cal_x->center + x_max_above; + cal_x->min = cal_x->center - x_min_below; + cal_y->max = cal_y->center + y_max_above; + cal_y->min = cal_y->center - y_min_below; + + /* right stick calibration parsing */ + raw_cal += 9; + cal_x = &ctlr->right_stick_cal_x; + cal_y = &ctlr->right_stick_cal_y; + + cal_x->center = hid_field_extract(ctlr->hdev, (raw_cal + 0), 0, 12); + cal_y->center = hid_field_extract(ctlr->hdev, (raw_cal + 1), 4, 12); + x_min_below = hid_field_extract(ctlr->hdev, (raw_cal + 3), 0, 12); + y_min_below = hid_field_extract(ctlr->hdev, (raw_cal + 4), 4, 12); + x_max_above = hid_field_extract(ctlr->hdev, (raw_cal + 6), 0, 12); + y_max_above = hid_field_extract(ctlr->hdev, (raw_cal + 7), 4, 12); + cal_x->max = cal_x->center + x_max_above; + cal_x->min = cal_x->center - x_min_below; + cal_y->max = cal_y->center + y_max_above; + cal_y->min = cal_y->center - y_min_below; + + hid_dbg(ctlr->hdev, "calibration:\n" + "l_x_c=%d l_x_max=%d l_x_min=%d\n" + "l_y_c=%d l_y_max=%d l_y_min=%d\n" + "r_x_c=%d r_x_max=%d r_x_min=%d\n" + "r_y_c=%d r_y_max=%d r_y_min=%d\n", + ctlr->left_stick_cal_x.center, + ctlr->left_stick_cal_x.max, + ctlr->left_stick_cal_x.min, + ctlr->left_stick_cal_y.center, + ctlr->left_stick_cal_y.max, + ctlr->left_stick_cal_y.min, + ctlr->right_stick_cal_x.center, + ctlr->right_stick_cal_x.max, + ctlr->right_stick_cal_x.min, + ctlr->right_stick_cal_y.center, + ctlr->right_stick_cal_y.max, + ctlr->right_stick_cal_y.min); + + return 0; +} + +static int joycon_set_report_mode(struct joycon_ctlr *ctlr) +{ + struct joycon_subcmd_request *req; + u8 buffer[sizeof(*req) + 1] = { 0 }; + + req = (struct joycon_subcmd_request *)buffer; + req->subcmd_id = JC_SUBCMD_SET_REPORT_MODE; + req->data[0] = 0x30; /* standard, full report mode */ + + hid_dbg(ctlr->hdev, "setting controller report mode\n"); + return joycon_send_subcmd(ctlr, req, 1); +} + +static s32 joycon_map_stick_val(struct joycon_stick_cal *cal, s32 val) +{ + s32 center = cal->center; + s32 min = cal->min; + s32 max = cal->max; + s32 new_val; + + if (val > center) { + new_val = (val - center) * JC_MAX_STICK_MAG; + new_val /= (max - center); + } else { + new_val = (center - val) * -JC_MAX_STICK_MAG; + new_val /= (center - min); + } + new_val = clamp(new_val, (s32)-JC_MAX_STICK_MAG, (s32)JC_MAX_STICK_MAG); + return new_val; +} + +static void joycon_parse_report(struct joycon_ctlr *ctlr, + struct joycon_input_report *rep) +{ + struct input_dev *dev = ctlr->input; + u32 btns; + u32 id = ctlr->hdev->product; + + btns = hid_field_extract(ctlr->hdev, rep->button_status, 0, 24); + + if (id != USB_DEVICE_ID_NINTENDO_JOYCONR) { + u16 raw_x; + u16 raw_y; + s32 x; + s32 y; + + /* get raw stick values */ + raw_x = hid_field_extract(ctlr->hdev, rep->left_stick, 0, 12); + raw_y = hid_field_extract(ctlr->hdev, + rep->left_stick + 1, 4, 12); + /* map the stick values */ + x = joycon_map_stick_val(&ctlr->left_stick_cal_x, raw_x); + y = -joycon_map_stick_val(&ctlr->left_stick_cal_y, raw_y); + /* report sticks */ + input_report_abs(dev, ABS_X, x); + input_report_abs(dev, ABS_Y, y); + + /* report buttons */ + input_report_key(dev, BTN_TL, btns & JC_BTN_L); + input_report_key(dev, BTN_TL2, btns & JC_BTN_ZL); + if (id != USB_DEVICE_ID_NINTENDO_PROCON) { + /* Report the S buttons as the non-existent triggers */ + input_report_key(dev, BTN_TR, btns & JC_BTN_SL_L); + input_report_key(dev, BTN_TR2, btns & JC_BTN_SR_L); + } + input_report_key(dev, BTN_SELECT, btns & JC_BTN_MINUS); + input_report_key(dev, BTN_THUMBL, btns & JC_BTN_LSTICK); + input_report_key(dev, BTN_Z, btns & JC_BTN_CAP); + input_report_key(dev, BTN_DPAD_DOWN, btns & JC_BTN_DOWN); + input_report_key(dev, BTN_DPAD_UP, btns & JC_BTN_UP); + input_report_key(dev, BTN_DPAD_RIGHT, btns & JC_BTN_RIGHT); + input_report_key(dev, BTN_DPAD_LEFT, btns & JC_BTN_LEFT); + } + if (id != USB_DEVICE_ID_NINTENDO_JOYCONL) { + u16 raw_x; + u16 raw_y; + s32 x; + s32 y; + + /* get raw stick values */ + raw_x = hid_field_extract(ctlr->hdev, rep->right_stick, 0, 12); + raw_y = hid_field_extract(ctlr->hdev, + rep->right_stick + 1, 4, 12); + /* map stick values */ + x = joycon_map_stick_val(&ctlr->right_stick_cal_x, raw_x); + y = -joycon_map_stick_val(&ctlr->right_stick_cal_y, raw_y); + /* report sticks */ + input_report_abs(dev, ABS_RX, x); + input_report_abs(dev, ABS_RY, y); + + /* report buttons */ + input_report_key(dev, BTN_TR, btns & JC_BTN_R); + input_report_key(dev, BTN_TR2, btns & JC_BTN_ZR); + if (id != USB_DEVICE_ID_NINTENDO_PROCON) { + /* Report the S buttons as the non-existent triggers */ + input_report_key(dev, BTN_TL, btns & JC_BTN_SL_R); + input_report_key(dev, BTN_TL2, btns & JC_BTN_SR_R); + } + input_report_key(dev, BTN_START, btns & JC_BTN_PLUS); + input_report_key(dev, BTN_THUMBR, btns & JC_BTN_RSTICK); + input_report_key(dev, BTN_MODE, btns & JC_BTN_HOME); + input_report_key(dev, BTN_WEST, btns & JC_BTN_Y); + input_report_key(dev, BTN_NORTH, btns & JC_BTN_X); + input_report_key(dev, BTN_EAST, btns & JC_BTN_A); + input_report_key(dev, BTN_SOUTH, btns & JC_BTN_B); + } + + input_sync(dev); +} + + +static const unsigned int joycon_button_inputs_l[] = { + BTN_SELECT, BTN_Z, BTN_THUMBL, + BTN_DPAD_UP, BTN_DPAD_DOWN, BTN_DPAD_LEFT, BTN_DPAD_RIGHT, + BTN_TL, BTN_TL2, + 0 /* 0 signals end of array */ +}; + +static const unsigned int joycon_button_inputs_r[] = { + BTN_START, BTN_MODE, BTN_THUMBR, + BTN_SOUTH, BTN_EAST, BTN_NORTH, BTN_WEST, + BTN_TR, BTN_TR2, + 0 /* 0 signals end of array */ +}; + +static DEFINE_MUTEX(joycon_input_num_mutex); +static int joycon_input_create(struct joycon_ctlr *ctlr) +{ + struct hid_device *hdev; + static int input_num = 1; + const char *name; + int ret; + int i; + + hdev = ctlr->hdev; + + switch (hdev->product) { + case USB_DEVICE_ID_NINTENDO_PROCON: + name = "Nintendo Switch Pro Controller"; + break; + case USB_DEVICE_ID_NINTENDO_JOYCONL: + name = "Nintendo Switch Left Joy-Con"; + break; + case USB_DEVICE_ID_NINTENDO_JOYCONR: + name = "Nintendo Switch Right Joy-Con"; + break; + default: /* Should be impossible */ + hid_err(hdev, "Invalid hid product\n"); + return -EINVAL; + } + + ctlr->input = devm_input_allocate_device(&hdev->dev); + if (!ctlr->input) + return -ENOMEM; + ctlr->input->id.bustype = hdev->bus; + ctlr->input->id.vendor = hdev->vendor; + ctlr->input->id.product = hdev->product; + ctlr->input->id.version = hdev->version; + ctlr->input->name = name; + input_set_drvdata(ctlr->input, ctlr); + + + /* set up sticks */ + if (hdev->product != USB_DEVICE_ID_NINTENDO_JOYCONR) { + input_set_abs_params(ctlr->input, ABS_X, + -JC_MAX_STICK_MAG, JC_MAX_STICK_MAG, + JC_STICK_FUZZ, JC_STICK_FLAT); + input_set_abs_params(ctlr->input, ABS_Y, + -JC_MAX_STICK_MAG, JC_MAX_STICK_MAG, + JC_STICK_FUZZ, JC_STICK_FLAT); + } + if (hdev->product != USB_DEVICE_ID_NINTENDO_JOYCONL) { + input_set_abs_params(ctlr->input, ABS_RX, + -JC_MAX_STICK_MAG, JC_MAX_STICK_MAG, + JC_STICK_FUZZ, JC_STICK_FLAT); + input_set_abs_params(ctlr->input, ABS_RY, + -JC_MAX_STICK_MAG, JC_MAX_STICK_MAG, + JC_STICK_FUZZ, JC_STICK_FLAT); + } + + /* set up buttons */ + if (hdev->product != USB_DEVICE_ID_NINTENDO_JOYCONR) { + for (i = 0; joycon_button_inputs_l[i] > 0; i++) + input_set_capability(ctlr->input, EV_KEY, + joycon_button_inputs_l[i]); + } + if (hdev->product != USB_DEVICE_ID_NINTENDO_JOYCONL) { + for (i = 0; joycon_button_inputs_r[i] > 0; i++) + input_set_capability(ctlr->input, EV_KEY, + joycon_button_inputs_r[i]); + } + + ret = input_register_device(ctlr->input); + if (ret) + return ret; + + /* Set the default controller player leds based on controller number */ + mutex_lock(&joycon_input_num_mutex); + mutex_lock(&ctlr->output_mutex); + ret = joycon_set_player_leds(ctlr, 0, 0xF >> (4 - input_num)); + if (ret) + hid_warn(ctlr->hdev, "Failed to set leds; ret=%d\n", ret); + mutex_unlock(&ctlr->output_mutex); + if (++input_num > 4) + input_num = 1; + mutex_unlock(&joycon_input_num_mutex); + + return 0; +} + +/* Common handler for parsing inputs */ +static int joycon_ctlr_read_handler(struct joycon_ctlr *ctlr, u8 *data, + int size) +{ + int ret = 0; + + if (data[0] == JC_INPUT_SUBCMD_REPLY || data[0] == JC_INPUT_IMU_DATA || + data[0] == JC_INPUT_MCU_DATA) { + if (size >= 12) /* make sure it contains the input report */ + joycon_parse_report(ctlr, + (struct joycon_input_report *)data); + } + + return ret; +} + +static int joycon_ctlr_handle_event(struct joycon_ctlr *ctlr, u8 *data, + int size) +{ + int ret = 0; + bool match = false; + struct joycon_input_report *report; + + if (unlikely(mutex_is_locked(&ctlr->output_mutex)) && + ctlr->msg_type != JOYCON_MSG_TYPE_NONE) { + switch (ctlr->msg_type) { + case JOYCON_MSG_TYPE_USB: + if (size < 2) + break; + if (data[0] == JC_INPUT_USB_RESPONSE && + data[1] == ctlr->usb_ack_match) + match = true; + break; + case JOYCON_MSG_TYPE_SUBCMD: + if (size < sizeof(struct joycon_input_report) || + data[0] != JC_INPUT_SUBCMD_REPLY) + break; + report = (struct joycon_input_report *)data; + if (report->reply.id == ctlr->subcmd_ack_match) + match = true; + break; + default: + break; + } + + if (match) { + memcpy(ctlr->input_buf, data, + min(size, (int)JC_MAX_RESP_SIZE)); + ctlr->msg_type = JOYCON_MSG_TYPE_NONE; + ctlr->received_resp = true; + wake_up(&ctlr->wait); + + /* This message has been handled */ + return 1; + } + } + + if (ctlr->ctlr_state == JOYCON_CTLR_STATE_READ) + ret = joycon_ctlr_read_handler(ctlr, data, size); + + return ret; +} + +static int nintendo_hid_event(struct hid_device *hdev, + struct hid_report *report, u8 *raw_data, int size) +{ + struct joycon_ctlr *ctlr = hid_get_drvdata(hdev); + + if (size < 1) + return -EINVAL; + + return joycon_ctlr_handle_event(ctlr, raw_data, size); +} + +static int nintendo_hid_probe(struct hid_device *hdev, + const struct hid_device_id *id) +{ + int ret; + struct joycon_ctlr *ctlr; + + hid_dbg(hdev, "probe - start\n"); + + ctlr = devm_kzalloc(&hdev->dev, sizeof(*ctlr), GFP_KERNEL); + if (!ctlr) { + ret = -ENOMEM; + goto err; + } + + ctlr->hdev = hdev; + ctlr->ctlr_state = JOYCON_CTLR_STATE_INIT; + hid_set_drvdata(hdev, ctlr); + mutex_init(&ctlr->output_mutex); + init_waitqueue_head(&ctlr->wait); + + ret = hid_parse(hdev); + if (ret) { + hid_err(hdev, "HID parse failed\n"); + goto err; + } + + ret = hid_hw_start(hdev, HID_CONNECT_HIDRAW); + if (ret) { + hid_err(hdev, "HW start failed\n"); + goto err; + } + + ret = hid_hw_open(hdev); + if (ret) { + hid_err(hdev, "cannot start hardware I/O\n"); + goto err_stop; + } + + hid_device_io_start(hdev); + + /* Initialize the controller */ + mutex_lock(&ctlr->output_mutex); + /* if handshake command fails, assume ble pro controller */ + if (hdev->product == USB_DEVICE_ID_NINTENDO_PROCON && + !joycon_send_usb(ctlr, JC_USB_CMD_HANDSHAKE)) { + hid_dbg(hdev, "detected USB controller\n"); + /* set baudrate for improved latency */ + ret = joycon_send_usb(ctlr, JC_USB_CMD_BAUDRATE_3M); + if (ret) { + hid_err(hdev, "Failed to set baudrate; ret=%d\n", ret); + goto err_mutex; + } + /* handshake */ + ret = joycon_send_usb(ctlr, JC_USB_CMD_HANDSHAKE); + if (ret) { + hid_err(hdev, "Failed handshake; ret=%d\n", ret); + goto err_mutex; + } + /* + * Set no timeout (to keep controller in USB mode). + * This doesn't send a response, so ignore the timeout. + */ + joycon_send_usb(ctlr, JC_USB_CMD_NO_TIMEOUT); + } + + /* get controller calibration data, and parse it */ + ret = joycon_request_calibration(ctlr); + if (ret) { + /* + * We can function with default calibration, but it may be + * inaccurate. Provide a warning, and continue on. + */ + hid_warn(hdev, "Analog stick positions may be inaccurate\n"); + } + + /* Set the reporting mode to 0x30, which is the full report mode */ + ret = joycon_set_report_mode(ctlr); + if (ret) { + hid_err(hdev, "Failed to set report mode; ret=%d\n", ret); + goto err_mutex; + } + + mutex_unlock(&ctlr->output_mutex); + + ret = joycon_input_create(ctlr); + if (ret) { + hid_err(hdev, "Failed to create input device; ret=%d\n", ret); + goto err_close; + } + + ctlr->ctlr_state = JOYCON_CTLR_STATE_READ; + + hid_dbg(hdev, "probe - success\n"); + return 0; + +err_mutex: + mutex_unlock(&ctlr->output_mutex); +err_close: + hid_hw_close(hdev); +err_stop: + hid_hw_stop(hdev); +err: + hid_err(hdev, "probe - fail = %d\n", ret); + return ret; +} + +static void nintendo_hid_remove(struct hid_device *hdev) +{ + hid_dbg(hdev, "remove\n"); + hid_hw_close(hdev); + hid_hw_stop(hdev); +} + +static const struct hid_device_id nintendo_hid_devices[] = { + { HID_USB_DEVICE(USB_VENDOR_ID_NINTENDO, + USB_DEVICE_ID_NINTENDO_PROCON) }, + { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_NINTENDO, + USB_DEVICE_ID_NINTENDO_PROCON) }, + { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_NINTENDO, + USB_DEVICE_ID_NINTENDO_JOYCONL) }, + { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_NINTENDO, + USB_DEVICE_ID_NINTENDO_JOYCONR) }, + { } +}; +MODULE_DEVICE_TABLE(hid, nintendo_hid_devices); + +static struct hid_driver nintendo_hid_driver = { + .name = "nintendo", + .id_table = nintendo_hid_devices, + .probe = nintendo_hid_probe, + .remove = nintendo_hid_remove, + .raw_event = nintendo_hid_event, +}; +module_hid_driver(nintendo_hid_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel J. Ogorchock "); +MODULE_DESCRIPTION("Driver for Nintendo Switch Controllers"); diff --git a/drivers/hwtracing/coresight/Kconfig b/drivers/hwtracing/coresight/Kconfig index f026e5c0e777..514a9b8086e3 100644 --- a/drivers/hwtracing/coresight/Kconfig +++ b/drivers/hwtracing/coresight/Kconfig @@ -150,6 +150,19 @@ config CORESIGHT_CPU_DEBUG To compile this driver as a module, choose M here: the module will be called coresight-cpu-debug. +config CORESIGHT_CPU_DEBUG_DEFAULT_ON + bool "Enable CoreSight CPU Debug by default" + depends on CORESIGHT_CPU_DEBUG + help + Say Y here to enable the CoreSight Debug panic-debug by default. This + can also be enabled via debugfs, but this ensures the debug feature + is enabled as early as possible. + + Has the same effect as setting coresight_cpu_debug.enable=1 on the + kernel command line. + + Say N if unsure. + config CORESIGHT_CTI tristate "CoreSight Cross Trigger Interface (CTI) driver" depends on ARM || ARM64 diff --git a/drivers/hwtracing/coresight/coresight-cpu-debug.c b/drivers/hwtracing/coresight/coresight-cpu-debug.c index c60442970c2a..1874df7c6a73 100644 --- a/drivers/hwtracing/coresight/coresight-cpu-debug.c +++ b/drivers/hwtracing/coresight/coresight-cpu-debug.c @@ -105,7 +105,7 @@ static DEFINE_PER_CPU(struct debug_drvdata *, debug_drvdata); static int debug_count; static struct dentry *debug_debugfs_dir; -static bool debug_enable; +static bool debug_enable = IS_ENABLED(CONFIG_CORESIGHT_CPU_DEBUG_DEFAULT_ON); module_param_named(enable, debug_enable, bool, 0600); MODULE_PARM_DESC(enable, "Control to enable coresight CPU debug functionality"); diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c index f775cbee12b8..efa39820acec 100644 --- a/drivers/hwtracing/coresight/coresight-etb10.c +++ b/drivers/hwtracing/coresight/coresight-etb10.c @@ -557,9 +557,8 @@ static unsigned long etb_update_buffer(struct coresight_device *csdev, /* * In snapshot mode we simply increment the head by the number of byte - * that were written. User space function cs_etm_find_snapshot() will - * figure out how many bytes to get from the AUX buffer based on the - * position of the head. + * that were written. User space will figure out how many bytes to get + * from the AUX buffer based on the position of the head. */ if (buf->snapshot) handle->head += to_read; diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c index 8ebd728d3a80..c039b6ae206f 100644 --- a/drivers/hwtracing/coresight/coresight-etm-perf.c +++ b/drivers/hwtracing/coresight/coresight-etm-perf.c @@ -452,9 +452,14 @@ static void etm_event_start(struct perf_event *event, int flags) * sink from this ETM. We can't do much in this case if * the sink was specified or hinted to the driver. For * now, simply don't record anything on this ETM. + * + * As such we pretend that everything is fine, and let + * it continue without actually tracing. The event could + * continue tracing when it moves to a CPU where it is + * reachable to a sink. */ if (!cpumask_test_cpu(cpu, &event_data->mask)) - goto fail_end_stop; + goto out; path = etm_event_cpu_path(event_data, cpu); /* We need a sink, no need to continue without one */ @@ -466,26 +471,32 @@ static void etm_event_start(struct perf_event *event, int flags) if (coresight_enable_path(path, CS_MODE_PERF, handle)) goto fail_end_stop; - /* Tell the perf core the event is alive */ - event->hw.state = 0; - /* Finally enable the tracer */ if (source_ops(csdev)->enable(csdev, event, CS_MODE_PERF)) goto fail_disable_path; +out: + /* Tell the perf core the event is alive */ + event->hw.state = 0; /* Save the event_data for this ETM */ ctxt->event_data = event_data; -out: return; fail_disable_path: coresight_disable_path(path); fail_end_stop: - perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); - perf_aux_output_end(handle, 0); + /* + * Check if the handle is still associated with the event, + * to handle cases where if the sink failed to start the + * trace and TRUNCATED the handle already. + */ + if (READ_ONCE(handle->event)) { + perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); + perf_aux_output_end(handle, 0); + } fail: event->hw.state = PERF_HES_STOPPED; - goto out; + return; } static void etm_event_stop(struct perf_event *event, int mode) @@ -517,6 +528,19 @@ static void etm_event_stop(struct perf_event *event, int mode) if (WARN_ON(!event_data)) return; + /* + * Check if this ETM was allowed to trace, as decided at + * etm_setup_aux(). If it wasn't allowed to trace, then + * nothing needs to be torn down other than outputting a + * zero sized record. + */ + if (handle->event && (mode & PERF_EF_UPDATE) && + !cpumask_test_cpu(cpu, &event_data->mask)) { + event->hw.state = PERF_HES_STOPPED; + perf_aux_output_end(handle, 0); + return; + } + if (!csdev) return; @@ -550,7 +574,21 @@ static void etm_event_stop(struct perf_event *event, int mode) size = sink_ops(sink)->update_buffer(sink, handle, event_data->snk_config); - perf_aux_output_end(handle, size); + /* + * Make sure the handle is still valid as the + * sink could have closed it from an IRQ. + * The sink driver must handle the race with + * update_buffer() and IRQ. Thus either we + * should get a valid handle and valid size + * (which may be 0). + * + * But we should never get a non-zero size with + * an invalid handle. + */ + if (READ_ONCE(handle->event)) + perf_aux_output_end(handle, size); + else + WARN_ON(size); } /* Disabling the path make its elements available to other sessions */ diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c index e24252eaf8e4..93899e71b195 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x-core.c +++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c @@ -40,6 +40,7 @@ #include "coresight-etm4x.h" #include "coresight-etm-perf.h" #include "coresight-etm4x-cfg.h" +#include "coresight-self-hosted-trace.h" #include "coresight-syscfg.h" static int boot_enable; @@ -238,6 +239,45 @@ struct etm4_enable_arg { int rc; }; +/* + * etm4x_prohibit_trace - Prohibit the CPU from tracing at all ELs. + * When the CPU supports FEAT_TRF, we could move the ETM to a trace + * prohibited state by filtering the Exception levels via TRFCR_EL1. + */ +static void etm4x_prohibit_trace(struct etmv4_drvdata *drvdata) +{ + /* If the CPU doesn't support FEAT_TRF, nothing to do */ + if (!drvdata->trfcr) + return; + cpu_prohibit_trace(); +} + +/* + * etm4x_allow_trace - Allow CPU tracing in the respective ELs, + * as configured by the drvdata->config.mode for the current + * session. Even though we have TRCVICTLR bits to filter the + * trace in the ELs, it doesn't prevent the ETM from generating + * a packet (e.g, TraceInfo) that might contain the addresses from + * the excluded levels. Thus we use the additional controls provided + * via the Trace Filtering controls (FEAT_TRF) to make sure no trace + * is generated for the excluded ELs. + */ +static void etm4x_allow_trace(struct etmv4_drvdata *drvdata) +{ + u64 trfcr = drvdata->trfcr; + + /* If the CPU doesn't support FEAT_TRF, nothing to do */ + if (!trfcr) + return; + + if (drvdata->config.mode & ETM_MODE_EXCL_KERN) + trfcr &= ~TRFCR_ELx_ExTRE; + if (drvdata->config.mode & ETM_MODE_EXCL_USER) + trfcr &= ~TRFCR_ELx_E0TRE; + + write_trfcr(trfcr); +} + #ifdef CONFIG_ETM4X_IMPDEF_FEATURE #define HISI_HIP08_AMBA_ID 0x000b6d01 @@ -442,6 +482,7 @@ static int etm4_enable_hw(struct etmv4_drvdata *drvdata) if (etm4x_is_ete(drvdata)) etm4x_relaxed_write32(csa, TRCRSR_TA, TRCRSR); + etm4x_allow_trace(drvdata); /* Enable the trace unit */ etm4x_relaxed_write32(csa, 1, TRCPRGCTLR); @@ -737,7 +778,6 @@ static int etm4_enable(struct coresight_device *csdev, static void etm4_disable_hw(void *info) { u32 control; - u64 trfcr; struct etmv4_drvdata *drvdata = info; struct etmv4_config *config = &drvdata->config; struct coresight_device *csdev = drvdata->csdev; @@ -764,12 +804,7 @@ static void etm4_disable_hw(void *info) * If the CPU supports v8.4 Trace filter Control, * set the ETM to trace prohibited region. */ - if (drvdata->trfc) { - trfcr = read_sysreg_s(SYS_TRFCR_EL1); - write_sysreg_s(trfcr & ~(TRFCR_ELx_ExTRE | TRFCR_ELx_E0TRE), - SYS_TRFCR_EL1); - isb(); - } + etm4x_prohibit_trace(drvdata); /* * Make sure everything completes before disabling, as recommended * by section 7.3.77 ("TRCVICTLR, ViewInst Main Control Register, @@ -785,9 +820,6 @@ static void etm4_disable_hw(void *info) if (coresight_timeout(csa, TRCSTATR, TRCSTATR_PMSTABLE_BIT, 1)) dev_err(etm_dev, "timeout while waiting for PM stable Trace Status\n"); - if (drvdata->trfc) - write_sysreg_s(trfcr, SYS_TRFCR_EL1); - /* read the status of the single shot comparators */ for (i = 0; i < drvdata->nr_ss_cmp; i++) { config->ss_status[i] = @@ -907,7 +939,7 @@ static inline bool cpu_supports_sysreg_trace(void) { u64 dfr0 = read_sysreg_s(SYS_ID_AA64DFR0_EL1); - return ((dfr0 >> ID_AA64DFR0_TRACEVER_SHIFT) & 0xfUL) > 0; + return ((dfr0 >> ID_AA64DFR0_EL1_TraceVer_SHIFT) & 0xfUL) > 0; } static bool etm4_init_sysreg_access(struct etmv4_drvdata *drvdata, @@ -989,15 +1021,15 @@ static bool etm4_init_csdev_access(struct etmv4_drvdata *drvdata, return false; } -static void cpu_enable_tracing(struct etmv4_drvdata *drvdata) +static void cpu_detect_trace_filtering(struct etmv4_drvdata *drvdata) { u64 dfr0 = read_sysreg(id_aa64dfr0_el1); u64 trfcr; - if (!cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_TRACE_FILT_SHIFT)) + drvdata->trfcr = 0; + if (!cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceFilt_SHIFT)) return; - drvdata->trfc = true; /* * If the CPU supports v8.4 SelfHosted Tracing, enable * tracing at the kernel EL and EL0, forcing to use the @@ -1011,7 +1043,7 @@ static void cpu_enable_tracing(struct etmv4_drvdata *drvdata) if (is_kernel_in_hyp_mode()) trfcr |= TRFCR_EL2_CX; - write_sysreg_s(trfcr, SYS_TRFCR_EL1); + drvdata->trfcr = trfcr; } static void etm4_init_arch_data(void *info) @@ -1202,7 +1234,7 @@ static void etm4_init_arch_data(void *info) /* NUMCNTR, bits[30:28] number of counters available for tracing */ drvdata->nr_cntr = BMVAL(etmidr5, 28, 30); etm4_cs_lock(drvdata, csa); - cpu_enable_tracing(drvdata); + cpu_detect_trace_filtering(drvdata); } static inline u32 etm4_get_victlr_access_type(struct etmv4_config *config) @@ -1554,7 +1586,7 @@ static void etm4_init_trace_id(struct etmv4_drvdata *drvdata) drvdata->trcid = coresight_get_trace_id(drvdata->cpu); } -static int etm4_cpu_save(struct etmv4_drvdata *drvdata) +static int __etm4_cpu_save(struct etmv4_drvdata *drvdata) { int i, ret = 0; struct etmv4_save_state *state; @@ -1693,7 +1725,23 @@ out: return ret; } -static void etm4_cpu_restore(struct etmv4_drvdata *drvdata) +static int etm4_cpu_save(struct etmv4_drvdata *drvdata) +{ + int ret = 0; + + /* Save the TRFCR irrespective of whether the ETM is ON */ + if (drvdata->trfcr) + drvdata->save_trfcr = read_trfcr(); + /* + * Save and restore the ETM Trace registers only if + * the ETM is active. + */ + if (local_read(&drvdata->mode) && drvdata->save_state) + ret = __etm4_cpu_save(drvdata); + return ret; +} + +static void __etm4_cpu_restore(struct etmv4_drvdata *drvdata) { int i; struct etmv4_save_state *state = drvdata->save_state; @@ -1789,6 +1837,14 @@ static void etm4_cpu_restore(struct etmv4_drvdata *drvdata) etm4_cs_lock(drvdata, csa); } +static void etm4_cpu_restore(struct etmv4_drvdata *drvdata) +{ + if (drvdata->trfcr) + write_trfcr(drvdata->save_trfcr); + if (drvdata->state_needs_restore) + __etm4_cpu_restore(drvdata); +} + static int etm4_cpu_pm_notify(struct notifier_block *nb, unsigned long cmd, void *v) { @@ -1800,23 +1856,17 @@ static int etm4_cpu_pm_notify(struct notifier_block *nb, unsigned long cmd, drvdata = etmdrvdata[cpu]; - if (!drvdata->save_state) - return NOTIFY_OK; - if (WARN_ON_ONCE(drvdata->cpu != cpu)) return NOTIFY_BAD; switch (cmd) { case CPU_PM_ENTER: - /* save the state if self-hosted coresight is in use */ - if (local_read(&drvdata->mode)) - if (etm4_cpu_save(drvdata)) - return NOTIFY_BAD; + if (etm4_cpu_save(drvdata)) + return NOTIFY_BAD; break; case CPU_PM_EXIT: case CPU_PM_ENTER_FAILED: - if (drvdata->state_needs_restore) - etm4_cpu_restore(drvdata); + etm4_cpu_restore(drvdata); break; default: return NOTIFY_DONE; @@ -2099,6 +2149,7 @@ static const struct amba_id etm4_ids[] = { CS_AMBA_UCI_ID(0x000bb803, uci_id_etm4),/* Qualcomm Kryo 385 Cortex-A75 */ CS_AMBA_UCI_ID(0x000bb805, uci_id_etm4),/* Qualcomm Kryo 4XX Cortex-A55 */ CS_AMBA_UCI_ID(0x000bb804, uci_id_etm4),/* Qualcomm Kryo 4XX Cortex-A76 */ + CS_AMBA_UCI_ID(0x000bbd0d, uci_id_etm4),/* Qualcomm Kryo 5XX Cortex-A77 */ CS_AMBA_UCI_ID(0x000cc0af, uci_id_etm4),/* Marvell ThunderX2 */ CS_AMBA_UCI_ID(0x000b6d01, uci_id_etm4),/* HiSilicon-Hip08 */ CS_AMBA_UCI_ID(0x000b6d02, uci_id_etm4),/* HiSilicon-Hip09 */ diff --git a/drivers/hwtracing/coresight/coresight-etm4x.h b/drivers/hwtracing/coresight/coresight-etm4x.h index 794b29639035..dcd52a64fc21 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x.h +++ b/drivers/hwtracing/coresight/coresight-etm4x.h @@ -920,8 +920,12 @@ struct etmv4_save_state { * @nooverflow: Indicate if overflow prevention is supported. * @atbtrig: If the implementation can support ATB triggers * @lpoverride: If the implementation can support low-power state over. - * @trfc: If the implementation supports Arm v8.4 trace filter controls. + * @trfcr: If the CPU supports FEAT_TRF, value of the TRFCR_ELx that + * allows tracing at all ELs. We don't want to compute this + * at runtime, due to the additional setting of TRFCR_CX when + * in EL2. Otherwise, 0. * @config: structure holding configuration parameters. + * @save_trfcr: Saved TRFCR_EL1 register during a CPU PM event. * @save_state: State to be preserved across power loss * @state_needs_restore: True when there is context to restore after PM exit * @skip_power_up: Indicates if an implementation can skip powering up @@ -972,8 +976,9 @@ struct etmv4_drvdata { bool nooverflow; bool atbtrig; bool lpoverride; - bool trfc; + u64 trfcr; struct etmv4_config config; + u64 save_trfcr; struct etmv4_save_state *save_state; bool state_needs_restore; bool skip_power_up; diff --git a/drivers/hwtracing/coresight/coresight-self-hosted-trace.h b/drivers/hwtracing/coresight/coresight-self-hosted-trace.h new file mode 100644 index 000000000000..53840a2c41f2 --- /dev/null +++ b/drivers/hwtracing/coresight/coresight-self-hosted-trace.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Arm v8 Self-Hosted trace support. + * + * Copyright (C) 2021 ARM Ltd. + */ + +#ifndef __CORESIGHT_SELF_HOSTED_TRACE_H +#define __CORESIGHT_SELF_HOSTED_TRACE_H + +#include + +static inline u64 read_trfcr(void) +{ + return read_sysreg_s(SYS_TRFCR_EL1); +} + +static inline void write_trfcr(u64 val) +{ + write_sysreg_s(val, SYS_TRFCR_EL1); + isb(); +} + +static inline u64 cpu_prohibit_trace(void) +{ + u64 trfcr = read_trfcr(); + + /* Prohibit tracing at EL0 & the kernel EL */ + write_trfcr(trfcr & ~(TRFCR_ELx_ExTRE | TRFCR_ELx_E0TRE)); + /* Return the original value of the TRFCR */ + return trfcr; +} +#endif /* __CORESIGHT_SELF_HOSTED_TRACE_H */ diff --git a/drivers/hwtracing/coresight/coresight-tmc-core.c b/drivers/hwtracing/coresight/coresight-tmc-core.c index 74c6323d4d6a..d0276af82494 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-core.c +++ b/drivers/hwtracing/coresight/coresight-tmc-core.c @@ -432,6 +432,21 @@ static u32 tmc_etr_get_default_buffer_size(struct device *dev) return size; } +static u32 tmc_etr_get_max_burst_size(struct device *dev) +{ + u32 burst_size; + + if (fwnode_property_read_u32(dev->fwnode, "arm,max-burst-size", + &burst_size)) + return TMC_AXICTL_WR_BURST_16; + + /* Only permissible values are 0 to 15 */ + if (burst_size > 0xF) + burst_size = TMC_AXICTL_WR_BURST_16; + + return burst_size; +} + static int tmc_probe(struct amba_device *adev, const struct amba_id *id) { int ret = 0; @@ -469,10 +484,12 @@ static int tmc_probe(struct amba_device *adev, const struct amba_id *id) /* This device is not associated with a session */ drvdata->pid = -1; - if (drvdata->config_type == TMC_CONFIG_TYPE_ETR) + if (drvdata->config_type == TMC_CONFIG_TYPE_ETR) { drvdata->size = tmc_etr_get_default_buffer_size(dev); - else + drvdata->max_burst_size = tmc_etr_get_max_burst_size(dev); + } else { drvdata->size = readl_relaxed(drvdata->base + TMC_RSZ) * 4; + } desc.dev = dev; desc.groups = coresight_tmc_groups; diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c b/drivers/hwtracing/coresight/coresight-tmc-etf.c index cd0fb7bfba68..4c4cbd1f7258 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-etf.c +++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c @@ -546,13 +546,17 @@ static unsigned long tmc_update_etf_buffer(struct coresight_device *csdev, /* * In snapshot mode we simply increment the head by the number of byte - * that were written. User space function cs_etm_find_snapshot() will - * figure out how many bytes to get from the AUX buffer based on the - * position of the head. + * that were written. User space will figure out how many bytes to get + * from the AUX buffer based on the position of the head. */ if (buf->snapshot) handle->head += to_read; + /* + * CS_LOCK() contains mb() so it can ensure visibility of the AUX trace + * data before the aux_head is updated via perf_aux_output_end(), which + * is expected by the perf ring buffer. + */ CS_LOCK(drvdata->base); out: spin_unlock_irqrestore(&drvdata->spinlock, flags); diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c index acdb59e0e661..867ad8bb9b0c 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c @@ -32,7 +32,6 @@ struct etr_flat_buf { * @etr_buf - Actual buffer used by the ETR * @pid - The PID this etr_perf_buffer belongs to. * @snaphost - Perf session mode - * @head - handle->head at the beginning of the session. * @nr_pages - Number of pages in the ring buffer. * @pages - Array of Pages in the ring buffer. */ @@ -41,7 +40,6 @@ struct etr_perf_buffer { struct etr_buf *etr_buf; pid_t pid; bool snapshot; - unsigned long head; int nr_pages; void **pages; }; @@ -609,8 +607,9 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata, if (!flat_buf) return -ENOMEM; - flat_buf->vaddr = dma_alloc_coherent(real_dev, etr_buf->size, - &flat_buf->daddr, GFP_KERNEL); + flat_buf->vaddr = dma_alloc_noncoherent(real_dev, etr_buf->size, + &flat_buf->daddr, + DMA_FROM_DEVICE, GFP_KERNEL); if (!flat_buf->vaddr) { kfree(flat_buf); return -ENOMEM; @@ -631,14 +630,18 @@ static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf) if (flat_buf && flat_buf->daddr) { struct device *real_dev = flat_buf->dev->parent; - dma_free_coherent(real_dev, flat_buf->size, - flat_buf->vaddr, flat_buf->daddr); + dma_free_noncoherent(real_dev, etr_buf->size, + flat_buf->vaddr, flat_buf->daddr, + DMA_FROM_DEVICE); } kfree(flat_buf); } static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp) { + struct etr_flat_buf *flat_buf = etr_buf->private; + struct device *real_dev = flat_buf->dev->parent; + /* * Adjust the buffer to point to the beginning of the trace data * and update the available trace data. @@ -648,6 +651,19 @@ static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp) etr_buf->len = etr_buf->size; else etr_buf->len = rwp - rrp; + + /* + * The driver always starts tracing at the beginning of the buffer, + * the only reason why we would get a wrap around is when the buffer + * is full. Sync the entire buffer in one go for this case. + */ + if (etr_buf->offset + etr_buf->len > etr_buf->size) + dma_sync_single_for_cpu(real_dev, flat_buf->daddr, + etr_buf->size, DMA_FROM_DEVICE); + else + dma_sync_single_for_cpu(real_dev, + flat_buf->daddr + etr_buf->offset, + etr_buf->len, DMA_FROM_DEVICE); } static ssize_t tmc_etr_get_data_flat_buf(struct etr_buf *etr_buf, @@ -982,7 +998,8 @@ static void __tmc_etr_enable_hw(struct tmc_drvdata *drvdata) axictl = readl_relaxed(drvdata->base + TMC_AXICTL); axictl &= ~TMC_AXICTL_CLEAR_MASK; - axictl |= (TMC_AXICTL_PROT_CTL_B1 | TMC_AXICTL_WR_BURST_16); + axictl |= TMC_AXICTL_PROT_CTL_B1; + axictl |= TMC_AXICTL_WR_BURST(drvdata->max_burst_size); axictl |= TMC_AXICTL_AXCACHE_OS; if (tmc_etr_has_cap(drvdata, TMC_ETR_AXI_ARCACHE)) { @@ -1437,16 +1454,16 @@ free_etr_perf_buffer: * buffer to the perf ring buffer. */ static void tmc_etr_sync_perf_buffer(struct etr_perf_buffer *etr_perf, + unsigned long head, unsigned long src_offset, unsigned long to_copy) { long bytes; long pg_idx, pg_offset; - unsigned long head = etr_perf->head; char **dst_pages, *src_buf; struct etr_buf *etr_buf = etr_perf->etr_buf; - head = etr_perf->head; + head = PERF_IDX2OFF(head, etr_perf); pg_idx = head >> PAGE_SHIFT; pg_offset = head & (PAGE_SIZE - 1); dst_pages = (char **)etr_perf->pages; @@ -1553,16 +1570,23 @@ tmc_update_etr_buffer(struct coresight_device *csdev, /* Insert barrier packets at the beginning, if there was an overflow */ if (lost) tmc_etr_buf_insert_barrier_packet(etr_buf, offset); - tmc_etr_sync_perf_buffer(etr_perf, offset, size); + tmc_etr_sync_perf_buffer(etr_perf, handle->head, offset, size); /* * In snapshot mode we simply increment the head by the number of byte - * that were written. User space function cs_etm_find_snapshot() will - * figure out how many bytes to get from the AUX buffer based on the - * position of the head. + * that were written. User space will figure out how many bytes to get + * from the AUX buffer based on the position of the head. */ if (etr_perf->snapshot) handle->head += size; + + /* + * Ensure that the AUX trace data is visible before the aux_head + * is updated via perf_aux_output_end(), as expected by the + * perf ring buffer. + */ + smp_wmb(); + out: /* * Don't set the TRUNCATED flag in snapshot mode because 1) the @@ -1605,8 +1629,6 @@ static int tmc_enable_etr_sink_perf(struct coresight_device *csdev, void *data) goto unlock_out; } - etr_perf->head = PERF_IDX2OFF(handle->head, etr_perf); - /* * No HW configuration is needed if the sink is already in * use for this session. diff --git a/drivers/hwtracing/coresight/coresight-tmc.h b/drivers/hwtracing/coresight/coresight-tmc.h index b91ec7dde7bc..6bec20a392b3 100644 --- a/drivers/hwtracing/coresight/coresight-tmc.h +++ b/drivers/hwtracing/coresight/coresight-tmc.h @@ -70,7 +70,8 @@ #define TMC_AXICTL_PROT_CTL_B0 BIT(0) #define TMC_AXICTL_PROT_CTL_B1 BIT(1) #define TMC_AXICTL_SCT_GAT_MODE BIT(7) -#define TMC_AXICTL_WR_BURST_16 0xF00 +#define TMC_AXICTL_WR_BURST(v) (((v) & 0xf) << 8) +#define TMC_AXICTL_WR_BURST_16 0xf /* Write-back Read and Write-allocate */ #define TMC_AXICTL_AXCACHE_OS (0xf << 2) #define TMC_AXICTL_ARCACHE_OS (0xf << 16) @@ -174,6 +175,8 @@ struct etr_buf { * @etr_buf: details of buffer used in TMC-ETR * @len: size of the available trace for ETF/ETB. * @size: trace buffer size for this TMC (common for all modes). + * @max_burst_size: The maximum burst size that can be initiated by + * TMC-ETR on AXI bus. * @mode: how this TMC is being used. * @config_type: TMC variant, must be of type @tmc_config_type. * @memwidth: width of the memory interface databus, in bytes. @@ -198,6 +201,7 @@ struct tmc_drvdata { }; u32 len; u32 size; + u32 max_burst_size; u32 mode; enum tmc_config_type config_type; enum tmc_mem_intf_width memwidth; diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c index fac63d092c7b..22ae53ffa976 100644 --- a/drivers/hwtracing/coresight/coresight-trbe.c +++ b/drivers/hwtracing/coresight/coresight-trbe.c @@ -16,6 +16,9 @@ #define pr_fmt(fmt) DRVNAME ": " fmt #include +#include + +#include "coresight-self-hosted-trace.h" #include "coresight-trbe.h" #define PERF_IDX2OFF(idx, buf) ((idx) % ((buf)->nr_pages << PAGE_SHIFT)) @@ -56,6 +59,8 @@ struct trbe_buf { * trbe_limit sibling pointers. */ unsigned long trbe_base; + /* The base programmed into the TRBE */ + unsigned long trbe_hw_base; unsigned long trbe_limit; unsigned long trbe_write; int nr_pages; @@ -64,13 +69,63 @@ struct trbe_buf { struct trbe_cpudata *cpudata; }; +/* + * TRBE erratum list + * + * The errata are defined in arm64 generic cpu_errata framework. + * Since the errata work arounds could be applied individually + * to the affected CPUs inside the TRBE driver, we need to know if + * a given CPU is affected by the erratum. Unlike the other erratum + * work arounds, TRBE driver needs to check multiple times during + * a trace session. Thus we need a quicker access to per-CPU + * errata and not issue costly this_cpu_has_cap() everytime. + * We keep a set of the affected errata in trbe_cpudata, per TRBE. + * + * We rely on the corresponding cpucaps to be defined for a given + * TRBE erratum. We map the given cpucap into a TRBE internal number + * to make the tracking of the errata lean. + * + * This helps in : + * - Not duplicating the detection logic + * - Streamlined detection of erratum across the system + */ +#define TRBE_WORKAROUND_OVERWRITE_FILL_MODE 0 +#define TRBE_WORKAROUND_WRITE_OUT_OF_RANGE 1 + +static int trbe_errata_cpucaps[] = { + [TRBE_WORKAROUND_OVERWRITE_FILL_MODE] = ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE, + [TRBE_WORKAROUND_WRITE_OUT_OF_RANGE] = ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE, + -1, /* Sentinel, must be the last entry */ +}; + +/* The total number of listed errata in trbe_errata_cpucaps */ +#define TRBE_ERRATA_MAX (ARRAY_SIZE(trbe_errata_cpucaps) - 1) + +/* + * Safe limit for the number of bytes that may be overwritten + * when ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE is triggered. + */ +#define TRBE_WORKAROUND_OVERWRITE_FILL_MODE_SKIP_BYTES 256 + +/* + * struct trbe_cpudata: TRBE instance specific data + * @trbe_flag - TRBE dirty/access flag support + * @trbe_hw_align - Actual TRBE alignment required for TRBPTR_EL1. + * @trbe_align - Software alignment used for the TRBPTR_EL1. + * @cpu - CPU this TRBE belongs to. + * @mode - Mode of current operation. (perf/disabled) + * @drvdata - TRBE specific drvdata + * @errata - Bit map for the errata on this TRBE. + */ struct trbe_cpudata { bool trbe_flag; + u64 trbe_hw_align; u64 trbe_align; int cpu; enum cs_mode mode; struct trbe_buf *buf; struct trbe_drvdata *drvdata; + DECLARE_BITMAP(errata, TRBE_ERRATA_MAX); }; struct trbe_drvdata { @@ -83,6 +138,35 @@ struct trbe_drvdata { struct platform_device *pdev; }; +static void trbe_check_errata(struct trbe_cpudata *cpudata) +{ + int i; + + for (i = 0; i < TRBE_ERRATA_MAX; i++) { + int cap = trbe_errata_cpucaps[i]; + + if (WARN_ON_ONCE(cap < 0)) + return; + if (this_cpu_has_cap(cap)) + set_bit(i, cpudata->errata); + } +} + +static inline bool trbe_has_erratum(struct trbe_cpudata *cpudata, int i) +{ + return (i < TRBE_ERRATA_MAX) && test_bit(i, cpudata->errata); +} + +static inline bool trbe_may_overwrite_in_fill_mode(struct trbe_cpudata *cpudata) +{ + return trbe_has_erratum(cpudata, TRBE_WORKAROUND_OVERWRITE_FILL_MODE); +} + +static inline bool trbe_may_write_out_of_range(struct trbe_cpudata *cpudata) +{ + return trbe_has_erratum(cpudata, TRBE_WORKAROUND_WRITE_OUT_OF_RANGE); +} + static int trbe_alloc_node(struct perf_event *event) { if (event->cpu == -1) @@ -120,6 +204,25 @@ static void trbe_reset_local(void) write_sysreg_s(0, SYS_TRBSR_EL1); } +static void trbe_report_wrap_event(struct perf_output_handle *handle) +{ + /* + * Mark the buffer to indicate that there was a WRAP event by + * setting the COLLISION flag. This indicates to the user that + * the TRBE trace collection was stopped without stopping the + * ETE and thus there might be some amount of trace that was + * lost between the time the WRAP was detected and the IRQ + * was consumed by the CPU. + * + * Setting the TRUNCATED flag would move the event to STOPPED + * state unnecessarily, even when there is space left in the + * ring buffer. Using the COLLISION flag doesn't have this side + * effect. We only set TRUNCATED flag when there is no space + * left in the ring buffer. + */ + perf_aux_output_flag(handle, PERF_AUX_FLAG_COLLISION); +} + static void trbe_stop_and_truncate_event(struct perf_output_handle *handle) { struct trbe_buf *buf = etm_perf_sink_config(handle); @@ -133,6 +236,7 @@ static void trbe_stop_and_truncate_event(struct perf_output_handle *handle) */ trbe_drain_and_disable_local(); perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); + perf_aux_output_end(handle, 0); *this_cpu_ptr(buf->cpudata->drvdata->handle) = NULL; } @@ -178,12 +282,18 @@ static void trbe_stop_and_truncate_event(struct perf_output_handle *handle) * consumed from the user space. The enabled TRBE buffer area is a moving subset of * the allocated perf auxiliary buffer. */ + +static void __trbe_pad_buf(struct trbe_buf *buf, u64 offset, int len) +{ + memset((void *)buf->trbe_base + offset, ETE_IGNORE_PACKET, len); +} + static void trbe_pad_buf(struct perf_output_handle *handle, int len) { struct trbe_buf *buf = etm_perf_sink_config(handle); u64 head = PERF_IDX2OFF(handle->head, buf); - memset((void *)buf->trbe_base + head, ETE_IGNORE_PACKET, len); + __trbe_pad_buf(buf, head, len); if (!buf->snapshot) perf_aux_output_skip(handle, len); } @@ -200,6 +310,25 @@ static unsigned long trbe_snapshot_offset(struct perf_output_handle *handle) return buf->nr_pages * PAGE_SIZE; } +static u64 trbe_min_trace_buf_size(struct perf_output_handle *handle) +{ + u64 size = TRBE_TRACE_MIN_BUF_SIZE; + struct trbe_buf *buf = etm_perf_sink_config(handle); + struct trbe_cpudata *cpudata = buf->cpudata; + + /* + * When the TRBE is affected by an erratum that could make it + * write to the next "virtually addressed" page beyond the LIMIT. + * We need to make sure there is always a PAGE after the LIMIT, + * within the buffer. Thus we ensure there is at least an extra + * page than normal. With this we could then adjust the LIMIT + * pointer down by a PAGE later. + */ + if (trbe_may_write_out_of_range(cpudata)) + size += PAGE_SIZE; + return size; +} + /* * TRBE Limit Calculation * @@ -252,13 +381,9 @@ static unsigned long __trbe_normal_offset(struct perf_output_handle *handle) * trbe_base trbe_base + nr_pages * * Perf aux buffer does not have any space for the driver to write into. - * Just communicate trace truncation event to the user space by marking - * it with PERF_AUX_FLAG_TRUNCATED. */ - if (!handle->size) { - perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); + if (!handle->size) return 0; - } /* Compute the tail and wakeup indices now that we've aligned head */ tail = PERF_IDX2OFF(handle->head + handle->size, buf); @@ -360,7 +485,6 @@ static unsigned long __trbe_normal_offset(struct perf_output_handle *handle) return limit; trbe_pad_buf(handle, handle->size); - perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); return 0; } @@ -374,10 +498,14 @@ static unsigned long trbe_normal_offset(struct perf_output_handle *handle) * If the head is too close to the limit and we don't * have space for a meaningful run, we rather pad it * and start fresh. + * + * We might have to do this more than once to make sure + * we have enough required space. */ - if (limit && (limit - head < TRBE_TRACE_MIN_BUF_SIZE)) { + while (limit && ((limit - head) < trbe_min_trace_buf_size(handle))) { trbe_pad_buf(handle, limit - head); limit = __trbe_normal_offset(handle); + head = PERF_IDX2OFF(handle->head, buf); } return limit; } @@ -448,12 +576,13 @@ static void set_trbe_limit_pointer_enabled(unsigned long addr) static void trbe_enable_hw(struct trbe_buf *buf) { - WARN_ON(buf->trbe_write < buf->trbe_base); + WARN_ON(buf->trbe_hw_base < buf->trbe_base); + WARN_ON(buf->trbe_write < buf->trbe_hw_base); WARN_ON(buf->trbe_write >= buf->trbe_limit); set_trbe_disabled(); isb(); clr_trbe_status(); - set_trbe_base_pointer(buf->trbe_base); + set_trbe_base_pointer(buf->trbe_hw_base); set_trbe_write_pointer(buf->trbe_write); /* @@ -464,10 +593,13 @@ static void trbe_enable_hw(struct trbe_buf *buf) set_trbe_limit_pointer_enabled(buf->trbe_limit); } -static enum trbe_fault_action trbe_get_fault_act(u64 trbsr) +static enum trbe_fault_action trbe_get_fault_act(struct perf_output_handle *handle, + u64 trbsr) { int ec = get_trbe_ec(trbsr); int bsc = get_trbe_bsc(trbsr); + struct trbe_buf *buf = etm_perf_sink_config(handle); + struct trbe_cpudata *cpudata = buf->cpudata; WARN_ON(is_trbe_running(trbsr)); if (is_trbe_trg(trbsr) || is_trbe_abort(trbsr)) @@ -476,13 +608,71 @@ static enum trbe_fault_action trbe_get_fault_act(u64 trbsr) if ((ec == TRBE_EC_STAGE1_ABORT) || (ec == TRBE_EC_STAGE2_ABORT)) return TRBE_FAULT_ACT_FATAL; - if (is_trbe_wrap(trbsr) && (ec == TRBE_EC_OTHERS) && (bsc == TRBE_BSC_FILLED)) { - if (get_trbe_write_pointer() == get_trbe_base_pointer()) - return TRBE_FAULT_ACT_WRAP; - } + /* + * If the trbe is affected by TRBE_WORKAROUND_OVERWRITE_FILL_MODE, + * it might write data after a WRAP event in the fill mode. + * Thus the check TRBPTR == TRBBASER will not be honored. + */ + if ((is_trbe_wrap(trbsr) && (ec == TRBE_EC_OTHERS) && (bsc == TRBE_BSC_FILLED)) && + (trbe_may_overwrite_in_fill_mode(cpudata) || + get_trbe_write_pointer() == get_trbe_base_pointer())) + return TRBE_FAULT_ACT_WRAP; + return TRBE_FAULT_ACT_SPURIOUS; } +static unsigned long trbe_get_trace_size(struct perf_output_handle *handle, + struct trbe_buf *buf, bool wrap) +{ + u64 write; + u64 start_off, end_off; + u64 size; + u64 overwrite_skip = TRBE_WORKAROUND_OVERWRITE_FILL_MODE_SKIP_BYTES; + + /* + * If the TRBE has wrapped around the write pointer has + * wrapped and should be treated as limit. + * + * When the TRBE is affected by TRBE_WORKAROUND_WRITE_OUT_OF_RANGE, + * it may write upto 64bytes beyond the "LIMIT". The driver already + * keeps a valid page next to the LIMIT and we could potentially + * consume the trace data that may have been collected there. But we + * cannot be really sure it is available, and the TRBPTR may not + * indicate the same. Also, affected cores are also affected by another + * erratum which forces the PAGE_SIZE alignment on the TRBPTR, and thus + * could potentially pad an entire PAGE_SIZE - 64bytes, to get those + * 64bytes. Thus we ignore the potential triggering of the erratum + * on WRAP and limit the data to LIMIT. + */ + if (wrap) + write = get_trbe_limit_pointer(); + else + write = get_trbe_write_pointer(); + + /* + * TRBE may use a different base address than the base + * of the ring buffer. Thus use the beginning of the ring + * buffer to compute the offsets. + */ + end_off = write - buf->trbe_base; + start_off = PERF_IDX2OFF(handle->head, buf); + + if (WARN_ON_ONCE(end_off < start_off)) + return 0; + + size = end_off - start_off; + /* + * If the TRBE is affected by the following erratum, we must fill + * the space we skipped with IGNORE packets. And we are always + * guaranteed to have at least a PAGE_SIZE space in the buffer. + */ + if (trbe_has_erratum(buf->cpudata, TRBE_WORKAROUND_OVERWRITE_FILL_MODE) && + !WARN_ON(size < overwrite_skip)) + __trbe_pad_buf(buf, start_off, overwrite_skip); + + return size; +} + static void *arm_trbe_alloc_buffer(struct coresight_device *csdev, struct perf_event *event, void **pages, int nr_pages, bool snapshot) @@ -544,9 +734,9 @@ static unsigned long arm_trbe_update_buffer(struct coresight_device *csdev, struct trbe_cpudata *cpudata = dev_get_drvdata(&csdev->dev); struct trbe_buf *buf = config; enum trbe_fault_action act; - unsigned long size, offset; - unsigned long write, base, status; + unsigned long size, status; unsigned long flags; + bool wrap = false; WARN_ON(buf->cpudata != cpudata); WARN_ON(cpudata->cpu != smp_processor_id()); @@ -554,8 +744,6 @@ static unsigned long arm_trbe_update_buffer(struct coresight_device *csdev, if (cpudata->mode != CS_MODE_PERF) return 0; - perf_aux_output_flag(handle, PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW); - /* * We are about to disable the TRBE. And this could in turn * fill up the buffer triggering, an IRQ. This could be consumed @@ -588,8 +776,6 @@ static unsigned long arm_trbe_update_buffer(struct coresight_device *csdev, * handle gets freed in etm_event_stop(). */ trbe_drain_and_disable_local(); - write = get_trbe_write_pointer(); - base = get_trbe_base_pointer(); /* Check if there is a pending interrupt and handle it here */ status = read_sysreg_s(SYS_TRBSR_EL1); @@ -603,7 +789,7 @@ static unsigned long arm_trbe_update_buffer(struct coresight_device *csdev, clr_trbe_irq(); isb(); - act = trbe_get_fault_act(status); + act = trbe_get_fault_act(handle, status); /* * If this was not due to a WRAP event, we have some * errors and as such buffer is empty. @@ -613,20 +799,11 @@ static unsigned long arm_trbe_update_buffer(struct coresight_device *csdev, goto done; } - /* - * Otherwise, the buffer is full and the write pointer - * has reached base. Adjust this back to the Limit pointer - * for correct size. Also, mark the buffer truncated. - */ - write = get_trbe_limit_pointer(); - perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); + trbe_report_wrap_event(handle); + wrap = true; } - offset = write - base; - if (WARN_ON_ONCE(offset < PERF_IDX2OFF(handle->head, buf))) - size = 0; - else - size = offset - PERF_IDX2OFF(handle->head, buf); + size = trbe_get_trace_size(handle, buf, wrap); done: local_irq_restore(flags); @@ -636,6 +813,148 @@ done: return size; } + +static int trbe_apply_work_around_before_enable(struct trbe_buf *buf) +{ + /* + * TRBE_WORKAROUND_OVERWRITE_FILL_MODE causes the TRBE to overwrite a few cache + * line size from the "TRBBASER_EL1" in the event of a "FILL". + * Thus, we could loose some amount of the trace at the base. + * + * Before Fix: + * + * normal-BASE head (normal-TRBPTR) tail (normal-LIMIT) + * | \/ / + * ------------------------------------------------------------- + * | Pg0 | Pg1 | | | PgN | + * ------------------------------------------------------------- + * + * In the normal course of action, we would set the TRBBASER to the + * beginning of the ring-buffer (normal-BASE). But with the erratum, + * the TRBE could overwrite the contents at the "normal-BASE", after + * hitting the "normal-LIMIT", since it doesn't stop as expected. And + * this is wrong. This could result in overwriting trace collected in + * one of the previous runs, being consumed by the user. So we must + * always make sure that the TRBBASER is within the region + * [head, head+size]. Note that TRBBASER must be PAGE aligned, + * + * After moving the BASE: + * + * normal-BASE head (normal-TRBPTR) tail (normal-LIMIT) + * | \/ / + * ------------------------------------------------------------- + * | | |xyzdef. |.. tuvw| | + * ------------------------------------------------------------- + * / + * New-BASER + * + * Also, we would set the TRBPTR to head (after adjusting for + * alignment) at normal-PTR. This would mean that the last few bytes + * of the trace (say, "xyz") might overwrite the first few bytes of + * trace written ("abc"). More importantly they will appear in what + * userspace sees as the beginning of the trace, which is wrong. We may + * not always have space to move the latest trace "xyz" to the correct + * order as it must appear beyond the LIMIT. (i.e, [head..head+size]). + * Thus it is easier to ignore those bytes than to complicate the + * driver to move it, assuming that the erratum was triggered and + * doing additional checks to see if there is indeed allowed space at + * TRBLIMITR.LIMIT. + * + * Thus the full workaround will move the BASE and the PTR and would + * look like (after padding at the skipped bytes at the end of + * session) : + * + * normal-BASE head (normal-TRBPTR) tail (normal-LIMIT) + * | \/ / + * ------------------------------------------------------------- + * | | |///abc.. |.. rst| | + * ------------------------------------------------------------- + * / | + * New-BASER New-TRBPTR + * + * To summarize, with the work around: + * + * - We always align the offset for the next session to PAGE_SIZE + * (This is to ensure we can program the TRBBASER to this offset + * within the region [head...head+size]). + * + * - At TRBE enable: + * - Set the TRBBASER to the page aligned offset of the current + * proposed write offset. (which is guaranteed to be aligned + * as above) + * - Move the TRBPTR to skip first 256bytes (that might be + * overwritten with the erratum). This ensures that the trace + * generated in the session is not re-written. + * + * - At trace collection: + * - Pad the 256bytes skipped above again with IGNORE packets. + */ + if (trbe_has_erratum(buf->cpudata, TRBE_WORKAROUND_OVERWRITE_FILL_MODE)) { + if (WARN_ON(!IS_ALIGNED(buf->trbe_write, PAGE_SIZE))) + return -EINVAL; + buf->trbe_hw_base = buf->trbe_write; + buf->trbe_write += TRBE_WORKAROUND_OVERWRITE_FILL_MODE_SKIP_BYTES; + } + + /* + * TRBE_WORKAROUND_WRITE_OUT_OF_RANGE could cause the TRBE to write to + * the next page after the TRBLIMITR.LIMIT. For perf, the "next page" + * may be: + * - The page beyond the ring buffer. This could mean, TRBE could + * corrupt another entity (kernel / user) + * - A portion of the "ring buffer" consumed by the userspace. + * i.e, a page outisde [head, head + size]. + * + * We work around this by: + * - Making sure that we have at least an extra space of PAGE left + * in the ring buffer [head, head + size], than we normally do + * without the erratum. See trbe_min_trace_buf_size(). + * + * - Adjust the TRBLIMITR.LIMIT to leave the extra PAGE outside + * the TRBE's range (i.e [TRBBASER, TRBLIMITR.LIMI] ). + */ + if (trbe_has_erratum(buf->cpudata, TRBE_WORKAROUND_WRITE_OUT_OF_RANGE)) { + s64 space = buf->trbe_limit - buf->trbe_write; + /* + * We must have more than a PAGE_SIZE worth space in the proposed + * range for the TRBE. + */ + if (WARN_ON(space <= PAGE_SIZE || + !IS_ALIGNED(buf->trbe_limit, PAGE_SIZE))) + return -EINVAL; + buf->trbe_limit -= PAGE_SIZE; + } + + return 0; +} + +static int __arm_trbe_enable(struct trbe_buf *buf, + struct perf_output_handle *handle) +{ + int ret = 0; + + perf_aux_output_flag(handle, PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW); + buf->trbe_limit = compute_trbe_buffer_limit(handle); + buf->trbe_write = buf->trbe_base + PERF_IDX2OFF(handle->head, buf); + if (buf->trbe_limit == buf->trbe_base) { + ret = -ENOSPC; + goto err; + } + /* Set the base of the TRBE to the buffer base */ + buf->trbe_hw_base = buf->trbe_base; + + ret = trbe_apply_work_around_before_enable(buf); + if (ret) + goto err; + + *this_cpu_ptr(buf->cpudata->drvdata->handle) = handle; + trbe_enable_hw(buf); + return 0; +err: + trbe_stop_and_truncate_event(handle); + return ret; +} + static int arm_trbe_enable(struct coresight_device *csdev, u32 mode, void *data) { struct trbe_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent); @@ -648,18 +967,11 @@ static int arm_trbe_enable(struct coresight_device *csdev, u32 mode, void *data) if (mode != CS_MODE_PERF) return -EINVAL; - *this_cpu_ptr(drvdata->handle) = handle; cpudata->buf = buf; cpudata->mode = mode; buf->cpudata = cpudata; - buf->trbe_limit = compute_trbe_buffer_limit(handle); - buf->trbe_write = buf->trbe_base + PERF_IDX2OFF(handle->head, buf); - if (buf->trbe_limit == buf->trbe_base) { - trbe_stop_and_truncate_event(handle); - return 0; - } - trbe_enable_hw(buf); - return 0; + + return __arm_trbe_enable(buf, handle); } static int arm_trbe_disable(struct coresight_device *csdev) @@ -683,35 +995,30 @@ static int arm_trbe_disable(struct coresight_device *csdev) static void trbe_handle_spurious(struct perf_output_handle *handle) { - struct trbe_buf *buf = etm_perf_sink_config(handle); + u64 limitr = read_sysreg_s(SYS_TRBLIMITR_EL1); - buf->trbe_limit = compute_trbe_buffer_limit(handle); - buf->trbe_write = buf->trbe_base + PERF_IDX2OFF(handle->head, buf); - if (buf->trbe_limit == buf->trbe_base) { - trbe_drain_and_disable_local(); - return; - } - trbe_enable_hw(buf); + /* + * If the IRQ was spurious, simply re-enable the TRBE + * back without modifying the buffer parameters to + * retain the trace collected so far. + */ + limitr |= TRBLIMITR_ENABLE; + write_sysreg_s(limitr, SYS_TRBLIMITR_EL1); + isb(); } -static void trbe_handle_overflow(struct perf_output_handle *handle) +static int trbe_handle_overflow(struct perf_output_handle *handle) { struct perf_event *event = handle->event; struct trbe_buf *buf = etm_perf_sink_config(handle); - unsigned long offset, size; + unsigned long size; struct etm_event_data *event_data; - offset = get_trbe_limit_pointer() - get_trbe_base_pointer(); - size = offset - PERF_IDX2OFF(handle->head, buf); + size = trbe_get_trace_size(handle, buf, true); if (buf->snapshot) handle->head += size; - /* - * Mark the buffer as truncated, as we have stopped the trace - * collection upon the WRAP event, without stopping the source. - */ - perf_aux_output_flag(handle, PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW | - PERF_AUX_FLAG_TRUNCATED); + trbe_report_wrap_event(handle); perf_aux_output_end(handle, size); event_data = perf_aux_output_begin(handle, event); if (!event_data) { @@ -723,16 +1030,10 @@ static void trbe_handle_overflow(struct perf_output_handle *handle) */ trbe_drain_and_disable_local(); *this_cpu_ptr(buf->cpudata->drvdata->handle) = NULL; - return; + return -EINVAL; } - buf->trbe_limit = compute_trbe_buffer_limit(handle); - buf->trbe_write = buf->trbe_base + PERF_IDX2OFF(handle->head, buf); - if (buf->trbe_limit == buf->trbe_base) { - trbe_stop_and_truncate_event(handle); - return; - } - *this_cpu_ptr(buf->cpudata->drvdata->handle) = handle; - trbe_enable_hw(buf); + + return __arm_trbe_enable(buf, handle); } static bool is_perf_trbe(struct perf_output_handle *handle) @@ -742,7 +1043,7 @@ static bool is_perf_trbe(struct perf_output_handle *handle) struct trbe_drvdata *drvdata = cpudata->drvdata; int cpu = smp_processor_id(); - WARN_ON(buf->trbe_base != get_trbe_base_pointer()); + WARN_ON(buf->trbe_hw_base != get_trbe_base_pointer()); WARN_ON(buf->trbe_limit != get_trbe_limit_pointer()); if (cpudata->mode != CS_MODE_PERF) @@ -763,13 +1064,10 @@ static irqreturn_t arm_trbe_irq_handler(int irq, void *dev) struct perf_output_handle *handle = *handle_ptr; enum trbe_fault_action act; u64 status; + bool truncated = false; + u64 trfcr; - /* - * Ensure the trace is visible to the CPUs and - * any external aborts have been resolved. - */ - trbe_drain_and_disable_local(); - + /* Reads to TRBSR_EL1 is fine when TRBE is active */ status = read_sysreg_s(SYS_TRBSR_EL1); /* * If the pending IRQ was handled by update_buffer callback @@ -778,6 +1076,13 @@ static irqreturn_t arm_trbe_irq_handler(int irq, void *dev) if (!is_trbe_irq(status)) return IRQ_NONE; + /* Prohibit the CPU from tracing before we disable the TRBE */ + trfcr = cpu_prohibit_trace(); + /* + * Ensure the trace is visible to the CPUs and + * any external aborts have been resolved. + */ + trbe_drain_and_disable_local(); clr_trbe_irq(); isb(); @@ -787,24 +1092,32 @@ static irqreturn_t arm_trbe_irq_handler(int irq, void *dev) if (!is_perf_trbe(handle)) return IRQ_NONE; - /* - * Ensure perf callbacks have completed, which may disable - * the trace buffer in response to a TRUNCATION flag. - */ - irq_work_run(); - - act = trbe_get_fault_act(status); + act = trbe_get_fault_act(handle, status); switch (act) { case TRBE_FAULT_ACT_WRAP: - trbe_handle_overflow(handle); + truncated = !!trbe_handle_overflow(handle); break; case TRBE_FAULT_ACT_SPURIOUS: trbe_handle_spurious(handle); break; case TRBE_FAULT_ACT_FATAL: trbe_stop_and_truncate_event(handle); + truncated = true; break; } + + /* + * If the buffer was truncated, ensure perf callbacks + * have completed, which will disable the event. + * + * Otherwise, restore the trace filter controls to + * allow the tracing. + */ + if (truncated) + irq_work_run(); + else + write_trfcr(trfcr); + return IRQ_HANDLED; } @@ -824,7 +1137,7 @@ static ssize_t align_show(struct device *dev, struct device_attribute *attr, cha { struct trbe_cpudata *cpudata = dev_get_drvdata(dev); - return sprintf(buf, "%llx\n", cpudata->trbe_align); + return sprintf(buf, "%llx\n", cpudata->trbe_hw_align); } static DEVICE_ATTR_RO(align); @@ -895,6 +1208,9 @@ cpu_clear: cpumask_clear_cpu(cpu, &drvdata->supported_cpus); } +/* + * Must be called with preemption disabled, for trbe_check_errata(). + */ static void arm_trbe_probe_cpu(void *info) { struct trbe_drvdata *drvdata = info; @@ -916,11 +1232,34 @@ static void arm_trbe_probe_cpu(void *info) goto cpu_clear; } - cpudata->trbe_align = 1ULL << get_trbe_address_align(trbidr); - if (cpudata->trbe_align > SZ_2K) { + cpudata->trbe_hw_align = 1ULL << get_trbe_address_align(trbidr); + if (cpudata->trbe_hw_align > SZ_2K) { pr_err("Unsupported alignment on cpu %d\n", cpu); goto cpu_clear; } + + /* + * Run the TRBE erratum checks, now that we know + * this instance is about to be registered. + */ + trbe_check_errata(cpudata); + + /* + * If the TRBE is affected by erratum TRBE_WORKAROUND_OVERWRITE_FILL_MODE, + * we must always program the TBRPTR_EL1, 256bytes from a page + * boundary, with TRBBASER_EL1 set to the page, to prevent + * TRBE over-writing 256bytes at TRBBASER_EL1 on FILL event. + * + * Thus make sure we always align our write pointer to a PAGE_SIZE, + * which also guarantees that we have at least a PAGE_SIZE space in + * the buffer (TRBLIMITR is PAGE aligned) and thus we can skip + * the required bytes at the base. + */ + if (trbe_may_overwrite_in_fill_mode(cpudata)) + cpudata->trbe_align = PAGE_SIZE; + else + cpudata->trbe_align = cpudata->trbe_hw_align; + cpudata->trbe_flag = get_trbe_flag_update(trbidr); cpudata->cpu = cpu; cpudata->drvdata = drvdata; @@ -975,6 +1314,13 @@ static int arm_trbe_remove_coresight(struct trbe_drvdata *drvdata) return 0; } +static void arm_trbe_probe_hotplugged_cpu(struct trbe_drvdata *drvdata) +{ + preempt_disable(); + arm_trbe_probe_cpu(drvdata); + preempt_enable(); +} + static int arm_trbe_cpu_startup(unsigned int cpu, struct hlist_node *node) { struct trbe_drvdata *drvdata = hlist_entry_safe(node, struct trbe_drvdata, hotplug_node); @@ -986,7 +1332,7 @@ static int arm_trbe_cpu_startup(unsigned int cpu, struct hlist_node *node) * initialize it now. */ if (!coresight_get_percpu_sink(cpu)) { - arm_trbe_probe_cpu(drvdata); + arm_trbe_probe_hotplugged_cpu(drvdata); if (cpumask_test_cpu(cpu, &drvdata->supported_cpus)) arm_trbe_register_coresight_cpu(drvdata, cpu); if (cpumask_test_cpu(cpu, &drvdata->supported_cpus)) diff --git a/drivers/hwtracing/coresight/coresight-trbe.h b/drivers/hwtracing/coresight/coresight-trbe.h index abf3e36082f0..963a5ff47dda 100644 --- a/drivers/hwtracing/coresight/coresight-trbe.h +++ b/drivers/hwtracing/coresight/coresight-trbe.h @@ -20,7 +20,8 @@ static inline bool is_trbe_available(void) { u64 aa64dfr0 = read_sysreg_s(SYS_ID_AA64DFR0_EL1); - unsigned int trbe = cpuid_feature_extract_unsigned_field(aa64dfr0, ID_AA64DFR0_TRBE_SHIFT); + unsigned int trbe = cpuid_feature_extract_unsigned_field(aa64dfr0, + ID_AA64DFR0_EL1_TraceBuffer_SHIFT); return trbe >= 0b0001; } diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index 95f90699d2b1..e4605ff1919c 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -24,6 +24,7 @@ #include #include #include "input-compat.h" +#include struct evdev { int open; @@ -214,6 +215,9 @@ static int evdev_set_clk_type(struct evdev_client *client, unsigned int clkid) static void __pass_event(struct evdev_client *client, const struct input_event *event) { + trace_android_vh_pass_input_event(client->head, client->tail, client->bufsize, + event->type, event->code, event->value); + client->buffer[client->head++] = *event; client->head &= client->bufsize - 1; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index e2e80eb2840c..6ebdfffc0af3 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -150,7 +150,7 @@ static struct arm_smmu_ctx_desc *arm_smmu_alloc_shared_cd(struct mm_struct *mm) } reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); - par = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_PARANGE_SHIFT); + par = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_EL1_PARANGE_SHIFT); tcr |= FIELD_PREP(CTXDESC_CD_0_TCR_IPS, par); cd->ttbr = virt_to_phys(mm->pgd); @@ -428,13 +428,13 @@ bool arm_smmu_sva_supported(struct arm_smmu_device *smmu) * addresses larger than what we support. */ reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); - fld = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_PARANGE_SHIFT); + fld = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_EL1_PARANGE_SHIFT); oas = id_aa64mmfr0_parange_to_phys_shift(fld); if (smmu->oas < oas) return false; /* We can support bigger ASIDs than the CPU, but not smaller */ - fld = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_ASID_SHIFT); + fld = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_EL1_ASIDBITS_SHIFT); asid_bits = fld ? 16 : 8; if (smmu->asid_bits < asid_bits) return false; diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 48c6f7ff4aef..cb96759f348f 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -25,6 +25,7 @@ #include #include #include +#include struct iommu_dma_msi_page { struct list_head list; @@ -406,6 +407,24 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base, return iova_reserve_iommu_regions(dev, domain); } +/* + * Should be called prior to using dma-apis. + */ +int iommu_dma_enable_best_fit_algo(struct device *dev) +{ + struct iommu_domain *domain; + struct iova_domain *iovad; + + domain = iommu_get_domain_for_dev(dev); + if (!domain || !domain->iova_cookie) + return -EINVAL; + + iovad = &((struct iommu_dma_cookie *)domain->iova_cookie)->iovad; + iovad->best_fit = true; + return 0; +} +EXPORT_SYMBOL(iommu_dma_enable_best_fit_algo); + /** * dma_info_to_prot - Translate DMA API directions and attributes to IOMMU API * page flags. @@ -422,6 +441,10 @@ static int dma_info_to_prot(enum dma_data_direction dir, bool coherent, if (attrs & DMA_ATTR_PRIVILEGED) prot |= IOMMU_PRIV; + if (attrs & DMA_ATTR_SYS_CACHE_ONLY) + prot |= IOMMU_SYS_CACHE; + if (attrs & DMA_ATTR_SYS_CACHE_ONLY_NWA) + prot |= IOMMU_SYS_CACHE_NWA; switch (dir) { case DMA_BIDIRECTIONAL: @@ -472,6 +495,8 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain, iova = alloc_iova_fast(iovad, iova_len, dma_limit >> shift, true); + trace_android_vh_iommu_iovad_alloc_iova(dev, iovad, (dma_addr_t)iova << shift, size); + return (dma_addr_t)iova << shift; } @@ -490,6 +515,8 @@ static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie, else free_iova_fast(iovad, iova_pfn(iovad, iova), size >> iova_shift(iovad)); + + trace_android_vh_iommu_iovad_free_iova(iovad, iova, size); } static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr, diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 94ff319ae8ac..f2ff2243444d 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -118,10 +118,14 @@ #define ARM_LPAE_MAIR_ATTR_NC 0x44 #define ARM_LPAE_MAIR_ATTR_INC_OWBRWA 0xf4 #define ARM_LPAE_MAIR_ATTR_WBRWA 0xff +#define ARM_LPAE_MAIR_ATTR_INC_OWBRANWA 0xe4ULL +#define ARM_LPAE_MAIR_ATTR_IWBRWA_OWBRANWA 0xefULL #define ARM_LPAE_MAIR_ATTR_IDX_NC 0 #define ARM_LPAE_MAIR_ATTR_IDX_CACHE 1 #define ARM_LPAE_MAIR_ATTR_IDX_DEV 2 #define ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE 3 +#define ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE_NWA 4 +#define ARM_LPAE_MAIR_ATTR_IDX_ICACHE_OCACHE_NWA 5 #define ARM_MALI_LPAE_TTBR_ADRMODE_TABLE (3u << 0) #define ARM_MALI_LPAE_TTBR_READ_INNER BIT(2) @@ -446,9 +450,19 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, if (prot & IOMMU_MMIO) pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV << ARM_LPAE_PTE_ATTRINDX_SHIFT); + else if ((prot & IOMMU_CACHE) && (prot & IOMMU_SYS_CACHE_NWA)) + pte |= (ARM_LPAE_MAIR_ATTR_IDX_ICACHE_OCACHE_NWA + << ARM_LPAE_PTE_ATTRINDX_SHIFT); + /* IOMMU_CACHE + IOMMU_SYS_CACHE equivalent to IOMMU_CACHE */ else if (prot & IOMMU_CACHE) pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE << ARM_LPAE_PTE_ATTRINDX_SHIFT); + else if (prot & IOMMU_SYS_CACHE) + pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE + << ARM_LPAE_PTE_ATTRINDX_SHIFT); + else if (prot & IOMMU_SYS_CACHE_NWA) + pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE_NWA + << ARM_LPAE_PTE_ATTRINDX_SHIFT); } /* @@ -904,7 +918,11 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) (ARM_LPAE_MAIR_ATTR_DEVICE << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV)) | (ARM_LPAE_MAIR_ATTR_INC_OWBRWA - << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE)); + << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE)) | + (ARM_LPAE_MAIR_ATTR_INC_OWBRANWA + << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE_NWA)) | + (ARM_LPAE_MAIR_ATTR_IWBRWA_OWBRANWA + << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_ICACHE_OCACHE_NWA)); cfg->arm_lpae_s1_cfg.mair = reg; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 7f409e9eea4b..6e8249d83e5b 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -221,13 +221,23 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list const struct iommu_ops *ops = dev->bus->iommu_ops; struct iommu_device *iommu_dev; struct iommu_group *group; + static DEFINE_MUTEX(iommu_probe_device_lock); int ret; if (!ops) return -ENODEV; - - if (!dev_iommu_get(dev)) - return -ENOMEM; + /* + * Serialise to avoid races between IOMMU drivers registering in + * parallel and/or the "replay" calls from ACPI/OF code via client + * driver probe. Once the latter have been cleaned up we should + * probably be able to use device_lock() here to minimise the scope, + * but for now enforcing a simple global ordering is fine. + */ + mutex_lock(&iommu_probe_device_lock); + if (!dev_iommu_get(dev)) { + ret = -ENOMEM; + goto err_unlock; + } if (!try_module_get(ops->owner)) { ret = -EINVAL; @@ -247,11 +257,14 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list ret = PTR_ERR(group); goto out_release; } - iommu_group_put(group); + mutex_lock(&group->mutex); if (group_list && !group->default_domain && list_empty(&group->entry)) list_add_tail(&group->entry, group_list); + mutex_unlock(&group->mutex); + iommu_group_put(group); + mutex_unlock(&iommu_probe_device_lock); iommu_device_link(iommu_dev, dev); return 0; @@ -265,6 +278,9 @@ out_module_put: err_free: dev_iommu_free(dev); +err_unlock: + mutex_unlock(&iommu_probe_device_lock); + return ret; } @@ -1807,11 +1823,11 @@ int bus_iommu_probe(struct bus_type *bus) return ret; list_for_each_entry_safe(group, next, &group_list, entry) { + mutex_lock(&group->mutex); + /* Remove item from the list */ list_del_init(&group->entry); - mutex_lock(&group->mutex); - /* Try to allocate default domain */ probe_alloc_default_domain(bus, group); @@ -2666,6 +2682,18 @@ static ssize_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova, unsigned int i = 0; int ret; + if (ops->map_sg) { + ret = ops->map_sg(domain, iova, sg, nents, prot, gfp, &mapped); + + if (ops->iotlb_sync_map) + ops->iotlb_sync_map(domain, iova, mapped); + + if (ret) + goto out_err; + + return mapped; + } + while (i <= nents) { phys_addr_t s_phys = sg_phys(sg); diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 0835f32e040a..20867c6796af 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -5,6 +5,7 @@ * Author: Anil S Keshavamurthy */ +#include #include #include #include @@ -15,6 +16,9 @@ /* The anchor node sits above the top of the usable address space */ #define IOVA_ANCHOR ~0UL +#define IOMMU_DEFAULT_IOVA_MAX_ALIGN_SHIFT 9 +static unsigned long iommu_max_align_shift __read_mostly = IOMMU_DEFAULT_IOVA_MAX_ALIGN_SHIFT; + static bool iova_rcache_insert(struct iova_domain *iovad, unsigned long pfn, unsigned long size); @@ -27,6 +31,29 @@ static void free_iova_rcaches(struct iova_domain *iovad); static void fq_destroy_all_entries(struct iova_domain *iovad); static void fq_flush_timeout(struct timer_list *t); +static unsigned long limit_align_shift(struct iova_domain *iovad, unsigned long shift) +{ + unsigned long max_align_shift; + + max_align_shift = iommu_max_align_shift + PAGE_SHIFT - iova_shift(iovad); + return min_t(unsigned long, max_align_shift, shift); +} + +#ifndef MODULE +static int __init iommu_set_def_max_align_shift(char *str) +{ + unsigned long max_align_shift; + + int ret = kstrtoul(str, 10, &max_align_shift); + + if (!ret) + iommu_max_align_shift = max_align_shift; + + return 0; +} +early_param("iommu.max_align_shift", iommu_set_def_max_align_shift); +#endif + static int iova_cpuhp_dead(unsigned int cpu, struct hlist_node *node) { struct iova_domain *iovad; @@ -69,6 +96,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule, rb_link_node(&iovad->anchor.node, NULL, &iovad->rbroot.rb_node); rb_insert_color(&iovad->anchor.node, &iovad->rbroot); cpuhp_state_add_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD, &iovad->cpuhp_dead); + iovad->best_fit = false; init_iova_rcaches(iovad); } EXPORT_SYMBOL_GPL(init_iova_domain); @@ -242,7 +270,7 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad, unsigned long high_pfn = limit_pfn, low_pfn = iovad->start_pfn; if (size_aligned) - align_mask <<= fls_long(size - 1); + align_mask <<= limit_align_shift(iovad, fls_long(size - 1)); /* Walk the tree backwards */ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags); @@ -291,6 +319,70 @@ iova32_full: return -ENOMEM; } +static int __alloc_and_insert_iova_best_fit(struct iova_domain *iovad, + unsigned long size, + unsigned long limit_pfn, + struct iova *new, bool size_aligned) +{ + struct rb_node *curr, *prev; + struct iova *curr_iova, *prev_iova; + unsigned long flags; + unsigned long align_mask = ~0UL; + struct rb_node *candidate_rb_parent; + unsigned long new_pfn, candidate_pfn = ~0UL; + unsigned long gap, candidate_gap = ~0UL; + + if (size_aligned) + align_mask <<= limit_align_shift(iovad, fls_long(size - 1)); + + /* Walk the tree backwards */ + spin_lock_irqsave(&iovad->iova_rbtree_lock, flags); + curr = &iovad->anchor.node; + prev = rb_prev(curr); + for (; prev; curr = prev, prev = rb_prev(curr)) { + curr_iova = rb_entry(curr, struct iova, node); + prev_iova = rb_entry(prev, struct iova, node); + + limit_pfn = min(limit_pfn, curr_iova->pfn_lo); + new_pfn = (limit_pfn - size) & align_mask; + gap = curr_iova->pfn_lo - prev_iova->pfn_hi - 1; + if ((limit_pfn >= size) && (new_pfn > prev_iova->pfn_hi) + && (gap < candidate_gap)) { + candidate_gap = gap; + candidate_pfn = new_pfn; + candidate_rb_parent = curr; + if (gap == size) + goto insert; + } + } + + curr_iova = rb_entry(curr, struct iova, node); + limit_pfn = min(limit_pfn, curr_iova->pfn_lo); + new_pfn = (limit_pfn - size) & align_mask; + gap = curr_iova->pfn_lo - iovad->start_pfn; + if (limit_pfn >= size && new_pfn >= iovad->start_pfn && + gap < candidate_gap) { + candidate_gap = gap; + candidate_pfn = new_pfn; + candidate_rb_parent = curr; + } + +insert: + if (candidate_pfn == ~0UL) { + spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); + return -ENOMEM; + } + + /* pfn_lo will point to size aligned address if size_aligned is set */ + new->pfn_lo = candidate_pfn; + new->pfn_hi = new->pfn_lo + size - 1; + + /* If we have 'prev', it's a valid place to start the insertion. */ + iova_insert_rbtree(&iovad->rbroot, new, candidate_rb_parent); + spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); + return 0; +} + static struct kmem_cache *iova_cache; static unsigned int iova_cache_users; static DEFINE_MUTEX(iova_cache_mutex); @@ -377,8 +469,13 @@ alloc_iova(struct iova_domain *iovad, unsigned long size, if (!new_iova) return NULL; - ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1, - new_iova, size_aligned); + if (iovad->best_fit) { + ret = __alloc_and_insert_iova_best_fit(iovad, size, + limit_pfn + 1, new_iova, size_aligned); + } else { + ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1, + new_iova, size_aligned); + } if (ret) { free_iova_mem(new_iova); diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 2ae46fa6b3de..f2ad8149fcf4 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -118,6 +118,7 @@ #define WR_THROT_EN BIT(6) #define HAS_LEGACY_IVRP_PADDR BIT(7) #define IOVA_34_EN BIT(8) +#define PGTABLE_PA_35_EN BIT(9) #define MTK_IOMMU_HAS_FLAG(pdata, _x) \ ((((pdata)->flags) & (_x)) == (_x)) @@ -125,6 +126,7 @@ struct mtk_iommu_domain { struct io_pgtable_cfg cfg; struct io_pgtable_ops *iop; + u32 ttbr; struct mtk_iommu_data *data; struct iommu_domain domain; @@ -393,6 +395,7 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom, if (data->m4u_dom) { dom->iop = data->m4u_dom->iop; dom->cfg = data->m4u_dom->cfg; + dom->ttbr = data->m4u_dom->ttbr; dom->domain.pgsize_bitmap = data->m4u_dom->cfg.pgsize_bitmap; goto update_iova_region; } @@ -406,6 +409,9 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom, .iommu_dev = data->dev, }; + if (MTK_IOMMU_HAS_FLAG(data->plat_data, PGTABLE_PA_35_EN)) + dom->cfg.quirks |= IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT; + if (MTK_IOMMU_HAS_FLAG(data->plat_data, HAS_4GB_MODE)) dom->cfg.oas = data->enable_4GB ? 33 : 32; else @@ -416,6 +422,9 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom, dev_err(data->dev, "Failed to alloc io pgtable\n"); return -EINVAL; } + dom->ttbr = dom->cfg.quirks & IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT ? + dom->cfg.arm_v7s_cfg.ttbr : + dom->cfg.arm_v7s_cfg.ttbr & MMU_PT_ADDR_MASK; /* Update our support page sizes bitmap */ dom->domain.pgsize_bitmap = dom->cfg.pgsize_bitmap; @@ -481,8 +490,7 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain, goto err_unlock; } data->m4u_dom = dom; - writel(dom->cfg.arm_v7s_cfg.ttbr & MMU_PT_ADDR_MASK, - data->base + REG_MMU_PT_BASE_ADDR); + writel(data->m4u_dom->ttbr, data->base + REG_MMU_PT_BASE_ADDR); pm_runtime_put(m4udev); } @@ -1025,7 +1033,7 @@ static int __maybe_unused mtk_iommu_runtime_resume(struct device *dev) writel_relaxed(reg->int_main_control, base + REG_MMU_INT_MAIN_CONTROL); writel_relaxed(reg->ivrp_paddr, base + REG_MMU_IVRP_PADDR); writel_relaxed(reg->vld_pa_rng, base + REG_MMU_VLD_PA_RNG); - writel(m4u_dom->cfg.arm_v7s_cfg.ttbr & MMU_PT_ADDR_MASK, base + REG_MMU_PT_BASE_ADDR); + writel(m4u_dom->ttbr, base + REG_MMU_PT_BASE_ADDR); return 0; } @@ -1046,7 +1054,8 @@ static const struct mtk_iommu_plat_data mt2712_data = { static const struct mtk_iommu_plat_data mt6779_data = { .m4u_plat = M4U_MT6779, - .flags = HAS_SUB_COMM | OUT_ORDER_WR_EN | WR_THROT_EN, + .flags = HAS_SUB_COMM | OUT_ORDER_WR_EN | WR_THROT_EN | + PGTABLE_PA_35_EN, .inv_sel_reg = REG_MMU_INV_SEL_GEN2, .iova_region = single_domain, .iova_region_nr = ARRAY_SIZE(single_domain), diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index ae1b9f59abc5..1ae1fc60f73a 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -401,8 +401,9 @@ config IRQ_UNIPHIER_AIDET Support for the UniPhier AIDET (ARM Interrupt Detector). config MESON_IRQ_GPIO - bool "Meson GPIO Interrupt Multiplexer" - depends on ARCH_MESON + tristate "Meson GPIO Interrupt Multiplexer" + depends on ARCH_MESON || COMPILE_TEST + default ARCH_MESON select IRQ_DOMAIN_HIERARCHY help Support Meson SoC Family GPIO Interrupt Multiplexer diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 9507989bf2e1..bbee60844ee7 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -18,6 +18,10 @@ #include #include #include +#include +#include +#include + #include #include @@ -29,6 +33,8 @@ #include #include +#include + #include "irq-gic-common.h" #define GICD_INT_NMI_PRI (GICD_INT_DEF_PRI & ~0x80) @@ -44,20 +50,6 @@ struct redist_region { bool single_redist; }; -struct gic_chip_data { - struct fwnode_handle *fwnode; - void __iomem *dist_base; - struct redist_region *redist_regions; - struct rdists rdists; - struct irq_domain *domain; - u64 redist_stride; - u32 nr_redist_regions; - u64 flags; - bool has_rss; - unsigned int ppi_nr; - struct partition_desc **ppi_descs; -}; - static struct gic_chip_data gic_data __read_mostly; static DEFINE_STATIC_KEY_TRUE(supports_deactivate_key); @@ -756,6 +748,7 @@ static asmlinkage void __exception_irq_entry gic_handle_irq(struct pt_regs *regs if (handle_domain_irq(gic_data.domain, irqnr, regs)) { WARN_ONCE(true, "Unexpected interrupt received!\n"); + log_abnormal_wakeup_reason("unexpected HW IRQ %u", irqnr); gic_deactivate_unhandled(irqnr); } } @@ -850,11 +843,15 @@ static void __init gic_dist_init(void) * enabled. */ affinity = gic_mpidr_to_affinity(cpu_logical_map(smp_processor_id())); - for (i = 32; i < GIC_LINE_NR; i++) + for (i = 32; i < GIC_LINE_NR; i++) { + trace_android_vh_gic_v3_affinity_init(i, GICD_IROUTER, &affinity); gic_write_irouter(affinity, base + GICD_IROUTER + i * 8); + } - for (i = 0; i < GIC_ESPI_NR; i++) + for (i = 0; i < GIC_ESPI_NR; i++) { + trace_android_vh_gic_v3_affinity_init(i, GICD_IROUTERnE, &affinity); gic_write_irouter(affinity, base + GICD_IROUTERnE + i * 8); + } } static int gic_iterate_rdists(int (*fn)(struct redist_region *, void __iomem *)) @@ -1302,6 +1299,9 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, reg = gic_dist_base(d) + offset + (index * 8); val = gic_mpidr_to_affinity(cpu_logical_map(cpu)); + trace_android_rvh_gic_v3_set_affinity(d, mask_val, &val, force, gic_dist_base(d), + gic_data.redist_regions[0].redist_base, + gic_data.redist_stride); gic_write_irouter(val, reg); /* @@ -1356,6 +1356,28 @@ static void gic_cpu_pm_init(void) static inline void gic_cpu_pm_init(void) { } #endif /* CONFIG_CPU_PM */ +#ifdef CONFIG_PM +void gic_resume(void) +{ + trace_android_vh_gic_resume(&gic_data); +} +EXPORT_SYMBOL_GPL(gic_resume); + +static struct syscore_ops gic_syscore_ops = { + .resume = gic_resume, +}; + +static void gic_syscore_init(void) +{ + register_syscore_ops(&gic_syscore_ops); +} + +#else +static inline void gic_syscore_init(void) { } +void gic_resume(void) { } +#endif + + static struct irq_chip gic_chip = { .name = "GICv3", .irq_mask = gic_mask_irq, @@ -1846,6 +1868,7 @@ static int __init gic_init_bases(void __iomem *dist_base, gic_cpu_init(); gic_smp_init(); gic_cpu_pm_init(); + gic_syscore_init(); if (gic_dist_supports_lpis()) { its_init(handle, &gic_data.rdists, gic_data.domain); diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c index 4ea71b28f9f5..a6277dea4c7a 100644 --- a/drivers/irqchip/irq-gic-v4.c +++ b/drivers/irqchip/irq-gic-v4.c @@ -94,7 +94,7 @@ bool gic_cpuif_has_vsgi(void) { unsigned long fld, reg = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); - fld = cpuid_feature_extract_unsigned_field(reg, ID_AA64PFR0_GIC_SHIFT); + fld = cpuid_feature_extract_unsigned_field(reg, ID_AA64PFR0_EL1_GIC_SHIFT); return fld >= 0x3; } diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 99077f30f699..a02bdad09318 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -815,6 +816,8 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, writeb_relaxed(gic_cpu_map[cpu], reg); irq_data_update_effective_affinity(d, cpumask_of(cpu)); + trace_android_vh_gic_set_affinity(d, mask_val, force, gic_cpu_map, reg); + return IRQ_SET_MASK_OK_DONE; } diff --git a/drivers/irqchip/irq-meson-gpio.c b/drivers/irqchip/irq-meson-gpio.c index e50676ce2ec8..cfc5cf5224cb 100644 --- a/drivers/irqchip/irq-meson-gpio.c +++ b/drivers/irqchip/irq-meson-gpio.c @@ -15,6 +15,7 @@ #include #include #include +#include #define NUM_CHANNEL 8 #define MAX_INPUT_MUX 256 @@ -136,6 +137,7 @@ static const struct of_device_id meson_irq_gpio_matches[] = { struct meson_gpio_irq_controller { const struct meson_gpio_irq_params *params; void __iomem *base; + struct irq_domain *domain; u32 channel_irqs[NUM_CHANNEL]; DECLARE_BITMAP(channel_map, NUM_CHANNEL); spinlock_t lock; @@ -436,8 +438,8 @@ static const struct irq_domain_ops meson_gpio_irq_domain_ops = { .translate = meson_gpio_irq_domain_translate, }; -static int __init meson_gpio_irq_parse_dt(struct device_node *node, - struct meson_gpio_irq_controller *ctl) +static int meson_gpio_irq_parse_dt(struct device_node *node, + struct meson_gpio_irq_controller *ctl) { const struct of_device_id *match; int ret; @@ -463,63 +465,84 @@ static int __init meson_gpio_irq_parse_dt(struct device_node *node, return 0; } -static int __init meson_gpio_irq_of_init(struct device_node *node, - struct device_node *parent) +static int meson_gpio_intc_probe(struct platform_device *pdev) { - struct irq_domain *domain, *parent_domain; + struct device_node *node = pdev->dev.of_node, *parent; struct meson_gpio_irq_controller *ctl; + struct irq_domain *parent_domain; + struct resource *res; int ret; + parent = of_irq_find_parent(node); if (!parent) { - pr_err("missing parent interrupt node\n"); + dev_err(&pdev->dev, "missing parent interrupt node\n"); return -ENODEV; } parent_domain = irq_find_host(parent); if (!parent_domain) { - pr_err("unable to obtain parent domain\n"); + dev_err(&pdev->dev, "unable to obtain parent domain\n"); return -ENXIO; } - ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); + ctl = devm_kzalloc(&pdev->dev, sizeof(*ctl), GFP_KERNEL); if (!ctl) return -ENOMEM; spin_lock_init(&ctl->lock); - ctl->base = of_iomap(node, 0); - if (!ctl->base) { - ret = -ENOMEM; - goto free_ctl; - } + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + ctl->base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(ctl->base)) + return PTR_ERR(ctl->base); ret = meson_gpio_irq_parse_dt(node, ctl); if (ret) - goto free_channel_irqs; + return ret; - domain = irq_domain_create_hierarchy(parent_domain, 0, - ctl->params->nr_hwirq, - of_node_to_fwnode(node), - &meson_gpio_irq_domain_ops, - ctl); - if (!domain) { - pr_err("failed to add domain\n"); - ret = -ENODEV; - goto free_channel_irqs; + ctl->domain = irq_domain_create_hierarchy(parent_domain, 0, + ctl->params->nr_hwirq, + of_node_to_fwnode(node), + &meson_gpio_irq_domain_ops, + ctl); + if (!ctl->domain) { + dev_err(&pdev->dev, "failed to add domain\n"); + return -ENODEV; } - pr_info("%d to %d gpio interrupt mux initialized\n", - ctl->params->nr_hwirq, NUM_CHANNEL); + platform_set_drvdata(pdev, ctl); + + dev_info(&pdev->dev, "%d to %d gpio interrupt mux initialized\n", + ctl->params->nr_hwirq, NUM_CHANNEL); return 0; - -free_channel_irqs: - iounmap(ctl->base); -free_ctl: - kfree(ctl); - - return ret; } -IRQCHIP_DECLARE(meson_gpio_intc, "amlogic,meson-gpio-intc", - meson_gpio_irq_of_init); +static int meson_gpio_intc_remove(struct platform_device *pdev) +{ + struct meson_gpio_irq_controller *ctl = platform_get_drvdata(pdev); + + irq_domain_remove(ctl->domain); + + return 0; +} + +static const struct of_device_id meson_gpio_intc_of_match[] = { + { .compatible = "amlogic,meson-gpio-intc", }, + {}, +}; +MODULE_DEVICE_TABLE(of, meson_gpio_intc_of_match); + +static struct platform_driver meson_gpio_intc_driver = { + .probe = meson_gpio_intc_probe, + .remove = meson_gpio_intc_remove, + .driver = { + .name = "meson-gpio-intc", + .of_match_table = meson_gpio_intc_of_match, + }, +}; +module_platform_driver(meson_gpio_intc_driver); + +MODULE_AUTHOR("Jerome Brunet "); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:meson-gpio-intc"); diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index f45fb372e51b..0ced36aee890 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -289,6 +289,27 @@ config DM_CRYPT If unsure, say N. +config DM_DEFAULT_KEY + tristate "Default-key target support" + depends on BLK_DEV_DM + depends on BLK_INLINE_ENCRYPTION + # dm-default-key doesn't require -o inlinecrypt, but it does currently + # rely on the inline encryption hooks being built into the kernel. + depends on FS_ENCRYPTION_INLINE_CRYPT + help + This device-mapper target allows you to create a device that + assigns a default encryption key to bios that aren't for the + contents of an encrypted file. + + This ensures that all blocks on-disk will be encrypted with + some key, without the performance hit of file contents being + encrypted twice when fscrypt (File-Based Encryption) is used. + + It is only appropriate to use dm-default-key when key + configuration is tightly controlled, like it is in Android, + such that all fscrypt keys are at least as hard to compromise + as the default key. + config DM_SNAPSHOT tristate "Snapshot target" depends on BLK_DEV_DM @@ -642,4 +663,30 @@ config DM_ZONED If unsure, say N. +config DM_BOW + tristate "Backup block device" + depends on BLK_DEV_DM + select DM_BUFIO + help + This device-mapper target takes a device and keeps a log of all + changes using free blocks identified by issuing a trim command. + This can then be restored by running a command line utility, + or committed by simply replacing the target. + + If unsure, say N. + +config DM_USER + tristate "Block device in userspace" + depends on BLK_DEV_DM + default y + help + This device-mapper target allows a userspace daemon to provide the + contents of a block device. See + for more information. + + To compile this code as a module, choose M here: the module will be + called dm-user. + + If unsure, say N. + endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 816945eeed7f..7890bcc727f5 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -57,6 +57,7 @@ obj-$(CONFIG_DM_UNSTRIPED) += dm-unstripe.o obj-$(CONFIG_DM_BUFIO) += dm-bufio.o obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o +obj-$(CONFIG_DM_DEFAULT_KEY) += dm-default-key.o obj-$(CONFIG_DM_DELAY) += dm-delay.o obj-$(CONFIG_DM_DUST) += dm-dust.o obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o @@ -83,6 +84,8 @@ obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o obj-$(CONFIG_DM_ZONED) += dm-zoned.o obj-$(CONFIG_DM_WRITECACHE) += dm-writecache.o +obj-$(CONFIG_DM_BOW) += dm-bow.o +obj-$(CONFIG_DM_USER) += dm-user.o ifeq ($(CONFIG_DM_INIT),y) dm-mod-objs += dm-init.o diff --git a/drivers/md/dm-bow.c b/drivers/md/dm-bow.c new file mode 100644 index 000000000000..728dd25675de --- /dev/null +++ b/drivers/md/dm-bow.c @@ -0,0 +1,1309 @@ +/* + * Copyright (C) 2018 Google Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-core.h" + +#include +#include +#include + +#define DM_MSG_PREFIX "bow" + +struct log_entry { + u64 source; + u64 dest; + u32 size; + u32 checksum; +} __packed; + +struct log_sector { + u32 magic; + u16 header_version; + u16 header_size; + u32 block_size; + u32 count; + u32 sequence; + sector_t sector0; + struct log_entry entries[]; +} __packed; + +/* + * MAGIC is BOW in ascii + */ +#define MAGIC 0x00574f42 +#define HEADER_VERSION 0x0100 + +/* + * A sorted set of ranges representing the state of the data on the device. + * Use an rb_tree for fast lookup of a given sector + * Consecutive ranges are always of different type - operations on this + * set must merge matching consecutive ranges. + * + * Top range is always of type TOP + */ +struct bow_range { + struct rb_node node; + sector_t sector; + enum { + INVALID, /* Type not set */ + SECTOR0, /* First sector - holds log record */ + SECTOR0_CURRENT,/* Live contents of sector0 */ + UNCHANGED, /* Original contents */ + TRIMMED, /* Range has been trimmed */ + CHANGED, /* Range has been changed */ + BACKUP, /* Range is being used as a backup */ + TOP, /* Final range - sector is size of device */ + } type; + struct list_head trimmed_list; /* list of TRIMMED ranges */ +}; + +static const char * const readable_type[] = { + "Invalid", + "Sector0", + "Sector0_current", + "Unchanged", + "Free", + "Changed", + "Backup", + "Top", +}; + +enum state { + TRIM, + CHECKPOINT, + COMMITTED, +}; + +struct bow_context { + struct dm_dev *dev; + u32 block_size; + u32 block_shift; + struct workqueue_struct *workqueue; + struct dm_bufio_client *bufio; + struct mutex ranges_lock; /* Hold to access this struct and/or ranges */ + struct rb_root ranges; + struct dm_kobject_holder kobj_holder; /* for sysfs attributes */ + atomic_t state; /* One of the enum state values above */ + u64 trims_total; + struct log_sector *log_sector; + struct list_head trimmed_list; + bool forward_trims; +}; + +sector_t range_top(struct bow_range *br) +{ + return container_of(rb_next(&br->node), struct bow_range, node) + ->sector; +} + +u64 range_size(struct bow_range *br) +{ + return (range_top(br) - br->sector) * SECTOR_SIZE; +} + +static sector_t bvec_top(struct bvec_iter *bi_iter) +{ + return bi_iter->bi_sector + bi_iter->bi_size / SECTOR_SIZE; +} + +/* + * Find the first range that overlaps with bi_iter + * bi_iter is set to the size of the overlapping sub-range + */ +static struct bow_range *find_first_overlapping_range(struct rb_root *ranges, + struct bvec_iter *bi_iter) +{ + struct rb_node *node = ranges->rb_node; + struct bow_range *br; + + while (node) { + br = container_of(node, struct bow_range, node); + + if (br->sector <= bi_iter->bi_sector + && bi_iter->bi_sector < range_top(br)) + break; + + if (bi_iter->bi_sector < br->sector) + node = node->rb_left; + else + node = node->rb_right; + } + + WARN_ON(!node); + if (!node) + return NULL; + + if (range_top(br) - bi_iter->bi_sector + < bi_iter->bi_size >> SECTOR_SHIFT) + bi_iter->bi_size = (range_top(br) - bi_iter->bi_sector) + << SECTOR_SHIFT; + + return br; +} + +void add_before(struct rb_root *ranges, struct bow_range *new_br, + struct bow_range *existing) +{ + struct rb_node *parent = &(existing->node); + struct rb_node **link = &(parent->rb_left); + + while (*link) { + parent = *link; + link = &((*link)->rb_right); + } + + rb_link_node(&new_br->node, parent, link); + rb_insert_color(&new_br->node, ranges); +} + +/* + * Given a range br returned by find_first_overlapping_range, split br into a + * leading range, a range matching the bi_iter and a trailing range. + * Leading and trailing may end up size 0 and will then be deleted. The + * new range matching the bi_iter is then returned and should have its type + * and type specific fields populated. + * If bi_iter runs off the end of the range, bi_iter is truncated accordingly + */ +static int split_range(struct bow_context *bc, struct bow_range **br, + struct bvec_iter *bi_iter) +{ + struct bow_range *new_br; + + if (bi_iter->bi_sector < (*br)->sector) { + WARN_ON(true); + return BLK_STS_IOERR; + } + + if (bi_iter->bi_sector > (*br)->sector) { + struct bow_range *leading_br = + kzalloc(sizeof(*leading_br), GFP_KERNEL); + + if (!leading_br) + return BLK_STS_RESOURCE; + + *leading_br = **br; + if (leading_br->type == TRIMMED) + list_add(&leading_br->trimmed_list, &bc->trimmed_list); + + add_before(&bc->ranges, leading_br, *br); + (*br)->sector = bi_iter->bi_sector; + } + + if (bvec_top(bi_iter) >= range_top(*br)) { + bi_iter->bi_size = (range_top(*br) - (*br)->sector) + * SECTOR_SIZE; + return BLK_STS_OK; + } + + /* new_br will be the beginning, existing br will be the tail */ + new_br = kzalloc(sizeof(*new_br), GFP_KERNEL); + if (!new_br) + return BLK_STS_RESOURCE; + + new_br->sector = (*br)->sector; + (*br)->sector = bvec_top(bi_iter); + add_before(&bc->ranges, new_br, *br); + *br = new_br; + + return BLK_STS_OK; +} + +/* + * Sets type of a range. May merge range into surrounding ranges + * Since br may be invalidated, always sets br to NULL to prevent + * usage after this is called + */ +static void set_type(struct bow_context *bc, struct bow_range **br, int type) +{ + struct bow_range *prev = container_of(rb_prev(&(*br)->node), + struct bow_range, node); + struct bow_range *next = container_of(rb_next(&(*br)->node), + struct bow_range, node); + + if ((*br)->type == TRIMMED) { + bc->trims_total -= range_size(*br); + list_del(&(*br)->trimmed_list); + } + + if (type == TRIMMED) { + bc->trims_total += range_size(*br); + list_add(&(*br)->trimmed_list, &bc->trimmed_list); + } + + (*br)->type = type; + + if (next->type == type) { + if (type == TRIMMED) + list_del(&next->trimmed_list); + rb_erase(&next->node, &bc->ranges); + kfree(next); + } + + if (prev->type == type) { + if (type == TRIMMED) + list_del(&(*br)->trimmed_list); + rb_erase(&(*br)->node, &bc->ranges); + kfree(*br); + } + + *br = NULL; +} + +static struct bow_range *find_free_range(struct bow_context *bc) +{ + if (list_empty(&bc->trimmed_list)) { + DMERR("Unable to find free space to back up to"); + return NULL; + } + + return list_first_entry(&bc->trimmed_list, struct bow_range, + trimmed_list); +} + +static sector_t sector_to_page(struct bow_context const *bc, sector_t sector) +{ + WARN_ON((sector & (((sector_t)1 << (bc->block_shift - SECTOR_SHIFT)) - 1)) + != 0); + return sector >> (bc->block_shift - SECTOR_SHIFT); +} + +static int copy_data(struct bow_context const *bc, + struct bow_range *source, struct bow_range *dest, + u32 *checksum) +{ + int i; + + if (range_size(source) != range_size(dest)) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + if (checksum) + *checksum = sector_to_page(bc, source->sector); + + for (i = 0; i < range_size(source) >> bc->block_shift; ++i) { + struct dm_buffer *read_buffer, *write_buffer; + u8 *read, *write; + sector_t page = sector_to_page(bc, source->sector) + i; + + read = dm_bufio_read(bc->bufio, page, &read_buffer); + if (IS_ERR(read)) { + DMERR("Cannot read page %llu", + (unsigned long long)page); + return PTR_ERR(read); + } + + if (checksum) + *checksum = crc32(*checksum, read, bc->block_size); + + write = dm_bufio_new(bc->bufio, + sector_to_page(bc, dest->sector) + i, + &write_buffer); + if (IS_ERR(write)) { + DMERR("Cannot write sector"); + dm_bufio_release(read_buffer); + return PTR_ERR(write); + } + + memcpy(write, read, bc->block_size); + + dm_bufio_mark_buffer_dirty(write_buffer); + dm_bufio_release(write_buffer); + dm_bufio_release(read_buffer); + } + + dm_bufio_write_dirty_buffers(bc->bufio); + return BLK_STS_OK; +} + +/****** logging functions ******/ + +static int add_log_entry(struct bow_context *bc, sector_t source, sector_t dest, + unsigned int size, u32 checksum); + +static int backup_log_sector(struct bow_context *bc) +{ + struct bow_range *first_br, *free_br; + struct bvec_iter bi_iter; + u32 checksum = 0; + int ret; + + first_br = container_of(rb_first(&bc->ranges), struct bow_range, node); + + if (first_br->type != SECTOR0) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + if (range_size(first_br) != bc->block_size) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + free_br = find_free_range(bc); + /* No space left - return this error to userspace */ + if (!free_br) + return BLK_STS_NOSPC; + bi_iter.bi_sector = free_br->sector; + bi_iter.bi_size = bc->block_size; + ret = split_range(bc, &free_br, &bi_iter); + if (ret) + return ret; + if (bi_iter.bi_size != bc->block_size) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + ret = copy_data(bc, first_br, free_br, &checksum); + if (ret) + return ret; + + bc->log_sector->count = 0; + bc->log_sector->sequence++; + ret = add_log_entry(bc, first_br->sector, free_br->sector, + range_size(first_br), checksum); + if (ret) + return ret; + + set_type(bc, &free_br, BACKUP); + return BLK_STS_OK; +} + +static int add_log_entry(struct bow_context *bc, sector_t source, sector_t dest, + unsigned int size, u32 checksum) +{ + struct dm_buffer *sector_buffer; + u8 *sector; + + if (sizeof(struct log_sector) + + sizeof(struct log_entry) * (bc->log_sector->count + 1) + > bc->block_size) { + int ret = backup_log_sector(bc); + + if (ret) + return ret; + } + + sector = dm_bufio_new(bc->bufio, 0, §or_buffer); + if (IS_ERR(sector)) { + DMERR("Cannot write boot sector"); + dm_bufio_release(sector_buffer); + return BLK_STS_NOSPC; + } + + bc->log_sector->entries[bc->log_sector->count].source = source; + bc->log_sector->entries[bc->log_sector->count].dest = dest; + bc->log_sector->entries[bc->log_sector->count].size = size; + bc->log_sector->entries[bc->log_sector->count].checksum = checksum; + bc->log_sector->count++; + + memcpy(sector, bc->log_sector, bc->block_size); + dm_bufio_mark_buffer_dirty(sector_buffer); + dm_bufio_release(sector_buffer); + dm_bufio_write_dirty_buffers(bc->bufio); + return BLK_STS_OK; +} + +static int prepare_log(struct bow_context *bc) +{ + struct bow_range *free_br, *first_br; + struct bvec_iter bi_iter; + u32 checksum = 0; + int ret; + + /* Carve out first sector as log sector */ + first_br = container_of(rb_first(&bc->ranges), struct bow_range, node); + if (first_br->type != UNCHANGED) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + if (range_size(first_br) < bc->block_size) { + WARN_ON(1); + return BLK_STS_IOERR; + } + bi_iter.bi_sector = 0; + bi_iter.bi_size = bc->block_size; + ret = split_range(bc, &first_br, &bi_iter); + if (ret) + return ret; + first_br->type = SECTOR0; + if (range_size(first_br) != bc->block_size) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + /* Find free sector for active sector0 reads/writes */ + free_br = find_free_range(bc); + if (!free_br) + return BLK_STS_NOSPC; + bi_iter.bi_sector = free_br->sector; + bi_iter.bi_size = bc->block_size; + ret = split_range(bc, &free_br, &bi_iter); + if (ret) + return ret; + + /* Copy data */ + ret = copy_data(bc, first_br, free_br, NULL); + if (ret) + return ret; + + bc->log_sector->sector0 = free_br->sector; + + set_type(bc, &free_br, SECTOR0_CURRENT); + + /* Find free sector to back up original sector zero */ + free_br = find_free_range(bc); + if (!free_br) + return BLK_STS_NOSPC; + bi_iter.bi_sector = free_br->sector; + bi_iter.bi_size = bc->block_size; + ret = split_range(bc, &free_br, &bi_iter); + if (ret) + return ret; + + /* Back up */ + ret = copy_data(bc, first_br, free_br, &checksum); + if (ret) + return ret; + + /* + * Set up our replacement boot sector - it will get written when we + * add the first log entry, which we do immediately + */ + bc->log_sector->magic = MAGIC; + bc->log_sector->header_version = HEADER_VERSION; + bc->log_sector->header_size = sizeof(*bc->log_sector); + bc->log_sector->block_size = bc->block_size; + bc->log_sector->count = 0; + bc->log_sector->sequence = 0; + + /* Add log entry */ + ret = add_log_entry(bc, first_br->sector, free_br->sector, + range_size(first_br), checksum); + if (ret) + return ret; + + set_type(bc, &free_br, BACKUP); + return BLK_STS_OK; +} + +static struct bow_range *find_sector0_current(struct bow_context *bc) +{ + struct bvec_iter bi_iter; + + bi_iter.bi_sector = bc->log_sector->sector0; + bi_iter.bi_size = bc->block_size; + return find_first_overlapping_range(&bc->ranges, &bi_iter); +} + +/****** sysfs interface functions ******/ + +static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct bow_context *bc = container_of(kobj, struct bow_context, + kobj_holder.kobj); + + return scnprintf(buf, PAGE_SIZE, "%d\n", atomic_read(&bc->state)); +} + +static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct bow_context *bc = container_of(kobj, struct bow_context, + kobj_holder.kobj); + enum state state, original_state; + int ret; + + state = buf[0] - '0'; + if (state < TRIM || state > COMMITTED) { + DMERR("State value %d out of range", state); + return -EINVAL; + } + + mutex_lock(&bc->ranges_lock); + original_state = atomic_read(&bc->state); + if (state != original_state + 1) { + DMERR("Invalid state change from %d to %d", + original_state, state); + ret = -EINVAL; + goto bad; + } + + DMINFO("Switching to state %s", state == CHECKPOINT ? "Checkpoint" + : state == COMMITTED ? "Committed" : "Unknown"); + + if (state == CHECKPOINT) { + ret = prepare_log(bc); + if (ret) { + DMERR("Failed to switch to checkpoint state"); + goto bad; + } + } else if (state == COMMITTED) { + struct bow_range *br = find_sector0_current(bc); + struct bow_range *sector0_br = + container_of(rb_first(&bc->ranges), struct bow_range, + node); + + ret = copy_data(bc, br, sector0_br, 0); + if (ret) { + DMERR("Failed to switch to committed state"); + goto bad; + } + } + atomic_inc(&bc->state); + ret = count; + +bad: + mutex_unlock(&bc->ranges_lock); + return ret; +} + +static ssize_t free_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct bow_context *bc = container_of(kobj, struct bow_context, + kobj_holder.kobj); + u64 trims_total; + + mutex_lock(&bc->ranges_lock); + trims_total = bc->trims_total; + mutex_unlock(&bc->ranges_lock); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", trims_total); +} + +static struct kobj_attribute attr_state = __ATTR_RW(state); +static struct kobj_attribute attr_free = __ATTR_RO(free); + +static struct attribute *bow_attrs[] = { + &attr_state.attr, + &attr_free.attr, + NULL +}; + +static struct kobj_type bow_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = bow_attrs, + .release = dm_kobject_release +}; + +/****** constructor/destructor ******/ + +static void dm_bow_dtr(struct dm_target *ti) +{ + struct bow_context *bc = (struct bow_context *) ti->private; + struct kobject *kobj; + + if (bc->workqueue) + destroy_workqueue(bc->workqueue); + if (bc->bufio) + dm_bufio_client_destroy(bc->bufio); + + kobj = &bc->kobj_holder.kobj; + if (kobj->state_initialized) { + kobject_put(kobj); + wait_for_completion(dm_get_completion_from_kobject(kobj)); + } + + while (rb_first(&bc->ranges)) { + struct bow_range *br = container_of(rb_first(&bc->ranges), + struct bow_range, node); + + rb_erase(&br->node, &bc->ranges); + kfree(br); + } + + mutex_destroy(&bc->ranges_lock); + kfree(bc->log_sector); + kfree(bc); +} + +static void dm_bow_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct bow_context *bc = ti->private; + const unsigned int block_size = bc->block_size; + + limits->logical_block_size = + max_t(unsigned int, limits->logical_block_size, block_size); + limits->physical_block_size = + max_t(unsigned int, limits->physical_block_size, block_size); + limits->io_min = max_t(unsigned int, limits->io_min, block_size); + + if (limits->max_discard_sectors == 0) { + limits->discard_granularity = 1 << 12; + limits->max_hw_discard_sectors = 1 << 15; + limits->max_discard_sectors = 1 << 15; + bc->forward_trims = false; + } else { + limits->discard_granularity = 1 << 12; + bc->forward_trims = true; + } +} + +static int dm_bow_ctr_optional(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct bow_context *bc = ti->private; + struct dm_arg_set as; + static const struct dm_arg _args[] = { + {0, 1, "Invalid number of feature args"}, + }; + unsigned int opt_params; + const char *opt_string; + int err; + char dummy; + + as.argc = argc; + as.argv = argv; + + err = dm_read_arg_group(_args, &as, &opt_params, &ti->error); + if (err) + return err; + + while (opt_params--) { + opt_string = dm_shift_arg(&as); + if (!opt_string) { + ti->error = "Not enough feature arguments"; + return -EINVAL; + } + + if (sscanf(opt_string, "block_size:%u%c", + &bc->block_size, &dummy) == 1) { + if (bc->block_size < SECTOR_SIZE || + bc->block_size > 4096 || + !is_power_of_2(bc->block_size)) { + ti->error = "Invalid block_size"; + return -EINVAL; + } + } else { + ti->error = "Invalid feature arguments"; + return -EINVAL; + } + } + + return 0; +} + +static int dm_bow_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct bow_context *bc; + struct bow_range *br; + int ret; + + if (argc < 1) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + bc = kzalloc(sizeof(*bc), GFP_KERNEL); + if (!bc) { + ti->error = "Cannot allocate bow context"; + return -ENOMEM; + } + + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->num_write_same_bios = 1; + ti->private = bc; + + ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), + &bc->dev); + if (ret) { + ti->error = "Device lookup failed"; + goto bad; + } + + bc->block_size = + bdev_get_queue(bc->dev->bdev)->limits.logical_block_size; + if (argc > 1) { + ret = dm_bow_ctr_optional(ti, argc - 1, &argv[1]); + if (ret) + goto bad; + } + + bc->block_shift = ilog2(bc->block_size); + bc->log_sector = kzalloc(bc->block_size, GFP_KERNEL); + if (!bc->log_sector) { + ti->error = "Cannot allocate log sector"; + goto bad; + } + + init_completion(&bc->kobj_holder.completion); + mutex_init(&bc->ranges_lock); + bc->ranges = RB_ROOT; + bc->bufio = dm_bufio_client_create(bc->dev->bdev, bc->block_size, 1, 0, + NULL, NULL); + if (IS_ERR(bc->bufio)) { + ti->error = "Cannot initialize dm-bufio"; + ret = PTR_ERR(bc->bufio); + bc->bufio = NULL; + goto bad; + } + + bc->workqueue = alloc_workqueue("dm-bow", + WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM + | WQ_UNBOUND, num_online_cpus()); + if (!bc->workqueue) { + ti->error = "Cannot allocate workqueue"; + ret = -ENOMEM; + goto bad; + } + + INIT_LIST_HEAD(&bc->trimmed_list); + + br = kzalloc(sizeof(*br), GFP_KERNEL); + if (!br) { + ti->error = "Cannot allocate ranges"; + ret = -ENOMEM; + goto bad; + } + + br->sector = ti->len; + br->type = TOP; + rb_link_node(&br->node, NULL, &bc->ranges.rb_node); + rb_insert_color(&br->node, &bc->ranges); + + br = kzalloc(sizeof(*br), GFP_KERNEL); + if (!br) { + ti->error = "Cannot allocate ranges"; + ret = -ENOMEM; + goto bad; + } + + br->sector = 0; + br->type = UNCHANGED; + rb_link_node(&br->node, bc->ranges.rb_node, + &bc->ranges.rb_node->rb_left); + rb_insert_color(&br->node, &bc->ranges); + + ti->discards_supported = true; + + return 0; + +bad: + dm_bow_dtr(ti); + return ret; +} + +void dm_bow_resume(struct dm_target *ti) +{ + struct mapped_device *md = dm_table_get_md(ti->table); + struct bow_context *bc = ti->private; + int ret; + + if (bc->kobj_holder.kobj.state_initialized) + return; + + ret = kobject_init_and_add(&bc->kobj_holder.kobj, &bow_ktype, + &disk_to_dev(dm_disk(md))->kobj, "%s", + "bow"); + if (ret) + ti->error = "Cannot create sysfs node"; +} + +/****** Handle writes ******/ + +static int prepare_unchanged_range(struct bow_context *bc, struct bow_range *br, + struct bvec_iter *bi_iter, + bool record_checksum) +{ + struct bow_range *backup_br; + struct bvec_iter backup_bi; + sector_t log_source, log_dest; + unsigned int log_size; + u32 checksum = 0; + int ret; + int original_type; + sector_t sector0; + + /* Find a free range */ + backup_br = find_free_range(bc); + if (!backup_br) + return BLK_STS_NOSPC; + + /* Carve out a backup range. This may be smaller than the br given */ + backup_bi.bi_sector = backup_br->sector; + backup_bi.bi_size = min(range_size(backup_br), (u64) bi_iter->bi_size); + ret = split_range(bc, &backup_br, &backup_bi); + if (ret) + return ret; + + /* + * Carve out a changed range. This will not be smaller than the backup + * br since the backup br is smaller than the source range and iterator + */ + bi_iter->bi_size = backup_bi.bi_size; + ret = split_range(bc, &br, bi_iter); + if (ret) + return ret; + if (range_size(br) != range_size(backup_br)) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + + /* Copy data over */ + ret = copy_data(bc, br, backup_br, record_checksum ? &checksum : NULL); + if (ret) + return ret; + + /* Add an entry to the log */ + log_source = br->sector; + log_dest = backup_br->sector; + log_size = range_size(br); + + /* + * Set the types. Note that since set_type also amalgamates ranges + * we have to set both sectors to their final type before calling + * set_type on either + */ + original_type = br->type; + sector0 = backup_br->sector; + bc->trims_total -= range_size(backup_br); + if (backup_br->type == TRIMMED) + list_del(&backup_br->trimmed_list); + backup_br->type = br->type == SECTOR0_CURRENT ? SECTOR0_CURRENT + : BACKUP; + br->type = CHANGED; + set_type(bc, &backup_br, backup_br->type); + + /* + * Add the log entry after marking the backup sector, since adding a log + * can cause another backup + */ + ret = add_log_entry(bc, log_source, log_dest, log_size, checksum); + if (ret) { + br->type = original_type; + return ret; + } + + /* Now it is safe to mark this backup successful */ + if (original_type == SECTOR0_CURRENT) + bc->log_sector->sector0 = sector0; + + set_type(bc, &br, br->type); + return ret; +} + +static int prepare_free_range(struct bow_context *bc, struct bow_range *br, + struct bvec_iter *bi_iter) +{ + int ret; + + ret = split_range(bc, &br, bi_iter); + if (ret) + return ret; + set_type(bc, &br, CHANGED); + return BLK_STS_OK; +} + +static int prepare_changed_range(struct bow_context *bc, struct bow_range *br, + struct bvec_iter *bi_iter) +{ + /* Nothing to do ... */ + return BLK_STS_OK; +} + +static int prepare_one_range(struct bow_context *bc, + struct bvec_iter *bi_iter) +{ + struct bow_range *br = find_first_overlapping_range(&bc->ranges, + bi_iter); + switch (br->type) { + case CHANGED: + return prepare_changed_range(bc, br, bi_iter); + + case TRIMMED: + return prepare_free_range(bc, br, bi_iter); + + case UNCHANGED: + case BACKUP: + return prepare_unchanged_range(bc, br, bi_iter, true); + + /* + * We cannot track the checksum for the active sector0, since it + * may change at any point. + */ + case SECTOR0_CURRENT: + return prepare_unchanged_range(bc, br, bi_iter, false); + + case SECTOR0: /* Handled in the dm_bow_map */ + case TOP: /* Illegal - top is off the end of the device */ + default: + WARN_ON(1); + return BLK_STS_IOERR; + } +} + +struct write_work { + struct work_struct work; + struct bow_context *bc; + struct bio *bio; +}; + +static void bow_write(struct work_struct *work) +{ + struct write_work *ww = container_of(work, struct write_work, work); + struct bow_context *bc = ww->bc; + struct bio *bio = ww->bio; + struct bvec_iter bi_iter = bio->bi_iter; + int ret = BLK_STS_OK; + + kfree(ww); + + mutex_lock(&bc->ranges_lock); + do { + ret = prepare_one_range(bc, &bi_iter); + bi_iter.bi_sector += bi_iter.bi_size / SECTOR_SIZE; + bi_iter.bi_size = bio->bi_iter.bi_size + - (bi_iter.bi_sector - bio->bi_iter.bi_sector) + * SECTOR_SIZE; + } while (!ret && bi_iter.bi_size); + + mutex_unlock(&bc->ranges_lock); + + if (!ret) { + bio_set_dev(bio, bc->dev->bdev); + submit_bio(bio); + } else { + DMERR("Write failure with error %d", -ret); + bio->bi_status = ret; + bio_endio(bio); + } +} + +static int queue_write(struct bow_context *bc, struct bio *bio) +{ + struct write_work *ww = kmalloc(sizeof(*ww), GFP_NOIO | __GFP_NORETRY + | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (!ww) { + DMERR("Failed to allocate write_work"); + return -ENOMEM; + } + + INIT_WORK(&ww->work, bow_write); + ww->bc = bc; + ww->bio = bio; + queue_work(bc->workqueue, &ww->work); + return DM_MAPIO_SUBMITTED; +} + +static int handle_sector0(struct bow_context *bc, struct bio *bio) +{ + int ret = DM_MAPIO_REMAPPED; + + if (bio->bi_iter.bi_size > bc->block_size) { + struct bio * split = bio_split(bio, + bc->block_size >> SECTOR_SHIFT, + GFP_NOIO, + &fs_bio_set); + if (!split) { + DMERR("Failed to split bio"); + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + + bio_chain(split, bio); + split->bi_iter.bi_sector = bc->log_sector->sector0; + bio_set_dev(split, bc->dev->bdev); + submit_bio(split); + + if (bio_data_dir(bio) == WRITE) + ret = queue_write(bc, bio); + } else { + bio->bi_iter.bi_sector = bc->log_sector->sector0; + } + + return ret; +} + +static int add_trim(struct bow_context *bc, struct bio *bio) +{ + struct bow_range *br; + struct bvec_iter bi_iter = bio->bi_iter; + + DMDEBUG("add_trim: %llu, %u", + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size); + + do { + br = find_first_overlapping_range(&bc->ranges, &bi_iter); + + switch (br->type) { + case UNCHANGED: + if (!split_range(bc, &br, &bi_iter)) + set_type(bc, &br, TRIMMED); + break; + + case TRIMMED: + /* Nothing to do */ + break; + + default: + /* No other case is legal in TRIM state */ + WARN_ON(true); + break; + } + + bi_iter.bi_sector += bi_iter.bi_size / SECTOR_SIZE; + bi_iter.bi_size = bio->bi_iter.bi_size + - (bi_iter.bi_sector - bio->bi_iter.bi_sector) + * SECTOR_SIZE; + + } while (bi_iter.bi_size); + + bio_endio(bio); + return DM_MAPIO_SUBMITTED; +} + +static int remove_trim(struct bow_context *bc, struct bio *bio) +{ + struct bow_range *br; + struct bvec_iter bi_iter = bio->bi_iter; + + DMDEBUG("remove_trim: %llu, %u", + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size); + + do { + br = find_first_overlapping_range(&bc->ranges, &bi_iter); + + switch (br->type) { + case UNCHANGED: + /* Nothing to do */ + break; + + case TRIMMED: + if (!split_range(bc, &br, &bi_iter)) + set_type(bc, &br, UNCHANGED); + break; + + default: + /* No other case is legal in TRIM state */ + WARN_ON(true); + break; + } + + bi_iter.bi_sector += bi_iter.bi_size / SECTOR_SIZE; + bi_iter.bi_size = bio->bi_iter.bi_size + - (bi_iter.bi_sector - bio->bi_iter.bi_sector) + * SECTOR_SIZE; + + } while (bi_iter.bi_size); + + return DM_MAPIO_REMAPPED; +} + +int remap_unless_illegal_trim(struct bow_context *bc, struct bio *bio) +{ + if (!bc->forward_trims && bio_op(bio) == REQ_OP_DISCARD) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } else { + bio_set_dev(bio, bc->dev->bdev); + return DM_MAPIO_REMAPPED; + } +} + +/****** dm interface ******/ + +static int dm_bow_map(struct dm_target *ti, struct bio *bio) +{ + int ret = DM_MAPIO_REMAPPED; + struct bow_context *bc = ti->private; + + if (likely(bc->state.counter == COMMITTED)) + return remap_unless_illegal_trim(bc, bio); + + if (bio_data_dir(bio) == READ && bio->bi_iter.bi_sector != 0) + return remap_unless_illegal_trim(bc, bio); + + if (atomic_read(&bc->state) != COMMITTED) { + enum state state; + + mutex_lock(&bc->ranges_lock); + state = atomic_read(&bc->state); + if (state == TRIM) { + if (bio_op(bio) == REQ_OP_DISCARD) + ret = add_trim(bc, bio); + else if (bio_data_dir(bio) == WRITE) + ret = remove_trim(bc, bio); + else + /* pass-through */; + } else if (state == CHECKPOINT) { + if (bio->bi_iter.bi_sector == 0) + ret = handle_sector0(bc, bio); + else if (bio_data_dir(bio) == WRITE) + ret = queue_write(bc, bio); + else + /* pass-through */; + } else { + /* pass-through */ + } + mutex_unlock(&bc->ranges_lock); + } + + if (ret == DM_MAPIO_REMAPPED) + return remap_unless_illegal_trim(bc, bio); + + return ret; +} + +static void dm_bow_tablestatus(struct dm_target *ti, char *result, + unsigned int maxlen) +{ + char *end = result + maxlen; + struct bow_context *bc = ti->private; + struct rb_node *i; + int trimmed_list_length = 0; + int trimmed_range_count = 0; + struct bow_range *br; + + if (maxlen == 0) + return; + result[0] = 0; + + list_for_each_entry(br, &bc->trimmed_list, trimmed_list) + if (br->type == TRIMMED) { + ++trimmed_list_length; + } else { + scnprintf(result, end - result, + "ERROR: non-trimmed entry in trimmed_list"); + return; + } + + if (!rb_first(&bc->ranges)) { + scnprintf(result, end - result, "ERROR: Empty ranges"); + return; + } + + if (container_of(rb_first(&bc->ranges), struct bow_range, node) + ->sector) { + scnprintf(result, end - result, + "ERROR: First range does not start at sector 0"); + return; + } + + for (i = rb_first(&bc->ranges); i; i = rb_next(i)) { + struct bow_range *br = container_of(i, struct bow_range, node); + + result += scnprintf(result, end - result, "%s: %llu", + readable_type[br->type], + (unsigned long long)br->sector); + if (result >= end) + return; + + result += scnprintf(result, end - result, "\n"); + if (result >= end) + return; + + if (br->type == TRIMMED) + ++trimmed_range_count; + + if (br->type == TOP) { + if (br->sector != ti->len) { + scnprintf(result, end - result, + "\nERROR: Top sector is incorrect"); + } + + if (&br->node != rb_last(&bc->ranges)) { + scnprintf(result, end - result, + "\nERROR: Top sector is not last"); + } + + break; + } + + if (!rb_next(i)) { + scnprintf(result, end - result, + "\nERROR: Last range not of type TOP"); + return; + } + + if (br->sector > range_top(br)) { + scnprintf(result, end - result, + "\nERROR: sectors out of order"); + return; + } + } + + if (trimmed_range_count != trimmed_list_length) + scnprintf(result, end - result, + "\nERROR: not all trimmed ranges in trimmed list"); +} + +static void dm_bow_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, + unsigned int maxlen) +{ + switch (type) { + case STATUSTYPE_INFO: + case STATUSTYPE_IMA: + if (maxlen) + result[0] = 0; + break; + + case STATUSTYPE_TABLE: + dm_bow_tablestatus(ti, result, maxlen); + break; + } +} + +int dm_bow_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +{ + struct bow_context *bc = ti->private; + struct dm_dev *dev = bc->dev; + + *bdev = dev->bdev; + /* Only pass ioctls through if the device sizes match exactly. */ + return ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; +} + +static int dm_bow_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct bow_context *bc = ti->private; + + return fn(ti, bc->dev, 0, ti->len, data); +} + +static struct target_type bow_target = { + .name = "bow", + .version = {1, 2, 0}, + .features = DM_TARGET_PASSES_CRYPTO, + .module = THIS_MODULE, + .ctr = dm_bow_ctr, + .resume = dm_bow_resume, + .dtr = dm_bow_dtr, + .map = dm_bow_map, + .status = dm_bow_status, + .prepare_ioctl = dm_bow_prepare_ioctl, + .iterate_devices = dm_bow_iterate_devices, + .io_hints = dm_bow_io_hints, +}; + +int __init dm_bow_init(void) +{ + int r = dm_register_target(&bow_target); + + if (r < 0) + DMERR("registering bow failed %d", r); + return r; +} + +void dm_bow_exit(void) +{ + dm_unregister_target(&bow_target); +} + +MODULE_LICENSE("GPL"); + +module_init(dm_bow_init); +module_exit(dm_bow_exit); diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 5a7d270b32c0..adb9604e85ac 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include @@ -202,7 +202,7 @@ struct dm_table { struct dm_md_mempools *mempools; #ifdef CONFIG_BLK_INLINE_ENCRYPTION - struct blk_keyslot_manager *ksm; + struct blk_crypto_profile *crypto_profile; #endif }; diff --git a/drivers/md/dm-default-key.c b/drivers/md/dm-default-key.c new file mode 100644 index 000000000000..577c5fdb6bbc --- /dev/null +++ b/drivers/md/dm-default-key.c @@ -0,0 +1,439 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2017 Google, Inc. + */ + +#include +#include +#include + +#define DM_MSG_PREFIX "default-key" + +static const struct dm_default_key_cipher { + const char *name; + enum blk_crypto_mode_num mode_num; + int key_size; +} dm_default_key_ciphers[] = { + { + .name = "aes-xts-plain64", + .mode_num = BLK_ENCRYPTION_MODE_AES_256_XTS, + .key_size = 64, + }, { + .name = "xchacha12,aes-adiantum-plain64", + .mode_num = BLK_ENCRYPTION_MODE_ADIANTUM, + .key_size = 32, + }, +}; + +/** + * struct dm_default_c - private data of a default-key target + * @dev: the underlying device + * @start: starting sector of the range of @dev which this target actually maps. + * For this purpose a "sector" is 512 bytes. + * @cipher_string: the name of the encryption algorithm being used + * @iv_offset: starting offset for IVs. IVs are generated as if the target were + * preceded by @iv_offset 512-byte sectors. + * @sector_size: crypto sector size in bytes (usually 4096) + * @sector_bits: log2(sector_size) + * @key: the encryption key to use + * @max_dun: the maximum DUN that may be used (computed from other params) + */ +struct default_key_c { + struct dm_dev *dev; + sector_t start; + const char *cipher_string; + u64 iv_offset; + unsigned int sector_size; + unsigned int sector_bits; + struct blk_crypto_key key; + enum blk_crypto_key_type key_type; + u64 max_dun; +}; + +static const struct dm_default_key_cipher * +lookup_cipher(const char *cipher_string) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(dm_default_key_ciphers); i++) { + if (strcmp(cipher_string, dm_default_key_ciphers[i].name) == 0) + return &dm_default_key_ciphers[i]; + } + return NULL; +} + +static void default_key_dtr(struct dm_target *ti) +{ + struct default_key_c *dkc = ti->private; + int err; + + if (dkc->dev) { + err = blk_crypto_evict_key(dkc->dev->bdev, &dkc->key); + if (err && err != -ENOKEY) + DMWARN("Failed to evict crypto key: %d", err); + dm_put_device(ti, dkc->dev); + } + kfree_sensitive(dkc->cipher_string); + kfree_sensitive(dkc); +} + +static int default_key_ctr_optional(struct dm_target *ti, + unsigned int argc, char **argv) +{ + struct default_key_c *dkc = ti->private; + struct dm_arg_set as; + static const struct dm_arg _args[] = { + {0, 4, "Invalid number of feature args"}, + }; + unsigned int opt_params; + const char *opt_string; + bool iv_large_sectors = false; + char dummy; + int err; + + as.argc = argc; + as.argv = argv; + + err = dm_read_arg_group(_args, &as, &opt_params, &ti->error); + if (err) + return err; + + while (opt_params--) { + opt_string = dm_shift_arg(&as); + if (!opt_string) { + ti->error = "Not enough feature arguments"; + return -EINVAL; + } + if (!strcmp(opt_string, "allow_discards")) { + ti->num_discard_bios = 1; + } else if (sscanf(opt_string, "sector_size:%u%c", + &dkc->sector_size, &dummy) == 1) { + if (dkc->sector_size < SECTOR_SIZE || + dkc->sector_size > 4096 || + !is_power_of_2(dkc->sector_size)) { + ti->error = "Invalid sector_size"; + return -EINVAL; + } + } else if (!strcmp(opt_string, "iv_large_sectors")) { + iv_large_sectors = true; + } else if (!strcmp(opt_string, "wrappedkey_v0")) { + dkc->key_type = BLK_CRYPTO_KEY_TYPE_HW_WRAPPED; + } else { + ti->error = "Invalid feature arguments"; + return -EINVAL; + } + } + + /* dm-default-key doesn't implement iv_large_sectors=false. */ + if (dkc->sector_size != SECTOR_SIZE && !iv_large_sectors) { + ti->error = "iv_large_sectors must be specified"; + return -EINVAL; + } + + return 0; +} + +/* + * Construct a default-key mapping: + * + * + * This syntax matches dm-crypt's, but lots of unneeded functionality has been + * removed. Also, dm-default-key requires that the "iv_large_sectors" option be + * given whenever a non-default sector size is used. + */ +static int default_key_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct default_key_c *dkc; + const struct dm_default_key_cipher *cipher; + u8 raw_key[BLK_CRYPTO_MAX_ANY_KEY_SIZE]; + unsigned int raw_key_size; + unsigned int dun_bytes; + unsigned long long tmpll; + char dummy; + int err; + + if (argc < 5) { + ti->error = "Not enough arguments"; + return -EINVAL; + } + + dkc = kzalloc(sizeof(*dkc), GFP_KERNEL); + if (!dkc) { + ti->error = "Out of memory"; + return -ENOMEM; + } + ti->private = dkc; + dkc->key_type = BLK_CRYPTO_KEY_TYPE_STANDARD; + + /* */ + dkc->cipher_string = kstrdup(argv[0], GFP_KERNEL); + if (!dkc->cipher_string) { + ti->error = "Out of memory"; + err = -ENOMEM; + goto bad; + } + cipher = lookup_cipher(dkc->cipher_string); + if (!cipher) { + ti->error = "Unsupported cipher"; + err = -EINVAL; + goto bad; + } + + /* */ + raw_key_size = strlen(argv[1]); + if (raw_key_size > 2 * BLK_CRYPTO_MAX_ANY_KEY_SIZE || + raw_key_size % 2) { + ti->error = "Invalid keysize"; + err = -EINVAL; + goto bad; + } + raw_key_size /= 2; + if (hex2bin(raw_key, argv[1], raw_key_size) != 0) { + ti->error = "Malformed key string"; + err = -EINVAL; + goto bad; + } + + /* */ + if (sscanf(argv[2], "%llu%c", &dkc->iv_offset, &dummy) != 1) { + ti->error = "Invalid iv_offset sector"; + err = -EINVAL; + goto bad; + } + + /* */ + err = dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), + &dkc->dev); + if (err) { + ti->error = "Device lookup failed"; + goto bad; + } + + /* */ + if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1 || + tmpll != (sector_t)tmpll) { + ti->error = "Invalid start sector"; + err = -EINVAL; + goto bad; + } + dkc->start = tmpll; + + /* optional arguments */ + dkc->sector_size = SECTOR_SIZE; + if (argc > 5) { + err = default_key_ctr_optional(ti, argc - 5, &argv[5]); + if (err) + goto bad; + } + dkc->sector_bits = ilog2(dkc->sector_size); + if (ti->len & ((dkc->sector_size >> SECTOR_SHIFT) - 1)) { + ti->error = "Device size is not a multiple of sector_size"; + err = -EINVAL; + goto bad; + } + + dkc->max_dun = (dkc->iv_offset + ti->len - 1) >> + (dkc->sector_bits - SECTOR_SHIFT); + dun_bytes = DIV_ROUND_UP(fls64(dkc->max_dun), 8); + + err = blk_crypto_init_key(&dkc->key, raw_key, raw_key_size, + dkc->key_type, cipher->mode_num, + dun_bytes, dkc->sector_size); + if (err) { + ti->error = "Error initializing blk-crypto key"; + goto bad; + } + + err = blk_crypto_start_using_key(dkc->dev->bdev, &dkc->key); + if (err) { + ti->error = "Error starting to use blk-crypto"; + goto bad; + } + + ti->num_flush_bios = 1; + + err = 0; + goto out; + +bad: + default_key_dtr(ti); +out: + memzero_explicit(raw_key, sizeof(raw_key)); + return err; +} + +static int default_key_map(struct dm_target *ti, struct bio *bio) +{ + const struct default_key_c *dkc = ti->private; + sector_t sector_in_target; + u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE] = { 0 }; + + bio_set_dev(bio, dkc->dev->bdev); + + /* + * If the bio is a device-level request which doesn't target a specific + * sector, there's nothing more to do. + */ + if (bio_sectors(bio) == 0) + return DM_MAPIO_REMAPPED; + + /* Map the bio's sector to the underlying device. (512-byte sectors) */ + sector_in_target = dm_target_offset(ti, bio->bi_iter.bi_sector); + bio->bi_iter.bi_sector = dkc->start + sector_in_target; + + /* + * If the bio should skip dm-default-key (i.e. if it's for an encrypted + * file's contents), or if it doesn't have any data (e.g. if it's a + * DISCARD request), there's nothing more to do. + */ + if (bio_should_skip_dm_default_key(bio) || !bio_has_data(bio)) + return DM_MAPIO_REMAPPED; + + /* + * Else, dm-default-key needs to set this bio's encryption context. + * It must not already have one. + */ + if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) + return DM_MAPIO_KILL; + + /* Calculate the DUN and enforce data-unit (crypto sector) alignment. */ + dun[0] = dkc->iv_offset + sector_in_target; /* 512-byte sectors */ + if (dun[0] & ((dkc->sector_size >> SECTOR_SHIFT) - 1)) + return DM_MAPIO_KILL; + dun[0] >>= dkc->sector_bits - SECTOR_SHIFT; /* crypto sectors */ + + /* + * This check isn't necessary as we should have calculated max_dun + * correctly, but be safe. + */ + if (WARN_ON_ONCE(dun[0] > dkc->max_dun)) + return DM_MAPIO_KILL; + + bio_crypt_set_ctx(bio, &dkc->key, dun, GFP_NOIO); + + return DM_MAPIO_REMAPPED; +} + +static void default_key_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, + unsigned int maxlen) +{ + const struct default_key_c *dkc = ti->private; + unsigned int sz = 0; + int num_feature_args = 0; + + switch (type) { + case STATUSTYPE_INFO: + case STATUSTYPE_IMA: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + /* Omit the key for now. */ + DMEMIT("%s - %llu %s %llu", dkc->cipher_string, dkc->iv_offset, + dkc->dev->name, (unsigned long long)dkc->start); + + num_feature_args += !!ti->num_discard_bios; + if (dkc->sector_size != SECTOR_SIZE) + num_feature_args += 2; + if (dkc->key_type == BLK_CRYPTO_KEY_TYPE_HW_WRAPPED) + num_feature_args += 1; + if (num_feature_args != 0) { + DMEMIT(" %d", num_feature_args); + if (ti->num_discard_bios) + DMEMIT(" allow_discards"); + if (dkc->sector_size != SECTOR_SIZE) { + DMEMIT(" sector_size:%u", dkc->sector_size); + DMEMIT(" iv_large_sectors"); + } + if (dkc->key_type == BLK_CRYPTO_KEY_TYPE_HW_WRAPPED) + DMEMIT(" wrappedkey_v0"); + } + break; + } +} + +static int default_key_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev) +{ + const struct default_key_c *dkc = ti->private; + const struct dm_dev *dev = dkc->dev; + + *bdev = dev->bdev; + + /* Only pass ioctls through if the device sizes match exactly. */ + if (dkc->start != 0 || + ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) + return 1; + return 0; +} + +static int default_key_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, + void *data) +{ + const struct default_key_c *dkc = ti->private; + + return fn(ti, dkc->dev, dkc->start, ti->len, data); +} + +static void default_key_io_hints(struct dm_target *ti, + struct queue_limits *limits) +{ + const struct default_key_c *dkc = ti->private; + const unsigned int sector_size = dkc->sector_size; + + limits->logical_block_size = + max_t(unsigned int, limits->logical_block_size, sector_size); + limits->physical_block_size = + max_t(unsigned int, limits->physical_block_size, sector_size); + limits->io_min = max_t(unsigned int, limits->io_min, sector_size); +} + +#ifdef CONFIG_BLK_DEV_ZONED +static int default_key_report_zones(struct dm_target *ti, + struct dm_report_zones_args *args, unsigned int nr_zones) +{ + struct default_key_c *dkc = ti->private; + + return dm_report_zones(dkc->dev->bdev, dkc->start, + dkc->start + dm_target_offset(ti, args->next_sector), + args, nr_zones); +} +#else +#define default_key_report_zones NULL +#endif + +static struct target_type default_key_target = { + .name = "default-key", + .version = {2, 1, 0}, + .features = DM_TARGET_PASSES_CRYPTO | DM_TARGET_ZONED_HM, + .report_zones = default_key_report_zones, + .module = THIS_MODULE, + .ctr = default_key_ctr, + .dtr = default_key_dtr, + .map = default_key_map, + .status = default_key_status, + .prepare_ioctl = default_key_prepare_ioctl, + .iterate_devices = default_key_iterate_devices, + .io_hints = default_key_io_hints, +}; + +static int __init dm_default_key_init(void) +{ + return dm_register_target(&default_key_target); +} + +static void __exit dm_default_key_exit(void) +{ + dm_unregister_target(&default_key_target); +} + +module_init(dm_default_key_init); +module_exit(dm_default_key_exit); + +MODULE_AUTHOR("Paul Lawrence "); +MODULE_AUTHOR("Paul Crowley "); +MODULE_AUTHOR("Eric Biggers "); +MODULE_DESCRIPTION(DM_NAME " target for encrypting filesystem metadata"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 2111daaacaba..d9e65611aef7 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -169,7 +169,7 @@ static void free_devices(struct list_head *devices, struct mapped_device *md) } } -static void dm_table_destroy_keyslot_manager(struct dm_table *t); +static void dm_table_destroy_crypto_profile(struct dm_table *t); void dm_table_destroy(struct dm_table *t) { @@ -199,7 +199,7 @@ void dm_table_destroy(struct dm_table *t) dm_free_md_mempools(t->mempools); - dm_table_destroy_keyslot_manager(t); + dm_table_destroy_crypto_profile(t); kfree(t); } @@ -1186,8 +1186,8 @@ static int dm_table_register_integrity(struct dm_table *t) #ifdef CONFIG_BLK_INLINE_ENCRYPTION -struct dm_keyslot_manager { - struct blk_keyslot_manager ksm; +struct dm_crypto_profile { + struct blk_crypto_profile profile; struct mapped_device *md; }; @@ -1202,7 +1202,7 @@ static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev, struct dm_keyslot_evict_args *args = data; int err; - err = blk_crypto_evict_key(bdev_get_queue(dev->bdev), args->key); + err = blk_crypto_evict_key(dev->bdev, args->key); if (!args->err) args->err = err; /* Always try to evict the key from all devices. */ @@ -1213,13 +1213,11 @@ static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev, * When an inline encryption key is evicted from a device-mapper device, evict * it from all the underlying devices. */ -static int dm_keyslot_evict(struct blk_keyslot_manager *ksm, +static int dm_keyslot_evict(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, unsigned int slot) { - struct dm_keyslot_manager *dksm = container_of(ksm, - struct dm_keyslot_manager, - ksm); - struct mapped_device *md = dksm->md; + struct mapped_device *md = + container_of(profile, struct dm_crypto_profile, profile)->md; struct dm_keyslot_evict_args args = { key }; struct dm_table *t; int srcu_idx; @@ -1239,150 +1237,212 @@ static int dm_keyslot_evict(struct blk_keyslot_manager *ksm, return args.err; } -static const struct blk_ksm_ll_ops dm_ksm_ll_ops = { - .keyslot_evict = dm_keyslot_evict, +struct dm_derive_sw_secret_args { + const u8 *eph_key; + size_t eph_key_size; + u8 *sw_secret; + int err; }; -static int device_intersect_crypto_modes(struct dm_target *ti, - struct dm_dev *dev, sector_t start, - sector_t len, void *data) +static int dm_derive_sw_secret_callback(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) { - struct blk_keyslot_manager *parent = data; - struct blk_keyslot_manager *child = bdev_get_queue(dev->bdev)->ksm; + struct dm_derive_sw_secret_args *args = data; - blk_ksm_intersect_modes(parent, child); + if (!args->err) + return 0; + + args->err = blk_crypto_derive_sw_secret(dev->bdev, + args->eph_key, + args->eph_key_size, + args->sw_secret); + /* Try another device in case this fails. */ return 0; } -void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm) +/* + * Retrieve the sw_secret from the underlying device. Given that only one + * sw_secret can exist for a particular wrapped key, retrieve it only from the + * first device that supports derive_sw_secret(). + */ +static int dm_derive_sw_secret(struct blk_crypto_profile *profile, + const u8 *eph_key, size_t eph_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]) { - struct dm_keyslot_manager *dksm = container_of(ksm, - struct dm_keyslot_manager, - ksm); + struct mapped_device *md = + container_of(profile, struct dm_crypto_profile, profile)->md; + struct dm_derive_sw_secret_args args = { + .eph_key = eph_key, + .eph_key_size = eph_key_size, + .sw_secret = sw_secret, + .err = -EOPNOTSUPP, + }; + struct dm_table *t; + int srcu_idx; + int i; + struct dm_target *ti; - if (!ksm) - return; - - blk_ksm_destroy(ksm); - kfree(dksm); + t = dm_get_live_table(md, &srcu_idx); + if (!t) + return -EOPNOTSUPP; + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); + if (!ti->type->iterate_devices) + continue; + ti->type->iterate_devices(ti, dm_derive_sw_secret_callback, + &args); + if (!args.err) + break; + } + dm_put_live_table(md, srcu_idx); + return args.err; } -static void dm_table_destroy_keyslot_manager(struct dm_table *t) +static int +device_intersect_crypto_capabilities(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { - dm_destroy_keyslot_manager(t->ksm); - t->ksm = NULL; + struct blk_crypto_profile *parent = data; + struct blk_crypto_profile *child = + bdev_get_queue(dev->bdev)->crypto_profile; + + blk_crypto_intersect_capabilities(parent, child); + return 0; +} + +void dm_destroy_crypto_profile(struct blk_crypto_profile *profile) +{ + struct dm_crypto_profile *dmcp = container_of(profile, + struct dm_crypto_profile, + profile); + + if (!profile) + return; + + blk_crypto_profile_destroy(profile); + kfree(dmcp); +} + +static void dm_table_destroy_crypto_profile(struct dm_table *t) +{ + dm_destroy_crypto_profile(t->crypto_profile); + t->crypto_profile = NULL; } /* - * Constructs and initializes t->ksm with a keyslot manager that - * represents the common set of crypto capabilities of the devices - * described by the dm_table. However, if the constructed keyslot - * manager does not support a superset of the crypto capabilities - * supported by the current keyslot manager of the mapped_device, - * it returns an error instead, since we don't support restricting - * crypto capabilities on table changes. Finally, if the constructed - * keyslot manager doesn't actually support any crypto modes at all, - * it just returns NULL. + * Constructs and initializes t->crypto_profile with a crypto profile that + * represents the common set of crypto capabilities of the devices described by + * the dm_table. However, if the constructed crypto profile doesn't support all + * crypto capabilities that are supported by the current mapped_device, it + * returns an error instead, since we don't support removing crypto capabilities + * on table changes. Finally, if the constructed crypto profile is "empty" (has + * no crypto capabilities at all), it just sets t->crypto_profile to NULL. */ -static int dm_table_construct_keyslot_manager(struct dm_table *t) +static int dm_table_construct_crypto_profile(struct dm_table *t) { - struct dm_keyslot_manager *dksm; - struct blk_keyslot_manager *ksm; + struct dm_crypto_profile *dmcp; + struct blk_crypto_profile *profile; struct dm_target *ti; unsigned int i; - bool ksm_is_empty = true; + bool empty_profile = true; - dksm = kmalloc(sizeof(*dksm), GFP_KERNEL); - if (!dksm) + dmcp = kmalloc(sizeof(*dmcp), GFP_KERNEL); + if (!dmcp) return -ENOMEM; - dksm->md = t->md; + dmcp->md = t->md; - ksm = &dksm->ksm; - blk_ksm_init_passthrough(ksm); - ksm->ksm_ll_ops = dm_ksm_ll_ops; - ksm->max_dun_bytes_supported = UINT_MAX; - memset(ksm->crypto_modes_supported, 0xFF, - sizeof(ksm->crypto_modes_supported)); + profile = &dmcp->profile; + blk_crypto_profile_init(profile, 0); + profile->ll_ops.keyslot_evict = dm_keyslot_evict; + profile->ll_ops.derive_sw_secret = dm_derive_sw_secret; + profile->max_dun_bytes_supported = UINT_MAX; + memset(profile->modes_supported, 0xFF, + sizeof(profile->modes_supported)); + profile->key_types_supported = ~0; for (i = 0; i < dm_table_get_num_targets(t); i++) { ti = dm_table_get_target(t, i); if (!dm_target_passes_crypto(ti->type)) { - blk_ksm_intersect_modes(ksm, NULL); + blk_crypto_intersect_capabilities(profile, NULL); break; } if (!ti->type->iterate_devices) continue; - ti->type->iterate_devices(ti, device_intersect_crypto_modes, - ksm); + ti->type->iterate_devices(ti, + device_intersect_crypto_capabilities, + profile); } - if (t->md->queue && !blk_ksm_is_superset(ksm, t->md->queue->ksm)) { + if (t->md->queue && + !blk_crypto_has_capabilities(profile, + t->md->queue->crypto_profile)) { DMWARN("Inline encryption capabilities of new DM table were more restrictive than the old table's. This is not supported!"); - dm_destroy_keyslot_manager(ksm); + dm_destroy_crypto_profile(profile); return -EINVAL; } /* - * If the new KSM doesn't actually support any crypto modes, we may as - * well represent it with a NULL ksm. + * If the new profile doesn't actually support any crypto capabilities, + * we may as well represent it with a NULL profile. */ - ksm_is_empty = true; - for (i = 0; i < ARRAY_SIZE(ksm->crypto_modes_supported); i++) { - if (ksm->crypto_modes_supported[i]) { - ksm_is_empty = false; + for (i = 0; i < ARRAY_SIZE(profile->modes_supported); i++) { + if (profile->modes_supported[i]) { + empty_profile = false; break; } } - if (ksm_is_empty) { - dm_destroy_keyslot_manager(ksm); - ksm = NULL; + if (empty_profile) { + dm_destroy_crypto_profile(profile); + profile = NULL; } /* - * t->ksm is only set temporarily while the table is being set - * up, and it gets set to NULL after the capabilities have - * been transferred to the request_queue. + * t->crypto_profile is only set temporarily while the table is being + * set up, and it gets set to NULL after the profile has been + * transferred to the request_queue. */ - t->ksm = ksm; + t->crypto_profile = profile; return 0; } -static void dm_update_keyslot_manager(struct request_queue *q, - struct dm_table *t) +static void dm_update_crypto_profile(struct request_queue *q, + struct dm_table *t) { - if (!t->ksm) + if (!t->crypto_profile) return; - /* Make the ksm less restrictive */ - if (!q->ksm) { - blk_ksm_register(t->ksm, q); + /* Make the crypto profile less restrictive. */ + if (!q->crypto_profile) { + blk_crypto_register(t->crypto_profile, q); } else { - blk_ksm_update_capabilities(q->ksm, t->ksm); - dm_destroy_keyslot_manager(t->ksm); + blk_crypto_update_capabilities(q->crypto_profile, + t->crypto_profile); + dm_destroy_crypto_profile(t->crypto_profile); } - t->ksm = NULL; + t->crypto_profile = NULL; } #else /* CONFIG_BLK_INLINE_ENCRYPTION */ -static int dm_table_construct_keyslot_manager(struct dm_table *t) +static int dm_table_construct_crypto_profile(struct dm_table *t) { return 0; } -void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm) +void dm_destroy_crypto_profile(struct blk_crypto_profile *profile) { } -static void dm_table_destroy_keyslot_manager(struct dm_table *t) +static void dm_table_destroy_crypto_profile(struct dm_table *t) { } -static void dm_update_keyslot_manager(struct request_queue *q, - struct dm_table *t) +static void dm_update_crypto_profile(struct request_queue *q, + struct dm_table *t) { } @@ -1414,9 +1474,9 @@ int dm_table_complete(struct dm_table *t) return r; } - r = dm_table_construct_keyslot_manager(t); + r = dm_table_construct_crypto_profile(t); if (r) { - DMERR("could not construct keyslot manager."); + DMERR("could not construct crypto profile."); return r; } @@ -2070,7 +2130,7 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, return r; } - dm_update_keyslot_manager(q, t); + dm_update_crypto_profile(q, t); disk_update_readahead(t->md->disk); return 0; diff --git a/drivers/md/dm-user.c b/drivers/md/dm-user.c new file mode 100644 index 000000000000..e393244a5e55 --- /dev/null +++ b/drivers/md/dm-user.c @@ -0,0 +1,1290 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2020 Google, Inc + * Copyright (C) 2020 Palmer Dabbelt + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "user" + +#define MAX_OUTSTANDING_MESSAGES 128 + +static unsigned int daemon_timeout_msec = 4000; +module_param_named(dm_user_daemon_timeout_msec, daemon_timeout_msec, uint, + 0644); +MODULE_PARM_DESC(dm_user_daemon_timeout_msec, + "IO Timeout in msec if daemon does not process"); + +/* + * dm-user uses four structures: + * + * - "struct target", the outermost structure, corresponds to a single device + * mapper target. This contains the set of outstanding BIOs that have been + * provided by DM and are not actively being processed by the user, along + * with a misc device that userspace can open to communicate with the + * kernel. Each time userspaces opens the misc device a new channel is + * created. + * - "struct channel", which represents a single active communication channel + * with userspace. Userspace may choose arbitrary read/write sizes to use + * when processing messages, channels form these into logical accesses. + * When userspace responds to a full message the channel completes the BIO + * and obtains a new message to process from the target. + * - "struct message", which wraps a BIO with the additional information + * required by the kernel to sort out what to do with BIOs when they return + * from userspace. + * - "struct dm_user_message", which is the exact message format that + * userspace sees. + * + * The hot path contains three distinct operations: + * + * - user_map(), which is provided a BIO from device mapper that is queued + * into the target. This allocates and enqueues a new message. + * - dev_read(), which dequeues a message, copies it to userspace. + * - dev_write(), which looks up a message (keyed by sequence number) and + * completes the corresponding BIO. + * + * Lock ordering (outer to inner) + * + * 1) miscdevice's global lock. This is held around dev_open, so it has to be + * the outermost lock. + * 2) target->lock + * 3) channel->lock + */ + +struct message { + /* + * Messages themselves do not need a lock, they're protected by either + * the target or channel's lock, depending on which can reference them + * directly. + */ + struct dm_user_message msg; + struct bio *bio; + size_t posn_to_user; + size_t total_to_user; + size_t posn_from_user; + size_t total_from_user; + + struct list_head from_user; + struct list_head to_user; + + /* + * These are written back from the user. They live in the same spot in + * the message, but we need to either keep the old values around or + * call a bunch more BIO helpers. These are only valid after write has + * adopted the message. + */ + u64 return_type; + u64 return_flags; + + struct delayed_work work; + bool delayed; + struct target *t; +}; + +struct target { + /* + * A target has a single lock, which protects everything in the target + * (but does not protect the channels associated with a target). + */ + struct mutex lock; + + /* + * There is only one point at which anything blocks: userspace blocks + * reading a new message, which is woken up by device mapper providing + * a new BIO to process (or tearing down the target). The + * corresponding write side doesn't block, instead we treat userspace's + * response containing a message that has yet to be mapped as an + * invalid operation. + */ + struct wait_queue_head wq; + + /* + * Messages are delivered to userspace in order, but may be returned + * out of order. This allows userspace to schedule IO if it wants to. + */ + mempool_t message_pool; + u64 next_seq_to_map; + u64 next_seq_to_user; + struct list_head to_user; + + /* + * There is a misc device per target. The name is selected by + * userspace (via a DM create ioctl argument), and each ends up in + * /dev/dm-user/. It looks like a better way to do this may be to have + * a filesystem to manage these, but this was more expedient. The + * current mechanism is functional, but does result in an arbitrary + * number of dynamically created misc devices. + */ + struct miscdevice miscdev; + + /* + * Device mapper's target destructor triggers tearing this all down, + * but we can't actually free until every channel associated with this + * target has been destroyed. Channels each have a reference to their + * target, and there is an additional single reference that corresponds + * to both DM and the misc device (both of which are destroyed by DM). + * + * In the common case userspace will be asleep waiting for a new + * message when device mapper decides to destroy the target, which + * means no new messages will appear. The destroyed flag triggers a + * wakeup, which will end up removing the reference. + */ + struct kref references; + int dm_destroyed; + bool daemon_terminated; +}; + +struct channel { + struct target *target; + + /* + * A channel has a single lock, which prevents multiple reads (or + * multiple writes) from conflicting with each other. + */ + struct mutex lock; + + struct message *cur_to_user; + struct message *cur_from_user; + ssize_t to_user_error; + ssize_t from_user_error; + + /* + * Once a message has been forwarded to userspace on a channel it must + * be responded to on the same channel. This allows us to error out + * the messages that have not yet been responded to by a channel when + * that channel closes, which makes handling errors more reasonable for + * fault-tolerant userspace daemons. It also happens to make avoiding + * shared locks between user_map() and dev_read() a lot easier. + * + * This does preclude a multi-threaded work stealing userspace + * implementation (or at least, force a degree of head-of-line blocking + * on the response path). + */ + struct list_head from_user; + + /* + * Responses from userspace can arrive in arbitrarily small chunks. + * We need some place to buffer one up until we can find the + * corresponding kernel-side message to continue processing, so instead + * of allocating them we just keep one off to the side here. This can + * only ever be pointer to by from_user_cur, and will never have a BIO. + */ + struct message scratch_message_from_user; +}; + +static void message_kill(struct message *m, mempool_t *pool) +{ + m->bio->bi_status = BLK_STS_IOERR; + bio_endio(m->bio); + mempool_free(m, pool); +} + +static inline bool is_user_space_thread_present(struct target *t) +{ + lockdep_assert_held(&t->lock); + return (kref_read(&t->references) > 1); +} + +static void process_delayed_work(struct work_struct *work) +{ + struct delayed_work *del_work = to_delayed_work(work); + struct message *msg = container_of(del_work, struct message, work); + + struct target *t = msg->t; + + mutex_lock(&t->lock); + + /* + * There is at least one thread to process the IO. + */ + if (is_user_space_thread_present(t)) { + mutex_unlock(&t->lock); + return; + } + + /* + * Terminate the IO with an error + */ + list_del(&msg->to_user); + pr_err("I/O error: sector %llu: no user-space daemon for %s target\n", + msg->bio->bi_iter.bi_sector, + t->miscdev.name); + message_kill(msg, &t->message_pool); + mutex_unlock(&t->lock); +} + +static void enqueue_delayed_work(struct message *m, bool is_delay) +{ + unsigned long delay = 0; + + m->delayed = true; + INIT_DELAYED_WORK(&m->work, process_delayed_work); + + /* + * Snapuserd daemon is the user-space process + * which processes IO request from dm-user + * when OTA is applied. Per the current design, + * when a dm-user target is created, daemon + * attaches to target and starts processing + * the IO's. Daemon is terminated only when + * dm-user target is destroyed. + * + * If for some reason, daemon crashes or terminates early, + * without destroying the dm-user target; then + * there is no mechanism to restart the daemon + * and start processing the IO's from the same target. + * Theoretically, it is possible but that infrastructure + * doesn't exist in the android ecosystem. + * + * Thus, when the daemon terminates, there is no way the IO's + * issued on that target will be processed. Hence, + * we set the delay to 0 and fail the IO's immediately. + * + * On the other hand, when a new dm-user target is created, + * we wait for the daemon to get attached for the first time. + * This primarily happens when init first stage spins up + * the daemon. At this point, since the snapshot device is mounted + * of a root filesystem, dm-user target may receive IO request + * even though daemon is not fully launched. We don't want + * to fail those IO requests immediately. Thus, we queue these + * requests with a timeout so that daemon is ready to process + * those IO requests. Again, if the daemon fails to launch within + * the timeout period, then IO's will be failed. + */ + if (is_delay) + delay = msecs_to_jiffies(daemon_timeout_msec); + + queue_delayed_work(system_wq, &m->work, delay); +} + +static inline struct target *target_from_target(struct dm_target *target) +{ + WARN_ON(target->private == NULL); + return target->private; +} + +static inline struct target *target_from_miscdev(struct miscdevice *miscdev) +{ + return container_of(miscdev, struct target, miscdev); +} + +static inline struct channel *channel_from_file(struct file *file) +{ + WARN_ON(file->private_data == NULL); + return file->private_data; +} + +static inline struct target *target_from_channel(struct channel *c) +{ + WARN_ON(c->target == NULL); + return c->target; +} + +static inline size_t bio_size(struct bio *bio) +{ + struct bio_vec bvec; + struct bvec_iter iter; + size_t out = 0; + + bio_for_each_segment (bvec, bio, iter) + out += bio_iter_len(bio, iter); + return out; +} + +static inline size_t bio_bytes_needed_to_user(struct bio *bio) +{ + switch (bio_op(bio)) { + case REQ_OP_WRITE: + return sizeof(struct dm_user_message) + bio_size(bio); + case REQ_OP_READ: + case REQ_OP_FLUSH: + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE_ZEROES: + return sizeof(struct dm_user_message); + + /* + * These ops are not passed to userspace under the assumption that + * they're not going to be particularly useful in that context. + */ + default: + return -EOPNOTSUPP; + } +} + +static inline size_t bio_bytes_needed_from_user(struct bio *bio) +{ + switch (bio_op(bio)) { + case REQ_OP_READ: + return sizeof(struct dm_user_message) + bio_size(bio); + case REQ_OP_WRITE: + case REQ_OP_FLUSH: + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE_ZEROES: + return sizeof(struct dm_user_message); + + /* + * These ops are not passed to userspace under the assumption that + * they're not going to be particularly useful in that context. + */ + default: + return -EOPNOTSUPP; + } +} + +static inline long bio_type_to_user_type(struct bio *bio) +{ + switch (bio_op(bio)) { + case REQ_OP_READ: + return DM_USER_REQ_MAP_READ; + case REQ_OP_WRITE: + return DM_USER_REQ_MAP_WRITE; + case REQ_OP_FLUSH: + return DM_USER_REQ_MAP_FLUSH; + case REQ_OP_DISCARD: + return DM_USER_REQ_MAP_DISCARD; + case REQ_OP_SECURE_ERASE: + return DM_USER_REQ_MAP_SECURE_ERASE; + case REQ_OP_WRITE_SAME: + return DM_USER_REQ_MAP_WRITE_SAME; + case REQ_OP_WRITE_ZEROES: + return DM_USER_REQ_MAP_WRITE_ZEROES; + + /* + * These ops are not passed to userspace under the assumption that + * they're not going to be particularly useful in that context. + */ + default: + return -EOPNOTSUPP; + } +} + +static inline long bio_flags_to_user_flags(struct bio *bio) +{ + u64 out = 0; + typeof(bio->bi_opf) opf = bio->bi_opf & ~REQ_OP_MASK; + + if (opf & REQ_FAILFAST_DEV) { + opf &= ~REQ_FAILFAST_DEV; + out |= DM_USER_REQ_MAP_FLAG_FAILFAST_DEV; + } + + if (opf & REQ_FAILFAST_TRANSPORT) { + opf &= ~REQ_FAILFAST_TRANSPORT; + out |= DM_USER_REQ_MAP_FLAG_FAILFAST_TRANSPORT; + } + + if (opf & REQ_FAILFAST_DRIVER) { + opf &= ~REQ_FAILFAST_DRIVER; + out |= DM_USER_REQ_MAP_FLAG_FAILFAST_DRIVER; + } + + if (opf & REQ_SYNC) { + opf &= ~REQ_SYNC; + out |= DM_USER_REQ_MAP_FLAG_SYNC; + } + + if (opf & REQ_META) { + opf &= ~REQ_META; + out |= DM_USER_REQ_MAP_FLAG_META; + } + + if (opf & REQ_PRIO) { + opf &= ~REQ_PRIO; + out |= DM_USER_REQ_MAP_FLAG_PRIO; + } + + if (opf & REQ_NOMERGE) { + opf &= ~REQ_NOMERGE; + out |= DM_USER_REQ_MAP_FLAG_NOMERGE; + } + + if (opf & REQ_IDLE) { + opf &= ~REQ_IDLE; + out |= DM_USER_REQ_MAP_FLAG_IDLE; + } + + if (opf & REQ_INTEGRITY) { + opf &= ~REQ_INTEGRITY; + out |= DM_USER_REQ_MAP_FLAG_INTEGRITY; + } + + if (opf & REQ_FUA) { + opf &= ~REQ_FUA; + out |= DM_USER_REQ_MAP_FLAG_FUA; + } + + if (opf & REQ_PREFLUSH) { + opf &= ~REQ_PREFLUSH; + out |= DM_USER_REQ_MAP_FLAG_PREFLUSH; + } + + if (opf & REQ_RAHEAD) { + opf &= ~REQ_RAHEAD; + out |= DM_USER_REQ_MAP_FLAG_RAHEAD; + } + + if (opf & REQ_BACKGROUND) { + opf &= ~REQ_BACKGROUND; + out |= DM_USER_REQ_MAP_FLAG_BACKGROUND; + } + + if (opf & REQ_NOWAIT) { + opf &= ~REQ_NOWAIT; + out |= DM_USER_REQ_MAP_FLAG_NOWAIT; + } + + if (opf & REQ_NOUNMAP) { + opf &= ~REQ_NOUNMAP; + out |= DM_USER_REQ_MAP_FLAG_NOUNMAP; + } + + if (unlikely(opf)) { + pr_warn("unsupported BIO type %x\n", opf); + return -EOPNOTSUPP; + } + WARN_ON(out < 0); + return out; +} + +/* + * Not quite what's in blk-map.c, but instead what I thought the functions in + * blk-map did. This one seems more generally useful and I think we could + * write the blk-map version in terms of this one. The differences are that + * this has a return value that counts, and blk-map uses the BIO _all iters. + * Neither advance the BIO iter but don't advance the IOV iter, which is a bit + * odd here. + */ +static ssize_t bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) +{ + struct bio_vec bvec; + struct bvec_iter biter; + ssize_t out = 0; + + bio_for_each_segment (bvec, bio, biter) { + ssize_t ret; + + ret = copy_page_from_iter(bvec.bv_page, bvec.bv_offset, + bvec.bv_len, iter); + + /* + * FIXME: I thought that IOV copies had a mechanism for + * terminating early, if for example a signal came in while + * sleeping waiting for a page to be mapped, but I don't see + * where that would happen. + */ + WARN_ON(ret < 0); + out += ret; + + if (!iov_iter_count(iter)) + break; + + if (ret < bvec.bv_len) + return ret; + } + + return out; +} + +static ssize_t bio_copy_to_iter(struct bio *bio, struct iov_iter *iter) +{ + struct bio_vec bvec; + struct bvec_iter biter; + ssize_t out = 0; + + bio_for_each_segment (bvec, bio, biter) { + ssize_t ret; + + ret = copy_page_to_iter(bvec.bv_page, bvec.bv_offset, + bvec.bv_len, iter); + + /* as above */ + WARN_ON(ret < 0); + out += ret; + + if (!iov_iter_count(iter)) + break; + + if (ret < bvec.bv_len) + return ret; + } + + return out; +} + +static ssize_t msg_copy_to_iov(struct message *msg, struct iov_iter *to) +{ + ssize_t copied = 0; + + if (!iov_iter_count(to)) + return 0; + + if (msg->posn_to_user < sizeof(msg->msg)) { + copied = copy_to_iter((char *)(&msg->msg) + msg->posn_to_user, + sizeof(msg->msg) - msg->posn_to_user, to); + } else { + copied = bio_copy_to_iter(msg->bio, to); + if (copied > 0) + bio_advance(msg->bio, copied); + } + + if (copied < 0) + return copied; + + msg->posn_to_user += copied; + return copied; +} + +static ssize_t msg_copy_from_iov(struct message *msg, struct iov_iter *from) +{ + ssize_t copied = 0; + + if (!iov_iter_count(from)) + return 0; + + if (msg->posn_from_user < sizeof(msg->msg)) { + copied = copy_from_iter( + (char *)(&msg->msg) + msg->posn_from_user, + sizeof(msg->msg) - msg->posn_from_user, from); + } else { + copied = bio_copy_from_iter(msg->bio, from); + if (copied > 0) + bio_advance(msg->bio, copied); + } + + if (copied < 0) + return copied; + + msg->posn_from_user += copied; + return copied; +} + +static struct message *msg_get_map(struct target *t) +{ + struct message *m; + + lockdep_assert_held(&t->lock); + + m = mempool_alloc(&t->message_pool, GFP_NOIO); + m->msg.seq = t->next_seq_to_map++; + INIT_LIST_HEAD(&m->to_user); + INIT_LIST_HEAD(&m->from_user); + return m; +} + +static struct message *msg_get_to_user(struct target *t) +{ + struct message *m; + + lockdep_assert_held(&t->lock); + + if (list_empty(&t->to_user)) + return NULL; + + m = list_first_entry(&t->to_user, struct message, to_user); + + list_del(&m->to_user); + + /* + * If the IO was queued to workqueue since there + * was no daemon to service the IO, then we + * will have to cancel the delayed work as the + * IO will be processed by this user-space thread. + * + * If the delayed work was already picked up for + * processing, then wait for it to complete. Note + * that the IO will not be terminated by the work + * queue thread. + */ + if (unlikely(m->delayed)) { + mutex_unlock(&t->lock); + cancel_delayed_work_sync(&m->work); + mutex_lock(&t->lock); + } + return m; +} + +static struct message *msg_get_from_user(struct channel *c, u64 seq) +{ + struct message *m; + struct list_head *cur, *tmp; + + lockdep_assert_held(&c->lock); + + list_for_each_safe (cur, tmp, &c->from_user) { + m = list_entry(cur, struct message, from_user); + if (m->msg.seq == seq) { + list_del(&m->from_user); + return m; + } + } + + return NULL; +} + +/* + * Returns 0 when there is no work left to do. This must be callable without + * holding the target lock, as it is part of the waitqueue's check expression. + * When called without the lock it may spuriously indicate there is remaining + * work, but when called with the lock it must be accurate. + */ +static int target_poll(struct target *t) +{ + return !list_empty(&t->to_user) || t->dm_destroyed; +} + +static void target_release(struct kref *ref) +{ + struct target *t = container_of(ref, struct target, references); + struct list_head *cur, *tmp; + + /* + * There may be outstanding BIOs that have not yet been given to + * userspace. At this point there's nothing we can do about them, as + * there are and will never be any channels. + */ + list_for_each_safe (cur, tmp, &t->to_user) { + struct message *m = list_entry(cur, struct message, to_user); + + if (unlikely(m->delayed)) { + bool ret; + + mutex_unlock(&t->lock); + ret = cancel_delayed_work_sync(&m->work); + mutex_lock(&t->lock); + if (!ret) + continue; + } + message_kill(m, &t->message_pool); + } + + mempool_exit(&t->message_pool); + mutex_unlock(&t->lock); + mutex_destroy(&t->lock); + kfree(t); +} + +static void target_put(struct target *t) +{ + /* + * This both releases a reference to the target and the lock. We leave + * it up to the caller to hold the lock, as they probably needed it for + * something else. + */ + lockdep_assert_held(&t->lock); + + if (!kref_put(&t->references, target_release)) { + /* + * User-space thread is getting terminated. + * We need to scan the list for all those + * pending IO's which were not processed yet + * and put them back to work-queue for delayed + * processing. + */ + if (!is_user_space_thread_present(t)) { + struct list_head *cur, *tmp; + + list_for_each_safe(cur, tmp, &t->to_user) { + struct message *m = list_entry(cur, + struct message, + to_user); + if (!m->delayed) + enqueue_delayed_work(m, false); + } + /* + * Daemon attached to this target is terminated. + */ + t->daemon_terminated = true; + } + mutex_unlock(&t->lock); + } +} + +static struct channel *channel_alloc(struct target *t) +{ + struct channel *c; + + lockdep_assert_held(&t->lock); + + c = kzalloc(sizeof(*c), GFP_KERNEL); + if (c == NULL) + return NULL; + + kref_get(&t->references); + c->target = t; + c->cur_from_user = &c->scratch_message_from_user; + mutex_init(&c->lock); + INIT_LIST_HEAD(&c->from_user); + return c; +} + +static void channel_free(struct channel *c) +{ + struct list_head *cur, *tmp; + + lockdep_assert_held(&c->lock); + + /* + * There may be outstanding BIOs that have been given to userspace but + * have not yet been completed. The channel has been shut down so + * there's no way to process the rest of those messages, so we just go + * ahead and error out the BIOs. Hopefully whatever's on the other end + * can handle the errors. One could imagine splitting the BIOs and + * completing as much as we got, but that seems like overkill here. + * + * Our only other options would be to let the BIO hang around (which + * seems way worse) or to resubmit it to userspace in the hope there's + * another channel. I don't really like the idea of submitting a + * message twice. + */ + if (c->cur_to_user != NULL) + message_kill(c->cur_to_user, &c->target->message_pool); + if (c->cur_from_user != &c->scratch_message_from_user) + message_kill(c->cur_from_user, &c->target->message_pool); + list_for_each_safe (cur, tmp, &c->from_user) + message_kill(list_entry(cur, struct message, from_user), + &c->target->message_pool); + + mutex_lock(&c->target->lock); + target_put(c->target); + mutex_unlock(&c->lock); + mutex_destroy(&c->lock); + kfree(c); +} + +static int dev_open(struct inode *inode, struct file *file) +{ + struct channel *c; + struct target *t; + + /* + * This is called by miscdev, which sets private_data to point to the + * struct miscdevice that was opened. The rest of our file operations + * want to refer to the channel that's been opened, so we swap that + * pointer out with a fresh channel. + * + * This is called with the miscdev lock held, which is also held while + * registering/unregistering the miscdev. The miscdev must be + * registered for this to get called, which means there must be an + * outstanding reference to the target, which means it cannot be freed + * out from under us despite us not holding a reference yet. + */ + t = container_of(file->private_data, struct target, miscdev); + mutex_lock(&t->lock); + file->private_data = c = channel_alloc(t); + + if (c == NULL) { + mutex_unlock(&t->lock); + return -ENOMEM; + } + + mutex_unlock(&t->lock); + return 0; +} + +static ssize_t dev_read(struct kiocb *iocb, struct iov_iter *to) +{ + struct channel *c = channel_from_file(iocb->ki_filp); + ssize_t total_processed = 0; + ssize_t processed; + + mutex_lock(&c->lock); + + if (unlikely(c->to_user_error)) { + total_processed = c->to_user_error; + goto cleanup_unlock; + } + + if (c->cur_to_user == NULL) { + struct target *t = target_from_channel(c); + + mutex_lock(&t->lock); + + while (!target_poll(t)) { + int e; + + mutex_unlock(&t->lock); + mutex_unlock(&c->lock); + e = wait_event_interruptible(t->wq, target_poll(t)); + mutex_lock(&c->lock); + mutex_lock(&t->lock); + + if (unlikely(e != 0)) { + /* + * We haven't processed any bytes in either the + * BIO or the IOV, so we can just terminate + * right now. Elsewhere in the kernel handles + * restarting the syscall when appropriate. + */ + total_processed = e; + mutex_unlock(&t->lock); + goto cleanup_unlock; + } + } + + if (unlikely(t->dm_destroyed)) { + /* + * DM has destroyed this target, so just lock + * the user out. There's really nothing else + * we can do here. Note that we don't actually + * tear any thing down until userspace has + * closed the FD, as there may still be + * outstanding BIOs. + * + * This is kind of a wacky error code to + * return. My goal was really just to try and + * find something that wasn't likely to be + * returned by anything else in the miscdev + * path. The message "block device required" + * seems like a somewhat reasonable thing to + * say when the target has disappeared out from + * under us, but "not block" isn't sensible. + */ + c->to_user_error = total_processed = -ENOTBLK; + mutex_unlock(&t->lock); + goto cleanup_unlock; + } + + /* + * Ensures that accesses to the message data are not ordered + * before the remote accesses that produce that message data. + * + * This pairs with the barrier in user_map(), via the + * conditional within the while loop above. Also see the lack + * of barrier in user_dtr(), which is why this can be after the + * destroyed check. + */ + smp_rmb(); + + c->cur_to_user = msg_get_to_user(t); + WARN_ON(c->cur_to_user == NULL); + mutex_unlock(&t->lock); + } + + processed = msg_copy_to_iov(c->cur_to_user, to); + total_processed += processed; + + WARN_ON(c->cur_to_user->posn_to_user > c->cur_to_user->total_to_user); + if (c->cur_to_user->posn_to_user == c->cur_to_user->total_to_user) { + struct message *m = c->cur_to_user; + + c->cur_to_user = NULL; + list_add_tail(&m->from_user, &c->from_user); + } + +cleanup_unlock: + mutex_unlock(&c->lock); + return total_processed; +} + +static ssize_t dev_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct channel *c = channel_from_file(iocb->ki_filp); + ssize_t total_processed = 0; + ssize_t processed; + + mutex_lock(&c->lock); + + if (unlikely(c->from_user_error)) { + total_processed = c->from_user_error; + goto cleanup_unlock; + } + + /* + * cur_from_user can never be NULL. If there's no real message it must + * point to the scratch space. + */ + WARN_ON(c->cur_from_user == NULL); + if (c->cur_from_user->posn_from_user < sizeof(struct dm_user_message)) { + struct message *msg, *old; + + processed = msg_copy_from_iov(c->cur_from_user, from); + if (processed <= 0) { + pr_warn("msg_copy_from_iov() returned %zu\n", + processed); + c->from_user_error = -EINVAL; + goto cleanup_unlock; + } + total_processed += processed; + + /* + * In the unlikely event the user has provided us a very short + * write, not even big enough to fill a message, just succeed. + * We'll eventually build up enough bytes to do something. + */ + if (unlikely(c->cur_from_user->posn_from_user < + sizeof(struct dm_user_message))) + goto cleanup_unlock; + + old = c->cur_from_user; + mutex_lock(&c->target->lock); + msg = msg_get_from_user(c, c->cur_from_user->msg.seq); + if (msg == NULL) { + pr_info("user provided an invalid messag seq of %llx\n", + old->msg.seq); + mutex_unlock(&c->target->lock); + c->from_user_error = -EINVAL; + goto cleanup_unlock; + } + mutex_unlock(&c->target->lock); + + WARN_ON(old->posn_from_user != sizeof(struct dm_user_message)); + msg->posn_from_user = sizeof(struct dm_user_message); + msg->return_type = old->msg.type; + msg->return_flags = old->msg.flags; + WARN_ON(msg->posn_from_user > msg->total_from_user); + c->cur_from_user = msg; + WARN_ON(old != &c->scratch_message_from_user); + } + + /* + * Userspace can signal an error for single requests by overwriting the + * seq field. + */ + switch (c->cur_from_user->return_type) { + case DM_USER_RESP_SUCCESS: + c->cur_from_user->bio->bi_status = BLK_STS_OK; + break; + case DM_USER_RESP_ERROR: + case DM_USER_RESP_UNSUPPORTED: + default: + c->cur_from_user->bio->bi_status = BLK_STS_IOERR; + goto finish_bio; + } + + /* + * The op was a success as far as userspace is concerned, so process + * whatever data may come along with it. The user may provide the BIO + * data in multiple chunks, in which case we don't need to finish the + * BIO. + */ + processed = msg_copy_from_iov(c->cur_from_user, from); + total_processed += processed; + + if (c->cur_from_user->posn_from_user < + c->cur_from_user->total_from_user) + goto cleanup_unlock; + +finish_bio: + /* + * When we set up this message the BIO's size matched the + * message size, if that's not still the case then something + * has gone off the rails. + */ + WARN_ON(bio_size(c->cur_from_user->bio) != 0); + bio_endio(c->cur_from_user->bio); + + /* + * We don't actually need to take the target lock here, as all + * we're doing is freeing the message and mempools have their + * own lock. Each channel has its ows scratch message. + */ + WARN_ON(c->cur_from_user == &c->scratch_message_from_user); + mempool_free(c->cur_from_user, &c->target->message_pool); + c->scratch_message_from_user.posn_from_user = 0; + c->cur_from_user = &c->scratch_message_from_user; + +cleanup_unlock: + mutex_unlock(&c->lock); + return total_processed; +} + +static int dev_release(struct inode *inode, struct file *file) +{ + struct channel *c; + + c = channel_from_file(file); + mutex_lock(&c->lock); + channel_free(c); + + return 0; +} + +static const struct file_operations file_operations = { + .owner = THIS_MODULE, + .open = dev_open, + .llseek = no_llseek, + .read_iter = dev_read, + .write_iter = dev_write, + .release = dev_release, +}; + +static int user_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct target *t; + int r; + + if (argc != 3) { + ti->error = "Invalid argument count"; + r = -EINVAL; + goto cleanup_none; + } + + t = kzalloc(sizeof(*t), GFP_KERNEL); + if (t == NULL) { + r = -ENOMEM; + goto cleanup_none; + } + ti->private = t; + + /* Enable more BIO types. */ + ti->num_discard_bios = 1; + ti->discards_supported = true; + ti->num_flush_bios = 1; + ti->flush_supported = true; + + /* + * We begin with a single reference to the target, which is miscdev's + * reference. This ensures that the target won't be freed + * until after the miscdev has been unregistered and all extant + * channels have been closed. + */ + kref_init(&t->references); + + t->daemon_terminated = false; + mutex_init(&t->lock); + init_waitqueue_head(&t->wq); + INIT_LIST_HEAD(&t->to_user); + mempool_init_kmalloc_pool(&t->message_pool, MAX_OUTSTANDING_MESSAGES, + sizeof(struct message)); + + t->miscdev.minor = MISC_DYNAMIC_MINOR; + t->miscdev.fops = &file_operations; + t->miscdev.name = kasprintf(GFP_KERNEL, "dm-user/%s", argv[2]); + if (t->miscdev.name == NULL) { + r = -ENOMEM; + goto cleanup_message_pool; + } + + /* + * Once the miscdev is registered it can be opened and therefor + * concurrent references to the channel can happen. Holding the target + * lock during misc_register() could deadlock. If registration + * succeeds then we will not access the target again so we just stick a + * barrier here, which pairs with taking the target lock everywhere + * else the target is accessed. + * + * I forgot where we ended up on the RCpc/RCsc locks. IIU RCsc locks + * would mean that we could take the target lock earlier and release it + * here instead of the memory barrier. I'm not sure that's any better, + * though, and this isn't on a hot path so it probably doesn't matter + * either way. + */ + smp_mb(); + + r = misc_register(&t->miscdev); + if (r) { + DMERR("Unable to register miscdev %s for dm-user", + t->miscdev.name); + r = -ENOMEM; + goto cleanup_misc_name; + } + + return 0; + +cleanup_misc_name: + kfree(t->miscdev.name); +cleanup_message_pool: + mempool_exit(&t->message_pool); + kfree(t); +cleanup_none: + return r; +} + +static void user_dtr(struct dm_target *ti) +{ + struct target *t = target_from_target(ti); + + /* + * Removes the miscdev. This must be called without the target lock + * held to avoid a possible deadlock because our open implementation is + * called holding the miscdev lock and must later take the target lock. + * + * There is no race here because only DM can register/unregister the + * miscdev, and DM ensures that doesn't happen twice. The internal + * miscdev lock is sufficient to ensure there are no races between + * deregistering the miscdev and open. + */ + misc_deregister(&t->miscdev); + + /* + * We are now free to take the target's lock and drop our reference to + * the target. There are almost certainly tasks sleeping in read on at + * least one of the channels associated with this target, this + * explicitly wakes them up and terminates the read. + */ + mutex_lock(&t->lock); + /* + * No barrier here, as wait/wake ensures that the flag visibility is + * correct WRT the wake/sleep state of the target tasks. + */ + t->dm_destroyed = true; + wake_up_all(&t->wq); + target_put(t); +} + +/* + * Consumes a BIO from device mapper, queueing it up for userspace. + */ +static int user_map(struct dm_target *ti, struct bio *bio) +{ + struct target *t; + struct message *entry; + + t = target_from_target(ti); + /* + * FIXME + * + * This seems like a bad idea. Specifically, here we're + * directly on the IO path when we take the target lock, which may also + * be taken from a user context. The user context doesn't actively + * trigger anything that may sleep while holding the lock, but this + * still seems like a bad idea. + * + * The obvious way to fix this would be to use a proper queue, which + * would result in no shared locks between the direct IO path and user + * tasks. I had a version that did this, but the head-of-line blocking + * from the circular buffer resulted in us needing a fairly large + * allocation in order to avoid situations in which the queue fills up + * and everything goes off the rails. + * + * I could jump through a some hoops to avoid a shared lock while still + * allowing for a large queue, but I'm not actually sure that allowing + * for very large queues is the right thing to do here. Intuitively it + * seems better to keep the queues small in here (essentially sized to + * the user latency for performance reasons only) and rely on returning + * DM_MAPIO_REQUEUE regularly, as that would give the rest of the + * kernel more information. + * + * I'll spend some time trying to figure out what's going on with + * DM_MAPIO_REQUEUE, but if someone has a better idea of how to fix + * this I'm all ears. + */ + mutex_lock(&t->lock); + + /* + * FIXME + * + * The assumption here is that there's no benefit to returning + * DM_MAPIO_KILL as opposed to just erroring out the BIO, but I'm not + * sure that's actually true -- for example, I could imagine users + * expecting that submitted BIOs are unlikely to fail and therefor + * relying on submission failure to indicate an unsupported type. + * + * There's two ways I can think of to fix this: + * - Add DM arguments that are parsed during the constructor that + * allow various dm_target flags to be set that indicate the op + * types supported by this target. This may make sense for things + * like discard, where DM can already transform the BIOs to a form + * that's likely to be supported. + * - Some sort of pre-filter that allows userspace to hook in here + * and kill BIOs before marking them as submitted. My guess would + * be that a userspace round trip is a bad idea here, but a BPF + * call seems resonable. + * + * My guess is that we'd likely want to do both. The first one is easy + * and gives DM the proper info, so it seems better. The BPF call + * seems overly complex for just this, but one could imagine wanting to + * sometimes return _MAPPED and a BPF filter would be the way to do + * that. + * + * For example, in Android we have an in-kernel DM device called + * "dm-bow" that takes advange of some portion of the space that has + * been discarded on a device to provide opportunistic block-level + * backups. While one could imagine just implementing this entirely in + * userspace, that would come with an appreciable performance penalty. + * Instead one could keep a BPF program that forwards most accesses + * directly to the backing block device while informing a userspace + * daemon of any discarded space and on writes to blocks that are to be + * backed up. + */ + if (unlikely((bio_type_to_user_type(bio) < 0) || + (bio_flags_to_user_flags(bio) < 0))) { + mutex_unlock(&t->lock); + return DM_MAPIO_KILL; + } + + entry = msg_get_map(t); + if (unlikely(entry == NULL)) { + mutex_unlock(&t->lock); + return DM_MAPIO_REQUEUE; + } + + entry->msg.type = bio_type_to_user_type(bio); + entry->msg.flags = bio_flags_to_user_flags(bio); + entry->msg.sector = bio->bi_iter.bi_sector; + entry->msg.len = bio_size(bio); + entry->bio = bio; + entry->posn_to_user = 0; + entry->total_to_user = bio_bytes_needed_to_user(bio); + entry->posn_from_user = 0; + entry->total_from_user = bio_bytes_needed_from_user(bio); + entry->delayed = false; + entry->t = t; + /* Pairs with the barrier in dev_read() */ + smp_wmb(); + list_add_tail(&entry->to_user, &t->to_user); + + /* + * If there is no daemon to process the IO's, + * queue these messages into a workqueue with + * a timeout. + */ + if (!is_user_space_thread_present(t)) + enqueue_delayed_work(entry, !t->daemon_terminated); + + wake_up_interruptible(&t->wq); + mutex_unlock(&t->lock); + return DM_MAPIO_SUBMITTED; +} + +static struct target_type user_target = { + .name = "user", + .version = { 1, 0, 0 }, + .module = THIS_MODULE, + .ctr = user_ctr, + .dtr = user_dtr, + .map = user_map, +}; + +static int __init dm_user_init(void) +{ + int r; + + r = dm_register_target(&user_target); + if (r) { + DMERR("register failed %d", r); + goto error; + } + + return 0; + +error: + return r; +} + +static void __exit dm_user_exit(void) +{ + dm_unregister_target(&user_target); +} + +module_init(dm_user_init); +module_exit(dm_user_exit); +MODULE_AUTHOR("Palmer Dabbelt "); +MODULE_DESCRIPTION(DM_NAME " target returning blocks from userspace"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 9dd2c2da075d..de206ddaac20 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #define DM_MSG_PREFIX "core" @@ -1676,14 +1676,14 @@ static const struct dax_operations dm_dax_ops; static void dm_wq_work(struct work_struct *work); #ifdef CONFIG_BLK_INLINE_ENCRYPTION -static void dm_queue_destroy_keyslot_manager(struct request_queue *q) +static void dm_queue_destroy_crypto_profile(struct request_queue *q) { - dm_destroy_keyslot_manager(q->ksm); + dm_destroy_crypto_profile(q->crypto_profile); } #else /* CONFIG_BLK_INLINE_ENCRYPTION */ -static inline void dm_queue_destroy_keyslot_manager(struct request_queue *q) +static inline void dm_queue_destroy_crypto_profile(struct request_queue *q) { } #endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ @@ -1710,7 +1710,7 @@ static void cleanup_mapped_device(struct mapped_device *md) dm_sysfs_exit(md); del_gendisk(md->disk); } - dm_queue_destroy_keyslot_manager(md->queue); + dm_queue_destroy_crypto_profile(md->queue); blk_cleanup_disk(md->disk); } diff --git a/drivers/media/Kconfig b/drivers/media/Kconfig index f3f24c63536b..b65ae4053b97 100644 --- a/drivers/media/Kconfig +++ b/drivers/media/Kconfig @@ -65,8 +65,7 @@ menu "Media device types" # Multimedia support - automatically enable V4L2 and DVB core # config MEDIA_CAMERA_SUPPORT - bool - prompt "Cameras and video grabbers" if MEDIA_SUPPORT_FILTER + bool "Cameras and video grabbers" default y if !MEDIA_SUPPORT_FILTER help Enable support for webcams and video grabbers. @@ -74,8 +73,7 @@ config MEDIA_CAMERA_SUPPORT Say Y when you have a webcam or a video capture grabber board. config MEDIA_ANALOG_TV_SUPPORT - bool - prompt "Analog TV" if MEDIA_SUPPORT_FILTER + bool "Analog TV" default y if !MEDIA_SUPPORT_FILTER help Enable analog TV support. @@ -88,8 +86,7 @@ config MEDIA_ANALOG_TV_SUPPORT will disable support for them. config MEDIA_DIGITAL_TV_SUPPORT - bool - prompt "Digital TV" if MEDIA_SUPPORT_FILTER + bool "Digital TV" default y if !MEDIA_SUPPORT_FILTER help Enable digital TV support. @@ -98,8 +95,7 @@ config MEDIA_DIGITAL_TV_SUPPORT hybrid digital TV and analog TV. config MEDIA_RADIO_SUPPORT - bool - prompt "AM/FM radio receivers/transmitters" if MEDIA_SUPPORT_FILTER + bool "AM/FM radio receivers/transmitters" default y if !MEDIA_SUPPORT_FILTER help Enable AM/FM radio support. @@ -114,8 +110,7 @@ config MEDIA_RADIO_SUPPORT disable support for them. config MEDIA_SDR_SUPPORT - bool - prompt "Software defined radio" if MEDIA_SUPPORT_FILTER + bool "Software defined radio" default y if !MEDIA_SUPPORT_FILTER help Enable software defined radio support. @@ -123,8 +118,7 @@ config MEDIA_SDR_SUPPORT Say Y when you have a software defined radio device. config MEDIA_PLATFORM_SUPPORT - bool - prompt "Platform-specific devices" if MEDIA_SUPPORT_FILTER + bool "Platform-specific devices" default y if !MEDIA_SUPPORT_FILTER help Enable support for complex cameras, codecs, and other hardware @@ -137,8 +131,7 @@ config MEDIA_PLATFORM_SUPPORT Say Y when you want to be able to see such devices. config MEDIA_TEST_SUPPORT - bool - prompt "Test drivers" if MEDIA_SUPPORT_FILTER + bool "Test drivers" default y if !MEDIA_SUPPORT_FILTER help These drivers should not be used on production kernels, but @@ -179,7 +172,7 @@ config MEDIA_CONTROLLER # config DVB_CORE - tristate + tristate "DVB Core" depends on MEDIA_DIGITAL_TV_SUPPORT depends on (I2C || I2C=n) default MEDIA_DIGITAL_TV_SUPPORT diff --git a/drivers/media/usb/uvc/uvc_ctrl.c b/drivers/media/usb/uvc/uvc_ctrl.c index 5bb29fc49538..bfb80490e259 100644 --- a/drivers/media/usb/uvc/uvc_ctrl.c +++ b/drivers/media/usb/uvc/uvc_ctrl.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "uvcvideo.h" diff --git a/drivers/media/usb/uvc/uvc_driver.c b/drivers/media/usb/uvc/uvc_driver.c index 72fff7264b54..1a9cc0080523 100644 --- a/drivers/media/usb/uvc/uvc_driver.c +++ b/drivers/media/usb/uvc/uvc_driver.c @@ -21,6 +21,7 @@ #include #include +#include #include "uvcvideo.h" @@ -35,193 +36,6 @@ static unsigned int uvc_quirks_param = -1; unsigned int uvc_dbg_param; unsigned int uvc_timeout_param = UVC_CTRL_STREAMING_TIMEOUT; -/* ------------------------------------------------------------------------ - * Video formats - */ - -static struct uvc_format_desc uvc_fmts[] = { - { - .name = "YUV 4:2:2 (YUYV)", - .guid = UVC_GUID_FORMAT_YUY2, - .fcc = V4L2_PIX_FMT_YUYV, - }, - { - .name = "YUV 4:2:2 (YUYV)", - .guid = UVC_GUID_FORMAT_YUY2_ISIGHT, - .fcc = V4L2_PIX_FMT_YUYV, - }, - { - .name = "YUV 4:2:0 (NV12)", - .guid = UVC_GUID_FORMAT_NV12, - .fcc = V4L2_PIX_FMT_NV12, - }, - { - .name = "MJPEG", - .guid = UVC_GUID_FORMAT_MJPEG, - .fcc = V4L2_PIX_FMT_MJPEG, - }, - { - .name = "YVU 4:2:0 (YV12)", - .guid = UVC_GUID_FORMAT_YV12, - .fcc = V4L2_PIX_FMT_YVU420, - }, - { - .name = "YUV 4:2:0 (I420)", - .guid = UVC_GUID_FORMAT_I420, - .fcc = V4L2_PIX_FMT_YUV420, - }, - { - .name = "YUV 4:2:0 (M420)", - .guid = UVC_GUID_FORMAT_M420, - .fcc = V4L2_PIX_FMT_M420, - }, - { - .name = "YUV 4:2:2 (UYVY)", - .guid = UVC_GUID_FORMAT_UYVY, - .fcc = V4L2_PIX_FMT_UYVY, - }, - { - .name = "Greyscale 8-bit (Y800)", - .guid = UVC_GUID_FORMAT_Y800, - .fcc = V4L2_PIX_FMT_GREY, - }, - { - .name = "Greyscale 8-bit (Y8 )", - .guid = UVC_GUID_FORMAT_Y8, - .fcc = V4L2_PIX_FMT_GREY, - }, - { - .name = "Greyscale 8-bit (D3DFMT_L8)", - .guid = UVC_GUID_FORMAT_D3DFMT_L8, - .fcc = V4L2_PIX_FMT_GREY, - }, - { - .name = "IR 8-bit (L8_IR)", - .guid = UVC_GUID_FORMAT_KSMEDIA_L8_IR, - .fcc = V4L2_PIX_FMT_GREY, - }, - { - .name = "Greyscale 10-bit (Y10 )", - .guid = UVC_GUID_FORMAT_Y10, - .fcc = V4L2_PIX_FMT_Y10, - }, - { - .name = "Greyscale 12-bit (Y12 )", - .guid = UVC_GUID_FORMAT_Y12, - .fcc = V4L2_PIX_FMT_Y12, - }, - { - .name = "Greyscale 16-bit (Y16 )", - .guid = UVC_GUID_FORMAT_Y16, - .fcc = V4L2_PIX_FMT_Y16, - }, - { - .name = "BGGR Bayer (BY8 )", - .guid = UVC_GUID_FORMAT_BY8, - .fcc = V4L2_PIX_FMT_SBGGR8, - }, - { - .name = "BGGR Bayer (BA81)", - .guid = UVC_GUID_FORMAT_BA81, - .fcc = V4L2_PIX_FMT_SBGGR8, - }, - { - .name = "GBRG Bayer (GBRG)", - .guid = UVC_GUID_FORMAT_GBRG, - .fcc = V4L2_PIX_FMT_SGBRG8, - }, - { - .name = "GRBG Bayer (GRBG)", - .guid = UVC_GUID_FORMAT_GRBG, - .fcc = V4L2_PIX_FMT_SGRBG8, - }, - { - .name = "RGGB Bayer (RGGB)", - .guid = UVC_GUID_FORMAT_RGGB, - .fcc = V4L2_PIX_FMT_SRGGB8, - }, - { - .name = "RGB565", - .guid = UVC_GUID_FORMAT_RGBP, - .fcc = V4L2_PIX_FMT_RGB565, - }, - { - .name = "BGR 8:8:8 (BGR3)", - .guid = UVC_GUID_FORMAT_BGR3, - .fcc = V4L2_PIX_FMT_BGR24, - }, - { - .name = "H.264", - .guid = UVC_GUID_FORMAT_H264, - .fcc = V4L2_PIX_FMT_H264, - }, - { - .name = "Greyscale 8 L/R (Y8I)", - .guid = UVC_GUID_FORMAT_Y8I, - .fcc = V4L2_PIX_FMT_Y8I, - }, - { - .name = "Greyscale 12 L/R (Y12I)", - .guid = UVC_GUID_FORMAT_Y12I, - .fcc = V4L2_PIX_FMT_Y12I, - }, - { - .name = "Depth data 16-bit (Z16)", - .guid = UVC_GUID_FORMAT_Z16, - .fcc = V4L2_PIX_FMT_Z16, - }, - { - .name = "Bayer 10-bit (SRGGB10P)", - .guid = UVC_GUID_FORMAT_RW10, - .fcc = V4L2_PIX_FMT_SRGGB10P, - }, - { - .name = "Bayer 16-bit (SBGGR16)", - .guid = UVC_GUID_FORMAT_BG16, - .fcc = V4L2_PIX_FMT_SBGGR16, - }, - { - .name = "Bayer 16-bit (SGBRG16)", - .guid = UVC_GUID_FORMAT_GB16, - .fcc = V4L2_PIX_FMT_SGBRG16, - }, - { - .name = "Bayer 16-bit (SRGGB16)", - .guid = UVC_GUID_FORMAT_RG16, - .fcc = V4L2_PIX_FMT_SRGGB16, - }, - { - .name = "Bayer 16-bit (SGRBG16)", - .guid = UVC_GUID_FORMAT_GR16, - .fcc = V4L2_PIX_FMT_SGRBG16, - }, - { - .name = "Depth data 16-bit (Z16)", - .guid = UVC_GUID_FORMAT_INVZ, - .fcc = V4L2_PIX_FMT_Z16, - }, - { - .name = "Greyscale 10-bit (Y10 )", - .guid = UVC_GUID_FORMAT_INVI, - .fcc = V4L2_PIX_FMT_Y10, - }, - { - .name = "IR:Depth 26-bit (INZI)", - .guid = UVC_GUID_FORMAT_INZI, - .fcc = V4L2_PIX_FMT_INZI, - }, - { - .name = "4-bit Depth Confidence (Packed)", - .guid = UVC_GUID_FORMAT_CNF4, - .fcc = V4L2_PIX_FMT_CNF4, - }, - { - .name = "HEVC", - .guid = UVC_GUID_FORMAT_HEVC, - .fcc = V4L2_PIX_FMT_HEVC, - }, -}; - /* ------------------------------------------------------------------------ * Utility functions */ @@ -241,19 +55,6 @@ struct usb_host_endpoint *uvc_find_endpoint(struct usb_host_interface *alts, return NULL; } -static struct uvc_format_desc *uvc_format_by_guid(const u8 guid[16]) -{ - unsigned int len = ARRAY_SIZE(uvc_fmts); - unsigned int i; - - for (i = 0; i < len; ++i) { - if (memcmp(guid, uvc_fmts[i].guid, 16) == 0) - return &uvc_fmts[i]; - } - - return NULL; -} - static enum v4l2_colorspace uvc_colorspace(const u8 primaries) { static const enum v4l2_colorspace colorprimaries[] = { @@ -325,86 +126,6 @@ static enum v4l2_ycbcr_encoding uvc_ycbcr_enc(const u8 matrix_coefficients) return V4L2_YCBCR_ENC_DEFAULT; /* Reserved */ } -/* Simplify a fraction using a simple continued fraction decomposition. The - * idea here is to convert fractions such as 333333/10000000 to 1/30 using - * 32 bit arithmetic only. The algorithm is not perfect and relies upon two - * arbitrary parameters to remove non-significative terms from the simple - * continued fraction decomposition. Using 8 and 333 for n_terms and threshold - * respectively seems to give nice results. - */ -void uvc_simplify_fraction(u32 *numerator, u32 *denominator, - unsigned int n_terms, unsigned int threshold) -{ - u32 *an; - u32 x, y, r; - unsigned int i, n; - - an = kmalloc_array(n_terms, sizeof(*an), GFP_KERNEL); - if (an == NULL) - return; - - /* Convert the fraction to a simple continued fraction. See - * https://mathforum.org/dr.math/faq/faq.fractions.html - * Stop if the current term is bigger than or equal to the given - * threshold. - */ - x = *numerator; - y = *denominator; - - for (n = 0; n < n_terms && y != 0; ++n) { - an[n] = x / y; - if (an[n] >= threshold) { - if (n < 2) - n++; - break; - } - - r = x - an[n] * y; - x = y; - y = r; - } - - /* Expand the simple continued fraction back to an integer fraction. */ - x = 0; - y = 1; - - for (i = n; i > 0; --i) { - r = y; - y = an[i-1] * y + x; - x = r; - } - - *numerator = y; - *denominator = x; - kfree(an); -} - -/* Convert a fraction to a frame interval in 100ns multiples. The idea here is - * to compute numerator / denominator * 10000000 using 32 bit fixed point - * arithmetic only. - */ -u32 uvc_fraction_to_interval(u32 numerator, u32 denominator) -{ - u32 multiplier; - - /* Saturate the result if the operation would overflow. */ - if (denominator == 0 || - numerator/denominator >= ((u32)-1)/10000000) - return (u32)-1; - - /* Divide both the denominator and the multiplier by two until - * numerator * multiplier doesn't overflow. If anyone knows a better - * algorithm please let me know. - */ - multiplier = 10000000; - while (numerator > ((u32)-1)/multiplier) { - multiplier /= 2; - denominator /= 2; - } - - return denominator ? numerator * multiplier / denominator : 0; -} - /* ------------------------------------------------------------------------ * Terminal and unit management */ @@ -3213,4 +2934,3 @@ MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); MODULE_VERSION(DRIVER_VERSION); - diff --git a/drivers/media/usb/uvc/uvc_v4l2.c b/drivers/media/usb/uvc/uvc_v4l2.c index 63842eb223a1..92f019a98cf6 100644 --- a/drivers/media/usb/uvc/uvc_v4l2.c +++ b/drivers/media/usb/uvc/uvc_v4l2.c @@ -374,7 +374,7 @@ static int uvc_v4l2_get_streamparm(struct uvc_streaming *stream, mutex_unlock(&stream->mutex); denominator = 10000000; - uvc_simplify_fraction(&numerator, &denominator, 8, 333); + v4l2_simplify_fraction(&numerator, &denominator, 8, 333); memset(parm, 0, sizeof(*parm)); parm->type = stream->type; @@ -415,7 +415,7 @@ static int uvc_v4l2_set_streamparm(struct uvc_streaming *stream, else timeperframe = parm->parm.output.timeperframe; - interval = uvc_fraction_to_interval(timeperframe.numerator, + interval = v4l2_fraction_to_interval(timeperframe.numerator, timeperframe.denominator); uvc_dbg(stream->dev, FORMAT, "Setting frame interval to %u/%u (%u)\n", timeperframe.numerator, timeperframe.denominator, interval); @@ -469,7 +469,7 @@ static int uvc_v4l2_set_streamparm(struct uvc_streaming *stream, /* Return the actual frame period. */ timeperframe.numerator = probe.dwFrameInterval; timeperframe.denominator = 10000000; - uvc_simplify_fraction(&timeperframe.numerator, + v4l2_simplify_fraction(&timeperframe.numerator, &timeperframe.denominator, 8, 333); if (parm->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) { @@ -1293,7 +1293,7 @@ static int uvc_ioctl_enum_frameintervals(struct file *file, void *fh, fival->discrete.numerator = frame->dwFrameInterval[index]; fival->discrete.denominator = 10000000; - uvc_simplify_fraction(&fival->discrete.numerator, + v4l2_simplify_fraction(&fival->discrete.numerator, &fival->discrete.denominator, 8, 333); } else { fival->type = V4L2_FRMIVAL_TYPE_STEPWISE; @@ -1303,11 +1303,11 @@ static int uvc_ioctl_enum_frameintervals(struct file *file, void *fh, fival->stepwise.max.denominator = 10000000; fival->stepwise.step.numerator = frame->dwFrameInterval[2]; fival->stepwise.step.denominator = 10000000; - uvc_simplify_fraction(&fival->stepwise.min.numerator, + v4l2_simplify_fraction(&fival->stepwise.min.numerator, &fival->stepwise.min.denominator, 8, 333); - uvc_simplify_fraction(&fival->stepwise.max.numerator, + v4l2_simplify_fraction(&fival->stepwise.max.numerator, &fival->stepwise.max.denominator, 8, 333); - uvc_simplify_fraction(&fival->stepwise.step.numerator, + v4l2_simplify_fraction(&fival->stepwise.step.numerator, &fival->stepwise.step.denominator, 8, 333); } diff --git a/drivers/media/usb/uvc/uvcvideo.h b/drivers/media/usb/uvc/uvcvideo.h index c3ea6a53869f..9fb0ea2cd8e6 100644 --- a/drivers/media/usb/uvc/uvcvideo.h +++ b/drivers/media/usb/uvc/uvcvideo.h @@ -41,141 +41,6 @@ #define UVC_EXT_GPIO_UNIT 0x7ffe #define UVC_EXT_GPIO_UNIT_ID 0x100 -/* ------------------------------------------------------------------------ - * GUIDs - */ -#define UVC_GUID_UVC_CAMERA \ - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01} -#define UVC_GUID_UVC_OUTPUT \ - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02} -#define UVC_GUID_UVC_MEDIA_TRANSPORT_INPUT \ - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03} -#define UVC_GUID_UVC_PROCESSING \ - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01} -#define UVC_GUID_UVC_SELECTOR \ - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02} -#define UVC_GUID_EXT_GPIO_CONTROLLER \ - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x03} - -#define UVC_GUID_FORMAT_MJPEG \ - { 'M', 'J', 'P', 'G', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_YUY2 \ - { 'Y', 'U', 'Y', '2', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_YUY2_ISIGHT \ - { 'Y', 'U', 'Y', '2', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0x00, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_NV12 \ - { 'N', 'V', '1', '2', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_YV12 \ - { 'Y', 'V', '1', '2', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_I420 \ - { 'I', '4', '2', '0', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_UYVY \ - { 'U', 'Y', 'V', 'Y', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_Y800 \ - { 'Y', '8', '0', '0', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_Y8 \ - { 'Y', '8', ' ', ' ', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_Y10 \ - { 'Y', '1', '0', ' ', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_Y12 \ - { 'Y', '1', '2', ' ', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_Y16 \ - { 'Y', '1', '6', ' ', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_BY8 \ - { 'B', 'Y', '8', ' ', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_BA81 \ - { 'B', 'A', '8', '1', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_GBRG \ - { 'G', 'B', 'R', 'G', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_GRBG \ - { 'G', 'R', 'B', 'G', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_RGGB \ - { 'R', 'G', 'G', 'B', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_BG16 \ - { 'B', 'G', '1', '6', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_GB16 \ - { 'G', 'B', '1', '6', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_RG16 \ - { 'R', 'G', '1', '6', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_GR16 \ - { 'G', 'R', '1', '6', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_RGBP \ - { 'R', 'G', 'B', 'P', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_BGR3 \ - { 0x7d, 0xeb, 0x36, 0xe4, 0x4f, 0x52, 0xce, 0x11, \ - 0x9f, 0x53, 0x00, 0x20, 0xaf, 0x0b, 0xa7, 0x70} -#define UVC_GUID_FORMAT_M420 \ - { 'M', '4', '2', '0', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} - -#define UVC_GUID_FORMAT_H264 \ - { 'H', '2', '6', '4', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_Y8I \ - { 'Y', '8', 'I', ' ', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_Y12I \ - { 'Y', '1', '2', 'I', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_Z16 \ - { 'Z', '1', '6', ' ', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_RW10 \ - { 'R', 'W', '1', '0', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_INVZ \ - { 'I', 'N', 'V', 'Z', 0x90, 0x2d, 0x58, 0x4a, \ - 0x92, 0x0b, 0x77, 0x3f, 0x1f, 0x2c, 0x55, 0x6b} -#define UVC_GUID_FORMAT_INZI \ - { 'I', 'N', 'Z', 'I', 0x66, 0x1a, 0x42, 0xa2, \ - 0x90, 0x65, 0xd0, 0x18, 0x14, 0xa8, 0xef, 0x8a} -#define UVC_GUID_FORMAT_INVI \ - { 'I', 'N', 'V', 'I', 0xdb, 0x57, 0x49, 0x5e, \ - 0x8e, 0x3f, 0xf4, 0x79, 0x53, 0x2b, 0x94, 0x6f} -#define UVC_GUID_FORMAT_CNF4 \ - { 'C', ' ', ' ', ' ', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} - -#define UVC_GUID_FORMAT_D3DFMT_L8 \ - {0x32, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -#define UVC_GUID_FORMAT_KSMEDIA_L8_IR \ - {0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} - -#define UVC_GUID_FORMAT_HEVC \ - { 'H', 'E', 'V', 'C', 0x00, 0x00, 0x10, 0x00, \ - 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} - - /* ------------------------------------------------------------------------ * Driver specific constants. */ @@ -280,13 +145,8 @@ struct uvc_control { struct uvc_fh *handle; /* File handle that last changed the control. */ }; -struct uvc_format_desc { - char *name; - u8 guid[16]; - u32 fcc; -}; - -/* The term 'entity' refers to both UVC units and UVC terminals. +/* + * The term 'entity' refers to both UVC units and UVC terminals. * * The type field is either the terminal type (wTerminalType in the terminal * descriptor), or the unit type (bDescriptorSubtype in the unit descriptor). @@ -905,9 +765,6 @@ int uvc_xu_ctrl_query(struct uvc_video_chain *chain, struct uvc_xu_control_query *xqry); /* Utility functions */ -void uvc_simplify_fraction(u32 *numerator, u32 *denominator, - unsigned int n_terms, unsigned int threshold); -u32 uvc_fraction_to_interval(u32 numerator, u32 denominator); struct usb_host_endpoint *uvc_find_endpoint(struct usb_host_interface *alts, u8 epaddr); diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c index 04af03285a20..00c99d3e0608 100644 --- a/drivers/media/v4l2-core/v4l2-common.c +++ b/drivers/media/v4l2-core/v4l2-common.c @@ -479,3 +479,89 @@ s64 v4l2_get_link_freq(struct v4l2_ctrl_handler *handler, unsigned int mul, return freq > 0 ? freq : -EINVAL; } EXPORT_SYMBOL_GPL(v4l2_get_link_freq); + +/* + * Simplify a fraction using a simple continued fraction decomposition. The + * idea here is to convert fractions such as 333333/10000000 to 1/30 using + * 32 bit arithmetic only. The algorithm is not perfect and relies upon two + * arbitrary parameters to remove non-significative terms from the simple + * continued fraction decomposition. Using 8 and 333 for n_terms and threshold + * respectively seems to give nice results. + */ +void v4l2_simplify_fraction(u32 *numerator, u32 *denominator, + unsigned int n_terms, unsigned int threshold) +{ + u32 *an; + u32 x, y, r; + unsigned int i, n; + + an = kmalloc_array(n_terms, sizeof(*an), GFP_KERNEL); + if (an == NULL) + return; + + /* + * Convert the fraction to a simple continued fraction. See + * https://en.wikipedia.org/wiki/Continued_fraction + * Stop if the current term is bigger than or equal to the given + * threshold. + */ + x = *numerator; + y = *denominator; + + for (n = 0; n < n_terms && y != 0; ++n) { + an[n] = x / y; + if (an[n] >= threshold) { + if (n < 2) + n++; + break; + } + + r = x - an[n] * y; + x = y; + y = r; + } + + /* Expand the simple continued fraction back to an integer fraction. */ + x = 0; + y = 1; + + for (i = n; i > 0; --i) { + r = y; + y = an[i-1] * y + x; + x = r; + } + + *numerator = y; + *denominator = x; + kfree(an); +} +EXPORT_SYMBOL_GPL(v4l2_simplify_fraction); + +/* + * Convert a fraction to a frame interval in 100ns multiples. The idea here is + * to compute numerator / denominator * 10000000 using 32 bit fixed point + * arithmetic only. + */ +u32 v4l2_fraction_to_interval(u32 numerator, u32 denominator) +{ + u32 multiplier; + + /* Saturate the result if the operation would overflow. */ + if (denominator == 0 || + numerator/denominator >= ((u32)-1)/10000000) + return (u32)-1; + + /* + * Divide both the denominator and the multiplier by two until + * numerator * multiplier doesn't overflow. If anyone knows a better + * algorithm please let me know. + */ + multiplier = 10000000; + while (numerator > ((u32)-1)/multiplier) { + multiplier /= 2; + denominator /= 2; + } + + return denominator ? numerator * multiplier / denominator : 0; +} +EXPORT_SYMBOL_GPL(v4l2_fraction_to_interval); diff --git a/drivers/media/v4l2-core/v4l2-ctrls-defs.c b/drivers/media/v4l2-core/v4l2-ctrls-defs.c index b73358fe1163..5ea33b96a0a4 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls-defs.c +++ b/drivers/media/v4l2-core/v4l2-ctrls-defs.c @@ -786,6 +786,7 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_MIN_BUFFERS_FOR_OUTPUT: return "Min Number of Output Buffers"; case V4L2_CID_ALPHA_COMPONENT: return "Alpha Component"; case V4L2_CID_COLORFX_CBCR: return "Color Effects, CbCr"; + case V4L2_CID_COLORFX_RGB: return "Color Effects, RGB"; /* * Codec controls @@ -1395,11 +1396,12 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, *min = *max = *step = *def = 0; break; case V4L2_CID_BG_COLOR: + case V4L2_CID_COLORFX_RGB: *type = V4L2_CTRL_TYPE_INTEGER; *step = 1; *min = 0; - /* Max is calculated as RGB888 that is 2^24 */ - *max = 0xFFFFFF; + /* Max is calculated as RGB888 that is 2^24 - 1 */ + *max = 0xffffff; break; case V4L2_CID_FLASH_FAULT: case V4L2_CID_JPEG_ACTIVE_MARKER: diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c index 85285ba8e817..54d3f3e89c5e 100644 --- a/drivers/message/fusion/mptsas.c +++ b/drivers/message/fusion/mptsas.c @@ -1952,12 +1952,12 @@ mptsas_qcmd(struct Scsi_Host *shost, struct scsi_cmnd *SCpnt) * @sc: scsi command that the midlayer is about to time out * **/ -static enum blk_eh_timer_return mptsas_eh_timed_out(struct scsi_cmnd *sc) +static enum scsi_timeout_action mptsas_eh_timed_out(struct scsi_cmnd *sc) { MPT_SCSI_HOST *hd; MPT_ADAPTER *ioc; VirtDevice *vdevice; - enum blk_eh_timer_return rc = BLK_EH_DONE; + enum scsi_timeout_action rc = SCSI_EH_NOT_HANDLED; hd = shost_priv(sc->device->host); if (hd == NULL) { @@ -1980,7 +1980,7 @@ static enum blk_eh_timer_return mptsas_eh_timed_out(struct scsi_cmnd *sc) dtmprintk(ioc, printk(MYIOC_s_WARN_FMT ": %s: ioc is in reset," "SML need to reset the timer (sc=%p)\n", ioc->name, __func__, sc)); - rc = BLK_EH_RESET_TIMER; + rc = SCSI_EH_RESET_TIMER; } vdevice = sc->device->hostdata; if (vdevice && vdevice->vtarget && (vdevice->vtarget->inDMD @@ -1988,7 +1988,7 @@ static enum blk_eh_timer_return mptsas_eh_timed_out(struct scsi_cmnd *sc) dtmprintk(ioc, printk(MYIOC_s_WARN_FMT ": %s: target removed " "or in device removal delay (sc=%p)\n", ioc->name, __func__, sc)); - rc = BLK_EH_RESET_TIMER; + rc = SCSI_EH_RESET_TIMER; goto done; } diff --git a/drivers/mfd/syscon.c b/drivers/mfd/syscon.c index 191fdb87c424..8882c577e960 100644 --- a/drivers/mfd/syscon.c +++ b/drivers/mfd/syscon.c @@ -22,6 +22,7 @@ #include #include #include +#include static struct platform_driver syscon_driver; @@ -129,6 +130,7 @@ static struct syscon *of_syscon_register(struct device_node *np, bool check_clk) } } + trace_android_vh_regmap_update(&syscon_config, regmap); syscon->regmap = regmap; syscon->np = np; diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 606331451466..9eb38b58f226 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -468,6 +468,21 @@ config MISC_RTSX tristate default MISC_RTSX_PCI || MISC_RTSX_USB +config UID_SYS_STATS + bool "Per-UID statistics" + depends on PROFILING && TASK_XACCT && TASK_IO_ACCOUNTING + help + Per UID based cpu time statistics exported to /proc/uid_cputime + Per UID based io statistics exported to /proc/uid_io + Per UID based procstat control in /proc/uid_procstat + +config UID_SYS_STATS_DEBUG + bool "Per-TASK statistics" + depends on UID_SYS_STATS + default n + help + Per TASK based io statistics exported to /proc/uid_io + config HISI_HIKEY_USB tristate "USB GPIO Hub on HiSilicon Hikey 960/970 Platform" depends on (OF && GPIOLIB) || COMPILE_TEST @@ -478,6 +493,31 @@ config HISI_HIKEY_USB switching between the dual-role USB-C port and the USB-A host ports using only one USB controller. +config OPEN_DICE + tristate "Open Profile for DICE driver" + depends on OF_RESERVED_MEM + help + This driver exposes a DICE reserved memory region to userspace via + a character device. The memory region contains Compound Device + Identifiers (CDIs) generated by firmware as an output of DICE + measured boot flow. Userspace can use CDIs for remote attestation + and sealing. + + If unsure, say N. + +config VCPU_STALL_DETECTOR + tristate "Guest vCPU stall detector" + depends on OF && HAS_IOMEM + help + When this driver is bound inside a KVM guest, it will + periodically "pet" an MMIO stall detector device from each vCPU + and allow the host to detect vCPU stalls. + + To compile this driver as a module, choose M here: the module + will be called vcpu_stall_detector. + + If you do not intend to run this kernel as a guest, say N. + source "drivers/misc/c2port/Kconfig" source "drivers/misc/eeprom/Kconfig" source "drivers/misc/cb710/Kconfig" diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index e0188c30ffa8..8d3cd0927cec 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -60,3 +60,6 @@ obj-$(CONFIG_UACCE) += uacce/ obj-$(CONFIG_XILINX_SDFEC) += xilinx_sdfec.o obj-$(CONFIG_HISI_HIKEY_USB) += hisi_hikey_usb.o obj-$(CONFIG_HI6421V600_IRQ) += hi6421v600-irq.o +obj-$(CONFIG_OPEN_DICE) += open-dice.o +obj-$(CONFIG_UID_SYS_STATS) += uid_sys_stats.o +obj-$(CONFIG_VCPU_STALL_DETECTOR) += vcpu_stall_detector.o \ No newline at end of file diff --git a/drivers/misc/open-dice.c b/drivers/misc/open-dice.c new file mode 100644 index 000000000000..c61be3404c6f --- /dev/null +++ b/drivers/misc/open-dice.c @@ -0,0 +1,208 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 - Google LLC + * Author: David Brazdil + * + * Driver for Open Profile for DICE. + * + * This driver takes ownership of a reserved memory region containing data + * generated by the Open Profile for DICE measured boot protocol. The memory + * contents are not interpreted by the kernel but can be mapped into a userspace + * process via a misc device. Userspace can also request a wipe of the memory. + * + * Userspace can access the data with (w/o error handling): + * + * fd = open("/dev/open-dice0", O_RDWR); + * read(fd, &size, sizeof(unsigned long)); + * data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); + * write(fd, NULL, 0); // wipe + * close(fd); + */ + +#include +#include +#include +#include +#include +#include + +#define DRIVER_NAME "open-dice" + +struct open_dice_drvdata { + struct mutex lock; + char name[16]; + struct reserved_mem *rmem; + struct miscdevice misc; +}; + +static inline struct open_dice_drvdata *to_open_dice_drvdata(struct file *filp) +{ + return container_of(filp->private_data, struct open_dice_drvdata, misc); +} + +static int open_dice_wipe(struct open_dice_drvdata *drvdata) +{ + void *kaddr; + + mutex_lock(&drvdata->lock); + kaddr = devm_memremap(drvdata->misc.this_device, drvdata->rmem->base, + drvdata->rmem->size, MEMREMAP_WC); + if (IS_ERR(kaddr)) { + mutex_unlock(&drvdata->lock); + return PTR_ERR(kaddr); + } + + memset(kaddr, 0, drvdata->rmem->size); + devm_memunmap(drvdata->misc.this_device, kaddr); + mutex_unlock(&drvdata->lock); + return 0; +} + +/* + * Copies the size of the reserved memory region to the user-provided buffer. + */ +static ssize_t open_dice_read(struct file *filp, char __user *ptr, size_t len, + loff_t *off) +{ + unsigned long val = to_open_dice_drvdata(filp)->rmem->size; + + return simple_read_from_buffer(ptr, len, off, &val, sizeof(val)); +} + +/* + * Triggers a wipe of the reserved memory region. The user-provided pointer + * is never dereferenced. + */ +static ssize_t open_dice_write(struct file *filp, const char __user *ptr, + size_t len, loff_t *off) +{ + if (open_dice_wipe(to_open_dice_drvdata(filp))) + return -EIO; + + /* Consume the input buffer. */ + return len; +} + +/* + * Creates a mapping of the reserved memory region in user address space. + */ +static int open_dice_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct open_dice_drvdata *drvdata = to_open_dice_drvdata(filp); + + /* Do not allow userspace to modify the underlying data. */ + if ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED)) + return -EPERM; + + /* Ensure userspace cannot acquire VM_WRITE + VM_SHARED later. */ + if (vma->vm_flags & VM_WRITE) + vma->vm_flags &= ~VM_MAYSHARE; + else if (vma->vm_flags & VM_SHARED) + vma->vm_flags &= ~VM_MAYWRITE; + + /* Create write-combine mapping so all clients observe a wipe. */ + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + vma->vm_flags |= VM_DONTCOPY | VM_DONTDUMP; + return vm_iomap_memory(vma, drvdata->rmem->base, drvdata->rmem->size); +} + +static const struct file_operations open_dice_fops = { + .owner = THIS_MODULE, + .read = open_dice_read, + .write = open_dice_write, + .mmap = open_dice_mmap, +}; + +static int __init open_dice_probe(struct platform_device *pdev) +{ + static unsigned int dev_idx; + struct device *dev = &pdev->dev; + struct reserved_mem *rmem; + struct open_dice_drvdata *drvdata; + int ret; + + rmem = of_reserved_mem_lookup(dev->of_node); + if (!rmem) { + dev_err(dev, "failed to lookup reserved memory\n"); + return -EINVAL; + } + + if (!rmem->size || (rmem->size > ULONG_MAX)) { + dev_err(dev, "invalid memory region size\n"); + return -EINVAL; + } + + if (!PAGE_ALIGNED(rmem->base) || !PAGE_ALIGNED(rmem->size)) { + dev_err(dev, "memory region must be page-aligned\n"); + return -EINVAL; + } + + drvdata = devm_kmalloc(dev, sizeof(*drvdata), GFP_KERNEL); + if (!drvdata) + return -ENOMEM; + + *drvdata = (struct open_dice_drvdata){ + .lock = __MUTEX_INITIALIZER(drvdata->lock), + .rmem = rmem, + .misc = (struct miscdevice){ + .parent = dev, + .name = drvdata->name, + .minor = MISC_DYNAMIC_MINOR, + .fops = &open_dice_fops, + .mode = 0600, + }, + }; + + /* Index overflow check not needed, misc_register() will fail. */ + snprintf(drvdata->name, sizeof(drvdata->name), DRIVER_NAME"%u", dev_idx++); + + ret = misc_register(&drvdata->misc); + if (ret) { + dev_err(dev, "failed to register misc device '%s': %d\n", + drvdata->name, ret); + return ret; + } + + platform_set_drvdata(pdev, drvdata); + return 0; +} + +static int open_dice_remove(struct platform_device *pdev) +{ + struct open_dice_drvdata *drvdata = platform_get_drvdata(pdev); + + misc_deregister(&drvdata->misc); + return 0; +} + +static const struct of_device_id open_dice_of_match[] = { + { .compatible = "google,open-dice" }, + {}, +}; + +static struct platform_driver open_dice_driver = { + .remove = open_dice_remove, + .driver = { + .name = DRIVER_NAME, + .of_match_table = open_dice_of_match, + }, +}; + +static int __init open_dice_init(void) +{ + int ret = platform_driver_probe(&open_dice_driver, open_dice_probe); + + /* DICE regions are optional. Succeed even with zero instances. */ + return (ret == -ENODEV) ? 0 : ret; +} + +static void __exit open_dice_exit(void) +{ + platform_driver_unregister(&open_dice_driver); +} + +module_init(open_dice_init); +module_exit(open_dice_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("David Brazdil "); diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c new file mode 100644 index 000000000000..985e7b02ab3c --- /dev/null +++ b/drivers/misc/uid_sys_stats.c @@ -0,0 +1,774 @@ +/* drivers/misc/uid_sys_stats.c + * + * Copyright (C) 2014 - 2015 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define UID_HASH_BITS 10 +DECLARE_HASHTABLE(hash_table, UID_HASH_BITS); + +static DEFINE_RT_MUTEX(uid_lock); +static struct proc_dir_entry *cpu_parent; +static struct proc_dir_entry *io_parent; +static struct proc_dir_entry *proc_parent; + +struct io_stats { + u64 read_bytes; + u64 write_bytes; + u64 rchar; + u64 wchar; + u64 fsync; +}; + +#define UID_STATE_FOREGROUND 0 +#define UID_STATE_BACKGROUND 1 +#define UID_STATE_BUCKET_SIZE 2 + +#define UID_STATE_TOTAL_CURR 2 +#define UID_STATE_TOTAL_LAST 3 +#define UID_STATE_DEAD_TASKS 4 +#define UID_STATE_SIZE 5 + +#define MAX_TASK_COMM_LEN 256 + +struct task_entry { + char comm[MAX_TASK_COMM_LEN]; + pid_t pid; + struct io_stats io[UID_STATE_SIZE]; + struct hlist_node hash; +}; + +struct uid_entry { + uid_t uid; + u64 utime; + u64 stime; + u64 active_utime; + u64 active_stime; + int state; + struct io_stats io[UID_STATE_SIZE]; + struct hlist_node hash; +#ifdef CONFIG_UID_SYS_STATS_DEBUG + DECLARE_HASHTABLE(task_entries, UID_HASH_BITS); +#endif +}; + +static u64 compute_write_bytes(struct task_io_accounting *ioac) +{ + if (ioac->write_bytes <= ioac->cancelled_write_bytes) + return 0; + + return ioac->write_bytes - ioac->cancelled_write_bytes; +} + +static void compute_io_bucket_stats(struct io_stats *io_bucket, + struct io_stats *io_curr, + struct io_stats *io_last, + struct io_stats *io_dead) +{ + /* tasks could switch to another uid group, but its io_last in the + * previous uid group could still be positive. + * therefore before each update, do an overflow check first + */ + int64_t delta; + + delta = io_curr->read_bytes + io_dead->read_bytes - + io_last->read_bytes; + io_bucket->read_bytes += delta > 0 ? delta : 0; + delta = io_curr->write_bytes + io_dead->write_bytes - + io_last->write_bytes; + io_bucket->write_bytes += delta > 0 ? delta : 0; + delta = io_curr->rchar + io_dead->rchar - io_last->rchar; + io_bucket->rchar += delta > 0 ? delta : 0; + delta = io_curr->wchar + io_dead->wchar - io_last->wchar; + io_bucket->wchar += delta > 0 ? delta : 0; + delta = io_curr->fsync + io_dead->fsync - io_last->fsync; + io_bucket->fsync += delta > 0 ? delta : 0; + + io_last->read_bytes = io_curr->read_bytes; + io_last->write_bytes = io_curr->write_bytes; + io_last->rchar = io_curr->rchar; + io_last->wchar = io_curr->wchar; + io_last->fsync = io_curr->fsync; + + memset(io_dead, 0, sizeof(struct io_stats)); +} + +#ifdef CONFIG_UID_SYS_STATS_DEBUG +static void get_full_task_comm(struct task_entry *task_entry, + struct task_struct *task) +{ + int i = 0, offset = 0, len = 0; + /* save one byte for terminating null character */ + int unused_len = MAX_TASK_COMM_LEN - TASK_COMM_LEN - 1; + char buf[MAX_TASK_COMM_LEN - TASK_COMM_LEN - 1]; + struct mm_struct *mm = task->mm; + + /* fill the first TASK_COMM_LEN bytes with thread name */ + __get_task_comm(task_entry->comm, TASK_COMM_LEN, task); + i = strlen(task_entry->comm); + while (i < TASK_COMM_LEN) + task_entry->comm[i++] = ' '; + + /* next the executable file name */ + if (mm) { + mmap_write_lock(mm); + if (mm->exe_file) { + char *pathname = d_path(&mm->exe_file->f_path, buf, + unused_len); + + if (!IS_ERR(pathname)) { + len = strlcpy(task_entry->comm + i, pathname, + unused_len); + i += len; + task_entry->comm[i++] = ' '; + unused_len--; + } + } + mmap_write_unlock(mm); + } + unused_len -= len; + + /* fill the rest with command line argument + * replace each null or new line character + * between args in argv with whitespace */ + len = get_cmdline(task, buf, unused_len); + while (offset < len) { + if (buf[offset] != '\0' && buf[offset] != '\n') + task_entry->comm[i++] = buf[offset]; + else + task_entry->comm[i++] = ' '; + offset++; + } + + /* get rid of trailing whitespaces in case when arg is memset to + * zero before being reset in userspace + */ + while (task_entry->comm[i-1] == ' ') + i--; + task_entry->comm[i] = '\0'; +} + +static struct task_entry *find_task_entry(struct uid_entry *uid_entry, + struct task_struct *task) +{ + struct task_entry *task_entry; + + hash_for_each_possible(uid_entry->task_entries, task_entry, hash, + task->pid) { + if (task->pid == task_entry->pid) { + /* if thread name changed, update the entire command */ + int len = strnchr(task_entry->comm, ' ', TASK_COMM_LEN) + - task_entry->comm; + + if (strncmp(task_entry->comm, task->comm, len)) + get_full_task_comm(task_entry, task); + return task_entry; + } + } + return NULL; +} + +static struct task_entry *find_or_register_task(struct uid_entry *uid_entry, + struct task_struct *task) +{ + struct task_entry *task_entry; + pid_t pid = task->pid; + + task_entry = find_task_entry(uid_entry, task); + if (task_entry) + return task_entry; + + task_entry = kzalloc(sizeof(struct task_entry), GFP_ATOMIC); + if (!task_entry) + return NULL; + + get_full_task_comm(task_entry, task); + + task_entry->pid = pid; + hash_add(uid_entry->task_entries, &task_entry->hash, (unsigned int)pid); + + return task_entry; +} + +static void remove_uid_tasks(struct uid_entry *uid_entry) +{ + struct task_entry *task_entry; + unsigned long bkt_task; + struct hlist_node *tmp_task; + + hash_for_each_safe(uid_entry->task_entries, bkt_task, + tmp_task, task_entry, hash) { + hash_del(&task_entry->hash); + kfree(task_entry); + } +} + +static void set_io_uid_tasks_zero(struct uid_entry *uid_entry) +{ + struct task_entry *task_entry; + unsigned long bkt_task; + + hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) { + memset(&task_entry->io[UID_STATE_TOTAL_CURR], 0, + sizeof(struct io_stats)); + } +} + +static void add_uid_tasks_io_stats(struct task_entry *task_entry, + struct task_io_accounting *ioac, int slot) +{ + struct io_stats *task_io_slot = &task_entry->io[slot]; + + task_io_slot->read_bytes += ioac->read_bytes; + task_io_slot->write_bytes += compute_write_bytes(ioac); + task_io_slot->rchar += ioac->rchar; + task_io_slot->wchar += ioac->wchar; + task_io_slot->fsync += ioac->syscfs; +} + +static void compute_io_uid_tasks(struct uid_entry *uid_entry) +{ + struct task_entry *task_entry; + unsigned long bkt_task; + + hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) { + compute_io_bucket_stats(&task_entry->io[uid_entry->state], + &task_entry->io[UID_STATE_TOTAL_CURR], + &task_entry->io[UID_STATE_TOTAL_LAST], + &task_entry->io[UID_STATE_DEAD_TASKS]); + } +} + +static void show_io_uid_tasks(struct seq_file *m, struct uid_entry *uid_entry) +{ + struct task_entry *task_entry; + unsigned long bkt_task; + + hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) { + /* Separated by comma because space exists in task comm */ + seq_printf(m, "task,%s,%lu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n", + task_entry->comm, + (unsigned long)task_entry->pid, + task_entry->io[UID_STATE_FOREGROUND].rchar, + task_entry->io[UID_STATE_FOREGROUND].wchar, + task_entry->io[UID_STATE_FOREGROUND].read_bytes, + task_entry->io[UID_STATE_FOREGROUND].write_bytes, + task_entry->io[UID_STATE_BACKGROUND].rchar, + task_entry->io[UID_STATE_BACKGROUND].wchar, + task_entry->io[UID_STATE_BACKGROUND].read_bytes, + task_entry->io[UID_STATE_BACKGROUND].write_bytes, + task_entry->io[UID_STATE_FOREGROUND].fsync, + task_entry->io[UID_STATE_BACKGROUND].fsync); + } +} +#else +static void remove_uid_tasks(struct uid_entry *uid_entry) {}; +static void set_io_uid_tasks_zero(struct uid_entry *uid_entry) {}; +static void compute_io_uid_tasks(struct uid_entry *uid_entry) {}; +static void show_io_uid_tasks(struct seq_file *m, + struct uid_entry *uid_entry) {} +#endif + +static struct uid_entry *find_uid_entry(uid_t uid) +{ + struct uid_entry *uid_entry; + hash_for_each_possible(hash_table, uid_entry, hash, uid) { + if (uid_entry->uid == uid) + return uid_entry; + } + return NULL; +} + +static struct uid_entry *find_or_register_uid(uid_t uid) +{ + struct uid_entry *uid_entry; + + uid_entry = find_uid_entry(uid); + if (uid_entry) + return uid_entry; + + uid_entry = kzalloc(sizeof(struct uid_entry), GFP_ATOMIC); + if (!uid_entry) + return NULL; + + uid_entry->uid = uid; +#ifdef CONFIG_UID_SYS_STATS_DEBUG + hash_init(uid_entry->task_entries); +#endif + hash_add(hash_table, &uid_entry->hash, uid); + + return uid_entry; +} + +static int uid_cputime_show(struct seq_file *m, void *v) +{ + struct uid_entry *uid_entry = NULL; + struct task_struct *task, *temp; + struct user_namespace *user_ns = current_user_ns(); + u64 utime; + u64 stime; + unsigned long bkt; + uid_t uid; + + rt_mutex_lock(&uid_lock); + + hash_for_each(hash_table, bkt, uid_entry, hash) { + uid_entry->active_stime = 0; + uid_entry->active_utime = 0; + } + + rcu_read_lock(); + do_each_thread(temp, task) { + uid = from_kuid_munged(user_ns, task_uid(task)); + if (!uid_entry || uid_entry->uid != uid) + uid_entry = find_or_register_uid(uid); + if (!uid_entry) { + rcu_read_unlock(); + rt_mutex_unlock(&uid_lock); + pr_err("%s: failed to find the uid_entry for uid %d\n", + __func__, uid); + return -ENOMEM; + } + /* avoid double accounting of dying threads */ + if (!(task->flags & PF_EXITING)) { + task_cputime_adjusted(task, &utime, &stime); + uid_entry->active_utime += utime; + uid_entry->active_stime += stime; + } + } while_each_thread(temp, task); + rcu_read_unlock(); + + hash_for_each(hash_table, bkt, uid_entry, hash) { + u64 total_utime = uid_entry->utime + + uid_entry->active_utime; + u64 total_stime = uid_entry->stime + + uid_entry->active_stime; + seq_printf(m, "%d: %llu %llu\n", uid_entry->uid, + ktime_to_us(total_utime), ktime_to_us(total_stime)); + } + + rt_mutex_unlock(&uid_lock); + return 0; +} + +static int uid_cputime_open(struct inode *inode, struct file *file) +{ + return single_open(file, uid_cputime_show, PDE_DATA(inode)); +} + +static const struct proc_ops uid_cputime_fops = { + .proc_open = uid_cputime_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, +}; + +static int uid_remove_open(struct inode *inode, struct file *file) +{ + return single_open(file, NULL, NULL); +} + +static ssize_t uid_remove_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + struct uid_entry *uid_entry; + struct hlist_node *tmp; + char uids[128]; + char *start_uid, *end_uid = NULL; + long int uid_start = 0, uid_end = 0; + + if (count >= sizeof(uids)) + count = sizeof(uids) - 1; + + if (copy_from_user(uids, buffer, count)) + return -EFAULT; + + uids[count] = '\0'; + end_uid = uids; + start_uid = strsep(&end_uid, "-"); + + if (!start_uid || !end_uid) + return -EINVAL; + + if (kstrtol(start_uid, 10, &uid_start) != 0 || + kstrtol(end_uid, 10, &uid_end) != 0) { + return -EINVAL; + } + + rt_mutex_lock(&uid_lock); + + for (; uid_start <= uid_end; uid_start++) { + hash_for_each_possible_safe(hash_table, uid_entry, tmp, + hash, (uid_t)uid_start) { + if (uid_start == uid_entry->uid) { + remove_uid_tasks(uid_entry); + hash_del(&uid_entry->hash); + kfree(uid_entry); + } + } + } + + rt_mutex_unlock(&uid_lock); + return count; +} + +static const struct proc_ops uid_remove_fops = { + .proc_open = uid_remove_open, + .proc_release = single_release, + .proc_write = uid_remove_write, +}; + +static void __add_uid_io_stats(struct uid_entry *uid_entry, + struct task_io_accounting *ioac, int slot) +{ + struct io_stats *io_slot = &uid_entry->io[slot]; + + io_slot->read_bytes += ioac->read_bytes; + io_slot->write_bytes += compute_write_bytes(ioac); + io_slot->rchar += ioac->rchar; + io_slot->wchar += ioac->wchar; + io_slot->fsync += ioac->syscfs; +} + +static void add_uid_io_stats(struct uid_entry *uid_entry, + struct task_struct *task, int slot) +{ + struct task_entry *task_entry __maybe_unused; + + /* avoid double accounting of dying threads */ + if (slot != UID_STATE_DEAD_TASKS && (task->flags & PF_EXITING)) + return; + +#ifdef CONFIG_UID_SYS_STATS_DEBUG + task_entry = find_or_register_task(uid_entry, task); + add_uid_tasks_io_stats(task_entry, &task->ioac, slot); +#endif + __add_uid_io_stats(uid_entry, &task->ioac, slot); +} + +static void update_io_stats_all_locked(void) +{ + struct uid_entry *uid_entry = NULL; + struct task_struct *task, *temp; + struct user_namespace *user_ns = current_user_ns(); + unsigned long bkt; + uid_t uid; + + hash_for_each(hash_table, bkt, uid_entry, hash) { + memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0, + sizeof(struct io_stats)); + set_io_uid_tasks_zero(uid_entry); + } + + rcu_read_lock(); + do_each_thread(temp, task) { + uid = from_kuid_munged(user_ns, task_uid(task)); + if (!uid_entry || uid_entry->uid != uid) + uid_entry = find_or_register_uid(uid); + if (!uid_entry) + continue; + add_uid_io_stats(uid_entry, task, UID_STATE_TOTAL_CURR); + } while_each_thread(temp, task); + rcu_read_unlock(); + + hash_for_each(hash_table, bkt, uid_entry, hash) { + compute_io_bucket_stats(&uid_entry->io[uid_entry->state], + &uid_entry->io[UID_STATE_TOTAL_CURR], + &uid_entry->io[UID_STATE_TOTAL_LAST], + &uid_entry->io[UID_STATE_DEAD_TASKS]); + compute_io_uid_tasks(uid_entry); + } +} + +static void update_io_stats_uid_locked(struct uid_entry *uid_entry) +{ + struct task_struct *task, *temp; + struct user_namespace *user_ns = current_user_ns(); + + memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0, + sizeof(struct io_stats)); + set_io_uid_tasks_zero(uid_entry); + + rcu_read_lock(); + do_each_thread(temp, task) { + if (from_kuid_munged(user_ns, task_uid(task)) != uid_entry->uid) + continue; + add_uid_io_stats(uid_entry, task, UID_STATE_TOTAL_CURR); + } while_each_thread(temp, task); + rcu_read_unlock(); + + compute_io_bucket_stats(&uid_entry->io[uid_entry->state], + &uid_entry->io[UID_STATE_TOTAL_CURR], + &uid_entry->io[UID_STATE_TOTAL_LAST], + &uid_entry->io[UID_STATE_DEAD_TASKS]); + compute_io_uid_tasks(uid_entry); +} + + +static int uid_io_show(struct seq_file *m, void *v) +{ + struct uid_entry *uid_entry; + unsigned long bkt; + + rt_mutex_lock(&uid_lock); + + update_io_stats_all_locked(); + + hash_for_each(hash_table, bkt, uid_entry, hash) { + seq_printf(m, "%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + uid_entry->uid, + uid_entry->io[UID_STATE_FOREGROUND].rchar, + uid_entry->io[UID_STATE_FOREGROUND].wchar, + uid_entry->io[UID_STATE_FOREGROUND].read_bytes, + uid_entry->io[UID_STATE_FOREGROUND].write_bytes, + uid_entry->io[UID_STATE_BACKGROUND].rchar, + uid_entry->io[UID_STATE_BACKGROUND].wchar, + uid_entry->io[UID_STATE_BACKGROUND].read_bytes, + uid_entry->io[UID_STATE_BACKGROUND].write_bytes, + uid_entry->io[UID_STATE_FOREGROUND].fsync, + uid_entry->io[UID_STATE_BACKGROUND].fsync); + + show_io_uid_tasks(m, uid_entry); + } + + rt_mutex_unlock(&uid_lock); + return 0; +} + +static int uid_io_open(struct inode *inode, struct file *file) +{ + return single_open(file, uid_io_show, PDE_DATA(inode)); +} + +static const struct proc_ops uid_io_fops = { + .proc_open = uid_io_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, +}; + +static int uid_procstat_open(struct inode *inode, struct file *file) +{ + return single_open(file, NULL, NULL); +} + +static ssize_t uid_procstat_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + struct uid_entry *uid_entry; + uid_t uid; + int argc, state; + char input[128]; + + if (count >= sizeof(input)) + return -EINVAL; + + if (copy_from_user(input, buffer, count)) + return -EFAULT; + + input[count] = '\0'; + + argc = sscanf(input, "%u %d", &uid, &state); + if (argc != 2) + return -EINVAL; + + if (state != UID_STATE_BACKGROUND && state != UID_STATE_FOREGROUND) + return -EINVAL; + + rt_mutex_lock(&uid_lock); + + uid_entry = find_or_register_uid(uid); + if (!uid_entry) { + rt_mutex_unlock(&uid_lock); + return -EINVAL; + } + + if (uid_entry->state == state) { + rt_mutex_unlock(&uid_lock); + return count; + } + + update_io_stats_uid_locked(uid_entry); + + uid_entry->state = state; + + rt_mutex_unlock(&uid_lock); + + return count; +} + +static const struct proc_ops uid_procstat_fops = { + .proc_open = uid_procstat_open, + .proc_release = single_release, + .proc_write = uid_procstat_write, +}; + +struct update_stats_work { + struct work_struct work; + uid_t uid; +#ifdef CONFIG_UID_SYS_STATS_DEBUG + struct task_struct *task; +#endif + struct task_io_accounting ioac; + u64 utime; + u64 stime; +}; + +static void update_stats_workfn(struct work_struct *work) +{ + struct update_stats_work *usw = + container_of(work, struct update_stats_work, work); + struct uid_entry *uid_entry; + struct task_entry *task_entry __maybe_unused; + + rt_mutex_lock(&uid_lock); + uid_entry = find_uid_entry(usw->uid); + if (!uid_entry) + goto exit; + + uid_entry->utime += usw->utime; + uid_entry->stime += usw->stime; + +#ifdef CONFIG_UID_SYS_STATS_DEBUG + task_entry = find_task_entry(uid_entry, usw->task); + if (!task_entry) + goto exit; + add_uid_tasks_io_stats(task_entry, &usw->ioac, + UID_STATE_DEAD_TASKS); +#endif + __add_uid_io_stats(uid_entry, &usw->ioac, UID_STATE_DEAD_TASKS); +exit: + rt_mutex_unlock(&uid_lock); +#ifdef CONFIG_UID_SYS_STATS_DEBUG + put_task_struct(usw->task); +#endif + kfree(usw); +} + +static int process_notifier(struct notifier_block *self, + unsigned long cmd, void *v) +{ + struct task_struct *task = v; + struct uid_entry *uid_entry; + u64 utime, stime; + uid_t uid; + + if (!task) + return NOTIFY_OK; + + uid = from_kuid_munged(current_user_ns(), task_uid(task)); + if (!rt_mutex_trylock(&uid_lock)) { + struct update_stats_work *usw; + + usw = kmalloc(sizeof(struct update_stats_work), GFP_KERNEL); + if (usw) { + INIT_WORK(&usw->work, update_stats_workfn); + usw->uid = uid; +#ifdef CONFIG_UID_SYS_STATS_DEBUG + usw->task = get_task_struct(task); +#endif + /* + * Copy task->ioac since task might be destroyed before + * the work is later performed. + */ + usw->ioac = task->ioac; + task_cputime_adjusted(task, &usw->utime, &usw->stime); + schedule_work(&usw->work); + } + return NOTIFY_OK; + } + + uid_entry = find_or_register_uid(uid); + if (!uid_entry) { + pr_err("%s: failed to find uid %d\n", __func__, uid); + goto exit; + } + + task_cputime_adjusted(task, &utime, &stime); + uid_entry->utime += utime; + uid_entry->stime += stime; + + add_uid_io_stats(uid_entry, task, UID_STATE_DEAD_TASKS); + +exit: + rt_mutex_unlock(&uid_lock); + return NOTIFY_OK; +} + +static struct notifier_block process_notifier_block = { + .notifier_call = process_notifier, +}; + +static int __init proc_uid_sys_stats_init(void) +{ + hash_init(hash_table); + + cpu_parent = proc_mkdir("uid_cputime", NULL); + if (!cpu_parent) { + pr_err("%s: failed to create uid_cputime proc entry\n", + __func__); + goto err; + } + + proc_create_data("remove_uid_range", 0222, cpu_parent, + &uid_remove_fops, NULL); + proc_create_data("show_uid_stat", 0444, cpu_parent, + &uid_cputime_fops, NULL); + + io_parent = proc_mkdir("uid_io", NULL); + if (!io_parent) { + pr_err("%s: failed to create uid_io proc entry\n", + __func__); + goto err; + } + + proc_create_data("stats", 0444, io_parent, + &uid_io_fops, NULL); + + proc_parent = proc_mkdir("uid_procstat", NULL); + if (!proc_parent) { + pr_err("%s: failed to create uid_procstat proc entry\n", + __func__); + goto err; + } + + proc_create_data("set", 0222, proc_parent, + &uid_procstat_fops, NULL); + + profile_event_register(PROFILE_TASK_EXIT, &process_notifier_block); + + return 0; + +err: + remove_proc_subtree("uid_cputime", NULL); + remove_proc_subtree("uid_io", NULL); + remove_proc_subtree("uid_procstat", NULL); + return -ENOMEM; +} + +early_initcall(proc_uid_sys_stats_init); diff --git a/drivers/misc/vcpu_stall_detector.c b/drivers/misc/vcpu_stall_detector.c new file mode 100644 index 000000000000..53b5506080e1 --- /dev/null +++ b/drivers/misc/vcpu_stall_detector.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0-only +// +// VCPU stall detector. +// Copyright (C) Google, 2022 + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VCPU_STALL_REG_STATUS (0x00) +#define VCPU_STALL_REG_LOAD_CNT (0x04) +#define VCPU_STALL_REG_CURRENT_CNT (0x08) +#define VCPU_STALL_REG_CLOCK_FREQ_HZ (0x0C) +#define VCPU_STALL_REG_LEN (0x10) + +#define VCPU_STALL_DEFAULT_CLOCK_HZ (10) +#define VCPU_STALL_MAX_CLOCK_HZ (100) +#define VCPU_STALL_DEFAULT_TIMEOUT_SEC (8) +#define VCPU_STALL_MAX_TIMEOUT_SEC (600) + +struct vcpu_stall_detect_config { + u32 clock_freq_hz; + u32 stall_timeout_sec; + + void __iomem *membase; + struct platform_device *dev; + enum cpuhp_state hp_online; +}; + +struct vcpu_stall_priv { + struct hrtimer vcpu_hrtimer; + bool is_initialized; +}; + +/* The vcpu stall configuration structure which applies to all the CPUs */ +static struct vcpu_stall_detect_config vcpu_stall_config; + +#define vcpu_stall_reg_write(vcpu, reg, value) \ + writel_relaxed((value), \ + (void __iomem *)(vcpu_stall_config.membase + \ + (vcpu) * VCPU_STALL_REG_LEN + (reg))) + + +static struct vcpu_stall_priv __percpu *vcpu_stall_detectors; + +static enum hrtimer_restart +vcpu_stall_detect_timer_fn(struct hrtimer *hrtimer) +{ + u32 ticks, ping_timeout_ms; + + /* Reload the stall detector counter register every + * `ping_timeout_ms` to prevent the virtual device + * from decrementing it to 0. The virtual device decrements this + * register at 'clock_freq_hz' frequency. + */ + ticks = vcpu_stall_config.clock_freq_hz * + vcpu_stall_config.stall_timeout_sec; + vcpu_stall_reg_write(smp_processor_id(), + VCPU_STALL_REG_LOAD_CNT, ticks); + + ping_timeout_ms = vcpu_stall_config.stall_timeout_sec * + MSEC_PER_SEC / 2; + hrtimer_forward_now(hrtimer, + ms_to_ktime(ping_timeout_ms)); + + return HRTIMER_RESTART; +} + +static int start_stall_detector_cpu(unsigned int cpu) +{ + u32 ticks, ping_timeout_ms; + struct vcpu_stall_priv *vcpu_stall_detector = + this_cpu_ptr(vcpu_stall_detectors); + struct hrtimer *vcpu_hrtimer = &vcpu_stall_detector->vcpu_hrtimer; + + vcpu_stall_reg_write(cpu, VCPU_STALL_REG_CLOCK_FREQ_HZ, + vcpu_stall_config.clock_freq_hz); + + /* Compute the number of ticks required for the stall detector + * counter register based on the internal clock frequency and the + * timeout value given from the device tree. + */ + ticks = vcpu_stall_config.clock_freq_hz * + vcpu_stall_config.stall_timeout_sec; + vcpu_stall_reg_write(cpu, VCPU_STALL_REG_LOAD_CNT, ticks); + + /* Enable the internal clock and start the stall detector */ + vcpu_stall_reg_write(cpu, VCPU_STALL_REG_STATUS, 1); + + /* Pet the stall detector at half of its expiration timeout + * to prevent spurious resets. + */ + ping_timeout_ms = vcpu_stall_config.stall_timeout_sec * + MSEC_PER_SEC / 2; + + hrtimer_init(vcpu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + vcpu_hrtimer->function = vcpu_stall_detect_timer_fn; + vcpu_stall_detector->is_initialized = true; + + hrtimer_start(vcpu_hrtimer, ms_to_ktime(ping_timeout_ms), + HRTIMER_MODE_REL_PINNED); + + return 0; +} + +static int stop_stall_detector_cpu(unsigned int cpu) +{ + struct vcpu_stall_priv *vcpu_stall_detector = + per_cpu_ptr(vcpu_stall_detectors, cpu); + + if (!vcpu_stall_detector->is_initialized) + return 0; + + /* Disable the stall detector for the current CPU */ + hrtimer_cancel(&vcpu_stall_detector->vcpu_hrtimer); + vcpu_stall_reg_write(cpu, VCPU_STALL_REG_STATUS, 0); + vcpu_stall_detector->is_initialized = false; + + return 0; +} + +static int vcpu_stall_detect_probe(struct platform_device *pdev) +{ + int ret; + struct resource *r; + void __iomem *membase; + u32 clock_freq_hz = VCPU_STALL_DEFAULT_CLOCK_HZ; + u32 stall_timeout_sec = VCPU_STALL_DEFAULT_TIMEOUT_SEC; + struct device_node *np = pdev->dev.of_node; + + vcpu_stall_detectors = devm_alloc_percpu(&pdev->dev, + typeof(struct vcpu_stall_priv)); + if (!vcpu_stall_detectors) + return -ENOMEM; + + membase = devm_platform_get_and_ioremap_resource(pdev, 0, &r); + if (IS_ERR(membase)) { + dev_err(&pdev->dev, "Failed to get memory resource\n"); + return PTR_ERR(membase); + } + + if (!of_property_read_u32(np, "clock-frequency", &clock_freq_hz)) { + if (!(clock_freq_hz > 0 && + clock_freq_hz < VCPU_STALL_MAX_CLOCK_HZ)) { + dev_warn(&pdev->dev, "clk out of range\n"); + clock_freq_hz = VCPU_STALL_DEFAULT_CLOCK_HZ; + } + } + + if (!of_property_read_u32(np, "timeout-sec", &stall_timeout_sec)) { + if (!(stall_timeout_sec > 0 && + stall_timeout_sec < VCPU_STALL_MAX_TIMEOUT_SEC)) { + dev_warn(&pdev->dev, "stall timeout out of range\n"); + stall_timeout_sec = VCPU_STALL_DEFAULT_TIMEOUT_SEC; + } + } + + vcpu_stall_config = (struct vcpu_stall_detect_config) { + .membase = membase, + .clock_freq_hz = clock_freq_hz, + .stall_timeout_sec = stall_timeout_sec + }; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "virt/vcpu_stall_detector:online", + start_stall_detector_cpu, + stop_stall_detector_cpu); + if (ret < 0) { + dev_err(&pdev->dev, "failed to install cpu hotplug"); + goto err; + } + + vcpu_stall_config.hp_online = ret; + return 0; +err: + return ret; +} + +static int vcpu_stall_detect_remove(struct platform_device *pdev) +{ + int cpu; + + cpuhp_remove_state(vcpu_stall_config.hp_online); + + for_each_possible_cpu(cpu) + stop_stall_detector_cpu(cpu); + + return 0; +} + +static const struct of_device_id vcpu_stall_detect_of_match[] = { + { .compatible = "qemu,vcpu-stall-detector", }, + {} +}; + +MODULE_DEVICE_TABLE(of, vcpu_stall_detect_of_match); + +static struct platform_driver vcpu_stall_detect_driver = { + .probe = vcpu_stall_detect_probe, + .remove = vcpu_stall_detect_remove, + .driver = { + .name = KBUILD_MODNAME, + .of_match_table = vcpu_stall_detect_of_match, + }, +}; + +module_platform_driver(vcpu_stall_detect_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Sebastian Ene "); +MODULE_DESCRIPTION("VCPU stall detector"); diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 66694d9f671f..08f2bc7abf28 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -49,6 +49,8 @@ #include +#include + #include "queue.h" #include "block.h" #include "core.h" @@ -1015,16 +1017,25 @@ static int mmc_blk_reset(struct mmc_blk_data *md, struct mmc_host *host, * in that case. */ main_md->part_curr = err ? MMC_BLK_PART_INVALID : main_md->part_type; - if (err) - return err; /* Ensure we switch back to the correct partition */ - if (mmc_blk_part_switch(host->card, md->part_type)) - /* - * We have failed to get back into the correct - * partition, so we need to abort the whole request. - */ - return -ENODEV; - return 0; + if (err) { + struct mmc_blk_data *main_md = + dev_get_drvdata(&host->card->dev); + int part_err; + + main_md->part_curr = main_md->part_type; + part_err = mmc_blk_part_switch(host->card, md->part_type); + if (part_err) { + /* + * We have failed to get back into the correct + * partition, so we need to abort the whole request. + */ + return -ENODEV; + } + + trace_android_vh_mmc_blk_reset(host, err); + } + return err; } static inline void mmc_blk_reset_success(struct mmc_blk_data *md, int type) @@ -1499,7 +1510,7 @@ void mmc_blk_cqe_recovery(struct mmc_queue *mq) pr_debug("%s: CQE recovery start\n", mmc_hostname(host)); err = mmc_cqe_recovery(host); - if (err) + if (err || host->cqe_recovery_reset_always) mmc_blk_reset(mq->blkdata, host, MMC_BLK_CQE_RECOVERY); mmc_blk_reset_success(mq->blkdata, MMC_BLK_CQE_RECOVERY); @@ -1858,6 +1869,7 @@ static void mmc_blk_mq_rw_recovery(struct mmc_queue *mq, struct request *req) err && mmc_blk_reset(md, card->host, type)) { pr_err("%s: recovery failed!\n", req->rq_disk->disk_name); mqrq->retries = MMC_NO_RETRIES; + trace_android_vh_mmc_blk_mq_rw_recovery(card); return; } diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index e7f35575aa1e..7d057ef00e7e 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -917,6 +917,7 @@ void mmc_set_clock(struct mmc_host *host, unsigned int hz) host->ios.clock = hz; mmc_set_ios(host); } +EXPORT_SYMBOL_GPL(mmc_set_clock); int mmc_execute_tuning(struct mmc_card *card) { @@ -943,12 +944,15 @@ int mmc_execute_tuning(struct mmc_card *card) } /* Only print error when we don't check for card removal */ - if (!host->detect_change) + if (!host->detect_change) { pr_err("%s: tuning execution failed: %d\n", mmc_hostname(host), err); + mmc_debugfs_err_stats_inc(host, MMC_ERR_TUNING); + } return err; } +EXPORT_SYMBOL_GPL(mmc_execute_tuning); /* * Change the bus mode (open drain/push-pull) of a host. @@ -958,6 +962,7 @@ void mmc_set_bus_mode(struct mmc_host *host, unsigned int mode) host->ios.bus_mode = mode; mmc_set_ios(host); } +EXPORT_SYMBOL_GPL(mmc_set_bus_mode); /* * Change data bus width of a host. @@ -967,6 +972,7 @@ void mmc_set_bus_width(struct mmc_host *host, unsigned int width) host->ios.bus_width = width; mmc_set_ios(host); } +EXPORT_SYMBOL_GPL(mmc_set_bus_width); /* * Set initial state after a power cycle or a hw_reset. @@ -1000,6 +1006,7 @@ void mmc_set_initial_state(struct mmc_host *host) mmc_crypto_set_initial_state(host); } +EXPORT_SYMBOL_GPL(mmc_set_initial_state); /** * mmc_vdd_to_ocrbitnum - Convert a voltage to the OCR bit number @@ -1269,6 +1276,7 @@ void mmc_set_timing(struct mmc_host *host, unsigned int timing) host->ios.timing = timing; mmc_set_ios(host); } +EXPORT_SYMBOL_GPL(mmc_set_timing); /* * Select appropriate driver type for host. @@ -2254,6 +2262,12 @@ void mmc_rescan(struct work_struct *work) if (freqs[i] <= host->f_min) break; } + + /* + * Ignore the command timeout errors observed during + * the card init as those are excepted. + */ + host->err_stats[MMC_ERR_CMD_TIMEOUT] = 0; mmc_release_host(host); out: diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h index f5f3f623ea49..a086152e0fb5 100644 --- a/drivers/mmc/core/core.h +++ b/drivers/mmc/core/core.h @@ -42,7 +42,7 @@ struct device_node *mmc_of_find_child_device(struct mmc_host *host, void mmc_init_erase(struct mmc_card *card); void mmc_set_chip_select(struct mmc_host *host, int mode); -void mmc_set_clock(struct mmc_host *host, unsigned int hz); +extern void mmc_set_clock(struct mmc_host *host, unsigned int hz); void mmc_set_bus_mode(struct mmc_host *host, unsigned int mode); void mmc_set_bus_width(struct mmc_host *host, unsigned int width); u32 mmc_select_voltage(struct mmc_host *host, u32 ocr); diff --git a/drivers/mmc/core/crypto.c b/drivers/mmc/core/crypto.c index 67557808cada..fec4fbf16a5b 100644 --- a/drivers/mmc/core/crypto.c +++ b/drivers/mmc/core/crypto.c @@ -16,13 +16,13 @@ void mmc_crypto_set_initial_state(struct mmc_host *host) { /* Reset might clear all keys, so reprogram all the keys. */ if (host->caps2 & MMC_CAP2_CRYPTO) - blk_ksm_reprogram_all_keys(&host->ksm); + blk_crypto_reprogram_all_keys(&host->crypto_profile); } void mmc_crypto_setup_queue(struct request_queue *q, struct mmc_host *host) { if (host->caps2 & MMC_CAP2_CRYPTO) - blk_ksm_register(&host->ksm, q); + blk_crypto_register(&host->crypto_profile, q); } EXPORT_SYMBOL_GPL(mmc_crypto_setup_queue); @@ -30,12 +30,15 @@ void mmc_crypto_prepare_req(struct mmc_queue_req *mqrq) { struct request *req = mmc_queue_req_to_req(mqrq); struct mmc_request *mrq = &mqrq->brq.mrq; + struct blk_crypto_keyslot *keyslot; if (!req->crypt_ctx) return; mrq->crypto_ctx = req->crypt_ctx; - if (req->crypt_keyslot) - mrq->crypto_key_slot = blk_ksm_get_slot_idx(req->crypt_keyslot); + + keyslot = req->crypt_keyslot; + if (keyslot) + mrq->crypto_key_slot = blk_crypto_keyslot_index(keyslot); } EXPORT_SYMBOL_GPL(mmc_crypto_prepare_req); diff --git a/drivers/mmc/core/debugfs.c b/drivers/mmc/core/debugfs.c index 3fdbc801e64a..9ed69c339ad5 100644 --- a/drivers/mmc/core/debugfs.c +++ b/drivers/mmc/core/debugfs.c @@ -223,6 +223,77 @@ static int mmc_clock_opt_set(void *data, u64 val) DEFINE_DEBUGFS_ATTRIBUTE(mmc_clock_fops, mmc_clock_opt_get, mmc_clock_opt_set, "%llu\n"); +static int mmc_err_state_get(void *data, u64 *val) +{ + struct mmc_host *host = data; + int i; + + *val = 0; + for (i = 0; i < ARRAY_SIZE(host->err_stats); i++) { + if (host->err_stats[i]) { + *val = 1; + break; + } + } + + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(mmc_err_state, mmc_err_state_get, NULL, "%llu\n"); + +static int mmc_err_stats_show(struct seq_file *file, void *data) +{ + struct mmc_host *host = (struct mmc_host *)file->private; + static const char *desc[MMC_ERR_MAX] = { + [MMC_ERR_CMD_TIMEOUT] = "Command Timeout Occurred", + [MMC_ERR_CMD_CRC] = "Command CRC Errors Occurred", + [MMC_ERR_DAT_TIMEOUT] = "Data Timeout Occurred", + [MMC_ERR_DAT_CRC] = "Data CRC Errors Occurred", + [MMC_ERR_AUTO_CMD] = "Auto-Cmd Error Occurred", + [MMC_ERR_ADMA] = "ADMA Error Occurred", + [MMC_ERR_TUNING] = "Tuning Error Occurred", + [MMC_ERR_CMDQ_RED] = "CMDQ RED Errors", + [MMC_ERR_CMDQ_GCE] = "CMDQ GCE Errors", + [MMC_ERR_CMDQ_ICCE] = "CMDQ ICCE Errors", + [MMC_ERR_REQ_TIMEOUT] = "Request Timedout", + [MMC_ERR_CMDQ_REQ_TIMEOUT] = "CMDQ Request Timedout", + [MMC_ERR_ICE_CFG] = "ICE Config Errors", + [MMC_ERR_CTRL_TIMEOUT] = "Controller Timedout errors", + [MMC_ERR_UNEXPECTED_IRQ] = "Unexpected IRQ errors", + }; + int i; + + for (i = 0; i < ARRAY_SIZE(desc); i++) { + if (desc[i]) + seq_printf(file, "# %s:\t %d\n", + desc[i], host->err_stats[i]); + } + + return 0; +} + +static int mmc_err_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, mmc_err_stats_show, inode->i_private); +} + +static ssize_t mmc_err_stats_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct mmc_host *host = filp->f_mapping->host->i_private; + + pr_debug("%s: Resetting MMC error statistics\n", __func__); + memset(host->err_stats, 0, sizeof(host->err_stats)); + + return cnt; +} + +static const struct file_operations mmc_err_stats_fops = { + .open = mmc_err_stats_open, + .read = seq_read, + .write = mmc_err_stats_write, +}; + void mmc_add_host_debugfs(struct mmc_host *host) { struct dentry *root; @@ -236,6 +307,11 @@ void mmc_add_host_debugfs(struct mmc_host *host) debugfs_create_file_unsafe("clock", S_IRUSR | S_IWUSR, root, host, &mmc_clock_fops); + debugfs_create_file("err_state", 0600, root, host, + &mmc_err_state); + debugfs_create_file("err_stats", 0600, root, host, + &mmc_err_stats_fops); + #ifdef CONFIG_FAIL_MMC_REQUEST if (fail_request) setup_fault_attr(&fail_default_attr, fail_request); diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index d739e2b631fe..98b0ee8b0a96 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -166,6 +166,7 @@ void mmc_retune_hold(struct mmc_host *host) host->retune_now = 1; host->hold_retune += 1; } +EXPORT_SYMBOL(mmc_retune_hold); void mmc_retune_release(struct mmc_host *host) { diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index d805f8450719..627a821f671b 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "core.h" #include "card.h" @@ -975,7 +976,7 @@ static int mmc_select_powerclass(struct mmc_card *card) /* * Set the bus speed for the selected speed mode. */ -static void mmc_set_bus_speed(struct mmc_card *card) +void mmc_set_bus_speed(struct mmc_card *card) { unsigned int max_dtr = (unsigned int)-1; @@ -995,7 +996,7 @@ static void mmc_set_bus_speed(struct mmc_card *card) * If the bus width is changed successfully, return the selected width value. * Zero is returned instead of error value if the wide width is not supported. */ -static int mmc_select_bus_width(struct mmc_card *card) +int mmc_select_bus_width(struct mmc_card *card) { static unsigned ext_csd_bits[] = { EXT_CSD_BUS_WIDTH_8, @@ -1060,11 +1061,12 @@ static int mmc_select_bus_width(struct mmc_card *card) return err; } +EXPORT_SYMBOL_GPL(mmc_select_bus_width); /* * Switch to the high-speed mode */ -static int mmc_select_hs(struct mmc_card *card) +int mmc_select_hs(struct mmc_card *card) { int err; @@ -1078,11 +1080,12 @@ static int mmc_select_hs(struct mmc_card *card) return err; } +EXPORT_SYMBOL_GPL(mmc_select_hs); /* * Activate wide bus and DDR if supported. */ -static int mmc_select_hs_ddr(struct mmc_card *card) +int mmc_select_hs_ddr(struct mmc_card *card) { struct mmc_host *host = card->host; u32 bus_width, ext_csd_bits; @@ -1151,8 +1154,9 @@ static int mmc_select_hs_ddr(struct mmc_card *card) return err; } +EXPORT_SYMBOL_GPL(mmc_select_hs_ddr); -static int mmc_select_hs400(struct mmc_card *card) +int mmc_select_hs400(struct mmc_card *card) { struct mmc_host *host = card->host; unsigned int max_dtr; @@ -1224,6 +1228,14 @@ static int mmc_select_hs400(struct mmc_card *card) mmc_set_timing(host, MMC_TIMING_MMC_HS400); mmc_set_bus_speed(card); + if (host->ops->execute_hs400_tuning) { + mmc_retune_disable(host); + err = host->ops->execute_hs400_tuning(host, card); + mmc_retune_enable(host); + if (err) + goto out_err; + } + if (host->ops->hs400_complete) host->ops->hs400_complete(host); @@ -1238,6 +1250,7 @@ out_err: __func__, err); return err; } +EXPORT_SYMBOL_GPL(mmc_select_hs400); int mmc_hs200_to_hs400(struct mmc_card *card) { @@ -1523,7 +1536,7 @@ err: /* * Activate High Speed, HS200 or HS400ES mode if supported. */ -static int mmc_select_timing(struct mmc_card *card) +int mmc_select_timing(struct mmc_card *card) { int err = 0; @@ -1548,12 +1561,13 @@ bus_speed: mmc_set_bus_speed(card); return 0; } +EXPORT_SYMBOL_GPL(mmc_select_timing); /* * Execute tuning sequence to seek the proper bus operating * conditions for HS200 and HS400, which sends CMD21 to the device. */ -static int mmc_hs200_tuning(struct mmc_card *card) +int mmc_hs200_tuning(struct mmc_card *card) { struct mmc_host *host = card->host; @@ -1568,6 +1582,7 @@ static int mmc_hs200_tuning(struct mmc_card *card) return mmc_execute_tuning(card); } +EXPORT_SYMBOL_GPL(mmc_hs200_tuning); /* * Handle the detection and initialisation of a card. @@ -2092,9 +2107,10 @@ static int _mmc_suspend(struct mmc_host *host, bool is_suspend) ((host->caps2 & MMC_CAP2_FULL_PWR_CYCLE) || !is_suspend || (host->caps2 & MMC_CAP2_FULL_PWR_CYCLE_IN_SUSPEND))) err = mmc_poweroff_notify(host->card, notify_type); - else if (mmc_can_sleep(host->card)) + else if (mmc_can_sleep(host->card)) { + trace_android_rvh_mmc_cache_card_properties(host); err = mmc_sleep(host); - else if (!mmc_host_is_spi(host)) + } else if (!mmc_host_is_spi(host)) err = mmc_deselect_cards(host); if (!err) { @@ -2129,6 +2145,7 @@ static int mmc_suspend(struct mmc_host *host) static int _mmc_resume(struct mmc_host *host) { int err = 0; + bool partial_init = false; mmc_claim_host(host); @@ -2136,7 +2153,9 @@ static int _mmc_resume(struct mmc_host *host) goto out; mmc_power_up(host, host->card->ocr); - err = mmc_init_card(host, host->card->ocr, host->card); + trace_android_rvh_partial_init(host, &partial_init); + if (!partial_init) + err = mmc_init_card(host, host->card->ocr, host->card); mmc_card_clr_suspended(host->card); out: diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c index 0c54858e89c0..ba028ecf513f 100644 --- a/drivers/mmc/core/mmc_ops.c +++ b/drivers/mmc/core/mmc_ops.c @@ -110,6 +110,7 @@ int mmc_select_card(struct mmc_card *card) return _mmc_select_card(card->host, card); } +EXPORT_SYMBOL_GPL(mmc_select_card); int mmc_deselect_cards(struct mmc_host *host) { @@ -545,6 +546,7 @@ bool mmc_prepare_busy_cmd(struct mmc_host *host, struct mmc_command *cmd, cmd->busy_timeout = timeout_ms; return true; } +EXPORT_SYMBOL_GPL(mmc_prepare_busy_cmd); /** * __mmc_switch - modify EXT_CSD register diff --git a/drivers/mmc/core/mmc_ops.h b/drivers/mmc/core/mmc_ops.h index ae25ffc2e870..e5e94567a9a9 100644 --- a/drivers/mmc/core/mmc_ops.h +++ b/drivers/mmc/core/mmc_ops.h @@ -38,7 +38,6 @@ int mmc_spi_read_ocr(struct mmc_host *host, int highcap, u32 *ocrp); int mmc_spi_set_crc(struct mmc_host *host, int use_crc); int mmc_bus_test(struct mmc_card *card, u8 bus_width); int mmc_can_ext_csd(struct mmc_card *card); -int mmc_get_ext_csd(struct mmc_card *card, u8 **new_ext_csd); int mmc_switch_status(struct mmc_card *card, bool crc_err_fatal); bool mmc_prepare_busy_cmd(struct mmc_host *host, struct mmc_command *cmd, unsigned int timeout_ms); diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index b15c034b42fb..9bfd09650b8c 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -20,6 +20,7 @@ #include "card.h" #include "crypto.h" #include "host.h" +#include #define MMC_DMA_MAP_MERGE_SEGMENTS 512 @@ -68,6 +69,7 @@ enum mmc_issue_type mmc_issue_type(struct mmc_queue *mq, struct request *req) return MMC_ISSUE_SYNC; } +EXPORT_SYMBOL_GPL(mmc_issue_type); static void __mmc_cqe_recovery_notifier(struct mmc_queue *mq) { @@ -234,7 +236,7 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx, enum mmc_issue_type issue_type; enum mmc_issued issued; bool get_card, cqe_retune_ok; - int ret; + int ret = 0; if (mmc_card_removed(mq->card)) { req->rq_flags |= RQF_QUIET; @@ -263,7 +265,8 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx, * For MMC host software queue, we only allow 2 requests in * flight to avoid a long latency. */ - if (host->hsq_enabled && mq->in_flight[issue_type] > 2) { + trace_android_vh_mmc_check_status(bd, &ret); + if (!ret && host->hsq_enabled && mq->in_flight[issue_type] > 2) { spin_unlock_irq(&mq->lock); return BLK_STS_RESOURCE; } diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index 592166e53dce..6aba9111ff82 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -18,6 +18,8 @@ #include #include +#include + #include "core.h" #include "card.h" #include "host.h" @@ -472,6 +474,8 @@ static void sd_update_bus_speed_mode(struct mmc_card *card) SD_MODE_UHS_SDR12)) { card->sd_bus_speed = UHS_SDR12_BUS_SPEED; } + + trace_android_vh_sd_update_bus_speed_mode(card); } static int sd_set_bus_speed_mode(struct mmc_card *card, u8 *status) @@ -1870,5 +1874,7 @@ err: pr_err("%s: error %d whilst initialising SD card\n", mmc_hostname(host), err); + trace_android_vh_mmc_attach_sd(host, ocr, err); + return err; } diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c index 5447c47157aa..2c8aba16aef1 100644 --- a/drivers/mmc/core/sdio.c +++ b/drivers/mmc/core/sdio.c @@ -15,6 +15,8 @@ #include #include +#include + #include "core.h" #include "card.h" #include "host.h" @@ -1094,6 +1096,8 @@ out: mmc_release_host(host); host->pm_flags &= ~MMC_PM_KEEP_POWER; + trace_android_vh_mmc_sdio_pm_flag_set(host); + return err; } diff --git a/drivers/mmc/core/slot-gpio.c b/drivers/mmc/core/slot-gpio.c index 05e907451df9..bda96142c726 100644 --- a/drivers/mmc/core/slot-gpio.c +++ b/drivers/mmc/core/slot-gpio.c @@ -14,6 +14,8 @@ #include #include +#include + #include "slot-gpio.h" struct mmc_gpio { @@ -30,6 +32,11 @@ static irqreturn_t mmc_gpio_cd_irqt(int irq, void *dev_id) /* Schedule a card detection after a debounce timeout */ struct mmc_host *host = dev_id; struct mmc_gpio *ctx = host->slot.handler_priv; + bool allow = true; + + trace_android_vh_mmc_gpio_cd_irqt(host, &allow); + if (!allow) + return IRQ_HANDLED; host->trigger_card_event = true; mmc_detect_change(host, msecs_to_jiffies(ctx->cd_debounce_delay_ms)); diff --git a/drivers/mmc/host/cqhci-core.c b/drivers/mmc/host/cqhci-core.c index 31f841231609..7f25cca86ab8 100644 --- a/drivers/mmc/host/cqhci-core.c +++ b/drivers/mmc/host/cqhci-core.c @@ -822,8 +822,15 @@ irqreturn_t cqhci_irq(struct mmc_host *mmc, u32 intmask, int cmd_error, pr_debug("%s: cqhci: IRQ status: 0x%08x\n", mmc_hostname(mmc), status); if ((status & (CQHCI_IS_RED | CQHCI_IS_GCE | CQHCI_IS_ICCE)) || - cmd_error || data_error) + cmd_error || data_error) { + if (status & CQHCI_IS_RED) + mmc_debugfs_err_stats_inc(mmc, MMC_ERR_CMDQ_RED); + if (status & CQHCI_IS_GCE) + mmc_debugfs_err_stats_inc(mmc, MMC_ERR_CMDQ_GCE); + if (status & CQHCI_IS_ICCE) + mmc_debugfs_err_stats_inc(mmc, MMC_ERR_CMDQ_ICCE); cqhci_error_irq(mmc, status, cmd_error, data_error); + } if (status & CQHCI_IS_TCC) { /* read TCN and complete the request */ diff --git a/drivers/mmc/host/cqhci-crypto.c b/drivers/mmc/host/cqhci-crypto.c index 6419cfbb4ab7..6652982410ec 100644 --- a/drivers/mmc/host/cqhci-crypto.c +++ b/drivers/mmc/host/cqhci-crypto.c @@ -6,7 +6,7 @@ */ #include -#include +#include #include #include "cqhci-crypto.h" @@ -23,9 +23,10 @@ static const struct cqhci_crypto_alg_entry { }; static inline struct cqhci_host * -cqhci_host_from_ksm(struct blk_keyslot_manager *ksm) +cqhci_host_from_crypto_profile(struct blk_crypto_profile *profile) { - struct mmc_host *mmc = container_of(ksm, struct mmc_host, ksm); + struct mmc_host *mmc = + container_of(profile, struct mmc_host, crypto_profile); return mmc->cqe_private; } @@ -57,12 +58,12 @@ static int cqhci_crypto_program_key(struct cqhci_host *cq_host, return 0; } -static int cqhci_crypto_keyslot_program(struct blk_keyslot_manager *ksm, +static int cqhci_crypto_keyslot_program(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, unsigned int slot) { - struct cqhci_host *cq_host = cqhci_host_from_ksm(ksm); + struct cqhci_host *cq_host = cqhci_host_from_crypto_profile(profile); const union cqhci_crypto_cap_entry *ccap_array = cq_host->crypto_cap_array; const struct cqhci_crypto_alg_entry *alg = @@ -115,11 +116,11 @@ static int cqhci_crypto_clear_keyslot(struct cqhci_host *cq_host, int slot) return cqhci_crypto_program_key(cq_host, &cfg, slot); } -static int cqhci_crypto_keyslot_evict(struct blk_keyslot_manager *ksm, +static int cqhci_crypto_keyslot_evict(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, unsigned int slot) { - struct cqhci_host *cq_host = cqhci_host_from_ksm(ksm); + struct cqhci_host *cq_host = cqhci_host_from_crypto_profile(profile); return cqhci_crypto_clear_keyslot(cq_host, slot); } @@ -132,7 +133,7 @@ static int cqhci_crypto_keyslot_evict(struct blk_keyslot_manager *ksm, * "enabled" when these are called, i.e. CQHCI_ENABLE might not be set in the * CQHCI_CFG register. But the hardware allows that. */ -static const struct blk_ksm_ll_ops cqhci_ksm_ops = { +static const struct blk_crypto_ll_ops cqhci_crypto_ops = { .keyslot_program = cqhci_crypto_keyslot_program, .keyslot_evict = cqhci_crypto_keyslot_evict, }; @@ -157,8 +158,8 @@ cqhci_find_blk_crypto_mode(union cqhci_crypto_cap_entry cap) * * If the driver previously set MMC_CAP2_CRYPTO and the CQE declares * CQHCI_CAP_CS, initialize the crypto support. This involves reading the - * crypto capability registers, initializing the keyslot manager, clearing all - * keyslots, and enabling 128-bit task descriptors. + * crypto capability registers, initializing the blk_crypto_profile, clearing + * all keyslots, and enabling 128-bit task descriptors. * * Return: 0 if crypto was initialized or isn't supported; whether * MMC_CAP2_CRYPTO remains set indicates which one of those cases it is. @@ -168,7 +169,7 @@ int cqhci_crypto_init(struct cqhci_host *cq_host) { struct mmc_host *mmc = cq_host->mmc; struct device *dev = mmc_dev(mmc); - struct blk_keyslot_manager *ksm = &mmc->ksm; + struct blk_crypto_profile *profile = &mmc->crypto_profile; unsigned int num_keyslots; unsigned int cap_idx; enum blk_crypto_mode_num blk_mode_num; @@ -199,15 +200,17 @@ int cqhci_crypto_init(struct cqhci_host *cq_host) */ num_keyslots = cq_host->crypto_capabilities.config_count + 1; - err = devm_blk_ksm_init(dev, ksm, num_keyslots); + err = devm_blk_crypto_profile_init(dev, profile, num_keyslots); if (err) goto out; - ksm->ksm_ll_ops = cqhci_ksm_ops; - ksm->dev = dev; + profile->ll_ops = cqhci_crypto_ops; + profile->dev = dev; /* Unfortunately, CQHCI crypto only supports 32 DUN bits. */ - ksm->max_dun_bytes_supported = 4; + profile->max_dun_bytes_supported = 4; + + profile->key_types_supported = BLK_CRYPTO_KEY_TYPE_STANDARD; /* * Cache all the crypto capabilities and advertise the supported crypto @@ -223,7 +226,7 @@ int cqhci_crypto_init(struct cqhci_host *cq_host) cq_host->crypto_cap_array[cap_idx]); if (blk_mode_num == BLK_ENCRYPTION_MODE_INVALID) continue; - ksm->crypto_modes_supported[blk_mode_num] |= + profile->modes_supported[blk_mode_num] |= cq_host->crypto_cap_array[cap_idx].sdus_mask * 512; } diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index 9871c19d2b4e..c2d34c5ede48 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -259,6 +259,7 @@ #define MSDC_PAD_TUNE_RD_SEL (0x1 << 13) /* RW */ #define MSDC_PAD_TUNE_CMD_SEL (0x1 << 21) /* RW */ +#define PAD_DS_TUNE_DLY_SEL (0x1 << 0) /* RW */ #define PAD_DS_TUNE_DLY1 (0x1f << 2) /* RW */ #define PAD_DS_TUNE_DLY2 (0x1f << 7) /* RW */ #define PAD_DS_TUNE_DLY3 (0x1f << 12) /* RW */ @@ -302,6 +303,11 @@ #define PAD_CMD_RD_RXDLY_SEL (0x1 << 11) /* RW */ #define PAD_CMD_TX_DLY (0x1f << 12) /* RW */ +/* EMMC50_PAD_DS_TUNE mask */ +#define PAD_DS_DLY_SEL (0x1 << 16) /* RW */ +#define PAD_DS_DLY1 (0x1f << 10) /* RW */ +#define PAD_DS_DLY3 (0x1f << 0) /* RW */ + #define REQ_CMD_EIO (0x1 << 0) #define REQ_CMD_TMO (0x1 << 1) #define REQ_DAT_ERR (0x1 << 2) @@ -449,11 +455,13 @@ struct msdc_host { bool vqmmc_enabled; u32 latch_ck; u32 hs400_ds_delay; + u32 hs400_ds_dly3; u32 hs200_cmd_int_delay; /* cmd internal delay for HS200/SDR104 */ u32 hs400_cmd_int_delay; /* cmd internal delay for HS400 */ bool hs400_cmd_resp_sel_rising; /* cmd response sample selection for HS400 */ bool hs400_mode; /* current eMMC will run at hs400 mode */ + bool hs400_tuning; /* hs400 mode online tuning */ bool internal_cd; /* Use internal card-detect logic */ bool cqhci; /* support eMMC hw cmdq */ struct msdc_save_para save_para; /* used when gate HCLK */ @@ -1191,7 +1199,8 @@ static bool msdc_cmd_done(struct msdc_host *host, int events, if (!sbc_error && !(events & MSDC_INT_CMDRDY)) { if (events & MSDC_INT_CMDTMO || (cmd->opcode != MMC_SEND_TUNING_BLOCK && - cmd->opcode != MMC_SEND_TUNING_BLOCK_HS200)) + cmd->opcode != MMC_SEND_TUNING_BLOCK_HS200 && + !host->hs400_tuning)) /* * should not clear fifo/interrupt as the tune data * may have alreay come when cmd19/cmd21 gets response @@ -1286,7 +1295,8 @@ static void msdc_cmd_next(struct msdc_host *host, if ((cmd->error && !(cmd->error == -EILSEQ && (cmd->opcode == MMC_SEND_TUNING_BLOCK || - cmd->opcode == MMC_SEND_TUNING_BLOCK_HS200))) || + cmd->opcode == MMC_SEND_TUNING_BLOCK_HS200 || + host->hs400_tuning))) || (mrq->sbc && mrq->sbc->error)) msdc_request_done(host, mrq); else if (cmd == mrq->sbc) @@ -2259,6 +2269,69 @@ static int msdc_prepare_hs400_tuning(struct mmc_host *mmc, struct mmc_ios *ios) return 0; } +static int msdc_execute_hs400_tuning(struct mmc_host *mmc, struct mmc_card *card) +{ + struct msdc_host *host = mmc_priv(mmc); + struct msdc_delay_phase dly1_delay; + u32 val, result_dly1 = 0; + u8 *ext_csd; + int i, ret; + + if (host->top_base) { + sdr_set_bits(host->top_base + EMMC50_PAD_DS_TUNE, + PAD_DS_DLY_SEL); + if (host->hs400_ds_dly3) + sdr_set_field(host->top_base + EMMC50_PAD_DS_TUNE, + PAD_DS_DLY3, host->hs400_ds_dly3); + } else { + sdr_set_bits(host->base + PAD_DS_TUNE, PAD_DS_TUNE_DLY_SEL); + if (host->hs400_ds_dly3) + sdr_set_field(host->base + PAD_DS_TUNE, + PAD_DS_TUNE_DLY3, host->hs400_ds_dly3); + } + + host->hs400_tuning = true; + for (i = 0; i < PAD_DELAY_MAX; i++) { + if (host->top_base) + sdr_set_field(host->top_base + EMMC50_PAD_DS_TUNE, + PAD_DS_DLY1, i); + else + sdr_set_field(host->base + PAD_DS_TUNE, + PAD_DS_TUNE_DLY1, i); + ret = mmc_get_ext_csd(card, &ext_csd); + if (!ret) { + result_dly1 |= (1 << i); + kfree(ext_csd); + } + } + host->hs400_tuning = false; + + dly1_delay = get_best_delay(host, result_dly1); + if (dly1_delay.maxlen == 0) { + dev_err(host->dev, "Failed to get DLY1 delay!\n"); + goto fail; + } + if (host->top_base) + sdr_set_field(host->top_base + EMMC50_PAD_DS_TUNE, + PAD_DS_DLY1, dly1_delay.final_phase); + else + sdr_set_field(host->base + PAD_DS_TUNE, + PAD_DS_TUNE_DLY1, dly1_delay.final_phase); + + if (host->top_base) + val = readl(host->top_base + EMMC50_PAD_DS_TUNE); + else + val = readl(host->base + PAD_DS_TUNE); + + dev_info(host->dev, "Fianl PAD_DS_TUNE: 0x%x\n", val); + + return 0; + +fail: + dev_err(host->dev, "Failed to tuning DS pin delay!\n"); + return -EIO; +} + static void msdc_hw_reset(struct mmc_host *mmc) { struct msdc_host *host = mmc_priv(mmc); @@ -2395,6 +2468,7 @@ static const struct mmc_host_ops mt_msdc_ops = { .card_busy = msdc_card_busy, .execute_tuning = msdc_execute_tuning, .prepare_hs400_tuning = msdc_prepare_hs400_tuning, + .execute_hs400_tuning = msdc_execute_hs400_tuning, .hw_reset = msdc_hw_reset, }; @@ -2414,6 +2488,9 @@ static void msdc_of_property_parse(struct platform_device *pdev, of_property_read_u32(pdev->dev.of_node, "hs400-ds-delay", &host->hs400_ds_delay); + of_property_read_u32(pdev->dev.of_node, "mediatek,hs400-ds-dly3", + &host->hs400_ds_dly3); + of_property_read_u32(pdev->dev.of_node, "mediatek,hs200-cmd-int-delay", &host->hs200_cmd_int_delay); diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index 83d38e44fc25..a86a70ad877d 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -2757,6 +2757,9 @@ static int sdhci_msm_probe(struct platform_device *pdev) msm_host->mmc->caps |= MMC_CAP_WAIT_WHILE_BUSY | MMC_CAP_NEED_RSP_BUSY; + /* Enable force hw reset during cqe recovery */ + msm_host->mmc->cqe_recovery_reset_always = true; + /* Set the timeout value to max possible */ host->max_timeout_count = 0xF; diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index ee50e1565244..5a0b0bb8a623 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -32,6 +32,8 @@ #include #include +#include + #include "sdhci.h" #define DRIVER_NAME "sdhci" @@ -224,6 +226,7 @@ void sdhci_reset(struct sdhci_host *host, u8 mask) if (timedout) { pr_err("%s: Reset 0x%x never completed.\n", mmc_hostname(host->mmc), (int)mask); + sdhci_err_stats_inc(host, CTRL_TIMEOUT); sdhci_dumpregs(host); return; } @@ -1720,6 +1723,7 @@ static bool sdhci_send_command_retry(struct sdhci_host *host, if (!timeout--) { pr_err("%s: Controller never released inhibit bit(s).\n", mmc_hostname(host->mmc)); + sdhci_err_stats_inc(host, CTRL_TIMEOUT); sdhci_dumpregs(host); cmd->error = -EIO; return false; @@ -1969,6 +1973,7 @@ void sdhci_enable_clk(struct sdhci_host *host, u16 clk) if (timedout) { pr_err("%s: Internal clock never stabilised.\n", mmc_hostname(host->mmc)); + sdhci_err_stats_inc(host, CTRL_TIMEOUT); sdhci_dumpregs(host); return; } @@ -1991,6 +1996,7 @@ void sdhci_enable_clk(struct sdhci_host *host, u16 clk) if (timedout) { pr_err("%s: PLL clock never stabilised.\n", mmc_hostname(host->mmc)); + sdhci_err_stats_inc(host, CTRL_TIMEOUT); sdhci_dumpregs(host); return; } @@ -2462,6 +2468,7 @@ static int sdhci_get_cd(struct mmc_host *mmc) { struct sdhci_host *host = mmc_priv(mmc); int gpio_cd = mmc_gpio_get_cd(mmc); + bool allow = true; if (host->flags & SDHCI_DEVICE_DEAD) return 0; @@ -2470,6 +2477,10 @@ static int sdhci_get_cd(struct mmc_host *mmc) if (!mmc_card_is_removable(mmc)) return 1; + trace_android_vh_sdhci_get_cd(host, &allow); + if (!allow) + return 0; + /* * Try slot gpio detect, if defined it take precedence * over build in controller functionality @@ -3190,6 +3201,7 @@ static void sdhci_timeout_timer(struct timer_list *t) if (host->cmd && !sdhci_data_line_cmd(host->cmd)) { pr_debug("%s: Timeout waiting for hardware cmd interrupt.\n", mmc_hostname(host->mmc)); + sdhci_err_stats_inc(host, REQ_TIMEOUT); sdhci_dumpregs(host); host->cmd->error = -ETIMEDOUT; @@ -3212,6 +3224,7 @@ static void sdhci_timeout_data_timer(struct timer_list *t) (host->cmd && sdhci_data_line_cmd(host->cmd))) { pr_debug("%s: Timeout waiting for hardware interrupt.\n", mmc_hostname(host->mmc)); + sdhci_err_stats_inc(host, REQ_TIMEOUT); sdhci_dumpregs(host); if (host->data) { @@ -3263,17 +3276,21 @@ static void sdhci_cmd_irq(struct sdhci_host *host, u32 intmask, u32 *intmask_p) return; pr_err("%s: Got command interrupt 0x%08x even though no command operation was in progress.\n", mmc_hostname(host->mmc), (unsigned)intmask); + sdhci_err_stats_inc(host, UNEXPECTED_IRQ); sdhci_dumpregs(host); return; } if (intmask & (SDHCI_INT_TIMEOUT | SDHCI_INT_CRC | SDHCI_INT_END_BIT | SDHCI_INT_INDEX)) { - if (intmask & SDHCI_INT_TIMEOUT) + if (intmask & SDHCI_INT_TIMEOUT) { host->cmd->error = -ETIMEDOUT; - else + sdhci_err_stats_inc(host, CMD_TIMEOUT); + } else { host->cmd->error = -EILSEQ; - + if (!mmc_op_tuning(host->cmd->opcode)) + sdhci_err_stats_inc(host, CMD_CRC); + } /* Treat data command CRC error the same as data CRC error */ if (host->cmd->data && (intmask & (SDHCI_INT_CRC | SDHCI_INT_TIMEOUT)) == @@ -3294,6 +3311,8 @@ static void sdhci_cmd_irq(struct sdhci_host *host, u32 intmask, u32 *intmask_p) int err = (auto_cmd_status & SDHCI_AUTO_CMD_TIMEOUT) ? -ETIMEDOUT : -EILSEQ; + sdhci_err_stats_inc(host, AUTO_CMD); + if (mrq->sbc && (host->flags & SDHCI_AUTO_CMD23)) { mrq->sbc->error = err; @@ -3371,6 +3390,7 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask) if (intmask & SDHCI_INT_DATA_TIMEOUT) { host->data_cmd = NULL; data_cmd->error = -ETIMEDOUT; + sdhci_err_stats_inc(host, CMD_TIMEOUT); __sdhci_finish_mrq(host, data_cmd->mrq); return; } @@ -3399,23 +3419,30 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask) pr_err("%s: Got data interrupt 0x%08x even though no data operation was in progress.\n", mmc_hostname(host->mmc), (unsigned)intmask); + sdhci_err_stats_inc(host, UNEXPECTED_IRQ); sdhci_dumpregs(host); return; } - if (intmask & SDHCI_INT_DATA_TIMEOUT) + if (intmask & SDHCI_INT_DATA_TIMEOUT) { host->data->error = -ETIMEDOUT; - else if (intmask & SDHCI_INT_DATA_END_BIT) + sdhci_err_stats_inc(host, DAT_TIMEOUT); + } else if (intmask & SDHCI_INT_DATA_END_BIT) { host->data->error = -EILSEQ; - else if ((intmask & SDHCI_INT_DATA_CRC) && + if (!mmc_op_tuning(SDHCI_GET_CMD(sdhci_readw(host, SDHCI_COMMAND)))) + sdhci_err_stats_inc(host, DAT_CRC); + } else if ((intmask & SDHCI_INT_DATA_CRC) && SDHCI_GET_CMD(sdhci_readw(host, SDHCI_COMMAND)) - != MMC_BUS_TEST_R) + != MMC_BUS_TEST_R) { host->data->error = -EILSEQ; - else if (intmask & SDHCI_INT_ADMA_ERROR) { + if (!mmc_op_tuning(SDHCI_GET_CMD(sdhci_readw(host, SDHCI_COMMAND)))) + sdhci_err_stats_inc(host, DAT_CRC); + } else if (intmask & SDHCI_INT_ADMA_ERROR) { pr_err("%s: ADMA error: 0x%08x\n", mmc_hostname(host->mmc), intmask); sdhci_adma_show_error(host); + sdhci_err_stats_inc(host, ADMA); host->data->error = -EIO; if (host->ops->adma_workaround) host->ops->adma_workaround(host, intmask); @@ -3613,6 +3640,7 @@ out: if (unexpected) { pr_err("%s: Unexpected interrupt 0x%08x.\n", mmc_hostname(host->mmc), unexpected); + sdhci_err_stats_inc(host, UNEXPECTED_IRQ); sdhci_dumpregs(host); } @@ -3936,20 +3964,27 @@ bool sdhci_cqe_irq(struct sdhci_host *host, u32 intmask, int *cmd_error, if (!host->cqe_on) return false; - if (intmask & (SDHCI_INT_INDEX | SDHCI_INT_END_BIT | SDHCI_INT_CRC)) + if (intmask & (SDHCI_INT_INDEX | SDHCI_INT_END_BIT | SDHCI_INT_CRC)) { *cmd_error = -EILSEQ; - else if (intmask & SDHCI_INT_TIMEOUT) + if (!mmc_op_tuning(SDHCI_GET_CMD(sdhci_readw(host, SDHCI_COMMAND)))) + sdhci_err_stats_inc(host, CMD_CRC); + } else if (intmask & SDHCI_INT_TIMEOUT) { *cmd_error = -ETIMEDOUT; - else + sdhci_err_stats_inc(host, CMD_TIMEOUT); + } else *cmd_error = 0; - if (intmask & (SDHCI_INT_DATA_END_BIT | SDHCI_INT_DATA_CRC)) + if (intmask & (SDHCI_INT_DATA_END_BIT | SDHCI_INT_DATA_CRC)) { *data_error = -EILSEQ; - else if (intmask & SDHCI_INT_DATA_TIMEOUT) + if (!mmc_op_tuning(SDHCI_GET_CMD(sdhci_readw(host, SDHCI_COMMAND)))) + sdhci_err_stats_inc(host, DAT_CRC); + } else if (intmask & SDHCI_INT_DATA_TIMEOUT) { *data_error = -ETIMEDOUT; - else if (intmask & SDHCI_INT_ADMA_ERROR) + sdhci_err_stats_inc(host, DAT_TIMEOUT); + } else if (intmask & SDHCI_INT_ADMA_ERROR) { *data_error = -EIO; - else + sdhci_err_stats_inc(host, ADMA); + } else *data_error = 0; /* Clear selected interrupts. */ @@ -3965,6 +4000,7 @@ bool sdhci_cqe_irq(struct sdhci_host *host, u32 intmask, int *cmd_error, sdhci_writel(host, intmask, SDHCI_INT_STATUS); pr_err("%s: CQE: Unexpected interrupt 0x%08x.\n", mmc_hostname(host->mmc), intmask); + sdhci_err_stats_inc(host, UNEXPECTED_IRQ); sdhci_dumpregs(host); } diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index 6a5cc05576cd..978f998cbe98 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -356,6 +357,9 @@ struct sdhci_adma2_64_desc { */ #define MMC_CMD_TRANSFER_TIME (10 * NSEC_PER_MSEC) /* max 10 ms */ +#define sdhci_err_stats_inc(host, err_name) \ + mmc_debugfs_err_stats_inc((host)->mmc, MMC_ERR_##err_name) + enum sdhci_cookie { COOKIE_UNMAPPED, COOKIE_PRE_MAPPED, /* mapped by sdhci_pre_req() */ @@ -613,6 +617,8 @@ struct sdhci_host { u64 data_timeout; + ANDROID_KABI_RESERVE(1); + unsigned long private[] ____cacheline_aligned; }; @@ -660,6 +666,8 @@ struct sdhci_ops { void (*request_done)(struct sdhci_host *host, struct mmc_request *mrq); void (*dump_vendor_regs)(struct sdhci_host *host); + + ANDROID_KABI_RESERVE(1); }; #ifdef CONFIG_MMC_SDHCI_IO_ACCESSORS diff --git a/drivers/mtd/nand/raw/nandsim.c b/drivers/mtd/nand/raw/nandsim.c index 0750121ac371..d1c7dbc5cc56 100644 --- a/drivers/mtd/nand/raw/nandsim.c +++ b/drivers/mtd/nand/raw/nandsim.c @@ -2452,5 +2452,6 @@ static void __exit ns_cleanup_module(void) module_exit(ns_cleanup_module); MODULE_LICENSE ("GPL"); +MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver); MODULE_AUTHOR ("Artem B. Bityuckiy"); MODULE_DESCRIPTION ("The NAND flash simulator"); diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index a32050fecabf..9b1f020d3217 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -1467,3 +1467,4 @@ MODULE_VERSION(__stringify(UBI_VERSION)); MODULE_DESCRIPTION("UBI - Unsorted Block Images"); MODULE_AUTHOR("Artem Bityutskiy"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver); diff --git a/drivers/net/wireless/ath/ar5523/ar5523.c b/drivers/net/wireless/ath/ar5523/ar5523.c index efe38b2c1df7..2b0f0a83c0ba 100644 --- a/drivers/net/wireless/ath/ar5523/ar5523.c +++ b/drivers/net/wireless/ath/ar5523/ar5523.c @@ -1166,7 +1166,7 @@ static int ar5523_get_wlan_mode(struct ar5523 *ar, ar5523_info(ar, "STA not found!\n"); return WLAN_MODE_11b; } - sta_rate_set = sta->supp_rates[ar->hw->conf.chandef.chan->band]; + sta_rate_set = sta->deflink.supp_rates[ar->hw->conf.chandef.chan->band]; for (bit = 0; bit < band->n_bitrates; bit++) { if (sta_rate_set & 1) { @@ -1204,7 +1204,7 @@ static void ar5523_create_rateset(struct ar5523 *ar, ar5523_info(ar, "STA not found. Cannot set rates\n"); sta_rate_set = bss_conf->basic_rates; } else - sta_rate_set = sta->supp_rates[ar->hw->conf.chandef.chan->band]; + sta_rate_set = sta->deflink.supp_rates[ar->hw->conf.chandef.chan->band]; ar5523_dbg(ar, "sta rate_set = %08x\n", sta_rate_set); diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 8208434d7d2b..99a0166c0358 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -2258,7 +2258,7 @@ static void ath10k_peer_assoc_h_rates(struct ath10k *ar, band = def.chan->band; sband = ar->hw->wiphy->bands[band]; - ratemask = sta->supp_rates[band]; + ratemask = sta->deflink.supp_rates[band]; ratemask &= arvif->bitrate_mask.control[band].legacy; rates = sband->bitrates; @@ -2303,7 +2303,7 @@ static void ath10k_peer_assoc_h_ht(struct ath10k *ar, struct ieee80211_sta *sta, struct wmi_peer_assoc_complete_arg *arg) { - const struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap; + const struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; struct ath10k_vif *arvif = (void *)vif->drv_priv; struct cfg80211_chan_def def; enum nl80211_band band; @@ -2342,7 +2342,7 @@ static void ath10k_peer_assoc_h_ht(struct ath10k *ar, if (ht_cap->cap & IEEE80211_HT_CAP_LDPC_CODING) arg->peer_flags |= ar->wmi.peer_flags->ldbc; - if (sta->bandwidth >= IEEE80211_STA_RX_BW_40) { + if (sta->deflink.bandwidth >= IEEE80211_STA_RX_BW_40) { arg->peer_flags |= ar->wmi.peer_flags->bw40; arg->peer_rate_caps |= WMI_RC_CW40_FLAG; } @@ -2395,7 +2395,8 @@ static void ath10k_peer_assoc_h_ht(struct ath10k *ar, arg->peer_ht_rates.rates[i] = i; } else { arg->peer_ht_rates.num_rates = n; - arg->peer_num_spatial_streams = min(sta->rx_nss, max_nss); + arg->peer_num_spatial_streams = min(sta->deflink.rx_nss, + max_nss); } ath10k_dbg(ar, ATH10K_DBG_MAC, "mac ht peer %pM mcs cnt %d nss %d\n", @@ -2552,7 +2553,7 @@ static void ath10k_peer_assoc_h_vht(struct ath10k *ar, struct ieee80211_sta *sta, struct wmi_peer_assoc_complete_arg *arg) { - const struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; + const struct ieee80211_sta_vht_cap *vht_cap = &sta->deflink.vht_cap; struct ath10k_vif *arvif = (void *)vif->drv_priv; struct ath10k_hw_params *hw = &ar->hw_params; struct cfg80211_chan_def def; @@ -2594,10 +2595,10 @@ static void ath10k_peer_assoc_h_vht(struct ath10k *ar, (1U << (IEEE80211_HT_MAX_AMPDU_FACTOR + ampdu_factor)) - 1); - if (sta->bandwidth == IEEE80211_STA_RX_BW_80) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_80) arg->peer_flags |= ar->wmi.peer_flags->bw80; - if (sta->bandwidth == IEEE80211_STA_RX_BW_160) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_160) arg->peer_flags |= ar->wmi.peer_flags->bw160; /* Calculate peer NSS capability from VHT capabilities if STA @@ -2611,7 +2612,7 @@ static void ath10k_peer_assoc_h_vht(struct ath10k *ar, vht_mcs_mask[i]) max_nss = i + 1; } - arg->peer_num_spatial_streams = min(sta->rx_nss, max_nss); + arg->peer_num_spatial_streams = min(sta->deflink.rx_nss, max_nss); arg->peer_vht_rates.rx_max_rate = __le16_to_cpu(vht_cap->vht_mcs.rx_highest); arg->peer_vht_rates.rx_mcs_set = @@ -2691,15 +2692,15 @@ static void ath10k_peer_assoc_h_qos(struct ath10k *ar, static bool ath10k_mac_sta_has_ofdm_only(struct ieee80211_sta *sta) { - return sta->supp_rates[NL80211_BAND_2GHZ] >> + return sta->deflink.supp_rates[NL80211_BAND_2GHZ] >> ATH10K_MAC_FIRST_OFDM_RATE_IDX; } static enum wmi_phy_mode ath10k_mac_get_phymode_vht(struct ath10k *ar, struct ieee80211_sta *sta) { - if (sta->bandwidth == IEEE80211_STA_RX_BW_160) { - switch (sta->vht_cap.cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_160) { + switch (sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: return MODE_11AC_VHT160; case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ: @@ -2710,13 +2711,13 @@ static enum wmi_phy_mode ath10k_mac_get_phymode_vht(struct ath10k *ar, } } - if (sta->bandwidth == IEEE80211_STA_RX_BW_80) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_80) return MODE_11AC_VHT80; - if (sta->bandwidth == IEEE80211_STA_RX_BW_40) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_40) return MODE_11AC_VHT40; - if (sta->bandwidth == IEEE80211_STA_RX_BW_20) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_20) return MODE_11AC_VHT20; return MODE_UNKNOWN; @@ -2743,15 +2744,15 @@ static void ath10k_peer_assoc_h_phymode(struct ath10k *ar, switch (band) { case NL80211_BAND_2GHZ: - if (sta->vht_cap.vht_supported && + if (sta->deflink.vht_cap.vht_supported && !ath10k_peer_assoc_h_vht_masked(vht_mcs_mask)) { - if (sta->bandwidth == IEEE80211_STA_RX_BW_40) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_40) phymode = MODE_11AC_VHT40; else phymode = MODE_11AC_VHT20; - } else if (sta->ht_cap.ht_supported && + } else if (sta->deflink.ht_cap.ht_supported && !ath10k_peer_assoc_h_ht_masked(ht_mcs_mask)) { - if (sta->bandwidth == IEEE80211_STA_RX_BW_40) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_40) phymode = MODE_11NG_HT40; else phymode = MODE_11NG_HT20; @@ -2766,12 +2767,12 @@ static void ath10k_peer_assoc_h_phymode(struct ath10k *ar, /* * Check VHT first. */ - if (sta->vht_cap.vht_supported && + if (sta->deflink.vht_cap.vht_supported && !ath10k_peer_assoc_h_vht_masked(vht_mcs_mask)) { phymode = ath10k_mac_get_phymode_vht(ar, sta); - } else if (sta->ht_cap.ht_supported && + } else if (sta->deflink.ht_cap.ht_supported && !ath10k_peer_assoc_h_ht_masked(ht_mcs_mask)) { - if (sta->bandwidth >= IEEE80211_STA_RX_BW_40) + if (sta->deflink.bandwidth >= IEEE80211_STA_RX_BW_40) phymode = MODE_11NA_HT40; else phymode = MODE_11NA_HT20; @@ -3086,8 +3087,8 @@ static void ath10k_bss_assoc(struct ieee80211_hw *hw, /* ap_sta must be accessed only within rcu section which must be left * before calling ath10k_setup_peer_smps() which might sleep. */ - ht_cap = ap_sta->ht_cap; - vht_cap = ap_sta->vht_cap; + ht_cap = ap_sta->deflink.ht_cap; + vht_cap = ap_sta->deflink.vht_cap; ret = ath10k_peer_assoc_prepare(ar, vif, ap_sta, &peer_arg); if (ret) { @@ -3285,7 +3286,7 @@ static int ath10k_station_assoc(struct ath10k *ar, */ if (!reassoc) { ret = ath10k_setup_peer_smps(ar, arvif, sta->addr, - &sta->ht_cap); + &sta->deflink.ht_cap); if (ret) { ath10k_warn(ar, "failed to setup peer SMPS for vdev %d: %d\n", arvif->vdev_id, ret); @@ -6809,10 +6810,10 @@ static int ath10k_sta_set_txpwr(struct ieee80211_hw *hw, int ret = 0; s16 txpwr; - if (sta->txpwr.type == NL80211_TX_POWER_AUTOMATIC) { + if (sta->deflink.txpwr.type == NL80211_TX_POWER_AUTOMATIC) { txpwr = 0; } else { - txpwr = sta->txpwr.power; + txpwr = sta->deflink.txpwr.power; if (!txpwr) return -EINVAL; } @@ -6932,26 +6933,26 @@ static int ath10k_mac_validate_rate_mask(struct ath10k *ar, struct ieee80211_sta *sta, u32 rate_ctrl_flag, u8 nss) { - if (nss > sta->rx_nss) { + if (nss > sta->deflink.rx_nss) { ath10k_warn(ar, "Invalid nss field, configured %u limit %u\n", - nss, sta->rx_nss); + nss, sta->deflink.rx_nss); return -EINVAL; } if (ATH10K_HW_PREAMBLE(rate_ctrl_flag) == WMI_RATE_PREAMBLE_VHT) { - if (!sta->vht_cap.vht_supported) { + if (!sta->deflink.vht_cap.vht_supported) { ath10k_warn(ar, "Invalid VHT rate for sta %pM\n", sta->addr); return -EINVAL; } } else if (ATH10K_HW_PREAMBLE(rate_ctrl_flag) == WMI_RATE_PREAMBLE_HT) { - if (!sta->ht_cap.ht_supported || sta->vht_cap.vht_supported) { + if (!sta->deflink.ht_cap.ht_supported || sta->deflink.vht_cap.vht_supported) { ath10k_warn(ar, "Invalid HT rate for sta %pM\n", sta->addr); return -EINVAL; } } else { - if (sta->ht_cap.ht_supported || sta->vht_cap.vht_supported) + if (sta->deflink.ht_cap.ht_supported || sta->deflink.vht_cap.vht_supported) return -EINVAL; } @@ -8291,7 +8292,7 @@ static bool ath10k_mac_set_vht_bitrate_mask_fixup(struct ath10k *ar, u8 rate = arvif->vht_pfr; /* skip non vht and multiple rate peers */ - if (!sta->vht_cap.vht_supported || arvif->vht_num_rates != 1) + if (!sta->deflink.vht_cap.vht_supported || arvif->vht_num_rates != 1) return false; err = ath10k_wmi_peer_set_param(ar, arvif->vdev_id, sta->addr, @@ -8332,7 +8333,7 @@ static void ath10k_mac_clr_bitrate_mask_iter(void *data, int err; /* clear vht peers only */ - if (arsta->arvif != arvif || !sta->vht_cap.vht_supported) + if (arsta->arvif != arvif || !sta->deflink.vht_cap.vht_supported) return; err = ath10k_wmi_peer_set_param(ar, arvif->vdev_id, sta->addr, @@ -8476,13 +8477,14 @@ static void ath10k_sta_rc_update(struct ieee80211_hw *hw, ath10k_dbg(ar, ATH10K_DBG_STA, "mac sta rc update for %pM changed %08x bw %d nss %d smps %d\n", - sta->addr, changed, sta->bandwidth, sta->rx_nss, + sta->addr, changed, sta->deflink.bandwidth, + sta->deflink.rx_nss, sta->smps_mode); if (changed & IEEE80211_RC_BW_CHANGED) { bw = WMI_PEER_CHWIDTH_20MHZ; - switch (sta->bandwidth) { + switch (sta->deflink.bandwidth) { case IEEE80211_STA_RX_BW_20: bw = WMI_PEER_CHWIDTH_20MHZ; break; @@ -8497,7 +8499,7 @@ static void ath10k_sta_rc_update(struct ieee80211_hw *hw, break; default: ath10k_warn(ar, "Invalid bandwidth %d in rc update for %pM\n", - sta->bandwidth, sta->addr); + sta->deflink.bandwidth, sta->addr); bw = WMI_PEER_CHWIDTH_20MHZ; break; } @@ -8506,7 +8508,7 @@ static void ath10k_sta_rc_update(struct ieee80211_hw *hw, } if (changed & IEEE80211_RC_NSS_CHANGED) - arsta->nss = sta->rx_nss; + arsta->nss = sta->deflink.rx_nss; if (changed & IEEE80211_RC_SMPS_CHANGED) { smps = WMI_PEER_SMPS_PS_NONE; diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index ae6e14fe03c7..84891bbf415a 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -1011,7 +1011,7 @@ static void ath11k_peer_assoc_h_rates(struct ath11k *ar, band = def.chan->band; sband = ar->hw->wiphy->bands[band]; - ratemask = sta->supp_rates[band]; + ratemask = sta->deflink.supp_rates[band]; ratemask &= arvif->bitrate_mask.control[band].legacy; rates = sband->bitrates; @@ -1056,7 +1056,7 @@ static void ath11k_peer_assoc_h_ht(struct ath11k *ar, struct ieee80211_sta *sta, struct peer_assoc_params *arg) { - const struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap; + const struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; struct ath11k_vif *arvif = (void *)vif->drv_priv; struct cfg80211_chan_def def; enum nl80211_band band; @@ -1093,7 +1093,7 @@ static void ath11k_peer_assoc_h_ht(struct ath11k *ar, if (ht_cap->cap & IEEE80211_HT_CAP_LDPC_CODING) arg->ldpc_flag = true; - if (sta->bandwidth >= IEEE80211_STA_RX_BW_40) { + if (sta->deflink.bandwidth >= IEEE80211_STA_RX_BW_40) { arg->bw_40 = true; arg->peer_rate_caps |= WMI_HOST_RC_CW40_FLAG; } @@ -1143,7 +1143,7 @@ static void ath11k_peer_assoc_h_ht(struct ath11k *ar, arg->peer_ht_rates.rates[i] = i; } else { arg->peer_ht_rates.num_rates = n; - arg->peer_nss = min(sta->rx_nss, max_nss); + arg->peer_nss = min(sta->deflink.rx_nss, max_nss); } ath11k_dbg(ar->ab, ATH11K_DBG_MAC, "mac ht peer %pM mcs cnt %d nss %d\n", @@ -1217,7 +1217,7 @@ static void ath11k_peer_assoc_h_vht(struct ath11k *ar, struct ieee80211_sta *sta, struct peer_assoc_params *arg) { - const struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; + const struct ieee80211_sta_vht_cap *vht_cap = &sta->deflink.vht_cap; struct ath11k_vif *arvif = (void *)vif->drv_priv; struct cfg80211_chan_def def; enum nl80211_band band; @@ -1261,10 +1261,10 @@ static void ath11k_peer_assoc_h_vht(struct ath11k *ar, (1U << (IEEE80211_HT_MAX_AMPDU_FACTOR + ampdu_factor)) - 1); - if (sta->bandwidth == IEEE80211_STA_RX_BW_80) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_80) arg->bw_80 = true; - if (sta->bandwidth == IEEE80211_STA_RX_BW_160) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_160) arg->bw_160 = true; /* Calculate peer NSS capability from VHT capabilities if STA @@ -1278,7 +1278,7 @@ static void ath11k_peer_assoc_h_vht(struct ath11k *ar, vht_mcs_mask[i]) max_nss = i + 1; } - arg->peer_nss = min(sta->rx_nss, max_nss); + arg->peer_nss = min(sta->deflink.rx_nss, max_nss); arg->rx_max_rate = __le16_to_cpu(vht_cap->vht_mcs.rx_highest); arg->rx_mcs_set = __le16_to_cpu(vht_cap->vht_mcs.rx_mcs_map); arg->tx_max_rate = __le16_to_cpu(vht_cap->vht_mcs.tx_highest); @@ -1310,7 +1310,7 @@ static void ath11k_peer_assoc_h_he(struct ath11k *ar, struct ieee80211_sta *sta, struct peer_assoc_params *arg) { - const struct ieee80211_sta_he_cap *he_cap = &sta->he_cap; + const struct ieee80211_sta_he_cap *he_cap = &sta->deflink.he_cap; u8 ampdu_factor; u16 v; @@ -1349,10 +1349,10 @@ static void ath11k_peer_assoc_h_he(struct ath11k *ar, IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK); if (ampdu_factor) { - if (sta->vht_cap.vht_supported) + if (sta->deflink.vht_cap.vht_supported) arg->peer_max_mpdu = (1 << (IEEE80211_HE_VHT_MAX_AMPDU_FACTOR + ampdu_factor)) - 1; - else if (sta->ht_cap.ht_supported) + else if (sta->deflink.ht_cap.ht_supported) arg->peer_max_mpdu = (1 << (IEEE80211_HE_HT_MAX_AMPDU_FACTOR + ampdu_factor)) - 1; } @@ -1393,7 +1393,7 @@ static void ath11k_peer_assoc_h_he(struct ath11k *ar, if (he_cap->he_cap_elem.mac_cap_info[0] & IEEE80211_HE_MAC_CAP0_TWT_REQ) arg->twt_requester = true; - switch (sta->bandwidth) { + switch (sta->deflink.bandwidth) { case IEEE80211_STA_RX_BW_160: if (he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) { @@ -1429,7 +1429,7 @@ static void ath11k_peer_assoc_h_he(struct ath11k *ar, static void ath11k_peer_assoc_h_smps(struct ieee80211_sta *sta, struct peer_assoc_params *arg) { - const struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap; + const struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; int smps; if (!ht_cap->ht_supported) @@ -1558,15 +1558,15 @@ err: static bool ath11k_mac_sta_has_ofdm_only(struct ieee80211_sta *sta) { - return sta->supp_rates[NL80211_BAND_2GHZ] >> + return sta->deflink.supp_rates[NL80211_BAND_2GHZ] >> ATH11K_MAC_FIRST_OFDM_RATE_IDX; } static enum wmi_phy_mode ath11k_mac_get_phymode_vht(struct ath11k *ar, struct ieee80211_sta *sta) { - if (sta->bandwidth == IEEE80211_STA_RX_BW_160) { - switch (sta->vht_cap.cap & + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_160) { + switch (sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: return MODE_11AC_VHT160; @@ -1578,13 +1578,13 @@ static enum wmi_phy_mode ath11k_mac_get_phymode_vht(struct ath11k *ar, } } - if (sta->bandwidth == IEEE80211_STA_RX_BW_80) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_80) return MODE_11AC_VHT80; - if (sta->bandwidth == IEEE80211_STA_RX_BW_40) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_40) return MODE_11AC_VHT40; - if (sta->bandwidth == IEEE80211_STA_RX_BW_20) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_20) return MODE_11AC_VHT20; return MODE_UNKNOWN; @@ -1593,24 +1593,24 @@ static enum wmi_phy_mode ath11k_mac_get_phymode_vht(struct ath11k *ar, static enum wmi_phy_mode ath11k_mac_get_phymode_he(struct ath11k *ar, struct ieee80211_sta *sta) { - if (sta->bandwidth == IEEE80211_STA_RX_BW_160) { - if (sta->he_cap.he_cap_elem.phy_cap_info[0] & + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_160) { + if (sta->deflink.he_cap.he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) return MODE_11AX_HE160; - else if (sta->he_cap.he_cap_elem.phy_cap_info[0] & - IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) + else if (sta->deflink.he_cap.he_cap_elem.phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) return MODE_11AX_HE80_80; /* not sure if this is a valid case? */ return MODE_11AX_HE160; } - if (sta->bandwidth == IEEE80211_STA_RX_BW_80) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_80) return MODE_11AX_HE80; - if (sta->bandwidth == IEEE80211_STA_RX_BW_40) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_40) return MODE_11AX_HE40; - if (sta->bandwidth == IEEE80211_STA_RX_BW_20) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_20) return MODE_11AX_HE20; return MODE_UNKNOWN; @@ -1637,22 +1637,22 @@ static void ath11k_peer_assoc_h_phymode(struct ath11k *ar, switch (band) { case NL80211_BAND_2GHZ: - if (sta->he_cap.has_he) { - if (sta->bandwidth == IEEE80211_STA_RX_BW_80) + if (sta->deflink.he_cap.has_he) { + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_80) phymode = MODE_11AX_HE80_2G; - else if (sta->bandwidth == IEEE80211_STA_RX_BW_40) + else if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_40) phymode = MODE_11AX_HE40_2G; else phymode = MODE_11AX_HE20_2G; - } else if (sta->vht_cap.vht_supported && - !ath11k_peer_assoc_h_vht_masked(vht_mcs_mask)) { - if (sta->bandwidth == IEEE80211_STA_RX_BW_40) + } else if (sta->deflink.vht_cap.vht_supported && + !ath11k_peer_assoc_h_vht_masked(vht_mcs_mask)) { + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_40) phymode = MODE_11AC_VHT40; else phymode = MODE_11AC_VHT20; - } else if (sta->ht_cap.ht_supported && + } else if (sta->deflink.ht_cap.ht_supported && !ath11k_peer_assoc_h_ht_masked(ht_mcs_mask)) { - if (sta->bandwidth == IEEE80211_STA_RX_BW_40) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_40) phymode = MODE_11NG_HT40; else phymode = MODE_11NG_HT20; @@ -1665,14 +1665,14 @@ static void ath11k_peer_assoc_h_phymode(struct ath11k *ar, case NL80211_BAND_5GHZ: case NL80211_BAND_6GHZ: /* Check HE first */ - if (sta->he_cap.has_he) { + if (sta->deflink.he_cap.has_he) { phymode = ath11k_mac_get_phymode_he(ar, sta); - } else if (sta->vht_cap.vht_supported && - !ath11k_peer_assoc_h_vht_masked(vht_mcs_mask)) { + } else if (sta->deflink.vht_cap.vht_supported && + !ath11k_peer_assoc_h_vht_masked(vht_mcs_mask)) { phymode = ath11k_mac_get_phymode_vht(ar, sta); - } else if (sta->ht_cap.ht_supported && + } else if (sta->deflink.ht_cap.ht_supported && !ath11k_peer_assoc_h_ht_masked(ht_mcs_mask)) { - if (sta->bandwidth >= IEEE80211_STA_RX_BW_40) + if (sta->deflink.bandwidth >= IEEE80211_STA_RX_BW_40) phymode = MODE_11NA_HT40; else phymode = MODE_11NA_HT20; @@ -1780,7 +1780,7 @@ static void ath11k_bss_assoc(struct ieee80211_hw *hw, } ret = ath11k_setup_peer_smps(ar, arvif, bss_conf->bssid, - &ap_sta->ht_cap); + &ap_sta->deflink.ht_cap); if (ret) { ath11k_warn(ar->ab, "failed to setup peer SMPS for vdev %d: %d\n", arvif->vdev_id, ret); @@ -2986,7 +2986,7 @@ static int ath11k_station_assoc(struct ath11k *ar, * fixed param. * Note that all other rates and NSS will be disabled for this peer. */ - if (sta->vht_cap.vht_supported && num_vht_rates == 1) { + if (sta->deflink.vht_cap.vht_supported && num_vht_rates == 1) { ret = ath11k_mac_set_peer_vht_fixed_rate(arvif, sta, mask, band); if (ret) @@ -3000,7 +3000,7 @@ static int ath11k_station_assoc(struct ath11k *ar, return 0; ret = ath11k_setup_peer_smps(ar, arvif, sta->addr, - &sta->ht_cap); + &sta->deflink.ht_cap); if (ret) { ath11k_warn(ar->ab, "failed to setup peer SMPS for vdev %d: %d\n", arvif->vdev_id, ret); @@ -3141,7 +3141,7 @@ static void ath11k_sta_rc_update_wk(struct work_struct *wk) * TODO: Check RATEMASK_CMDID to support auto rates selection * across HT/VHT and for multiple VHT MCS support. */ - if (sta->vht_cap.vht_supported && num_vht_rates == 1) { + if (sta->deflink.vht_cap.vht_supported && num_vht_rates == 1) { ath11k_mac_set_peer_vht_fixed_rate(arvif, sta, mask, band); } else { @@ -3380,10 +3380,10 @@ static int ath11k_mac_op_sta_set_txpwr(struct ieee80211_hw *hw, int ret = 0; s16 txpwr; - if (sta->txpwr.type == NL80211_TX_POWER_AUTOMATIC) { + if (sta->deflink.txpwr.type == NL80211_TX_POWER_AUTOMATIC) { txpwr = 0; } else { - txpwr = sta->txpwr.power; + txpwr = sta->deflink.txpwr.power; if (!txpwr) return -EINVAL; } @@ -3431,7 +3431,8 @@ static void ath11k_mac_op_sta_rc_update(struct ieee80211_hw *hw, ath11k_dbg(ar->ab, ATH11K_DBG_MAC, "mac sta rc update for %pM changed %08x bw %d nss %d smps %d\n", - sta->addr, changed, sta->bandwidth, sta->rx_nss, + sta->addr, changed, sta->deflink.bandwidth, + sta->deflink.rx_nss, sta->smps_mode); spin_lock_bh(&ar->data_lock); @@ -3439,7 +3440,7 @@ static void ath11k_mac_op_sta_rc_update(struct ieee80211_hw *hw, if (changed & IEEE80211_RC_BW_CHANGED) { bw = WMI_PEER_CHWIDTH_20MHZ; - switch (sta->bandwidth) { + switch (sta->deflink.bandwidth) { case IEEE80211_STA_RX_BW_20: bw = WMI_PEER_CHWIDTH_20MHZ; break; @@ -3454,7 +3455,7 @@ static void ath11k_mac_op_sta_rc_update(struct ieee80211_hw *hw, break; default: ath11k_warn(ar->ab, "Invalid bandwidth %d in rc update for %pM\n", - sta->bandwidth, sta->addr); + sta->deflink.bandwidth, sta->addr); bw = WMI_PEER_CHWIDTH_20MHZ; break; } @@ -3463,7 +3464,7 @@ static void ath11k_mac_op_sta_rc_update(struct ieee80211_hw *hw, } if (changed & IEEE80211_RC_NSS_CHANGED) - arsta->nss = sta->rx_nss; + arsta->nss = sta->deflink.rx_nss; if (changed & IEEE80211_RC_SMPS_CHANGED) { smps = WMI_PEER_SMPS_PS_NONE; diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c index fefdc6753acd..e1f10db90f30 100644 --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c @@ -807,7 +807,7 @@ void ath6kl_cfg80211_connect_event(struct ath6kl_vif *vif, u16 channel, cfg80211_put_bss(ar->wiphy, bss); } else if (vif->sme_state == SME_CONNECTED) { struct cfg80211_roam_info roam_info = { - .bss = bss, + .links[0].bss = bss, .req_ie = assoc_req_ie, .req_ie_len = assoc_req_len, .resp_ie = assoc_resp_ie, @@ -1119,12 +1119,12 @@ void ath6kl_cfg80211_ch_switch_notify(struct ath6kl_vif *vif, int freq, NL80211_CHAN_HT20 : NL80211_CHAN_NO_HT); mutex_lock(&vif->wdev.mtx); - cfg80211_ch_switch_notify(vif->ndev, &chandef); + cfg80211_ch_switch_notify(vif->ndev, &chandef, 0); mutex_unlock(&vif->wdev.mtx); } static int ath6kl_cfg80211_add_key(struct wiphy *wiphy, struct net_device *ndev, - u8 key_index, bool pairwise, + int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, struct key_params *params) { @@ -1249,7 +1249,7 @@ static int ath6kl_cfg80211_add_key(struct wiphy *wiphy, struct net_device *ndev, } static int ath6kl_cfg80211_del_key(struct wiphy *wiphy, struct net_device *ndev, - u8 key_index, bool pairwise, + int link_id, u8 key_index, bool pairwise, const u8 *mac_addr) { struct ath6kl *ar = ath6kl_priv(ndev); @@ -1279,7 +1279,7 @@ static int ath6kl_cfg80211_del_key(struct wiphy *wiphy, struct net_device *ndev, } static int ath6kl_cfg80211_get_key(struct wiphy *wiphy, struct net_device *ndev, - u8 key_index, bool pairwise, + int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback) (void *cookie, struct key_params *)) @@ -1314,7 +1314,7 @@ static int ath6kl_cfg80211_get_key(struct wiphy *wiphy, struct net_device *ndev, } static int ath6kl_cfg80211_set_default_key(struct wiphy *wiphy, - struct net_device *ndev, + struct net_device *ndev, int link_id, u8 key_index, bool unicast, bool multicast) { @@ -2967,7 +2967,8 @@ static int ath6kl_change_beacon(struct wiphy *wiphy, struct net_device *dev, return ath6kl_set_ies(vif, beacon); } -static int ath6kl_stop_ap(struct wiphy *wiphy, struct net_device *dev) +static int ath6kl_stop_ap(struct wiphy *wiphy, struct net_device *dev, + unsigned int link_id) { struct ath6kl *ar = ath6kl_priv(dev); struct ath6kl_vif *vif = netdev_priv(dev); @@ -3368,6 +3369,7 @@ static int ath6kl_cfg80211_sscan_stop(struct wiphy *wiphy, static int ath6kl_cfg80211_set_bitrate(struct wiphy *wiphy, struct net_device *dev, + unsigned int link_id, const u8 *addr, const struct cfg80211_bitrate_mask *mask) { diff --git a/drivers/net/wireless/ath/ath9k/debug_sta.c b/drivers/net/wireless/ath/ath9k/debug_sta.c index d95cabddce33..1e2a30019fb6 100644 --- a/drivers/net/wireless/ath/ath9k/debug_sta.c +++ b/drivers/net/wireless/ath/ath9k/debug_sta.c @@ -36,7 +36,7 @@ static ssize_t read_file_node_aggr(struct file *file, char __user *user_buf, if (buf == NULL) return -ENOMEM; - if (!an->sta->ht_cap.ht_supported) { + if (!an->sta->deflink.ht_cap.ht_supported) { len = scnprintf(buf, size, "%s\n", "HT not supported"); goto exit; @@ -186,7 +186,7 @@ static ssize_t read_file_node_recv(struct file *file, char __user *user_buf, band = ah->curchan->chan->band; rstats = &an->rx_rate_stats; - if (!sta->ht_cap.ht_supported) + if (!sta->deflink.ht_cap.ht_supported) goto legacy; len += scnprintf(buf + len, size - len, diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c index 72ef319feeda..cfee732a89b1 100644 --- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c +++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c @@ -491,7 +491,7 @@ static int ath9k_htc_add_station(struct ath9k_htc_priv *priv, ista->index = sta_idx; tsta.is_vif_sta = 0; maxampdu = 1 << (IEEE80211_HT_MAX_AMPDU_FACTOR + - sta->ht_cap.ampdu_factor); + sta->deflink.ht_cap.ampdu_factor); tsta.maxampdu = cpu_to_be16(maxampdu); } else { memcpy(&tsta.macaddr, vif->addr, ETH_ALEN); @@ -602,7 +602,7 @@ static void ath9k_htc_setup_rate(struct ath9k_htc_priv *priv, sband = priv->hw->wiphy->bands[priv->hw->conf.chandef.chan->band]; for (i = 0, j = 0; i < sband->n_bitrates; i++) { - if (sta->supp_rates[sband->band] & BIT(i)) { + if (sta->deflink.supp_rates[sband->band] & BIT(i)) { trate->rates.legacy_rates.rs_rates[j] = (sband->bitrates[i].bitrate * 2) / 10; j++; @@ -610,9 +610,9 @@ static void ath9k_htc_setup_rate(struct ath9k_htc_priv *priv, } trate->rates.legacy_rates.rs_nrates = j; - if (sta->ht_cap.ht_supported) { + if (sta->deflink.ht_cap.ht_supported) { for (i = 0, j = 0; i < 77; i++) { - if (sta->ht_cap.mcs.rx_mask[i/8] & (1<<(i%8))) + if (sta->deflink.ht_cap.mcs.rx_mask[i/8] & (1<<(i%8))) trate->rates.ht_rates.rs_rates[j++] = i; if (j == ATH_HTC_RATE_MAX) break; @@ -620,18 +620,18 @@ static void ath9k_htc_setup_rate(struct ath9k_htc_priv *priv, trate->rates.ht_rates.rs_nrates = j; caps = WLAN_RC_HT_FLAG; - if (sta->ht_cap.cap & IEEE80211_HT_CAP_RX_STBC) + if (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_RX_STBC) caps |= ATH_RC_TX_STBC_FLAG; - if (sta->ht_cap.mcs.rx_mask[1]) + if (sta->deflink.ht_cap.mcs.rx_mask[1]) caps |= WLAN_RC_DS_FLAG; - if ((sta->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) && - (conf_is_ht40(&priv->hw->conf))) + if ((sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) && + (conf_is_ht40(&priv->hw->conf))) caps |= WLAN_RC_40_FLAG; if (conf_is_ht40(&priv->hw->conf) && - (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40)) + (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40)) caps |= WLAN_RC_SGI_FLAG; else if (conf_is_ht20(&priv->hw->conf) && - (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20)) + (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20)) caps |= WLAN_RC_SGI_FLAG; } diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index e2791d45f5f5..77144647f4fc 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -2048,7 +2048,7 @@ static int ath9k_ampdu_action(struct ieee80211_hw *hw, case IEEE80211_AMPDU_TX_OPERATIONAL: atid = ath_node_to_tid(an, tid); atid->baw_size = IEEE80211_MIN_AMPDU_BUF << - sta->ht_cap.ampdu_factor; + sta->deflink.ht_cap.ampdu_factor; break; default: ath_err(ath9k_hw_common(sc->sc_ah), "Unknown AMPDU action\n"); diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c index 6555abf02f18..8d5be39cc261 100644 --- a/drivers/net/wireless/ath/ath9k/xmit.c +++ b/drivers/net/wireless/ath/ath9k/xmit.c @@ -1533,10 +1533,10 @@ int ath_tx_aggr_start(struct ath_softc *sc, struct ieee80211_sta *sta, * in HT IBSS when a beacon with HT-info is received after the station * has already been added. */ - if (sta->ht_cap.ht_supported) { + if (sta->deflink.ht_cap.ht_supported) { an->maxampdu = (1 << (IEEE80211_HT_MAX_AMPDU_FACTOR + - sta->ht_cap.ampdu_factor)) - 1; - density = ath9k_parse_mpdudensity(sta->ht_cap.ampdu_density); + sta->deflink.ht_cap.ampdu_factor)) - 1; + density = ath9k_parse_mpdudensity(sta->deflink.ht_cap.ampdu_density); an->mpdudensity = density; } diff --git a/drivers/net/wireless/ath/carl9170/main.c b/drivers/net/wireless/ath/carl9170/main.c index a87476383c54..5072c8cd7102 100644 --- a/drivers/net/wireless/ath/carl9170/main.c +++ b/drivers/net/wireless/ath/carl9170/main.c @@ -1307,8 +1307,8 @@ static int carl9170_op_sta_add(struct ieee80211_hw *hw, atomic_set(&sta_info->pending_frames, 0); - if (sta->ht_cap.ht_supported) { - if (sta->ht_cap.ampdu_density > 6) { + if (sta->deflink.ht_cap.ht_supported) { + if (sta->deflink.ht_cap.ampdu_density > 6) { /* * HW does support 16us AMPDU density. * No HT-Xmit for station. @@ -1320,7 +1320,7 @@ static int carl9170_op_sta_add(struct ieee80211_hw *hw, for (i = 0; i < ARRAY_SIZE(sta_info->agg); i++) RCU_INIT_POINTER(sta_info->agg[i], NULL); - sta_info->ampdu_max_len = 1 << (3 + sta->ht_cap.ampdu_factor); + sta_info->ampdu_max_len = 1 << (3 + sta->deflink.ht_cap.ampdu_factor); sta_info->ht_sta = true; } @@ -1336,7 +1336,7 @@ static int carl9170_op_sta_remove(struct ieee80211_hw *hw, unsigned int i; bool cleanup = false; - if (sta->ht_cap.ht_supported) { + if (sta->deflink.ht_cap.ht_supported) { sta_info->ht_sta = false; diff --git a/drivers/net/wireless/ath/carl9170/tx.c b/drivers/net/wireless/ath/carl9170/tx.c index f9e1306ac74f..03f4b552db6a 100644 --- a/drivers/net/wireless/ath/carl9170/tx.c +++ b/drivers/net/wireless/ath/carl9170/tx.c @@ -1044,8 +1044,9 @@ static int carl9170_tx_prepare(struct ar9170 *ar, if (unlikely(!sta || !cvif)) goto err_out; - factor = min_t(unsigned int, 1u, sta->ht_cap.ampdu_factor); - density = sta->ht_cap.ampdu_density; + factor = min_t(unsigned int, 1u, + sta->deflink.ht_cap.ampdu_factor); + density = sta->deflink.ht_cap.ampdu_density; if (density) { /* diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c index d51a78330135..b1967951928d 100644 --- a/drivers/net/wireless/ath/wcn36xx/main.c +++ b/drivers/net/wireless/ath/wcn36xx/main.c @@ -744,7 +744,7 @@ static void wcn36xx_update_allowed_rates(struct ieee80211_sta *sta, int i, size; u16 *rates_table; struct wcn36xx_sta *sta_priv = wcn36xx_sta_to_priv(sta); - u32 rates = sta->supp_rates[band]; + u32 rates = sta->deflink.supp_rates[band]; memset(&sta_priv->supported_rates, 0, sizeof(sta_priv->supported_rates)); @@ -770,20 +770,20 @@ static void wcn36xx_update_allowed_rates(struct ieee80211_sta *sta, } } - if (sta->ht_cap.ht_supported) { - BUILD_BUG_ON(sizeof(sta->ht_cap.mcs.rx_mask) > - sizeof(sta_priv->supported_rates.supported_mcs_set)); + if (sta->deflink.ht_cap.ht_supported) { + BUILD_BUG_ON(sizeof(sta->deflink.ht_cap.mcs.rx_mask) > + sizeof(sta_priv->supported_rates.supported_mcs_set)); memcpy(sta_priv->supported_rates.supported_mcs_set, - sta->ht_cap.mcs.rx_mask, - sizeof(sta->ht_cap.mcs.rx_mask)); + sta->deflink.ht_cap.mcs.rx_mask, + sizeof(sta->deflink.ht_cap.mcs.rx_mask)); } - if (sta->vht_cap.vht_supported) { + if (sta->deflink.vht_cap.vht_supported) { sta_priv->supported_rates.op_rate_mode = STA_11ac; sta_priv->supported_rates.vht_rx_mcs_map = - sta->vht_cap.vht_mcs.rx_mcs_map; + sta->deflink.vht_cap.vht_mcs.rx_mcs_map; sta_priv->supported_rates.vht_tx_mcs_map = - sta->vht_cap.vht_mcs.tx_mcs_map; + sta->deflink.vht_cap.vht_mcs.tx_mcs_map; } } diff --git a/drivers/net/wireless/ath/wcn36xx/smd.c b/drivers/net/wireless/ath/wcn36xx/smd.c index c056fae1d641..14b5fc9db177 100644 --- a/drivers/net/wireless/ath/wcn36xx/smd.c +++ b/drivers/net/wireless/ath/wcn36xx/smd.c @@ -208,9 +208,9 @@ static void wcn36xx_smd_set_bss_nw_type(struct wcn36xx *wcn, { if (NL80211_BAND_5GHZ == WCN36XX_BAND(wcn)) bss_params->nw_type = WCN36XX_HAL_11A_NW_TYPE; - else if (sta && sta->ht_cap.ht_supported) + else if (sta && sta->deflink.ht_cap.ht_supported) bss_params->nw_type = WCN36XX_HAL_11N_NW_TYPE; - else if (sta && (sta->supp_rates[NL80211_BAND_2GHZ] & 0x7f)) + else if (sta && (sta->deflink.supp_rates[NL80211_BAND_2GHZ] & 0x7f)) bss_params->nw_type = WCN36XX_HAL_11G_NW_TYPE; else bss_params->nw_type = WCN36XX_HAL_11B_NW_TYPE; @@ -225,9 +225,9 @@ static void wcn36xx_smd_set_bss_ht_params(struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct wcn36xx_hal_config_bss_params *bss_params) { - if (sta && sta->ht_cap.ht_supported) { - unsigned long caps = sta->ht_cap.cap; - bss_params->ht = sta->ht_cap.ht_supported; + if (sta && sta->deflink.ht_cap.ht_supported) { + unsigned long caps = sta->deflink.ht_cap.cap; + bss_params->ht = sta->deflink.ht_cap.ht_supported; bss_params->tx_channel_width_set = is_cap_supported(caps, IEEE80211_HT_CAP_SUP_WIDTH_20_40); bss_params->lsig_tx_op_protection_full_support = @@ -250,23 +250,23 @@ wcn36xx_smd_set_bss_vht_params(struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct wcn36xx_hal_config_bss_params_v1 *bss) { - if (sta && sta->vht_cap.vht_supported) + if (sta && sta->deflink.vht_cap.vht_supported) bss->vht_capable = 1; } static void wcn36xx_smd_set_sta_ht_params(struct ieee80211_sta *sta, struct wcn36xx_hal_config_sta_params *sta_params) { - if (sta->ht_cap.ht_supported) { - unsigned long caps = sta->ht_cap.cap; - sta_params->ht_capable = sta->ht_cap.ht_supported; + if (sta->deflink.ht_cap.ht_supported) { + unsigned long caps = sta->deflink.ht_cap.cap; + sta_params->ht_capable = sta->deflink.ht_cap.ht_supported; sta_params->tx_channel_width_set = is_cap_supported(caps, IEEE80211_HT_CAP_SUP_WIDTH_20_40); sta_params->lsig_txop_protection = is_cap_supported(caps, IEEE80211_HT_CAP_LSIG_TXOP_PROT); - sta_params->max_ampdu_size = sta->ht_cap.ampdu_factor; - sta_params->max_ampdu_density = sta->ht_cap.ampdu_density; + sta_params->max_ampdu_size = sta->deflink.ht_cap.ampdu_factor; + sta_params->max_ampdu_density = sta->deflink.ht_cap.ampdu_density; sta_params->max_amsdu_size = is_cap_supported(caps, IEEE80211_HT_CAP_MAX_AMSDU); sta_params->sgi_20Mhz = is_cap_supported(caps, @@ -286,10 +286,10 @@ static void wcn36xx_smd_set_sta_vht_params(struct wcn36xx *wcn, struct ieee80211_sta *sta, struct wcn36xx_hal_config_sta_params_v1 *sta_params) { - if (sta->vht_cap.vht_supported) { - unsigned long caps = sta->vht_cap.cap; + if (sta->deflink.vht_cap.vht_supported) { + unsigned long caps = sta->deflink.vht_cap.cap; - sta_params->vht_capable = sta->vht_cap.vht_supported; + sta_params->vht_capable = sta->deflink.vht_cap.vht_supported; sta_params->vht_ldpc_enabled = is_cap_supported(caps, IEEE80211_VHT_CAP_RXLDPC); if (get_feat_caps(wcn->fw_feat_caps, MU_MIMO)) { @@ -307,9 +307,10 @@ static void wcn36xx_smd_set_sta_vht_params(struct wcn36xx *wcn, static void wcn36xx_smd_set_sta_ht_ldpc_params(struct ieee80211_sta *sta, struct wcn36xx_hal_config_sta_params_v1 *sta_params) { - if (sta->ht_cap.ht_supported) { + if (sta->deflink.ht_cap.ht_supported) { sta_params->ht_ldpc_enabled = - is_cap_supported(sta->ht_cap.cap, IEEE80211_HT_CAP_LDPC_CODING); + is_cap_supported(sta->deflink.ht_cap.cap, + IEEE80211_HT_CAP_LDPC_CODING); } } diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c index 1ff2679963f0..232c03d8c1bc 100644 --- a/drivers/net/wireless/ath/wil6210/cfg80211.c +++ b/drivers/net/wireless/ath/wil6210/cfg80211.c @@ -1618,7 +1618,7 @@ static void wil_del_rx_key(u8 key_index, enum wmi_key_usage key_usage, } static int wil_cfg80211_add_key(struct wiphy *wiphy, - struct net_device *ndev, + struct net_device *ndev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, struct key_params *params) @@ -1695,7 +1695,7 @@ static int wil_cfg80211_add_key(struct wiphy *wiphy, } static int wil_cfg80211_del_key(struct wiphy *wiphy, - struct net_device *ndev, + struct net_device *ndev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr) { @@ -1722,7 +1722,7 @@ static int wil_cfg80211_del_key(struct wiphy *wiphy, /* Need to be present or wiphy_new() will WARN */ static int wil_cfg80211_set_default_key(struct wiphy *wiphy, - struct net_device *ndev, + struct net_device *ndev, int link_id, u8 key_index, bool unicast, bool multicast) { @@ -2071,8 +2071,8 @@ void wil_cfg80211_ap_recovery(struct wil6210_priv *wil) key_params.key = vif->gtk; key_params.key_len = vif->gtk_len; key_params.seq_len = IEEE80211_GCMP_PN_LEN; - rc = wil_cfg80211_add_key(wiphy, ndev, vif->gtk_index, false, - NULL, &key_params); + rc = wil_cfg80211_add_key(wiphy, ndev, -1, vif->gtk_index, + false, NULL, &key_params); if (rc) wil_err(wil, "vif %d recovery add key failed (%d)\n", i, rc); @@ -2097,8 +2097,8 @@ static int wil_cfg80211_change_beacon(struct wiphy *wiphy, bcon->tail_len)) privacy = 1; - memcpy(vif->ssid, wdev->ssid, wdev->ssid_len); - vif->ssid_len = wdev->ssid_len; + memcpy(vif->ssid, wdev->u.ap.ssid, wdev->u.ap.ssid_len); + vif->ssid_len = wdev->u.ap.ssid_len; /* in case privacy has changed, need to restart the AP */ if (vif->privacy != privacy) { @@ -2107,7 +2107,7 @@ static int wil_cfg80211_change_beacon(struct wiphy *wiphy, rc = _wil_cfg80211_start_ap(wiphy, ndev, vif->ssid, vif->ssid_len, privacy, - wdev->beacon_interval, + wdev->links[0].ap.beacon_interval, vif->channel, vif->wmi_edmg_channel, bcon, vif->hidden_ssid, @@ -2185,7 +2185,8 @@ static int wil_cfg80211_start_ap(struct wiphy *wiphy, } static int wil_cfg80211_stop_ap(struct wiphy *wiphy, - struct net_device *ndev) + struct net_device *ndev, + unsigned int link_id) { struct wil6210_priv *wil = wiphy_to_wil(wiphy); struct wil6210_vif *vif = ndev_to_vif(ndev); diff --git a/drivers/net/wireless/ath/wil6210/debugfs.c b/drivers/net/wireless/ath/wil6210/debugfs.c index ac7787e1a7f6..04d1aa0e2d35 100644 --- a/drivers/net/wireless/ath/wil6210/debugfs.c +++ b/drivers/net/wireless/ath/wil6210/debugfs.c @@ -1385,19 +1385,6 @@ static int temp_show(struct seq_file *s, void *data) } DEFINE_SHOW_ATTRIBUTE(temp); -/*---------freq------------*/ -static int freq_show(struct seq_file *s, void *data) -{ - struct wil6210_priv *wil = s->private; - struct wireless_dev *wdev = wil->main_ndev->ieee80211_ptr; - u32 freq = wdev->chandef.chan ? wdev->chandef.chan->center_freq : 0; - - seq_printf(s, "Freq = %d\n", freq); - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(freq); - /*---------link------------*/ static int link_show(struct seq_file *s, void *data) { @@ -2374,7 +2361,6 @@ static const struct { {"pmcdata", 0444, &fops_pmcdata}, {"pmcring", 0444, &fops_pmcring}, {"temp", 0444, &temp_fops}, - {"freq", 0444, &freq_fops}, {"link", 0444, &link_fops}, {"info", 0444, &info_fops}, {"recovery", 0644, &fops_recovery}, diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c index 2dc8406736f4..ed03b75446ff 100644 --- a/drivers/net/wireless/ath/wil6210/wmi.c +++ b/drivers/net/wireless/ath/wil6210/wmi.c @@ -1822,8 +1822,8 @@ wmi_evt_reassoc_status(struct wil6210_vif *vif, int id, void *d, int len) freq = ieee80211_channel_to_frequency(ch, NL80211_BAND_60GHZ); memset(&info, 0, sizeof(info)); - info.channel = ieee80211_get_channel(wiphy, freq); - info.bss = vif->bss; + info.links[0].channel = ieee80211_get_channel(wiphy, freq); + info.links[0].bss = vif->bss; info.req_ie = assoc_req_ie; info.req_ie_len = assoc_req_ie_len; info.resp_ie = assoc_resp_ie; diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 94ab63dfc1c6..48f746ba8585 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -2361,7 +2361,8 @@ done: static s32 brcmf_cfg80211_config_default_key(struct wiphy *wiphy, struct net_device *ndev, - u8 key_idx, bool unicast, bool multicast) + int link_id, u8 key_idx, bool unicast, + bool multicast) { struct brcmf_if *ifp = netdev_priv(ndev); struct brcmf_pub *drvr = ifp->drvr; @@ -2395,7 +2396,8 @@ done: static s32 brcmf_cfg80211_del_key(struct wiphy *wiphy, struct net_device *ndev, - u8 key_idx, bool pairwise, const u8 *mac_addr) + int link_id, u8 key_idx, bool pairwise, + const u8 *mac_addr) { struct brcmf_if *ifp = netdev_priv(ndev); struct brcmf_wsec_key *key; @@ -2432,8 +2434,8 @@ brcmf_cfg80211_del_key(struct wiphy *wiphy, struct net_device *ndev, static s32 brcmf_cfg80211_add_key(struct wiphy *wiphy, struct net_device *ndev, - u8 key_idx, bool pairwise, const u8 *mac_addr, - struct key_params *params) + int link_id, u8 key_idx, bool pairwise, + const u8 *mac_addr, struct key_params *params) { struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy); struct brcmf_if *ifp = netdev_priv(ndev); @@ -2457,8 +2459,8 @@ brcmf_cfg80211_add_key(struct wiphy *wiphy, struct net_device *ndev, } if (params->key_len == 0) - return brcmf_cfg80211_del_key(wiphy, ndev, key_idx, pairwise, - mac_addr); + return brcmf_cfg80211_del_key(wiphy, ndev, -1, key_idx, + pairwise, mac_addr); if (params->key_len > sizeof(key->data)) { bphy_err(drvr, "Too long key length (%u)\n", params->key_len); @@ -2553,8 +2555,9 @@ done: } static s32 -brcmf_cfg80211_get_key(struct wiphy *wiphy, struct net_device *ndev, u8 key_idx, - bool pairwise, const u8 *mac_addr, void *cookie, +brcmf_cfg80211_get_key(struct wiphy *wiphy, struct net_device *ndev, + int link_id, u8 key_idx, bool pairwise, + const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params *params)) { @@ -2610,7 +2613,8 @@ done: static s32 brcmf_cfg80211_config_default_mgmt_key(struct wiphy *wiphy, - struct net_device *ndev, u8 key_idx) + struct net_device *ndev, int link_id, + u8 key_idx) { struct brcmf_if *ifp = netdev_priv(ndev); @@ -4945,7 +4949,8 @@ exit: return err; } -static int brcmf_cfg80211_stop_ap(struct wiphy *wiphy, struct net_device *ndev) +static int brcmf_cfg80211_stop_ap(struct wiphy *wiphy, struct net_device *ndev, + unsigned int link_id) { struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy); struct brcmf_if *ifp = netdev_priv(ndev); @@ -5282,6 +5287,7 @@ exit: static int brcmf_cfg80211_get_channel(struct wiphy *wiphy, struct wireless_dev *wdev, + unsigned int link_id, struct cfg80211_chan_def *chandef) { struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy); @@ -5995,8 +6001,8 @@ brcmf_bss_roaming_done(struct brcmf_cfg80211_info *cfg, done: kfree(buf); - roam_info.channel = notify_channel; - roam_info.bssid = profile->bssid; + roam_info.links[0].channel = notify_channel; + roam_info.links[0].bssid = profile->bssid; roam_info.req_ie = conn_info->req_ie; roam_info.req_ie_len = conn_info->req_ie_len; roam_info.resp_ie = conn_info->resp_ie; @@ -6039,7 +6045,7 @@ brcmf_bss_connect_done(struct brcmf_cfg80211_info *cfg, } else { conn_params.status = WLAN_STATUS_AUTH_TIMEOUT; } - conn_params.bssid = profile->bssid; + conn_params.links[0].bssid = profile->bssid; conn_params.req_ie = conn_info->req_ie; conn_params.req_ie_len = conn_info->req_ie_len; conn_params.resp_ie = conn_info->resp_ie; diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c index eadac0f5590f..8c741b98d8e5 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c @@ -868,7 +868,7 @@ brcms_ops_ampdu_action(struct ieee80211_hw *hw, spin_lock_bh(&wl->lock); brcms_c_ampdu_tx_operational(wl->wlc, tid, buf_size, (1 << (IEEE80211_HT_MAX_AMPDU_FACTOR + - sta->ht_cap.ampdu_factor)) - 1); + sta->deflink.ht_cap.ampdu_factor)) - 1); spin_unlock_bh(&wl->lock); /* Power save wakeup */ break; diff --git a/drivers/net/wireless/intel/iwlegacy/3945-rs.c b/drivers/net/wireless/intel/iwlegacy/3945-rs.c index b2478cbe558e..0eaad980c85c 100644 --- a/drivers/net/wireless/intel/iwlegacy/3945-rs.c +++ b/drivers/net/wireless/intel/iwlegacy/3945-rs.c @@ -354,13 +354,13 @@ il3945_rs_rate_init(struct il_priv *il, struct ieee80211_sta *sta, u8 sta_id) * after assoc.. */ for (i = sband->n_bitrates - 1; i >= 0; i--) { - if (sta->supp_rates[sband->band] & (1 << i)) { + if (sta->deflink.supp_rates[sband->band] & (1 << i)) { rs_sta->last_txrate_idx = i; break; } } - il->_3945.sta_supp_rates = sta->supp_rates[sband->band]; + il->_3945.sta_supp_rates = sta->deflink.supp_rates[sband->band]; /* For 5 GHz band it start at IL_FIRST_OFDM_RATE */ if (sband->band == NL80211_BAND_5GHZ) { rs_sta->last_txrate_idx += IL_FIRST_OFDM_RATE; @@ -631,7 +631,7 @@ il3945_rs_get_rate(void *il_r, struct ieee80211_sta *sta, void *il_sta, il_sta = NULL; } - rate_mask = sta->supp_rates[sband->band]; + rate_mask = sta->deflink.supp_rates[sband->band]; /* get user max rate if set */ max_rate_idx = fls(txrc->rate_idx_mask) - 1; diff --git a/drivers/net/wireless/intel/iwlegacy/4965-rs.c b/drivers/net/wireless/intel/iwlegacy/4965-rs.c index 150805aec407..d8a5dbf89a02 100644 --- a/drivers/net/wireless/intel/iwlegacy/4965-rs.c +++ b/drivers/net/wireless/intel/iwlegacy/4965-rs.c @@ -627,7 +627,7 @@ il4965_rs_toggle_antenna(u32 valid_ant, u32 *rate_n_flags, static bool il4965_rs_use_green(struct il_priv *il, struct ieee80211_sta *sta) { - return (sta->ht_cap.cap & IEEE80211_HT_CAP_GRN_FLD) && + return (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_GRN_FLD) && !il->ht.non_gf_sta_present; } @@ -970,7 +970,7 @@ il4965_rs_tx_status(void *il_r, struct ieee80211_supported_band *sband, lq_sta->last_rate_n_flags = tx_rate; done: /* See if there's a better rate or modulation mode to try. */ - if (sta->supp_rates[sband->band]) + if (sta->deflink.supp_rates[sband->band]) il4965_rs_rate_scale_perform(il, skb, sta, lq_sta); } @@ -1164,7 +1164,7 @@ il4965_rs_switch_to_mimo2(struct il_priv *il, struct il_lq_sta *lq_sta, s32 rate; s8 is_green = lq_sta->is_green; - if (!conf_is_ht(conf) || !sta->ht_cap.ht_supported) + if (!conf_is_ht(conf) || !sta->deflink.ht_cap.ht_supported) return -1; if (sta->smps_mode == IEEE80211_SMPS_STATIC) @@ -1182,7 +1182,7 @@ il4965_rs_switch_to_mimo2(struct il_priv *il, struct il_lq_sta *lq_sta, tbl->max_search = IL_MAX_SEARCH; rate_mask = lq_sta->active_mimo2_rate; - if (il_is_ht40_tx_allowed(il, &sta->ht_cap)) + if (il_is_ht40_tx_allowed(il, &sta->deflink.ht_cap)) tbl->is_ht40 = 1; else tbl->is_ht40 = 0; @@ -1217,7 +1217,7 @@ il4965_rs_switch_to_siso(struct il_priv *il, struct il_lq_sta *lq_sta, u8 is_green = lq_sta->is_green; s32 rate; - if (!conf_is_ht(conf) || !sta->ht_cap.ht_supported) + if (!conf_is_ht(conf) || !sta->deflink.ht_cap.ht_supported) return -1; D_RATE("LQ: try to switch to SISO\n"); @@ -1228,7 +1228,7 @@ il4965_rs_switch_to_siso(struct il_priv *il, struct il_lq_sta *lq_sta, tbl->max_search = IL_MAX_SEARCH; rate_mask = lq_sta->active_siso_rate; - if (il_is_ht40_tx_allowed(il, &sta->ht_cap)) + if (il_is_ht40_tx_allowed(il, &sta->deflink.ht_cap)) tbl->is_ht40 = 1; else tbl->is_ht40 = 0; @@ -1384,7 +1384,7 @@ il4965_rs_move_siso_to_other(struct il_priv *il, struct il_lq_sta *lq_sta, struct il_scale_tbl_info *search_tbl = &(lq_sta->lq_info[(1 - lq_sta->active_tbl)]); struct il_rate_scale_data *win = &(tbl->win[idx]); - struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap; + struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; u32 sz = (sizeof(struct il_scale_tbl_info) - (sizeof(struct il_rate_scale_data) * RATE_COUNT)); @@ -1507,7 +1507,7 @@ il4965_rs_move_mimo2_to_other(struct il_priv *il, struct il_lq_sta *lq_sta, struct il_scale_tbl_info *search_tbl = &(lq_sta->lq_info[(1 - lq_sta->active_tbl)]); struct il_rate_scale_data *win = &(tbl->win[idx]); - struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap; + struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; u32 sz = (sizeof(struct il_scale_tbl_info) - (sizeof(struct il_rate_scale_data) * RATE_COUNT)); @@ -1760,7 +1760,7 @@ il4965_rs_rate_scale_perform(struct il_priv *il, struct sk_buff *skb, (info->flags & IEEE80211_TX_CTL_NO_ACK)) return; - lq_sta->supp_rates = sta->supp_rates[lq_sta->band]; + lq_sta->supp_rates = sta->deflink.supp_rates[lq_sta->band]; tid = il4965_rs_tl_add_packet(lq_sta, hdr); if (tid != MAX_TID_COUNT && (lq_sta->tx_agg_tid_en & (1 << tid))) { @@ -2271,7 +2271,7 @@ il4965_rs_rate_init(struct il_priv *il, struct ieee80211_sta *sta, u8 sta_id) int i, j; struct ieee80211_hw *hw = il->hw; struct ieee80211_conf *conf = &il->hw->conf; - struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap; + struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; struct il_station_priv *sta_priv; struct il_lq_sta *lq_sta; struct ieee80211_supported_band *sband; @@ -2288,7 +2288,7 @@ il4965_rs_rate_init(struct il_priv *il, struct ieee80211_sta *sta, u8 sta_id) win[i]); lq_sta->flush_timer = 0; - lq_sta->supp_rates = sta->supp_rates[sband->band]; + lq_sta->supp_rates = sta->deflink.supp_rates[sband->band]; for (j = 0; j < LQ_SIZE; j++) for (i = 0; i < RATE_COUNT; i++) il4965_rs_rate_scale_clear_win(&lq_sta->lq_info[j]. diff --git a/drivers/net/wireless/intel/iwlegacy/common.c b/drivers/net/wireless/intel/iwlegacy/common.c index 683b632981ed..8299d89e7505 100644 --- a/drivers/net/wireless/intel/iwlegacy/common.c +++ b/drivers/net/wireless/intel/iwlegacy/common.c @@ -1863,7 +1863,7 @@ EXPORT_SYMBOL(il_send_add_sta); static void il_set_ht_add_station(struct il_priv *il, u8 idx, struct ieee80211_sta *sta) { - struct ieee80211_sta_ht_cap *sta_ht_inf = &sta->ht_cap; + struct ieee80211_sta_ht_cap *sta_ht_inf = &sta->deflink.ht_cap; __le32 sta_flags; if (!sta || !sta_ht_inf->ht_supported) @@ -1900,7 +1900,7 @@ il_set_ht_add_station(struct il_priv *il, u8 idx, struct ieee80211_sta *sta) cpu_to_le32((u32) sta_ht_inf-> ampdu_density << STA_FLG_AGG_MPDU_DENSITY_POS); - if (il_is_ht40_tx_allowed(il, &sta->ht_cap)) + if (il_is_ht40_tx_allowed(il, &sta->deflink.ht_cap)) sta_flags |= STA_FLG_HT40_EN_MSK; else sta_flags &= ~STA_FLG_HT40_EN_MSK; @@ -5222,7 +5222,7 @@ il_ht_conf(struct il_priv *il, struct ieee80211_vif *vif) rcu_read_lock(); sta = ieee80211_find_sta(vif, bss_conf->bssid); if (sta) { - struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap; + struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; int maxstreams; maxstreams = diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/rs.c b/drivers/net/wireless/intel/iwlwifi/dvm/rs.c index 548540dd0c0f..2bb1e97cfd1f 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/rs.c +++ b/drivers/net/wireless/intel/iwlwifi/dvm/rs.c @@ -1044,7 +1044,7 @@ static void rs_tx_status(void *priv_r, struct ieee80211_supported_band *sband, lq_sta->last_rate_n_flags = tx_rate; done: /* See if there's a better rate or modulation mode to try. */ - if (sta && sta->supp_rates[sband->band]) + if (sta && sta->deflink.supp_rates[sband->band]) rs_rate_scale_perform(priv, skb, sta, lq_sta); if (priv->lib->bt_params && priv->lib->bt_params->advanced_bt_coexist) @@ -1244,7 +1244,7 @@ static int rs_switch_to_mimo2(struct iwl_priv *priv, struct iwl_station_priv *sta_priv = (void *)sta->drv_priv; struct iwl_rxon_context *ctx = sta_priv->ctx; - if (!conf_is_ht(conf) || !sta->ht_cap.ht_supported) + if (!conf_is_ht(conf) || !sta->deflink.ht_cap.ht_supported) return -1; if (sta->smps_mode == IEEE80211_SMPS_STATIC) @@ -1299,7 +1299,7 @@ static int rs_switch_to_mimo3(struct iwl_priv *priv, struct iwl_station_priv *sta_priv = (void *)sta->drv_priv; struct iwl_rxon_context *ctx = sta_priv->ctx; - if (!conf_is_ht(conf) || !sta->ht_cap.ht_supported) + if (!conf_is_ht(conf) || !sta->deflink.ht_cap.ht_supported) return -1; if (sta->smps_mode == IEEE80211_SMPS_STATIC) @@ -1355,7 +1355,7 @@ static int rs_switch_to_siso(struct iwl_priv *priv, struct iwl_station_priv *sta_priv = (void *)sta->drv_priv; struct iwl_rxon_context *ctx = sta_priv->ctx; - if (!conf_is_ht(conf) || !sta->ht_cap.ht_supported) + if (!conf_is_ht(conf) || !sta->deflink.ht_cap.ht_supported) return -1; IWL_DEBUG_RATE(priv, "LQ: try to switch to SISO\n"); @@ -1575,7 +1575,7 @@ static void rs_move_siso_to_other(struct iwl_priv *priv, struct iwl_scale_tbl_info *search_tbl = &(lq_sta->lq_info[(1 - lq_sta->active_tbl)]); struct iwl_rate_scale_data *window = &(tbl->win[index]); - struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap; + struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; u32 sz = (sizeof(struct iwl_scale_tbl_info) - (sizeof(struct iwl_rate_scale_data) * IWL_RATE_COUNT)); u8 start_action; @@ -1745,7 +1745,7 @@ static void rs_move_mimo2_to_other(struct iwl_priv *priv, struct iwl_scale_tbl_info *search_tbl = &(lq_sta->lq_info[(1 - lq_sta->active_tbl)]); struct iwl_rate_scale_data *window = &(tbl->win[index]); - struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap; + struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; u32 sz = (sizeof(struct iwl_scale_tbl_info) - (sizeof(struct iwl_rate_scale_data) * IWL_RATE_COUNT)); u8 start_action; @@ -1913,7 +1913,7 @@ static void rs_move_mimo3_to_other(struct iwl_priv *priv, struct iwl_scale_tbl_info *search_tbl = &(lq_sta->lq_info[(1 - lq_sta->active_tbl)]); struct iwl_rate_scale_data *window = &(tbl->win[index]); - struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap; + struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; u32 sz = (sizeof(struct iwl_scale_tbl_info) - (sizeof(struct iwl_rate_scale_data) * IWL_RATE_COUNT)); u8 start_action; @@ -2217,7 +2217,7 @@ static void rs_rate_scale_perform(struct iwl_priv *priv, info->flags & IEEE80211_TX_CTL_NO_ACK) return; - lq_sta->supp_rates = sta->supp_rates[lq_sta->band]; + lq_sta->supp_rates = sta->deflink.supp_rates[lq_sta->band]; tid = rs_tl_add_packet(lq_sta, hdr); if ((tid != IWL_MAX_TID_COUNT) && @@ -2768,7 +2768,7 @@ void iwl_rs_rate_init(struct iwl_priv *priv, struct ieee80211_sta *sta, u8 sta_i int i, j; struct ieee80211_hw *hw = priv->hw; struct ieee80211_conf *conf = &priv->hw->conf; - struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap; + struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; struct iwl_station_priv *sta_priv; struct iwl_lq_sta *lq_sta; struct ieee80211_supported_band *sband; @@ -2786,7 +2786,7 @@ void iwl_rs_rate_init(struct iwl_priv *priv, struct ieee80211_sta *sta, u8 sta_i rs_rate_scale_clear_window(&lq_sta->lq_info[j].win[i]); lq_sta->flush_timer = 0; - lq_sta->supp_rates = sta->supp_rates[sband->band]; + lq_sta->supp_rates = sta->deflink.supp_rates[sband->band]; IWL_DEBUG_RATE(priv, "LQ: *** rate scale station global init for station %d ***\n", sta_id); @@ -2803,7 +2803,7 @@ void iwl_rs_rate_init(struct iwl_priv *priv, struct ieee80211_sta *sta, u8 sta_i /* * active legacy rates as per supported rates bitmap */ - supp = sta->supp_rates[sband->band]; + supp = sta->deflink.supp_rates[sband->band]; lq_sta->active_legacy_rate = 0; for_each_set_bit(i, &supp, BITS_PER_LONG) lq_sta->active_legacy_rate |= BIT(sband->bitrates[i].hw_value); diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c b/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c index 12a3d464ae64..b4c0fcc021be 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c +++ b/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c @@ -1285,7 +1285,7 @@ static void iwlagn_check_needed_chains(struct iwl_priv *priv, break; } - ht_cap = &sta->ht_cap; + ht_cap = &sta->deflink.ht_cap; need_multiple = true; diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/sta.c b/drivers/net/wireless/intel/iwlwifi/dvm/sta.c index ddc14059b07d..cc20d41df5ae 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/sta.c +++ b/drivers/net/wireless/intel/iwlwifi/dvm/sta.c @@ -144,7 +144,7 @@ bool iwl_is_ht40_tx_allowed(struct iwl_priv *priv, if (!sta) return true; - return sta->bandwidth >= IEEE80211_STA_RX_BW_40; + return sta->deflink.bandwidth >= IEEE80211_STA_RX_BW_40; } static void iwl_sta_calc_ht_flags(struct iwl_priv *priv, @@ -152,7 +152,7 @@ static void iwl_sta_calc_ht_flags(struct iwl_priv *priv, struct iwl_rxon_context *ctx, __le32 *flags, __le32 *mask) { - struct ieee80211_sta_ht_cap *sta_ht_inf = &sta->ht_cap; + struct ieee80211_sta_ht_cap *sta_ht_inf = &sta->deflink.ht_cap; *mask = STA_FLG_RTS_MIMO_PROT_MSK | STA_FLG_MIMO_DIS_MSK | diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index 00ca17f3b263..4fcddbcf5508 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -917,7 +917,7 @@ iwl_mvm_get_wowlan_config(struct iwl_mvm *mvm, /* TODO: wowlan_config_cmd->wowlan_ba_teardown_tids */ wowlan_config_cmd->is_11n_connection = - ap_sta->ht_cap.ht_supported; + ap_sta->deflink.ht_cap.ht_supported; wowlan_config_cmd->flags = ENABLE_L3_FILTERING | ENABLE_NBNS_FILTERING | ENABLE_DHCP_FILTERING; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 56c7a68a6491..8d1af8376a0e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -2069,7 +2069,7 @@ static void iwl_mvm_cfg_he_sta(struct iwl_mvm *mvm, return; } - if (!sta->he_cap.has_he) { + if (!sta->deflink.he_cap.has_he) { rcu_read_unlock(); return; } @@ -2081,17 +2081,17 @@ static void iwl_mvm_cfg_he_sta(struct iwl_mvm *mvm, flags |= STA_CTXT_HE_RU_2MHZ_BLOCK; /* HTC flags */ - if (sta->he_cap.he_cap_elem.mac_cap_info[0] & + if (sta->deflink.he_cap.he_cap_elem.mac_cap_info[0] & IEEE80211_HE_MAC_CAP0_HTC_HE) sta_ctxt_cmd.htc_flags |= cpu_to_le32(IWL_HE_HTC_SUPPORT); - if ((sta->he_cap.he_cap_elem.mac_cap_info[1] & + if ((sta->deflink.he_cap.he_cap_elem.mac_cap_info[1] & IEEE80211_HE_MAC_CAP1_LINK_ADAPTATION) || - (sta->he_cap.he_cap_elem.mac_cap_info[2] & + (sta->deflink.he_cap.he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_LINK_ADAPTATION)) { u8 link_adap = - ((sta->he_cap.he_cap_elem.mac_cap_info[2] & + ((sta->deflink.he_cap.he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_LINK_ADAPTATION) << 1) + - (sta->he_cap.he_cap_elem.mac_cap_info[1] & + (sta->deflink.he_cap.he_cap_elem.mac_cap_info[1] & IEEE80211_HE_MAC_CAP1_LINK_ADAPTATION); if (link_adap == 2) @@ -2101,12 +2101,12 @@ static void iwl_mvm_cfg_he_sta(struct iwl_mvm *mvm, sta_ctxt_cmd.htc_flags |= cpu_to_le32(IWL_HE_HTC_LINK_ADAP_BOTH); } - if (sta->he_cap.he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_BSR) + if (sta->deflink.he_cap.he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_BSR) sta_ctxt_cmd.htc_flags |= cpu_to_le32(IWL_HE_HTC_BSR_SUPP); - if (sta->he_cap.he_cap_elem.mac_cap_info[3] & + if (sta->deflink.he_cap.he_cap_elem.mac_cap_info[3] & IEEE80211_HE_MAC_CAP3_OMI_CONTROL) sta_ctxt_cmd.htc_flags |= cpu_to_le32(IWL_HE_HTC_OMI_SUPP); - if (sta->he_cap.he_cap_elem.mac_cap_info[4] & IEEE80211_HE_MAC_CAP4_BQR) + if (sta->deflink.he_cap.he_cap_elem.mac_cap_info[4] & IEEE80211_HE_MAC_CAP4_BQR) sta_ctxt_cmd.htc_flags |= cpu_to_le32(IWL_HE_HTC_BQR_SUPP); /* @@ -2116,15 +2116,15 @@ static void iwl_mvm_cfg_he_sta(struct iwl_mvm *mvm, memset(&sta_ctxt_cmd.pkt_ext, 7, sizeof(sta_ctxt_cmd.pkt_ext)); /* If PPE Thresholds exist, parse them into a FW-familiar format. */ - if (sta->he_cap.he_cap_elem.phy_cap_info[6] & + if (sta->deflink.he_cap.he_cap_elem.phy_cap_info[6] & IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) { - u8 nss = (sta->he_cap.ppe_thres[0] & + u8 nss = (sta->deflink.he_cap.ppe_thres[0] & IEEE80211_PPE_THRES_NSS_MASK) + 1; u8 ru_index_bitmap = - (sta->he_cap.ppe_thres[0] & + (sta->deflink.he_cap.ppe_thres[0] & IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK) >> IEEE80211_PPE_THRES_RU_INDEX_BITMASK_POS; - u8 *ppe = &sta->he_cap.ppe_thres[0]; + u8 *ppe = &sta->deflink.he_cap.ppe_thres[0]; u8 ppe_pos_bit = 7; /* Starting after PPE header */ /* @@ -2162,14 +2162,14 @@ static void iwl_mvm_cfg_he_sta(struct iwl_mvm *mvm, } flags |= STA_CTXT_HE_PACKET_EXT; - } else if ((sta->he_cap.he_cap_elem.phy_cap_info[9] & + } else if ((sta->deflink.he_cap.he_cap_elem.phy_cap_info[9] & IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_MASK) != IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_RESERVED) { int low_th = -1; int high_th = -1; /* Take the PPE thresholds from the nominal padding info */ - switch (sta->he_cap.he_cap_elem.phy_cap_info[9] & + switch (sta->deflink.he_cap.he_cap_elem.phy_cap_info[9] & IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_MASK) { case IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_0US: low_th = IWL_HE_PKT_EXT_NONE; @@ -2206,11 +2206,11 @@ static void iwl_mvm_cfg_he_sta(struct iwl_mvm *mvm, } } - if (sta->he_cap.he_cap_elem.mac_cap_info[2] & + if (sta->deflink.he_cap.he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP) flags |= STA_CTXT_HE_32BIT_BA_BITMAP; - if (sta->he_cap.he_cap_elem.mac_cap_info[2] & + if (sta->deflink.he_cap.he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_ACK_EN) flags |= STA_CTXT_HE_ACK_ENABLED; @@ -3171,7 +3171,7 @@ static int iwl_mvm_mac_sta_state(struct ieee80211_hw *hw, } if (vif->type == NL80211_IFTYPE_STATION) - vif->bss_conf.he_support = sta->he_cap.has_he; + vif->bss_conf.he_support = sta->deflink.he_cap.has_he; if (sta->tdls && (vif->p2p || @@ -3203,17 +3203,17 @@ static int iwl_mvm_mac_sta_state(struct ieee80211_hw *hw, } else if (old_state == IEEE80211_STA_AUTH && new_state == IEEE80211_STA_ASSOC) { if (vif->type == NL80211_IFTYPE_AP) { - vif->bss_conf.he_support = sta->he_cap.has_he; + vif->bss_conf.he_support = sta->deflink.he_cap.has_he; mvmvif->ap_assoc_sta_count++; iwl_mvm_mac_ctxt_changed(mvm, vif, false, NULL); if (vif->bss_conf.he_support && !iwlwifi_mod_params.disable_11ax) iwl_mvm_cfg_he_sta(mvm, vif, mvm_sta->sta_id); } else if (vif->type == NL80211_IFTYPE_STATION) { - vif->bss_conf.he_support = sta->he_cap.has_he; + vif->bss_conf.he_support = sta->deflink.he_cap.has_he; mvmvif->he_ru_2mhz_block = false; - if (sta->he_cap.has_he) + if (sta->deflink.he_cap.has_he) iwl_mvm_check_he_obss_narrow_bw_ru(hw, vif); iwl_mvm_mac_ctxt_changed(mvm, vif, false, NULL); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c index 2d58cb969918..f6d09ca1d3ed 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c @@ -11,7 +11,7 @@ static u8 rs_fw_bw_from_sta_bw(struct ieee80211_sta *sta) { - switch (sta->bandwidth) { + switch (sta->deflink.bandwidth) { case IEEE80211_STA_RX_BW_160: return IWL_TLC_MNG_CH_WIDTH_160MHZ; case IEEE80211_STA_RX_BW_80: @@ -42,9 +42,9 @@ static u8 rs_fw_set_active_chains(u8 chains) static u8 rs_fw_sgi_cw_support(struct ieee80211_sta *sta) { - struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap; - struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; - struct ieee80211_sta_he_cap *he_cap = &sta->he_cap; + struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; + struct ieee80211_sta_vht_cap *vht_cap = &sta->deflink.vht_cap; + struct ieee80211_sta_he_cap *he_cap = &sta->deflink.he_cap; u8 supp = 0; if (he_cap->has_he) @@ -66,9 +66,9 @@ static u16 rs_fw_get_config_flags(struct iwl_mvm *mvm, struct ieee80211_sta *sta, struct ieee80211_supported_band *sband) { - struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap; - struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; - struct ieee80211_sta_he_cap *he_cap = &sta->he_cap; + struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; + struct ieee80211_sta_vht_cap *vht_cap = &sta->deflink.vht_cap; + struct ieee80211_sta_he_cap *he_cap = &sta->deflink.he_cap; bool vht_ena = vht_cap->vht_supported; u16 flags = 0; @@ -137,7 +137,7 @@ rs_fw_vht_set_enabled_rates(const struct ieee80211_sta *sta, { u16 supp; int i, highest_mcs; - u8 max_nss = sta->rx_nss; + u8 max_nss = sta->deflink.rx_nss; struct ieee80211_vht_cap ieee_vht_cap = { .vht_cap_info = cpu_to_le32(vht_cap->cap), .supp_mcs = vht_cap->vht_mcs, @@ -155,7 +155,7 @@ rs_fw_vht_set_enabled_rates(const struct ieee80211_sta *sta, continue; supp = BIT(highest_mcs + 1) - 1; - if (sta->bandwidth == IEEE80211_STA_RX_BW_20) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_20) supp &= ~BIT(IWL_TLC_MNG_HT_RATE_MCS9); cmd->ht_rates[i][IWL_TLC_HT_BW_NONE_160] = cpu_to_le16(supp); @@ -164,7 +164,7 @@ rs_fw_vht_set_enabled_rates(const struct ieee80211_sta *sta, * configuration is supported - only for MCS 0 since we already * decoded the MCS bits anyway ourselves. */ - if (sta->bandwidth == IEEE80211_STA_RX_BW_160 && + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_160 && ieee80211_get_vht_max_nss(&ieee_vht_cap, IEEE80211_VHT_CHANWIDTH_160MHZ, 0, true, nss) >= nss) @@ -195,7 +195,7 @@ rs_fw_he_set_enabled_rates(const struct ieee80211_sta *sta, struct ieee80211_supported_band *sband, struct iwl_tlc_config_cmd *cmd) { - const struct ieee80211_sta_he_cap *he_cap = &sta->he_cap; + const struct ieee80211_sta_he_cap *he_cap = &sta->deflink.he_cap; u16 mcs_160 = le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_160); u16 mcs_80 = le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_80); u16 tx_mcs_80 = @@ -203,7 +203,7 @@ rs_fw_he_set_enabled_rates(const struct ieee80211_sta *sta, u16 tx_mcs_160 = le16_to_cpu(sband->iftype_data->he_cap.he_mcs_nss_supp.tx_mcs_160); int i; - u8 nss = sta->rx_nss; + u8 nss = sta->deflink.rx_nss; /* the station support only a single receive chain */ if (sta->smps_mode == IEEE80211_SMPS_STATIC) @@ -246,12 +246,12 @@ static void rs_fw_set_supp_rates(struct ieee80211_sta *sta, int i; u16 supp = 0; unsigned long tmp; /* must be unsigned long for for_each_set_bit */ - const struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap; - const struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; - const struct ieee80211_sta_he_cap *he_cap = &sta->he_cap; + const struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; + const struct ieee80211_sta_vht_cap *vht_cap = &sta->deflink.vht_cap; + const struct ieee80211_sta_he_cap *he_cap = &sta->deflink.he_cap; /* non HT rates */ - tmp = sta->supp_rates[sband->band]; + tmp = sta->deflink.supp_rates[sband->band]; for_each_set_bit(i, &tmp, BITS_PER_LONG) supp |= BIT(sband->bitrates[i].hw_value); @@ -362,11 +362,11 @@ out: u16 rs_fw_get_max_amsdu_len(struct ieee80211_sta *sta) { struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta); - const struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; - const struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap; + const struct ieee80211_sta_vht_cap *vht_cap = &sta->deflink.vht_cap; + const struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; if (mvmsta->vif->bss_conf.chandef.chan->band == NL80211_BAND_6GHZ) { - switch (le16_get_bits(sta->he_6ghz_capa.capa, + switch (le16_get_bits(sta->deflink.he_6ghz_capa.capa, IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN)) { case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: return IEEE80211_MAX_MPDU_LEN_VHT_11454; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c index b97708cb869d..f20a04564a4b 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c @@ -140,7 +140,7 @@ static bool rs_mimo_allow(struct iwl_mvm *mvm, struct ieee80211_sta *sta, struct rs_rate *rate, const struct rs_tx_column *next_col) { - if (!sta->ht_cap.ht_supported) + if (!sta->deflink.ht_cap.ht_supported) return false; if (sta->smps_mode == IEEE80211_SMPS_STATIC) @@ -162,7 +162,7 @@ static bool rs_siso_allow(struct iwl_mvm *mvm, struct ieee80211_sta *sta, struct rs_rate *rate, const struct rs_tx_column *next_col) { - if (!sta->ht_cap.ht_supported) + if (!sta->deflink.ht_cap.ht_supported) return false; return true; @@ -172,8 +172,8 @@ static bool rs_sgi_allow(struct iwl_mvm *mvm, struct ieee80211_sta *sta, struct rs_rate *rate, const struct rs_tx_column *next_col) { - struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap; - struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; + struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; + struct ieee80211_sta_vht_cap *vht_cap = &sta->deflink.vht_cap; if (is_ht20(rate) && (ht_cap->cap & IEEE80211_HT_CAP_SGI_20)) @@ -1416,13 +1416,13 @@ static s32 rs_get_best_rate(struct iwl_mvm *mvm, static u32 rs_bw_from_sta_bw(struct ieee80211_sta *sta) { - struct ieee80211_sta_vht_cap *sta_vht_cap = &sta->vht_cap; + struct ieee80211_sta_vht_cap *sta_vht_cap = &sta->deflink.vht_cap; struct ieee80211_vht_cap vht_cap = { .vht_cap_info = cpu_to_le32(sta_vht_cap->cap), .supp_mcs = sta_vht_cap->vht_mcs, }; - switch (sta->bandwidth) { + switch (sta->deflink.bandwidth) { case IEEE80211_STA_RX_BW_160: /* * Don't use 160 MHz if VHT extended NSS support @@ -1435,7 +1435,7 @@ static u32 rs_bw_from_sta_bw(struct ieee80211_sta *sta) if (ieee80211_get_vht_max_nss(&vht_cap, IEEE80211_VHT_CHANWIDTH_160MHZ, 0, true, - sta->rx_nss) < sta->rx_nss) + sta->deflink.rx_nss) < sta->deflink.rx_nss) return RATE_MCS_CHAN_WIDTH_80; return RATE_MCS_CHAN_WIDTH_160; case IEEE80211_STA_RX_BW_80: @@ -2584,7 +2584,7 @@ static void rs_get_initial_rate(struct iwl_mvm *mvm, * In case of VHT/HT when the rssi is low fallback to the case of * legacy rates. */ - if (sta->vht_cap.vht_supported && + if (sta->deflink.vht_cap.vht_supported && best_rssi > IWL_RS_LOW_RSSI_THRESHOLD) { /* * In AP mode, when a new station associates, rs is initialized @@ -2610,14 +2610,15 @@ static void rs_get_initial_rate(struct iwl_mvm *mvm, nentries = ARRAY_SIZE(rs_optimal_rates_vht_20mhz); break; default: - IWL_ERR(mvm, "Invalid BW %d\n", sta->bandwidth); + IWL_ERR(mvm, "Invalid BW %d\n", + sta->deflink.bandwidth); goto out; } active_rate = lq_sta->active_siso_rate; rate->type = LQ_VHT_SISO; rate->bw = bw; - } else if (sta->ht_cap.ht_supported && + } else if (sta->deflink.ht_cap.ht_supported && best_rssi > IWL_RS_LOW_RSSI_THRESHOLD) { initial_rates = rs_optimal_rates_ht; nentries = ARRAY_SIZE(rs_optimal_rates_ht); @@ -2809,14 +2810,14 @@ static void rs_vht_set_enabled_rates(struct ieee80211_sta *sta, /* VHT MCS9 isn't valid for 20Mhz for NSS=1,2 */ if (i == IWL_RATE_MCS_9_INDEX && - sta->bandwidth == IEEE80211_STA_RX_BW_20) + sta->deflink.bandwidth == IEEE80211_STA_RX_BW_20) continue; lq_sta->active_siso_rate |= BIT(i); } } - if (sta->rx_nss < 2) + if (sta->deflink.rx_nss < 2) return; highest_mcs = rs_vht_highest_rx_mcs_index(vht_cap, 2); @@ -2827,7 +2828,7 @@ static void rs_vht_set_enabled_rates(struct ieee80211_sta *sta, /* VHT MCS9 isn't valid for 20Mhz for NSS=1,2 */ if (i == IWL_RATE_MCS_9_INDEX && - sta->bandwidth == IEEE80211_STA_RX_BW_20) + sta->deflink.bandwidth == IEEE80211_STA_RX_BW_20) continue; lq_sta->active_mimo2_rate |= BIT(i); @@ -2964,8 +2965,8 @@ static void rs_drv_rate_init(struct iwl_mvm *mvm, struct ieee80211_sta *sta, { int i, j; struct ieee80211_hw *hw = mvm->hw; - struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap; - struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; + struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; + struct ieee80211_sta_vht_cap *vht_cap = &sta->deflink.vht_cap; struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta); struct iwl_lq_sta *lq_sta = &mvmsta->lq_sta.rs_drv; struct ieee80211_supported_band *sband; @@ -3001,7 +3002,7 @@ static void rs_drv_rate_init(struct iwl_mvm *mvm, struct ieee80211_sta *sta, /* * active legacy rates as per supported rates bitmap */ - supp = sta->supp_rates[sband->band]; + supp = sta->deflink.supp_rates[sband->band]; lq_sta->active_legacy_rate = 0; for_each_set_bit(i, &supp, BITS_PER_LONG) lq_sta->active_legacy_rate |= BIT(sband->bitrates[i].hw_value); @@ -3294,7 +3295,7 @@ static void __iwl_mvm_rs_tx_status(struct iwl_mvm *mvm, IWL_DEBUG_RATE(mvm, "reduced txpower: %d\n", reduced_txp); done: /* See if there's a better rate or modulation mode to try. */ - if (sta->supp_rates[info->band]) + if (sta->deflink.supp_rates[info->band]) rs_rate_scale_perform(mvm, sta, lq_sta, tid, ndp); } diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c index efccdd3f3377..6724d9a4618e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c @@ -215,6 +215,9 @@ static int iwl_mvm_create_skb(struct iwl_mvm *mvm, struct sk_buff *skb, shdr->type != htons(ETH_P_PAE) && shdr->type != htons(ETH_P_TDLS)))) skb->ip_summed = CHECKSUM_NONE; + else + /* mac80211 assumes full CSUM including SNAP header */ + skb_postpush_rcsum(skb, shdr, sizeof(*shdr)); } fraglen = len - headlen; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sf.c b/drivers/net/wireless/intel/iwlwifi/mvm/sf.c index 655da8856c75..693752d8f65b 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sf.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sf.c @@ -106,10 +106,10 @@ static void iwl_mvm_fill_sf_command(struct iwl_mvm *mvm, * capabilities of the AP station, and choose the watermark accordingly. */ if (sta) { - if (sta->ht_cap.ht_supported || - sta->vht_cap.vht_supported || - sta->he_cap.has_he) { - switch (sta->rx_nss) { + if (sta->deflink.ht_cap.ht_supported || + sta->deflink.vht_cap.vht_supported || + sta->deflink.he_cap.has_he) { + switch (sta->deflink.rx_nss) { case 1: watermark = SF_W_MARK_SISO; break; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c index 1bb456daff9e..6c691079798a 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c @@ -86,7 +86,7 @@ int iwl_mvm_sta_send_to_fw(struct iwl_mvm *mvm, struct ieee80211_sta *sta, } } - switch (sta->bandwidth) { + switch (sta->deflink.bandwidth) { case IEEE80211_STA_RX_BW_160: add_sta_cmd.station_flags |= cpu_to_le32(STA_FLG_FAT_EN_160MHZ); fallthrough; @@ -97,13 +97,13 @@ int iwl_mvm_sta_send_to_fw(struct iwl_mvm *mvm, struct ieee80211_sta *sta, add_sta_cmd.station_flags |= cpu_to_le32(STA_FLG_FAT_EN_40MHZ); fallthrough; case IEEE80211_STA_RX_BW_20: - if (sta->ht_cap.ht_supported) + if (sta->deflink.ht_cap.ht_supported) add_sta_cmd.station_flags |= cpu_to_le32(STA_FLG_FAT_EN_20MHZ); break; } - switch (sta->rx_nss) { + switch (sta->deflink.rx_nss) { case 1: add_sta_cmd.station_flags |= cpu_to_le32(STA_FLG_MIMO_EN_SISO); break; @@ -133,12 +133,12 @@ int iwl_mvm_sta_send_to_fw(struct iwl_mvm *mvm, struct ieee80211_sta *sta, break; } - if (sta->ht_cap.ht_supported) { + if (sta->deflink.ht_cap.ht_supported) { add_sta_cmd.station_flags_msk |= cpu_to_le32(STA_FLG_MAX_AGG_SIZE_MSK | STA_FLG_AGG_MPDU_DENS_MSK); - mpdu_dens = sta->ht_cap.ampdu_density; + mpdu_dens = sta->deflink.ht_cap.ampdu_density; } if (mvm_sta->vif->bss_conf.chandef.chan->band == NL80211_BAND_6GHZ) { @@ -146,18 +146,18 @@ int iwl_mvm_sta_send_to_fw(struct iwl_mvm *mvm, struct ieee80211_sta *sta, cpu_to_le32(STA_FLG_MAX_AGG_SIZE_MSK | STA_FLG_AGG_MPDU_DENS_MSK); - mpdu_dens = le16_get_bits(sta->he_6ghz_capa.capa, + mpdu_dens = le16_get_bits(sta->deflink.he_6ghz_capa.capa, IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START); - agg_size = le16_get_bits(sta->he_6ghz_capa.capa, - IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP); + agg_size = le16_get_bits(sta->deflink.he_6ghz_capa.capa, + IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP); } else - if (sta->vht_cap.vht_supported) { - agg_size = sta->vht_cap.cap & + if (sta->deflink.vht_cap.vht_supported) { + agg_size = sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; agg_size >>= IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT; - } else if (sta->ht_cap.ht_supported) { - agg_size = sta->ht_cap.ampdu_factor; + } else if (sta->deflink.ht_cap.ht_supported) { + agg_size = sta->deflink.ht_cap.ampdu_factor; } /* D6.0 10.12.2 A-MPDU length limit rules @@ -168,8 +168,8 @@ int iwl_mvm_sta_send_to_fw(struct iwl_mvm *mvm, struct ieee80211_sta *sta, * Maximum AMPDU Length Exponent Extension field in its HE * Capabilities element */ - if (sta->he_cap.has_he) - agg_size += u8_get_bits(sta->he_cap.he_cap_elem.mac_cap_info[3], + if (sta->deflink.he_cap.has_he) + agg_size += u8_get_bits(sta->deflink.he_cap.he_cap_elem.mac_cap_info[3], IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK); /* Limit to max A-MPDU supported by FW */ diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c index e354918c2480..36508196232d 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c @@ -743,7 +743,7 @@ unsigned int iwl_mvm_max_amsdu_size(struct iwl_mvm *mvm, int lmac = iwl_mvm_get_lmac_id(mvm->fw, band); /* For HE redirect to trigger based fifos */ - if (sta->he_cap.has_he && !WARN_ON(!iwl_mvm_has_new_tx_api(mvm))) + if (sta->deflink.he_cap.has_he && !WARN_ON(!iwl_mvm_has_new_tx_api(mvm))) ac += 4; txf = iwl_mvm_mac_ac_to_tx_fifo(mvm, ac); @@ -884,7 +884,7 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb, * section 8.7.3 NOTE 3). */ if (info->flags & IEEE80211_TX_CTL_AMPDU && - !sta->vht_cap.vht_supported) + !sta->deflink.vht_cap.vht_supported) max_amsdu_len = min_t(unsigned int, max_amsdu_len, 4095); /* Sub frame header + SNAP + IP header + TCP header + MSS */ @@ -1032,7 +1032,7 @@ static int iwl_mvm_tx_mpdu(struct iwl_mvm *mvm, struct sk_buff *skb, if (WARN_ON_ONCE(mvmsta->sta_id == IWL_MVM_INVALID_STA)) return -1; - if (unlikely(ieee80211_is_any_nullfunc(fc)) && sta->he_cap.has_he) + if (unlikely(ieee80211_is_any_nullfunc(fc)) && sta->deflink.he_cap.has_he) return -1; if (unlikely(ieee80211_is_probe_resp(fc))) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index c3c3b5aa87b0..48d9bde5a45c 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -652,6 +652,7 @@ struct mac80211_hwsim_data { ARRAY_SIZE(hwsim_channels_6ghz)]; struct ieee80211_channel *channel; + enum nl80211_chan_width bw; u64 beacon_int /* beacon interval in us */; unsigned int rx_filter; bool started, idle, scanning; @@ -803,6 +804,40 @@ extern int hwsim_tx_virtio(struct mac80211_hwsim_data *data, #define hwsim_virtio_enabled false #endif +static int hwsim_get_chanwidth(enum nl80211_chan_width bw) +{ + switch (bw) { + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + return 20; + case NL80211_CHAN_WIDTH_40: + return 40; + case NL80211_CHAN_WIDTH_80: + return 80; + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_160: + return 160; + case NL80211_CHAN_WIDTH_320: + return 320; + case NL80211_CHAN_WIDTH_5: + return 5; + case NL80211_CHAN_WIDTH_10: + return 10; + case NL80211_CHAN_WIDTH_1: + return 1; + case NL80211_CHAN_WIDTH_2: + return 2; + case NL80211_CHAN_WIDTH_4: + return 4; + case NL80211_CHAN_WIDTH_8: + return 8; + case NL80211_CHAN_WIDTH_16: + return 16; + } + + return INT_MAX; +} + static void mac80211_hwsim_tx_frame(struct ieee80211_hw *hw, struct sk_buff *skb, struct ieee80211_channel *chan); @@ -1599,7 +1634,8 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_channel *channel; bool ack; - u32 _portid; + enum nl80211_chan_width confbw = NL80211_CHAN_WIDTH_20_NOHT; + u32 _portid, i; if (WARN_ON(skb->len < 10)) { /* Should not happen; just a sanity check for addr1 use */ @@ -1609,14 +1645,17 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw, if (!data->use_chanctx) { channel = data->channel; + confbw = data->bw; } else if (txi->hw_queue == 4) { channel = data->tmp_chan; } else { chanctx_conf = rcu_dereference(txi->control.vif->chanctx_conf); - if (chanctx_conf) + if (chanctx_conf) { channel = chanctx_conf->def.chan; - else + confbw = chanctx_conf->def.width; + } else { channel = NULL; + } } if (WARN(!channel, "TX w/o channel - queue = %d\n", txi->hw_queue)) { @@ -1640,6 +1679,25 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw, txi->control.rates, ARRAY_SIZE(txi->control.rates)); + for (i = 0; i < ARRAY_SIZE(txi->control.rates); i++) { + u16 rflags = txi->control.rates[i].flags; + /* initialize to data->bw for 5/10 MHz handling */ + enum nl80211_chan_width bw = data->bw; + + if (txi->control.rates[i].idx == -1) + break; + + if (rflags & IEEE80211_TX_RC_40_MHZ_WIDTH) + bw = NL80211_CHAN_WIDTH_40; + else if (rflags & IEEE80211_TX_RC_80_MHZ_WIDTH) + bw = NL80211_CHAN_WIDTH_80; + else if (rflags & IEEE80211_TX_RC_160_MHZ_WIDTH) + bw = NL80211_CHAN_WIDTH_160; + + if (WARN_ON(hwsim_get_chanwidth(bw) > hwsim_get_chanwidth(confbw))) + return; + } + if (skb->len >= 24 + 8 && ieee80211_is_probe_resp(hdr->frame_control)) { /* fake header transmission time */ @@ -1939,6 +1997,7 @@ static int mac80211_hwsim_config(struct ieee80211_hw *hw, u32 changed) } data->channel = conf->chandef.chan; + data->bw = conf->chandef.width; for (idx = 0; idx < ARRAY_SIZE(data->survey_data); idx++) { if (data->survey_data[idx].channel && @@ -1950,6 +2009,7 @@ static int mac80211_hwsim_config(struct ieee80211_hw *hw, u32 changed) } } else { data->channel = conf->chandef.chan; + data->bw = conf->chandef.width; } mutex_unlock(&data->mutex); @@ -2081,12 +2141,51 @@ static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw, wiphy_dbg(hw->wiphy, " TX Power: %d dBm\n", info->txpower); } +static void +mac80211_hwsim_sta_rc_update(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *sta, + u32 changed) +{ + struct mac80211_hwsim_data *data = hw->priv; + u32 bw = U32_MAX; + enum nl80211_chan_width confbw = NL80211_CHAN_WIDTH_20_NOHT; + + switch (sta->deflink.bandwidth) { +#define C(_bw) case IEEE80211_STA_RX_BW_##_bw: bw = _bw; break + C(20); + C(40); + C(80); + C(160); +#undef C + } + + if (!data->use_chanctx) { + confbw = data->bw; + } else { + struct ieee80211_chanctx_conf *chanctx_conf; + + rcu_read_lock(); + chanctx_conf = rcu_dereference(vif->chanctx_conf); + + if (!WARN_ON(!chanctx_conf)) + confbw = chanctx_conf->def.width; + rcu_read_unlock(); + } + + WARN(bw > hwsim_get_chanwidth(confbw), + "intf %pM: bad STA %pM bandwidth %d MHz (%d) > channel config %d MHz (%d)\n", + vif->addr, sta->addr, bw, sta->deflink.bandwidth, + hwsim_get_chanwidth(data->bw), data->bw); +} + static int mac80211_hwsim_sta_add(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta) { hwsim_check_magic(vif); hwsim_set_sta_magic(sta); + mac80211_hwsim_sta_rc_update(hw, vif, sta, 0); return 0; } @@ -2665,6 +2764,7 @@ static int mac80211_hwsim_tx_last_beacon(struct ieee80211_hw *hw) .sta_add = mac80211_hwsim_sta_add, \ .sta_remove = mac80211_hwsim_sta_remove, \ .sta_notify = mac80211_hwsim_sta_notify, \ + .sta_rc_update = mac80211_hwsim_sta_rc_update, \ .set_tim = mac80211_hwsim_set_tim, \ .conf_tx = mac80211_hwsim_conf_tx, \ .get_survey = mac80211_hwsim_get_survey, \ diff --git a/drivers/net/wireless/marvell/libertas/cfg.c b/drivers/net/wireless/marvell/libertas/cfg.c index 4e3de684928b..b9e674343b17 100644 --- a/drivers/net/wireless/marvell/libertas/cfg.c +++ b/drivers/net/wireless/marvell/libertas/cfg.c @@ -1437,7 +1437,7 @@ static int lbs_cfg_disconnect(struct wiphy *wiphy, struct net_device *dev, } static int lbs_cfg_set_default_key(struct wiphy *wiphy, - struct net_device *netdev, + struct net_device *netdev, int link_id, u8 key_index, bool unicast, bool multicast) { @@ -1457,8 +1457,8 @@ static int lbs_cfg_set_default_key(struct wiphy *wiphy, static int lbs_cfg_add_key(struct wiphy *wiphy, struct net_device *netdev, - u8 idx, bool pairwise, const u8 *mac_addr, - struct key_params *params) + int link_id, u8 idx, bool pairwise, + const u8 *mac_addr, struct key_params *params) { struct lbs_private *priv = wiphy_priv(wiphy); u16 key_info; @@ -1518,7 +1518,8 @@ static int lbs_cfg_add_key(struct wiphy *wiphy, struct net_device *netdev, static int lbs_cfg_del_key(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool pairwise, const u8 *mac_addr) + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr) { lbs_deb_assoc("del_key: key_idx %d, mac_addr %pM\n", diff --git a/drivers/net/wireless/marvell/libertas/mesh.c b/drivers/net/wireless/marvell/libertas/mesh.c index 6cbba84989b8..d57a9264ee29 100644 --- a/drivers/net/wireless/marvell/libertas/mesh.c +++ b/drivers/net/wireless/marvell/libertas/mesh.c @@ -109,9 +109,9 @@ static int lbs_mesh_config(struct lbs_private *priv, uint16_t action, if (priv->mesh_dev) { mesh_wdev = priv->mesh_dev->ieee80211_ptr; - ie->val.mesh_id_len = mesh_wdev->mesh_id_up_len; - memcpy(ie->val.mesh_id, mesh_wdev->ssid, - mesh_wdev->mesh_id_up_len); + ie->val.mesh_id_len = mesh_wdev->u.mesh.id_up_len; + memcpy(ie->val.mesh_id, mesh_wdev->u.mesh.id, + mesh_wdev->u.mesh.id_up_len); } ie->len = sizeof(struct mrvl_meshie_val) - @@ -986,8 +986,8 @@ static int lbs_add_mesh(struct lbs_private *priv) mesh_wdev->wiphy = priv->wdev->wiphy; if (priv->mesh_tlv) { - sprintf(mesh_wdev->ssid, "mesh"); - mesh_wdev->mesh_id_up_len = 4; + sprintf(mesh_wdev->u.mesh.id, "mesh"); + mesh_wdev->u.mesh.id_up_len = 4; } mesh_wdev->netdev = mesh_dev; diff --git a/drivers/net/wireless/marvell/mwifiex/11h.c b/drivers/net/wireless/marvell/mwifiex/11h.c index 3fa25cd64cda..4ca8d0135708 100644 --- a/drivers/net/wireless/marvell/mwifiex/11h.c +++ b/drivers/net/wireless/marvell/mwifiex/11h.c @@ -304,6 +304,6 @@ void mwifiex_dfs_chan_sw_work_queue(struct work_struct *work) mwifiex_dbg(priv->adapter, MSG, "indicating channel switch completion to kernel\n"); mutex_lock(&priv->wdev.mtx); - cfg80211_ch_switch_notify(priv->netdev, &priv->dfs_chandef); + cfg80211_ch_switch_notify(priv->netdev, &priv->dfs_chandef, 0); mutex_unlock(&priv->wdev.mtx); } diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c index 97f0f39364d6..8c4f24da563d 100644 --- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c +++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c @@ -154,7 +154,8 @@ static void *mwifiex_cfg80211_get_adapter(struct wiphy *wiphy) */ static int mwifiex_cfg80211_del_key(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool pairwise, const u8 *mac_addr) + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr) { struct mwifiex_private *priv = mwifiex_netdev_get_priv(netdev); static const u8 bc_mac[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; @@ -443,7 +444,7 @@ mwifiex_cfg80211_set_power_mgmt(struct wiphy *wiphy, */ static int mwifiex_cfg80211_set_default_key(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool unicast, + int link_id, u8 key_index, bool unicast, bool multicast) { struct mwifiex_private *priv = mwifiex_netdev_get_priv(netdev); @@ -468,8 +469,8 @@ mwifiex_cfg80211_set_default_key(struct wiphy *wiphy, struct net_device *netdev, */ static int mwifiex_cfg80211_add_key(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool pairwise, const u8 *mac_addr, - struct key_params *params) + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr, struct key_params *params) { struct mwifiex_private *priv = mwifiex_netdev_get_priv(netdev); struct mwifiex_wep_key *wep_key; @@ -506,6 +507,7 @@ mwifiex_cfg80211_add_key(struct wiphy *wiphy, struct net_device *netdev, static int mwifiex_cfg80211_set_default_mgmt_key(struct wiphy *wiphy, struct net_device *netdev, + int link_id, u8 key_index) { struct mwifiex_private *priv = mwifiex_netdev_get_priv(netdev); @@ -1680,10 +1682,12 @@ mwifiex_mgmt_stypes[NUM_NL80211_IFTYPES] = { * Function configures data rates to firmware using bitrate mask * provided by cfg80211. */ -static int mwifiex_cfg80211_set_bitrate_mask(struct wiphy *wiphy, - struct net_device *dev, - const u8 *peer, - const struct cfg80211_bitrate_mask *mask) +static int +mwifiex_cfg80211_set_bitrate_mask(struct wiphy *wiphy, + struct net_device *dev, + unsigned int link_id, + const u8 *peer, + const struct cfg80211_bitrate_mask *mask) { struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev); u16 bitmap_rates[MAX_BITMAP_RATES_SIZE]; @@ -1925,7 +1929,8 @@ mwifiex_cfg80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant) /* cfg80211 operation handler for stop ap. * Function stops BSS running at uAP interface. */ -static int mwifiex_cfg80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) +static int mwifiex_cfg80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, + unsigned int link_id) { struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev); @@ -2348,7 +2353,7 @@ mwifiex_cfg80211_connect(struct wiphy *wiphy, struct net_device *dev, return -EINVAL; } - if (priv->wdev.current_bss) { + if (priv->wdev.connected) { mwifiex_dbg(adapter, ERROR, "%s: already connected\n", dev->name); return -EALREADY; @@ -2576,7 +2581,7 @@ mwifiex_cfg80211_scan(struct wiphy *wiphy, return -EBUSY; } - if (!priv->wdev.current_bss && priv->scan_block) + if (!priv->wdev.connected && priv->scan_block) priv->scan_block = false; if (!mwifiex_stop_bg_scan(priv)) @@ -3987,6 +3992,7 @@ mwifiex_cfg80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, static int mwifiex_cfg80211_get_channel(struct wiphy *wiphy, struct wireless_dev *wdev, + unsigned int link_id, struct cfg80211_chan_def *chandef) { struct mwifiex_private *priv = mwifiex_netdev_get_priv(wdev->netdev); diff --git a/drivers/net/wireless/marvell/mwifiex/sta_cmd.c b/drivers/net/wireless/marvell/mwifiex/sta_cmd.c index 48ea00da1fc9..6b874697746e 100644 --- a/drivers/net/wireless/marvell/mwifiex/sta_cmd.c +++ b/drivers/net/wireless/marvell/mwifiex/sta_cmd.c @@ -1786,29 +1786,31 @@ mwifiex_cmd_tdls_oper(struct mwifiex_private *priv, wmm_qos_info->qos_info = 0; config_len += sizeof(struct mwifiex_ie_types_qos_info); - if (params->ht_capa) { + if (params->link_sta_params.ht_capa) { ht_capab = (struct mwifiex_ie_types_htcap *)(pos + config_len); ht_capab->header.type = cpu_to_le16(WLAN_EID_HT_CAPABILITY); ht_capab->header.len = cpu_to_le16(sizeof(struct ieee80211_ht_cap)); - memcpy(&ht_capab->ht_cap, params->ht_capa, + memcpy(&ht_capab->ht_cap, params->link_sta_params.ht_capa, sizeof(struct ieee80211_ht_cap)); config_len += sizeof(struct mwifiex_ie_types_htcap); } - if (params->supported_rates && params->supported_rates_len) { + if (params->link_sta_params.supported_rates && + params->link_sta_params.supported_rates_len) { tlv_rates = (struct host_cmd_tlv_rates *)(pos + config_len); tlv_rates->header.type = cpu_to_le16(WLAN_EID_SUPP_RATES); tlv_rates->header.len = - cpu_to_le16(params->supported_rates_len); - memcpy(tlv_rates->rates, params->supported_rates, - params->supported_rates_len); + cpu_to_le16(params->link_sta_params.supported_rates_len); + memcpy(tlv_rates->rates, + params->link_sta_params.supported_rates, + params->link_sta_params.supported_rates_len); config_len += sizeof(struct host_cmd_tlv_rates) + - params->supported_rates_len; + params->link_sta_params.supported_rates_len; } if (params->ext_capab && params->ext_capab_len) { @@ -1822,14 +1824,14 @@ mwifiex_cmd_tdls_oper(struct mwifiex_private *priv, config_len += sizeof(struct mwifiex_ie_types_extcap) + params->ext_capab_len; } - if (params->vht_capa) { + if (params->link_sta_params.vht_capa) { vht_capab = (struct mwifiex_ie_types_vhtcap *)(pos + config_len); vht_capab->header.type = cpu_to_le16(WLAN_EID_VHT_CAPABILITY); vht_capab->header.len = cpu_to_le16(sizeof(struct ieee80211_vht_cap)); - memcpy(&vht_capab->vht_cap, params->vht_capa, + memcpy(&vht_capab->vht_cap, params->link_sta_params.vht_capa, sizeof(struct ieee80211_vht_cap)); config_len += sizeof(struct mwifiex_ie_types_vhtcap); } diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c index 529e325498cd..5a6f0b7e23fe 100644 --- a/drivers/net/wireless/marvell/mwl8k.c +++ b/drivers/net/wireless/marvell/mwl8k.c @@ -1985,7 +1985,7 @@ mwl8k_txq_xmit(struct ieee80211_hw *hw, txpriority = index; - if (priv->ap_fw && sta && sta->ht_cap.ht_supported && !eapol_frame && + if (priv->ap_fw && sta && sta->deflink.ht_cap.ht_supported && !eapol_frame && ieee80211_is_data_qos(wh->frame_control)) { tid = qos & 0xf; mwl8k_tx_count_packet(sta, tid); @@ -4027,9 +4027,9 @@ mwl8k_create_ba(struct ieee80211_hw *hw, struct mwl8k_ampdu_stream *stream, cmd->create_params.reset_seq_no_flag = 1; cmd->create_params.param_info = - (stream->sta->ht_cap.ampdu_factor & + (stream->sta->deflink.ht_cap.ampdu_factor & IEEE80211_HT_AMPDU_PARM_FACTOR) | - ((stream->sta->ht_cap.ampdu_density << 2) & + ((stream->sta->deflink.ht_cap.ampdu_density << 2) & IEEE80211_HT_AMPDU_PARM_DENSITY); cmd->create_params.flags = @@ -4113,18 +4113,18 @@ static int mwl8k_cmd_set_new_stn_add(struct ieee80211_hw *hw, cmd->stn_id = cpu_to_le16(sta->aid); cmd->action = cpu_to_le16(MWL8K_STA_ACTION_ADD); if (hw->conf.chandef.chan->band == NL80211_BAND_2GHZ) - rates = sta->supp_rates[NL80211_BAND_2GHZ]; + rates = sta->deflink.supp_rates[NL80211_BAND_2GHZ]; else - rates = sta->supp_rates[NL80211_BAND_5GHZ] << 5; + rates = sta->deflink.supp_rates[NL80211_BAND_5GHZ] << 5; cmd->legacy_rates = cpu_to_le32(rates); - if (sta->ht_cap.ht_supported) { - cmd->ht_rates[0] = sta->ht_cap.mcs.rx_mask[0]; - cmd->ht_rates[1] = sta->ht_cap.mcs.rx_mask[1]; - cmd->ht_rates[2] = sta->ht_cap.mcs.rx_mask[2]; - cmd->ht_rates[3] = sta->ht_cap.mcs.rx_mask[3]; - cmd->ht_capabilities_info = cpu_to_le16(sta->ht_cap.cap); - cmd->mac_ht_param_info = (sta->ht_cap.ampdu_factor & 3) | - ((sta->ht_cap.ampdu_density & 7) << 2); + if (sta->deflink.ht_cap.ht_supported) { + cmd->ht_rates[0] = sta->deflink.ht_cap.mcs.rx_mask[0]; + cmd->ht_rates[1] = sta->deflink.ht_cap.mcs.rx_mask[1]; + cmd->ht_rates[2] = sta->deflink.ht_cap.mcs.rx_mask[2]; + cmd->ht_rates[3] = sta->deflink.ht_cap.mcs.rx_mask[3]; + cmd->ht_capabilities_info = cpu_to_le16(sta->deflink.ht_cap.cap); + cmd->mac_ht_param_info = (sta->deflink.ht_cap.ampdu_factor & 3) | + ((sta->deflink.ht_cap.ampdu_density & 7) << 2); cmd->is_qos_sta = 1; } @@ -4543,16 +4543,16 @@ static int mwl8k_cmd_update_stadb_add(struct ieee80211_hw *hw, p = &cmd->peer_info; p->peer_type = MWL8K_PEER_TYPE_ACCESSPOINT; p->basic_caps = cpu_to_le16(vif->bss_conf.assoc_capability); - p->ht_support = sta->ht_cap.ht_supported; - p->ht_caps = cpu_to_le16(sta->ht_cap.cap); - p->extended_ht_caps = (sta->ht_cap.ampdu_factor & 3) | - ((sta->ht_cap.ampdu_density & 7) << 2); + p->ht_support = sta->deflink.ht_cap.ht_supported; + p->ht_caps = cpu_to_le16(sta->deflink.ht_cap.cap); + p->extended_ht_caps = (sta->deflink.ht_cap.ampdu_factor & 3) | + ((sta->deflink.ht_cap.ampdu_density & 7) << 2); if (hw->conf.chandef.chan->band == NL80211_BAND_2GHZ) - rates = sta->supp_rates[NL80211_BAND_2GHZ]; + rates = sta->deflink.supp_rates[NL80211_BAND_2GHZ]; else - rates = sta->supp_rates[NL80211_BAND_5GHZ] << 5; + rates = sta->deflink.supp_rates[NL80211_BAND_5GHZ] << 5; legacy_rate_mask_to_array(p->legacy_rates, rates); - memcpy(p->ht_rates, &sta->ht_cap.mcs, 16); + memcpy(p->ht_rates, &sta->deflink.ht_cap.mcs, 16); p->interop = 1; p->amsdu_enabled = 0; @@ -5029,12 +5029,12 @@ mwl8k_bss_info_changed_sta(struct ieee80211_hw *hw, struct ieee80211_vif *vif, } if (hw->conf.chandef.chan->band == NL80211_BAND_2GHZ) { - ap_legacy_rates = ap->supp_rates[NL80211_BAND_2GHZ]; + ap_legacy_rates = ap->deflink.supp_rates[NL80211_BAND_2GHZ]; } else { ap_legacy_rates = - ap->supp_rates[NL80211_BAND_5GHZ] << 5; + ap->deflink.supp_rates[NL80211_BAND_5GHZ] << 5; } - memcpy(ap_mcs_rates, &ap->ht_cap.mcs, 16); + memcpy(ap_mcs_rates, &ap->deflink.ht_cap.mcs, 16); rcu_read_unlock(); @@ -5345,7 +5345,7 @@ static int mwl8k_sta_add(struct ieee80211_hw *hw, ret = mwl8k_cmd_update_stadb_add(hw, vif, sta); if (ret >= 0) { MWL8K_STA(sta)->peer_id = ret; - if (sta->ht_cap.ht_supported) + if (sta->deflink.ht_cap.ht_supported) MWL8K_STA(sta)->is_ampdu_allowed = true; ret = 0; } diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/mac.c b/drivers/net/wireless/mediatek/mt76/mt7603/mac.c index 65f1f2bb8083..3400d8e1c496 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/mac.c @@ -325,19 +325,21 @@ void mt7603_wtbl_update_cap(struct mt7603_dev *dev, struct ieee80211_sta *sta) addr = mt7603_wtbl1_addr(idx); - ampdu_density = sta->ht_cap.ampdu_density; + ampdu_density = sta->deflink.ht_cap.ampdu_density; if (ampdu_density < IEEE80211_HT_MPDU_DENSITY_4) ampdu_density = IEEE80211_HT_MPDU_DENSITY_4; val = mt76_rr(dev, addr + 2 * 4); val &= MT_WTBL1_W2_KEY_TYPE | MT_WTBL1_W2_ADMISSION_CONTROL; - val |= FIELD_PREP(MT_WTBL1_W2_AMPDU_FACTOR, sta->ht_cap.ampdu_factor) | - FIELD_PREP(MT_WTBL1_W2_MPDU_DENSITY, sta->ht_cap.ampdu_density) | + val |= FIELD_PREP(MT_WTBL1_W2_AMPDU_FACTOR, + sta->deflink.ht_cap.ampdu_factor) | + FIELD_PREP(MT_WTBL1_W2_MPDU_DENSITY, + sta->deflink.ht_cap.ampdu_density) | MT_WTBL1_W2_TXS_BAF_REPORT; - if (sta->ht_cap.cap) + if (sta->deflink.ht_cap.cap) val |= MT_WTBL1_W2_HT; - if (sta->vht_cap.cap) + if (sta->deflink.vht_cap.cap) val |= MT_WTBL1_W2_VHT; mt76_wr(dev, addr + 2 * 4, val); @@ -346,9 +348,9 @@ void mt7603_wtbl_update_cap(struct mt7603_dev *dev, struct ieee80211_sta *sta) val = mt76_rr(dev, addr + 9 * 4); val &= ~(MT_WTBL2_W9_SHORT_GI_20 | MT_WTBL2_W9_SHORT_GI_40 | MT_WTBL2_W9_SHORT_GI_80); - if (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) + if (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) val |= MT_WTBL2_W9_SHORT_GI_20; - if (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) + if (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) val |= MT_WTBL2_W9_SHORT_GI_40; mt76_wr(dev, addr + 9 * 4, val); } diff --git a/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c b/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c index 017bd59c4ea8..7dff4406bde4 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c @@ -531,7 +531,7 @@ mt76_connac_mcu_sta_amsdu_tlv(struct sk_buff *skb, struct ieee80211_sta *sta, static void mt76_connac_mcu_sta_he_tlv(struct sk_buff *skb, struct ieee80211_sta *sta) { - struct ieee80211_sta_he_cap *he_cap = &sta->he_cap; + struct ieee80211_sta_he_cap *he_cap = &sta->deflink.he_cap; struct ieee80211_he_cap_elem *elem = &he_cap->he_cap_elem; struct sta_rec_he *he; struct tlv *tlv; @@ -619,7 +619,7 @@ mt76_connac_mcu_sta_he_tlv(struct sk_buff *skb, struct ieee80211_sta *sta) he->he_cap = cpu_to_le32(cap); - switch (sta->bandwidth) { + switch (sta->deflink.bandwidth) { case IEEE80211_STA_RX_BW_160: if (elem->phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) @@ -671,9 +671,9 @@ mt76_connac_get_phy_mode_v2(struct mt76_phy *mphy, struct ieee80211_vif *vif, u8 mode = 0; if (sta) { - ht_cap = &sta->ht_cap; - vht_cap = &sta->vht_cap; - he_cap = &sta->he_cap; + ht_cap = &sta->deflink.ht_cap; + vht_cap = &sta->deflink.vht_cap; + he_cap = &sta->deflink.he_cap; } else { struct ieee80211_supported_band *sband; @@ -722,25 +722,25 @@ void mt76_connac_mcu_sta_tlv(struct mt76_phy *mphy, struct sk_buff *skb, u16 supp_rates; /* starec ht */ - if (sta->ht_cap.ht_supported) { + if (sta->deflink.ht_cap.ht_supported) { struct sta_rec_ht *ht; tlv = mt76_connac_mcu_add_tlv(skb, STA_REC_HT, sizeof(*ht)); ht = (struct sta_rec_ht *)tlv; - ht->ht_cap = cpu_to_le16(sta->ht_cap.cap); + ht->ht_cap = cpu_to_le16(sta->deflink.ht_cap.cap); } /* starec vht */ - if (sta->vht_cap.vht_supported) { + if (sta->deflink.vht_cap.vht_supported) { struct sta_rec_vht *vht; int len; len = is_mt7921(dev) ? sizeof(*vht) : sizeof(*vht) - 4; tlv = mt76_connac_mcu_add_tlv(skb, STA_REC_VHT, len); vht = (struct sta_rec_vht *)tlv; - vht->vht_cap = cpu_to_le32(sta->vht_cap.cap); - vht->vht_rx_mcs_map = sta->vht_cap.vht_mcs.rx_mcs_map; - vht->vht_tx_mcs_map = sta->vht_cap.vht_mcs.tx_mcs_map; + vht->vht_cap = cpu_to_le32(sta->deflink.vht_cap.cap); + vht->vht_rx_mcs_map = sta->deflink.vht_cap.vht_mcs.rx_mcs_map; + vht->vht_tx_mcs_map = sta->deflink.vht_cap.vht_mcs.tx_mcs_map; } /* starec uapsd */ @@ -749,11 +749,11 @@ void mt76_connac_mcu_sta_tlv(struct mt76_phy *mphy, struct sk_buff *skb, if (!is_mt7921(dev)) return; - if (sta->ht_cap.ht_supported) + if (sta->deflink.ht_cap.ht_supported) mt76_connac_mcu_sta_amsdu_tlv(skb, sta, vif); /* starec he */ - if (sta->he_cap.has_he) + if (sta->deflink.he_cap.has_he) mt76_connac_mcu_sta_he_tlv(skb, sta); tlv = mt76_connac_mcu_add_tlv(skb, STA_REC_PHY, sizeof(*phy)); @@ -762,14 +762,14 @@ void mt76_connac_mcu_sta_tlv(struct mt76_phy *mphy, struct sk_buff *skb, phy->basic_rate = cpu_to_le16((u16)vif->bss_conf.basic_rates); phy->rcpi = rcpi; phy->ampdu = FIELD_PREP(IEEE80211_HT_AMPDU_PARM_FACTOR, - sta->ht_cap.ampdu_factor) | + sta->deflink.ht_cap.ampdu_factor) | FIELD_PREP(IEEE80211_HT_AMPDU_PARM_DENSITY, - sta->ht_cap.ampdu_density); + sta->deflink.ht_cap.ampdu_density); tlv = mt76_connac_mcu_add_tlv(skb, STA_REC_RA, sizeof(*ra_info)); ra_info = (struct sta_rec_ra_info *)tlv; - supp_rates = sta->supp_rates[band]; + supp_rates = sta->deflink.supp_rates[band]; if (band == NL80211_BAND_2GHZ) supp_rates = FIELD_PREP(RA_LEGACY_OFDM, supp_rates >> 4) | FIELD_PREP(RA_LEGACY_CCK, supp_rates & 0xf); @@ -778,17 +778,18 @@ void mt76_connac_mcu_sta_tlv(struct mt76_phy *mphy, struct sk_buff *skb, ra_info->legacy = cpu_to_le16(supp_rates); - if (sta->ht_cap.ht_supported) - memcpy(ra_info->rx_mcs_bitmask, sta->ht_cap.mcs.rx_mask, + if (sta->deflink.ht_cap.ht_supported) + memcpy(ra_info->rx_mcs_bitmask, + sta->deflink.ht_cap.mcs.rx_mask, HT_MCS_MASK_NUM); tlv = mt76_connac_mcu_add_tlv(skb, STA_REC_STATE, sizeof(*state)); state = (struct sta_rec_state *)tlv; state->state = sta_state; - if (sta->vht_cap.vht_supported) { - state->vht_opmode = sta->bandwidth; - state->vht_opmode |= (sta->rx_nss - 1) << + if (sta->deflink.vht_cap.vht_supported) { + state->vht_opmode = sta->deflink.bandwidth; + state->vht_opmode |= (sta->deflink.rx_nss - 1) << IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT; } } @@ -817,17 +818,17 @@ void mt76_connac_mcu_wtbl_ht_tlv(struct mt76_dev *dev, struct sk_buff *skb, struct tlv *tlv; u32 flags = 0; - if (sta->ht_cap.ht_supported) { + if (sta->deflink.ht_cap.ht_supported) { tlv = mt76_connac_mcu_add_nested_tlv(skb, WTBL_HT, sizeof(*ht), wtbl_tlv, sta_wtbl); ht = (struct wtbl_ht *)tlv; - ht->ldpc = !!(sta->ht_cap.cap & IEEE80211_HT_CAP_LDPC_CODING); - ht->af = sta->ht_cap.ampdu_factor; - ht->mm = sta->ht_cap.ampdu_density; + ht->ldpc = !!(sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_LDPC_CODING); + ht->af = sta->deflink.ht_cap.ampdu_factor; + ht->mm = sta->deflink.ht_cap.ampdu_density; ht->ht = true; } - if (sta->vht_cap.vht_supported) { + if (sta->deflink.vht_cap.vht_supported) { struct wtbl_vht *vht; u8 af; @@ -835,18 +836,18 @@ void mt76_connac_mcu_wtbl_ht_tlv(struct mt76_dev *dev, struct sk_buff *skb, sizeof(*vht), wtbl_tlv, sta_wtbl); vht = (struct wtbl_vht *)tlv; - vht->ldpc = !!(sta->vht_cap.cap & IEEE80211_VHT_CAP_RXLDPC); + vht->ldpc = !!(sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_RXLDPC); vht->vht = true; af = FIELD_GET(IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK, - sta->vht_cap.cap); + sta->deflink.vht_cap.cap); if (ht) ht->af = max(ht->af, af); } mt76_connac_mcu_wtbl_smps_tlv(skb, sta, sta_wtbl, wtbl_tlv); - if (!is_mt7921(dev) && sta->ht_cap.ht_supported) { + if (!is_mt7921(dev) && sta->deflink.ht_cap.ht_supported) { /* sgi */ u32 msk = MT_WTBL_W5_SHORT_GI_20 | MT_WTBL_W5_SHORT_GI_40 | MT_WTBL_W5_SHORT_GI_80 | MT_WTBL_W5_SHORT_GI_160; @@ -856,15 +857,15 @@ void mt76_connac_mcu_wtbl_ht_tlv(struct mt76_dev *dev, struct sk_buff *skb, sizeof(*raw), wtbl_tlv, sta_wtbl); - if (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) + if (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) flags |= MT_WTBL_W5_SHORT_GI_20; - if (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) + if (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) flags |= MT_WTBL_W5_SHORT_GI_40; - if (sta->vht_cap.vht_supported) { - if (sta->vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_80) + if (sta->deflink.vht_cap.vht_supported) { + if (sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_80) flags |= MT_WTBL_W5_SHORT_GI_80; - if (sta->vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_160) + if (sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_160) flags |= MT_WTBL_W5_SHORT_GI_160; } raw = (struct wtbl_raw *)tlv; @@ -1134,9 +1135,9 @@ mt76_connac_get_phy_mode(struct mt76_phy *phy, struct ieee80211_vif *vif, return 0x38; if (sta) { - ht_cap = &sta->ht_cap; - vht_cap = &sta->vht_cap; - he_cap = &sta->he_cap; + ht_cap = &sta->deflink.ht_cap; + vht_cap = &sta->deflink.vht_cap; + he_cap = &sta->deflink.he_cap; } else { struct ieee80211_supported_band *sband; diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c b/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c index 07b21b208582..f69fa9f23e16 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c @@ -412,9 +412,9 @@ void mt76x02_mac_write_txwi(struct mt76x02_dev *dev, struct mt76x02_txwi *txwi, txwi->ack_ctl |= MT_TXWI_ACK_CTL_NSEQ; if ((info->flags & IEEE80211_TX_CTL_AMPDU) && sta) { u8 ba_size = IEEE80211_MIN_AMPDU_BUF; - u8 ampdu_density = sta->ht_cap.ampdu_density; + u8 ampdu_density = sta->deflink.ht_cap.ampdu_density; - ba_size <<= sta->ht_cap.ampdu_factor; + ba_size <<= sta->deflink.ht_cap.ampdu_factor; ba_size = min_t(int, 63, ba_size - 1); if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) ba_size = 0; diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c index a8a0e6af51f8..7e797a506431 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c @@ -1082,7 +1082,7 @@ mt7915_tx_check_aggr(struct ieee80211_sta *sta, __le32 *txwi) u16 fc, tid; u32 val; - if (!sta || !sta->ht_cap.ht_supported) + if (!sta || !sta->deflink.ht_cap.ht_supported) return; tid = FIELD_GET(MT_TXD1_TID, le32_to_cpu(txwi[1])); diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c index e9d854e3293e..cdfa7d22f6ad 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c @@ -157,9 +157,9 @@ mt7915_get_phy_mode(struct ieee80211_vif *vif, struct ieee80211_sta *sta) u8 mode = 0; if (sta) { - ht_cap = &sta->ht_cap; - vht_cap = &sta->vht_cap; - he_cap = &sta->he_cap; + ht_cap = &sta->deflink.ht_cap; + vht_cap = &sta->deflink.vht_cap; + he_cap = &sta->deflink.he_cap; } else { struct ieee80211_supported_band *sband; @@ -215,18 +215,18 @@ mt7915_mcu_set_sta_he_mcs(struct ieee80211_sta *sta, __le16 *he_mcs, { struct mt7915_sta *msta = (struct mt7915_sta *)sta->drv_priv; struct cfg80211_chan_def *chandef = &msta->vif->phy->mt76->chandef; - int nss, max_nss = sta->rx_nss > 3 ? 4 : sta->rx_nss; + int nss, max_nss = sta->deflink.rx_nss > 3 ? 4 : sta->deflink.rx_nss; u16 mcs_map; switch (chandef->width) { case NL80211_CHAN_WIDTH_80P80: - mcs_map = le16_to_cpu(sta->he_cap.he_mcs_nss_supp.rx_mcs_80p80); + mcs_map = le16_to_cpu(sta->deflink.he_cap.he_mcs_nss_supp.rx_mcs_80p80); break; case NL80211_CHAN_WIDTH_160: - mcs_map = le16_to_cpu(sta->he_cap.he_mcs_nss_supp.rx_mcs_160); + mcs_map = le16_to_cpu(sta->deflink.he_cap.he_mcs_nss_supp.rx_mcs_160); break; default: - mcs_map = le16_to_cpu(sta->he_cap.he_mcs_nss_supp.rx_mcs_80); + mcs_map = le16_to_cpu(sta->deflink.he_cap.he_mcs_nss_supp.rx_mcs_80); break; } @@ -267,7 +267,7 @@ mt7915_mcu_set_sta_he_mcs(struct ieee80211_sta *sta, __le16 *he_mcs, mcs_map |= mcs << (nss * 2); /* only support 2ss on 160MHz */ - if (nss > 1 && (sta->bandwidth == IEEE80211_STA_RX_BW_160)) + if (nss > 1 && (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_160)) break; } @@ -278,8 +278,8 @@ static void mt7915_mcu_set_sta_vht_mcs(struct ieee80211_sta *sta, __le16 *vht_mcs, const u16 *mask) { - u16 mcs_map = le16_to_cpu(sta->vht_cap.vht_mcs.rx_mcs_map); - int nss, max_nss = sta->rx_nss > 3 ? 4 : sta->rx_nss; + u16 mcs_map = le16_to_cpu(sta->deflink.vht_cap.vht_mcs.rx_mcs_map); + int nss, max_nss = sta->deflink.rx_nss > 3 ? 4 : sta->deflink.rx_nss; u16 mcs; for (nss = 0; nss < max_nss; nss++, mcs_map >>= 2) { @@ -300,7 +300,7 @@ mt7915_mcu_set_sta_vht_mcs(struct ieee80211_sta *sta, __le16 *vht_mcs, vht_mcs[nss] = cpu_to_le16(mcs & mask[nss]); /* only support 2ss on 160MHz */ - if (nss > 1 && (sta->bandwidth == IEEE80211_STA_RX_BW_160)) + if (nss > 1 && (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_160)) break; } } @@ -309,10 +309,10 @@ static void mt7915_mcu_set_sta_ht_mcs(struct ieee80211_sta *sta, u8 *ht_mcs, const u8 *mask) { - int nss, max_nss = sta->rx_nss > 3 ? 4 : sta->rx_nss; + int nss, max_nss = sta->deflink.rx_nss > 3 ? 4 : sta->deflink.rx_nss; for (nss = 0; nss < max_nss; nss++) - ht_mcs[nss] = sta->ht_cap.mcs.rx_mask[nss] & mask[nss]; + ht_mcs[nss] = sta->deflink.ht_cap.mcs.rx_mask[nss] & mask[nss]; } static int @@ -1477,7 +1477,7 @@ static void mt7915_mcu_sta_he_tlv(struct sk_buff *skb, struct ieee80211_sta *sta) { struct mt7915_sta *msta = (struct mt7915_sta *)sta->drv_priv; - struct ieee80211_sta_he_cap *he_cap = &sta->he_cap; + struct ieee80211_sta_he_cap *he_cap = &sta->deflink.he_cap; struct ieee80211_he_cap_elem *elem = &he_cap->he_cap_elem; enum nl80211_band band = msta->vif->phy->mt76->chandef.chan->band; const u16 *mcs_mask = msta->vif->bitrate_mask.control[band].he_mcs; @@ -1567,7 +1567,7 @@ mt7915_mcu_sta_he_tlv(struct sk_buff *skb, struct ieee80211_sta *sta) he->he_cap = cpu_to_le32(cap); - switch (sta->bandwidth) { + switch (sta->deflink.bandwidth) { case IEEE80211_STA_RX_BW_160: if (elem->phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) @@ -1647,7 +1647,7 @@ mt7915_mcu_sta_uapsd_tlv(struct sk_buff *skb, struct ieee80211_sta *sta, static void mt7915_mcu_sta_muru_tlv(struct sk_buff *skb, struct ieee80211_sta *sta) { - struct ieee80211_sta_he_cap *he_cap = &sta->he_cap; + struct ieee80211_sta_he_cap *he_cap = &sta->deflink.he_cap; struct ieee80211_he_cap_elem *elem = &he_cap->he_cap_elem; struct sta_rec_muru *muru; struct tlv *tlv; @@ -1680,7 +1680,7 @@ mt7915_mcu_sta_muru_tlv(struct sk_buff *skb, struct ieee80211_sta *sta) muru->ofdma_ul.rx_t_frame_11ac = 0; muru->mimo_dl.vht_mu_bfee = - !!(sta->vht_cap.cap & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE); + !!(sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE); muru->mimo_dl.partial_bw_dl_mimo = HE_PHY(CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO, elem->phy_cap_info[6]); @@ -1699,9 +1699,9 @@ mt7915_mcu_sta_vht_tlv(struct sk_buff *skb, struct ieee80211_sta *sta) tlv = mt7915_mcu_add_tlv(skb, STA_REC_VHT, sizeof(*vht)); vht = (struct sta_rec_vht *)tlv; - vht->vht_cap = cpu_to_le32(sta->vht_cap.cap); - vht->vht_rx_mcs_map = sta->vht_cap.vht_mcs.rx_mcs_map; - vht->vht_tx_mcs_map = sta->vht_cap.vht_mcs.tx_mcs_map; + vht->vht_cap = cpu_to_le32(sta->deflink.vht_cap.cap); + vht->vht_rx_mcs_map = sta->deflink.vht_cap.vht_mcs.rx_mcs_map; + vht->vht_tx_mcs_map = sta->deflink.vht_cap.vht_mcs.tx_mcs_map; } static void @@ -1742,19 +1742,19 @@ mt7915_mcu_sta_tlv(struct mt7915_dev *dev, struct sk_buff *skb, struct tlv *tlv; /* starec ht */ - if (sta->ht_cap.ht_supported) { + if (sta->deflink.ht_cap.ht_supported) { struct sta_rec_ht *ht; tlv = mt7915_mcu_add_tlv(skb, STA_REC_HT, sizeof(*ht)); ht = (struct sta_rec_ht *)tlv; - ht->ht_cap = cpu_to_le16(sta->ht_cap.cap); + ht->ht_cap = cpu_to_le16(sta->deflink.ht_cap.cap); if (mt7915_hw_amsdu_supported(vif)) mt7915_mcu_sta_amsdu_tlv(skb, sta); } /* starec he */ - if (sta->he_cap.has_he) + if (sta->deflink.he_cap.has_he) mt7915_mcu_sta_he_tlv(skb, sta); /* starec uapsd */ @@ -1784,29 +1784,29 @@ mt7915_mcu_wtbl_ht_tlv(struct sk_buff *skb, struct ieee80211_sta *sta, struct tlv *tlv; /* wtbl ht */ - if (sta->ht_cap.ht_supported) { + if (sta->deflink.ht_cap.ht_supported) { tlv = mt7915_mcu_add_nested_tlv(skb, WTBL_HT, sizeof(*ht), wtbl_tlv, sta_wtbl); ht = (struct wtbl_ht *)tlv; - ht->ldpc = !!(sta->ht_cap.cap & IEEE80211_HT_CAP_LDPC_CODING); - ht->af = sta->ht_cap.ampdu_factor; - ht->mm = sta->ht_cap.ampdu_density; + ht->ldpc = !!(sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_LDPC_CODING); + ht->af = sta->deflink.ht_cap.ampdu_factor; + ht->mm = sta->deflink.ht_cap.ampdu_density; ht->ht = true; } /* wtbl vht */ - if (sta->vht_cap.vht_supported) { + if (sta->deflink.vht_cap.vht_supported) { struct wtbl_vht *vht; u8 af; tlv = mt7915_mcu_add_nested_tlv(skb, WTBL_VHT, sizeof(*vht), wtbl_tlv, sta_wtbl); vht = (struct wtbl_vht *)tlv; - vht->ldpc = !!(sta->vht_cap.cap & IEEE80211_VHT_CAP_RXLDPC); + vht->ldpc = !!(sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_RXLDPC); vht->vht = true; af = FIELD_GET(IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK, - sta->vht_cap.cap); + sta->deflink.vht_cap.cap); if (ht) ht->af = max_t(u8, ht->af, af); } @@ -1906,7 +1906,7 @@ static void mt7915_mcu_sta_bfer_ht(struct ieee80211_sta *sta, struct mt7915_phy *phy, struct sta_rec_bf *bf) { - struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs; + struct ieee80211_mcs_info *mcs = &sta->deflink.ht_cap.mcs; u8 n = 0; bf->tx_mode = MT_PHY_TYPE_HT; @@ -1932,7 +1932,7 @@ static void mt7915_mcu_sta_bfer_vht(struct ieee80211_sta *sta, struct mt7915_phy *phy, struct sta_rec_bf *bf, bool explicit) { - struct ieee80211_sta_vht_cap *pc = &sta->vht_cap; + struct ieee80211_sta_vht_cap *pc = &sta->deflink.vht_cap; struct ieee80211_sta_vht_cap *vc = &phy->mt76->sband_5g.sband.vht_cap; u16 mcs_map = le16_to_cpu(pc->vht_mcs.rx_mcs_map); u8 nss_mcs = mt7915_mcu_get_sta_nss(mcs_map); @@ -1952,7 +1952,7 @@ mt7915_mcu_sta_bfer_vht(struct ieee80211_sta *sta, struct mt7915_phy *phy, bf->nc = min_t(u8, nss_mcs, bf->nr); bf->ibf_ncol = bf->nc; - if (sta->bandwidth == IEEE80211_STA_RX_BW_160) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_160) bf->nr = 1; } else { bf->bf_cap = MT_IBF; @@ -1960,7 +1960,7 @@ mt7915_mcu_sta_bfer_vht(struct ieee80211_sta *sta, struct mt7915_phy *phy, bf->nc = min_t(u8, nss_mcs, bf->nr); bf->ibf_ncol = nss_mcs; - if (sta->bandwidth == IEEE80211_STA_RX_BW_160) + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_160) bf->ibf_nrow = 1; } } @@ -1969,7 +1969,7 @@ static void mt7915_mcu_sta_bfer_he(struct ieee80211_sta *sta, struct ieee80211_vif *vif, struct mt7915_phy *phy, struct sta_rec_bf *bf) { - struct ieee80211_sta_he_cap *pc = &sta->he_cap; + struct ieee80211_sta_he_cap *pc = &sta->deflink.he_cap; struct ieee80211_he_cap_elem *pe = &pc->he_cap_elem; const struct ieee80211_sta_he_cap *vc = mt7915_get_he_phy_cap(phy, vif); const struct ieee80211_he_cap_elem *ve = &vc->he_cap_elem; @@ -1991,7 +1991,7 @@ mt7915_mcu_sta_bfer_he(struct ieee80211_sta *sta, struct ieee80211_vif *vif, bf->nc = min_t(u8, nss_mcs, bf->nr); bf->ibf_ncol = bf->nc; - if (sta->bandwidth != IEEE80211_STA_RX_BW_160) + if (sta->deflink.bandwidth != IEEE80211_STA_RX_BW_160) return; /* go over for 160MHz and 80p80 */ @@ -2051,20 +2051,20 @@ mt7915_mcu_sta_bfer_tlv(struct sk_buff *skb, struct ieee80211_sta *sta, * vht: support eBF and iBF * ht: iBF only, since mac80211 lacks of eBF support */ - if (sta->he_cap.has_he && explicit) + if (sta->deflink.he_cap.has_he && explicit) mt7915_mcu_sta_bfer_he(sta, vif, phy, bf); - else if (sta->vht_cap.vht_supported) + else if (sta->deflink.vht_cap.vht_supported) mt7915_mcu_sta_bfer_vht(sta, phy, bf, explicit); - else if (sta->ht_cap.ht_supported) + else if (sta->deflink.ht_cap.ht_supported) mt7915_mcu_sta_bfer_ht(sta, phy, bf); else return; - bf->bw = sta->bandwidth; - bf->ibf_dbw = sta->bandwidth; + bf->bw = sta->deflink.bandwidth; + bf->ibf_dbw = sta->deflink.bandwidth; bf->ibf_nrow = tx_ant; - if (!explicit && sta->bandwidth <= IEEE80211_STA_RX_BW_40 && !bf->nc) + if (!explicit && sta->deflink.bandwidth <= IEEE80211_STA_RX_BW_40 && !bf->nc) bf->ibf_timeout = 0x48; else bf->ibf_timeout = 0x18; @@ -2074,7 +2074,7 @@ mt7915_mcu_sta_bfer_tlv(struct sk_buff *skb, struct ieee80211_sta *sta, else bf->mem_20m = matrix[bf->nr][bf->nc]; - switch (sta->bandwidth) { + switch (sta->deflink.bandwidth) { case IEEE80211_STA_RX_BW_160: case IEEE80211_STA_RX_BW_80: bf->mem_total = bf->mem_20m * 2; @@ -2100,13 +2100,13 @@ mt7915_mcu_sta_bfee_tlv(struct sk_buff *skb, struct ieee80211_sta *sta, tlv = mt7915_mcu_add_tlv(skb, STA_REC_BFEE, sizeof(*bfee)); bfee = (struct sta_rec_bfee *)tlv; - if (sta->he_cap.has_he) { - struct ieee80211_he_cap_elem *pe = &sta->he_cap.he_cap_elem; + if (sta->deflink.he_cap.has_he) { + struct ieee80211_he_cap_elem *pe = &sta->deflink.he_cap.he_cap_elem; nr = HE_PHY(CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_MASK, pe->phy_cap_info[5]); - } else if (sta->vht_cap.vht_supported) { - struct ieee80211_sta_vht_cap *pc = &sta->vht_cap; + } else if (sta->deflink.vht_cap.vht_supported) { + struct ieee80211_sta_vht_cap *pc = &sta->deflink.vht_cap; nr = FIELD_GET(IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK, pc->cap); @@ -2133,12 +2133,12 @@ mt7915_mcu_add_txbf(struct mt7915_dev *dev, struct ieee80211_vif *vif, phy = mvif->band_idx ? mt7915_ext_phy(dev) : &dev->phy; - if (sta->he_cap.has_he) { + if (sta->deflink.he_cap.has_he) { struct ieee80211_he_cap_elem *pe; const struct ieee80211_he_cap_elem *ve; const struct ieee80211_sta_he_cap *vc; - pe = &sta->he_cap.he_cap_elem; + pe = &sta->deflink.he_cap.he_cap_elem; vc = mt7915_get_he_phy_cap(phy, vif); ve = &vc->he_cap_elem; @@ -2146,11 +2146,11 @@ mt7915_mcu_add_txbf(struct mt7915_dev *dev, struct ieee80211_vif *vif, HE_PHY(CAP4_SU_BEAMFORMEE, ve->phy_cap_info[4])); ebf = !!(HE_PHY(CAP3_SU_BEAMFORMER, ve->phy_cap_info[3]) && HE_PHY(CAP4_SU_BEAMFORMEE, pe->phy_cap_info[4])); - } else if (sta->vht_cap.vht_supported) { + } else if (sta->deflink.vht_cap.vht_supported) { struct ieee80211_sta_vht_cap *pc; struct ieee80211_sta_vht_cap *vc; - pc = &sta->vht_cap; + pc = &sta->deflink.vht_cap; vc = &phy->mt76->sband_5g.sband.vht_cap; ebfee = !!((pc->cap & IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE) && @@ -2206,7 +2206,7 @@ mt7915_mcu_sta_rate_ctrl_tlv(struct sk_buff *skb, struct mt7915_dev *dev, enum nl80211_band band = chandef->chan->band; struct sta_rec_ra *ra; struct tlv *tlv; - u32 supp_rate = sta->supp_rates[band]; + u32 supp_rate = sta->deflink.supp_rates[band]; u32 cap = sta->wme ? STA_CAP_WMM : 0; tlv = mt7915_mcu_add_tlv(skb, STA_REC_RA, sizeof(*ra)); @@ -2216,8 +2216,8 @@ mt7915_mcu_sta_rate_ctrl_tlv(struct sk_buff *skb, struct mt7915_dev *dev, ra->auto_rate = true; ra->phy_mode = mt7915_get_phy_mode(vif, sta); ra->channel = chandef->chan->hw_value; - ra->bw = sta->bandwidth; - ra->phy.bw = sta->bandwidth; + ra->bw = sta->deflink.bandwidth; + ra->phy.bw = sta->deflink.bandwidth; if (supp_rate) { supp_rate &= mask->control[band].legacy; @@ -2237,54 +2237,54 @@ mt7915_mcu_sta_rate_ctrl_tlv(struct sk_buff *skb, struct mt7915_dev *dev, } } - if (sta->ht_cap.ht_supported) { + if (sta->deflink.ht_cap.ht_supported) { const u8 *mcs_mask = mask->control[band].ht_mcs; ra->supp_mode |= MODE_HT; - ra->af = sta->ht_cap.ampdu_factor; - ra->ht_gf = !!(sta->ht_cap.cap & IEEE80211_HT_CAP_GRN_FLD); + ra->af = sta->deflink.ht_cap.ampdu_factor; + ra->ht_gf = !!(sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_GRN_FLD); cap |= STA_CAP_HT; - if (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) + if (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) cap |= STA_CAP_SGI_20; - if (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) + if (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) cap |= STA_CAP_SGI_40; - if (sta->ht_cap.cap & IEEE80211_HT_CAP_TX_STBC) + if (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_TX_STBC) cap |= STA_CAP_TX_STBC; - if (sta->ht_cap.cap & IEEE80211_HT_CAP_RX_STBC) + if (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_RX_STBC) cap |= STA_CAP_RX_STBC; - if (sta->ht_cap.cap & IEEE80211_HT_CAP_LDPC_CODING) + if (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_LDPC_CODING) cap |= STA_CAP_LDPC; mt7915_mcu_set_sta_ht_mcs(sta, ra->ht_mcs, mcs_mask); ra->supp_ht_mcs = *(__le32 *)ra->ht_mcs; } - if (sta->vht_cap.vht_supported) { + if (sta->deflink.vht_cap.vht_supported) { const u16 *mcs_mask = mask->control[band].vht_mcs; u8 af; ra->supp_mode |= MODE_VHT; af = FIELD_GET(IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK, - sta->vht_cap.cap); + sta->deflink.vht_cap.cap); ra->af = max_t(u8, ra->af, af); cap |= STA_CAP_VHT; - if (sta->vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_80) + if (sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_80) cap |= STA_CAP_VHT_SGI_80; - if (sta->vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_160) + if (sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_160) cap |= STA_CAP_VHT_SGI_160; - if (sta->vht_cap.cap & IEEE80211_VHT_CAP_TXSTBC) + if (sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_TXSTBC) cap |= STA_CAP_VHT_TX_STBC; - if (sta->vht_cap.cap & IEEE80211_VHT_CAP_RXSTBC_1) + if (sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_RXSTBC_1) cap |= STA_CAP_VHT_RX_STBC; - if (sta->vht_cap.cap & IEEE80211_VHT_CAP_RXLDPC) + if (sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_RXLDPC) cap |= STA_CAP_VHT_LDPC; mt7915_mcu_set_sta_vht_mcs(sta, ra->supp_vht_mcs, mcs_mask); } - if (sta->he_cap.has_he) { + if (sta->deflink.he_cap.has_he) { ra->supp_mode |= MODE_HE; cap |= STA_CAP_HE; } @@ -2318,7 +2318,7 @@ int mt7915_mcu_add_he(struct mt7915_dev *dev, struct ieee80211_vif *vif, struct sk_buff *skb; int len; - if (!sta->he_cap.has_he) + if (!sta->deflink.he_cap.has_he) return 0; len = sizeof(struct sta_req_hdr) + sizeof(struct sta_rec_he); @@ -2368,7 +2368,7 @@ mt7915_mcu_add_mu(struct mt7915_dev *dev, struct ieee80211_vif *vif, struct sk_buff *skb; int ret; - if (!sta->vht_cap.vht_supported && !sta->he_cap.has_he) + if (!sta->deflink.vht_cap.vht_supported && !sta->deflink.he_cap.has_he) return 0; ret = mt7915_mcu_add_group(dev, vif, sta); diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/mac.c b/drivers/net/wireless/mediatek/mt76/mt7921/mac.c index 6cf0c9b1b8b9..880be2529bc0 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/mac.c @@ -967,7 +967,7 @@ mt7921_tx_check_aggr(struct ieee80211_sta *sta, __le32 *txwi) u16 fc, tid; u32 val; - if (!sta || !sta->ht_cap.ht_supported) + if (!sta || !sta->deflink.ht_cap.ht_supported) return; tid = FIELD_GET(MT_TXD1_TID, le32_to_cpu(txwi[1])); diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c index 9b490ff36bd6..cd0aa0a2dd9b 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c @@ -538,10 +538,10 @@ mt7921_mcu_tx_done_event(struct mt7921_dev *dev, struct sk_buff *skb) /* peer config based on IEEE SPEC */ memset(&peer, 0x0, sizeof(peer)); peer.bw = event->bw; - peer.g2 = !!(sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20); - peer.g4 = !!(sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40); - peer.g8 = !!(sta->vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_80); - peer.g16 = !!(sta->vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_160); + peer.g2 = !!(sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20); + peer.g4 = !!(sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40); + peer.g8 = !!(sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_80); + peer.g16 = !!(sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_160); mt7921_mcu_tx_rate_parse(mphy->mt76, &peer, &msta->stats.tx_rate, le16_to_cpu(event->tx_rate)); diff --git a/drivers/net/wireless/mediatek/mt7601u/mac.c b/drivers/net/wireless/mediatek/mt7601u/mac.c index d2ee1aaa3c81..ca9cf628eb10 100644 --- a/drivers/net/wireless/mediatek/mt7601u/mac.c +++ b/drivers/net/wireless/mediatek/mt7601u/mac.c @@ -385,7 +385,7 @@ void mt7601u_mac_set_ampdu_factor(struct mt7601u_dev *dev) msta = container_of(wcid, struct mt76_sta, wcid); sta = container_of(msta, struct ieee80211_sta, drv_priv); - min_factor = min(min_factor, sta->ht_cap.ampdu_factor); + min_factor = min(min_factor, sta->deflink.ht_cap.ampdu_factor); } rcu_read_unlock(); diff --git a/drivers/net/wireless/mediatek/mt7601u/tx.c b/drivers/net/wireless/mediatek/mt7601u/tx.c index f3dff8319a4c..f1fa0442a57f 100644 --- a/drivers/net/wireless/mediatek/mt7601u/tx.c +++ b/drivers/net/wireless/mediatek/mt7601u/tx.c @@ -163,7 +163,7 @@ mt7601u_push_txwi(struct mt7601u_dev *dev, struct sk_buff *skb, if ((info->flags & IEEE80211_TX_CTL_AMPDU) && sta) { u8 ba_size = IEEE80211_MIN_AMPDU_BUF; - ba_size <<= sta->ht_cap.ampdu_factor; + ba_size <<= sta->deflink.ht_cap.ampdu_factor; ba_size = min_t(int, 63, ba_size); if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) ba_size = 0; @@ -172,7 +172,7 @@ mt7601u_push_txwi(struct mt7601u_dev *dev, struct sk_buff *skb, txwi->flags = cpu_to_le16(MT_TXWI_FLAGS_AMPDU | FIELD_PREP(MT_TXWI_FLAGS_MPDU_DENSITY, - sta->ht_cap.ampdu_density)); + sta->deflink.ht_cap.ampdu_density)); if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) txwi->flags = 0; } diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c index 1688144d7847..18387d015697 100644 --- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c +++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c @@ -539,8 +539,9 @@ static int wilc_wfi_cfg_copy_wpa_info(struct wilc_wfi_key *key_info, return 0; } -static int add_key(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, - bool pairwise, const u8 *mac_addr, struct key_params *params) +static int add_key(struct wiphy *wiphy, struct net_device *netdev, int link_id, + u8 key_index, bool pairwise, const u8 *mac_addr, + struct key_params *params) { int ret = 0, keylen = params->key_len; @@ -649,7 +650,7 @@ static int add_key(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, return ret; } -static int del_key(struct wiphy *wiphy, struct net_device *netdev, +static int del_key(struct wiphy *wiphy, struct net_device *netdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr) @@ -686,8 +687,9 @@ static int del_key(struct wiphy *wiphy, struct net_device *netdev, return 0; } -static int get_key(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, - bool pairwise, const u8 *mac_addr, void *cookie, +static int get_key(struct wiphy *wiphy, struct net_device *netdev, int link_id, + u8 key_index, bool pairwise, const u8 *mac_addr, + void *cookie, void (*callback)(void *cookie, struct key_params *)) { struct wilc_vif *vif = netdev_priv(netdev); @@ -714,7 +716,8 @@ static int get_key(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, } static int set_default_key(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool unicast, bool multicast) + int link_id, u8 key_index, bool unicast, + bool multicast) { struct wilc_vif *vif = netdev_priv(netdev); @@ -1396,7 +1399,8 @@ static int change_beacon(struct wiphy *wiphy, struct net_device *dev, return wilc_add_beacon(vif, 0, 0, beacon); } -static int stop_ap(struct wiphy *wiphy, struct net_device *dev) +static int stop_ap(struct wiphy *wiphy, struct net_device *dev, + unsigned int link_id) { int ret; struct wilc_vif *vif = netdev_priv(dev); diff --git a/drivers/net/wireless/microchip/wilc1000/hif.c b/drivers/net/wireless/microchip/wilc1000/hif.c index 3e5cc947b9b9..d211ae224f32 100644 --- a/drivers/net/wireless/microchip/wilc1000/hif.c +++ b/drivers/net/wireless/microchip/wilc1000/hif.c @@ -806,15 +806,15 @@ static void wilc_hif_pack_sta_param(u8 *cur_byte, const u8 *mac, put_unaligned_le16(params->aid, cur_byte); cur_byte += 2; - *cur_byte++ = params->supported_rates_len; - if (params->supported_rates_len > 0) - memcpy(cur_byte, params->supported_rates, - params->supported_rates_len); - cur_byte += params->supported_rates_len; + *cur_byte++ = params->link_sta_params.supported_rates_len; + if (params->link_sta_params.supported_rates_len > 0) + memcpy(cur_byte, params->link_sta_params.supported_rates, + params->link_sta_params.supported_rates_len); + cur_byte += params->link_sta_params.supported_rates_len; - if (params->ht_capa) { + if (params->link_sta_params.ht_capa) { *cur_byte++ = true; - memcpy(cur_byte, params->ht_capa, + memcpy(cur_byte, params->link_sta_params.ht_capa, sizeof(struct ieee80211_ht_cap)); } else { *cur_byte++ = false; @@ -1820,7 +1820,8 @@ int wilc_add_station(struct wilc_vif *vif, const u8 *mac, wid.id = WID_ADD_STA; wid.type = WID_BIN; - wid.size = WILC_ADD_STA_LENGTH + params->supported_rates_len; + wid.size = WILC_ADD_STA_LENGTH + + params->link_sta_params.supported_rates_len; wid.val = kmalloc(wid.size, GFP_KERNEL); if (!wid.val) return -ENOMEM; @@ -1905,7 +1906,8 @@ int wilc_edit_station(struct wilc_vif *vif, const u8 *mac, wid.id = WID_EDIT_STA; wid.type = WID_BIN; - wid.size = WILC_ADD_STA_LENGTH + params->supported_rates_len; + wid.size = WILC_ADD_STA_LENGTH + + params->link_sta_params.supported_rates_len; wid.val = kmalloc(wid.size, GFP_KERNEL); if (!wid.val) return -ENOMEM; diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c index 84b15a655eab..832dd00991f3 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c +++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c @@ -352,7 +352,8 @@ static int qtnf_start_ap(struct wiphy *wiphy, struct net_device *dev, return ret; } -static int qtnf_stop_ap(struct wiphy *wiphy, struct net_device *dev) +static int qtnf_stop_ap(struct wiphy *wiphy, struct net_device *dev, + unsigned int link_id) { struct qtnf_vif *vif = qtnf_netdev_get_priv(dev); int ret; @@ -500,7 +501,7 @@ qtnf_dump_station(struct wiphy *wiphy, struct net_device *dev, switch (vif->wdev.iftype) { case NL80211_IFTYPE_STATION: - if (idx != 0 || !vif->wdev.current_bss) + if (idx != 0 || !vif->wdev.connected) return -ENOENT; ether_addr_copy(mac, vif->bssid); @@ -531,8 +532,8 @@ qtnf_dump_station(struct wiphy *wiphy, struct net_device *dev, } static int qtnf_add_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_index, bool pairwise, const u8 *mac_addr, - struct key_params *params) + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr, struct key_params *params) { struct qtnf_vif *vif = qtnf_netdev_get_priv(dev); int ret; @@ -547,7 +548,8 @@ static int qtnf_add_key(struct wiphy *wiphy, struct net_device *dev, } static int qtnf_del_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_index, bool pairwise, const u8 *mac_addr) + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr) { struct qtnf_vif *vif = qtnf_netdev_get_priv(dev); int ret; @@ -568,7 +570,8 @@ static int qtnf_del_key(struct wiphy *wiphy, struct net_device *dev, } static int qtnf_set_default_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_index, bool unicast, bool multicast) + int link_id, u8 key_index, bool unicast, + bool multicast) { struct qtnf_vif *vif = qtnf_netdev_get_priv(dev); int ret; @@ -584,7 +587,7 @@ static int qtnf_set_default_key(struct wiphy *wiphy, struct net_device *dev, static int qtnf_set_default_mgmt_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_index) + int link_id, u8 key_index) { struct qtnf_vif *vif = qtnf_netdev_get_priv(dev); int ret; @@ -729,7 +732,7 @@ qtnf_disconnect(struct wiphy *wiphy, struct net_device *dev, pr_err("VIF%u.%u: failed to disconnect\n", mac->macid, vif->vifid); - if (vif->wdev.current_bss) { + if (vif->wdev.connected) { netif_carrier_off(vif->netdev); cfg80211_disconnected(vif->netdev, reason_code, NULL, 0, true, GFP_KERNEL); @@ -745,10 +748,11 @@ qtnf_dump_survey(struct wiphy *wiphy, struct net_device *dev, struct qtnf_wmac *mac = wiphy_priv(wiphy); struct wireless_dev *wdev = dev->ieee80211_ptr; struct ieee80211_supported_band *sband; - const struct cfg80211_chan_def *chandef = &wdev->chandef; + const struct cfg80211_chan_def *chandef = wdev_chandef(wdev, 0); struct ieee80211_channel *chan; int ret; + sband = wiphy->bands[NL80211_BAND_2GHZ]; if (sband && idx >= sband->n_channels) { idx -= sband->n_channels; @@ -765,7 +769,7 @@ qtnf_dump_survey(struct wiphy *wiphy, struct net_device *dev, survey->channel = chan; survey->filled = 0x0; - if (chan == chandef->chan) + if (chandef && chan == chandef->chan) survey->filled = SURVEY_INFO_IN_USE; ret = qtnf_cmd_get_chan_stats(mac, chan->center_freq, survey); @@ -778,7 +782,7 @@ qtnf_dump_survey(struct wiphy *wiphy, struct net_device *dev, static int qtnf_get_channel(struct wiphy *wiphy, struct wireless_dev *wdev, - struct cfg80211_chan_def *chandef) + unsigned int link_id, struct cfg80211_chan_def *chandef) { struct net_device *ndev = wdev->netdev; struct qtnf_vif *vif; diff --git a/drivers/net/wireless/quantenna/qtnfmac/commands.c b/drivers/net/wireless/quantenna/qtnfmac/commands.c index c68563c83098..0fad53693292 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/commands.c +++ b/drivers/net/wireless/quantenna/qtnfmac/commands.c @@ -241,6 +241,7 @@ int qtnf_cmd_send_start_ap(struct qtnf_vif *vif, struct qlink_auth_encr *aen; int ret; int i; + int n; if (!qtnf_cmd_start_ap_can_fit(vif, s)) return -E2BIG; @@ -280,8 +281,9 @@ int qtnf_cmd_send_start_ap(struct qtnf_vif *vif, for (i = 0; i < QLINK_MAX_NR_CIPHER_SUITES; i++) aen->ciphers_pairwise[i] = cpu_to_le32(s->crypto.ciphers_pairwise[i]); - aen->n_akm_suites = cpu_to_le32(s->crypto.n_akm_suites); - for (i = 0; i < QLINK_MAX_NR_AKM_SUITES; i++) + n = min(QLINK_MAX_NR_AKM_SUITES, s->crypto.n_akm_suites); + aen->n_akm_suites = cpu_to_le32(n); + for (i = 0; i < n; i++) aen->akm_suites[i] = cpu_to_le32(s->crypto.akm_suites[i]); aen->control_port = s->crypto.control_port; aen->control_port_no_encrypt = s->crypto.control_port_no_encrypt; @@ -2005,7 +2007,7 @@ int qtnf_cmd_send_scan(struct qtnf_wmac *mac) dwell_active = scan_req->duration; dwell_passive = scan_req->duration; } else if (wdev->iftype == NL80211_IFTYPE_STATION && - wdev->current_bss) { + wdev->connected) { /* let device select dwell based on traffic conditions */ dwell_active = QTNF_SCAN_TIME_AUTO; dwell_passive = QTNF_SCAN_TIME_AUTO; @@ -2076,6 +2078,7 @@ int qtnf_cmd_send_connect(struct qtnf_vif *vif, struct qlink_auth_encr *aen; int ret; int i; + int n; u32 connect_flags = 0; cmd_skb = qtnf_cmd_alloc_new_cmdskb(vif->mac->macid, vif->vifid, @@ -2132,9 +2135,10 @@ int qtnf_cmd_send_connect(struct qtnf_vif *vif, aen->ciphers_pairwise[i] = cpu_to_le32(sme->crypto.ciphers_pairwise[i]); - aen->n_akm_suites = cpu_to_le32(sme->crypto.n_akm_suites); + n = min(QLINK_MAX_NR_AKM_SUITES, sme->crypto.n_akm_suites); + aen->n_akm_suites = cpu_to_le32(n); - for (i = 0; i < QLINK_MAX_NR_AKM_SUITES; i++) + for (i = 0; i < n; i++) aen->akm_suites[i] = cpu_to_le32(sme->crypto.akm_suites[i]); aen->control_port = sme->crypto.control_port; diff --git a/drivers/net/wireless/quantenna/qtnfmac/event.c b/drivers/net/wireless/quantenna/qtnfmac/event.c index 8dc80574d08d..4fafe370101a 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/event.c +++ b/drivers/net/wireless/quantenna/qtnfmac/event.c @@ -189,7 +189,7 @@ qtnf_event_handle_bss_join(struct qtnf_vif *vif, vif->mac->macid, vif->vifid, join_info->bssid, chandef.chan->hw_value); - if (!vif->wdev.ssid_len) { + if (!vif->wdev.u.client.ssid_len) { pr_warn("VIF%u.%u: SSID unknown for BSS:%pM\n", vif->mac->macid, vif->vifid, join_info->bssid); @@ -197,7 +197,7 @@ qtnf_event_handle_bss_join(struct qtnf_vif *vif, goto done; } - ie = kzalloc(2 + vif->wdev.ssid_len, GFP_KERNEL); + ie = kzalloc(2 + vif->wdev.u.client.ssid_len, GFP_KERNEL); if (!ie) { pr_warn("VIF%u.%u: IE alloc failed for BSS:%pM\n", vif->mac->macid, vif->vifid, @@ -207,14 +207,15 @@ qtnf_event_handle_bss_join(struct qtnf_vif *vif, } ie[0] = WLAN_EID_SSID; - ie[1] = vif->wdev.ssid_len; - memcpy(ie + 2, vif->wdev.ssid, vif->wdev.ssid_len); + ie[1] = vif->wdev.u.client.ssid_len; + memcpy(ie + 2, vif->wdev.u.client.ssid, + vif->wdev.u.client.ssid_len); bss = cfg80211_inform_bss(wiphy, chandef.chan, CFG80211_BSS_FTYPE_UNKNOWN, join_info->bssid, 0, WLAN_CAPABILITY_ESS, 100, - ie, 2 + vif->wdev.ssid_len, + ie, 2 + vif->wdev.u.client.ssid_len, 0, GFP_KERNEL); if (!bss) { pr_warn("VIF%u.%u: can't connect to unknown BSS: %pM\n", @@ -470,14 +471,14 @@ qtnf_event_handle_freq_change(struct qtnf_wmac *mac, continue; if (vif->wdev.iftype == NL80211_IFTYPE_STATION && - !vif->wdev.current_bss) + !vif->wdev.connected) continue; if (!vif->netdev) continue; mutex_lock(&vif->wdev.mtx); - cfg80211_ch_switch_notify(vif->netdev, &chandef); + cfg80211_ch_switch_notify(vif->netdev, &chandef, 0); mutex_unlock(&vif->wdev.mtx); } diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c index 34788bfb34b7..cf3ef44f18f2 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c @@ -1801,8 +1801,8 @@ int rt2800_sta_add(struct ieee80211_hw *hw, struct ieee80211_vif *vif, * do not have a choice if some connected STA is not capable to * receive the same amount of data like the others. */ - if (sta->ht_cap.ht_supported) { - drv_data->ampdu_factor_cnt[sta->ht_cap.ampdu_factor & 3]++; + if (sta->deflink.ht_cap.ht_supported) { + drv_data->ampdu_factor_cnt[sta->deflink.ht_cap.ampdu_factor & 3]++; rt2800_set_max_psdu_len(rt2x00dev); } @@ -1847,8 +1847,8 @@ int rt2800_sta_remove(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct rt2x00_sta *sta_priv = sta_to_rt2x00_sta(sta); int wcid = sta_priv->wcid; - if (sta->ht_cap.ht_supported) { - drv_data->ampdu_factor_cnt[sta->ht_cap.ampdu_factor & 3]--; + if (sta->deflink.ht_cap.ht_supported) { + drv_data->ampdu_factor_cnt[sta->deflink.ht_cap.ampdu_factor & 3]--; rt2800_set_max_psdu_len(rt2x00dev); } diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00queue.c b/drivers/net/wireless/ralink/rt2x00/rt2x00queue.c index fb1d31b2d52a..aa6b2f3d2eff 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00queue.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00queue.c @@ -303,7 +303,7 @@ static void rt2x00queue_create_tx_descriptor_ht(struct rt2x00_dev *rt2x00dev, if (sta) { sta_priv = sta_to_rt2x00_sta(sta); txdesc->u.ht.wcid = sta_priv->wcid; - density = sta->ht_cap.ampdu_density; + density = sta->deflink.ht_cap.ampdu_density; } /* diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c index 7370d92a3bda..b78190856e1d 100644 --- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c +++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c @@ -4484,21 +4484,21 @@ rtl8xxxu_wireless_mode(struct ieee80211_hw *hw, struct ieee80211_sta *sta) u16 network_type = WIRELESS_MODE_UNKNOWN; if (hw->conf.chandef.chan->band == NL80211_BAND_5GHZ) { - if (sta->vht_cap.vht_supported) + if (sta->deflink.vht_cap.vht_supported) network_type = WIRELESS_MODE_AC; - else if (sta->ht_cap.ht_supported) + else if (sta->deflink.ht_cap.ht_supported) network_type = WIRELESS_MODE_N_5G; network_type |= WIRELESS_MODE_A; } else { - if (sta->vht_cap.vht_supported) + if (sta->deflink.vht_cap.vht_supported) network_type = WIRELESS_MODE_AC; - else if (sta->ht_cap.ht_supported) + else if (sta->deflink.ht_cap.ht_supported) network_type = WIRELESS_MODE_N_24G; - if (sta->supp_rates[0] <= 0xf) + if (sta->deflink.supp_rates[0] <= 0xf) network_type |= WIRELESS_MODE_B; - else if (sta->supp_rates[0] & 0xf) + else if (sta->deflink.supp_rates[0] & 0xf) network_type |= (WIRELESS_MODE_B | WIRELESS_MODE_G); else network_type |= WIRELESS_MODE_G; @@ -4582,16 +4582,16 @@ rtl8xxxu_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif, goto error; } - if (sta->ht_cap.ht_supported) + if (sta->deflink.ht_cap.ht_supported) dev_info(dev, "%s: HT supported\n", __func__); - if (sta->vht_cap.vht_supported) + if (sta->deflink.vht_cap.vht_supported) dev_info(dev, "%s: VHT supported\n", __func__); /* TODO: Set bits 28-31 for rate adaptive id */ - ramask = (sta->supp_rates[0] & 0xfff) | - sta->ht_cap.mcs.rx_mask[0] << 12 | - sta->ht_cap.mcs.rx_mask[1] << 20; - if (sta->ht_cap.cap & + ramask = (sta->deflink.supp_rates[0] & 0xfff) | + sta->deflink.ht_cap.mcs.rx_mask[0] << 12 | + sta->deflink.ht_cap.mcs.rx_mask[1] << 20; + if (sta->deflink.ht_cap.cap & (IEEE80211_HT_CAP_SGI_40 | IEEE80211_HT_CAP_SGI_20)) sgi = 1; rcu_read_unlock(); @@ -5066,12 +5066,12 @@ static void rtl8xxxu_tx(struct ieee80211_hw *hw, /* (tx_info->flags & IEEE80211_TX_CTL_AMPDU) && */ ampdu_enable = false; if (ieee80211_is_data_qos(hdr->frame_control) && sta) { - if (sta->ht_cap.ht_supported) { + if (sta->deflink.ht_cap.ht_supported) { u32 ampdu, val32; u8 *qc = ieee80211_get_qos_ctl(hdr); u8 tid = qc[0] & IEEE80211_QOS_CTL_TID_MASK; - ampdu = (u32)sta->ht_cap.ampdu_density; + ampdu = (u32)sta->deflink.ht_cap.ampdu_density; val32 = ampdu << TXDESC_AMPDU_DENSITY_SHIFT; tx_desc->txdw2 |= cpu_to_le32(val32); @@ -5086,7 +5086,7 @@ static void rtl8xxxu_tx(struct ieee80211_hw *hw, if (rate_flag & IEEE80211_TX_RC_SHORT_GI || (ieee80211_is_data_qos(hdr->frame_control) && - sta && sta->ht_cap.cap & + sta && sta->deflink.ht_cap.cap & (IEEE80211_HT_CAP_SGI_40 | IEEE80211_HT_CAP_SGI_20))) sgi = true; @@ -6166,8 +6166,8 @@ rtl8xxxu_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif, switch (action) { case IEEE80211_AMPDU_TX_START: dev_dbg(dev, "%s: IEEE80211_AMPDU_TX_START\n", __func__); - ampdu_factor = sta->ht_cap.ampdu_factor; - ampdu_density = sta->ht_cap.ampdu_density; + ampdu_factor = sta->deflink.ht_cap.ampdu_factor; + ampdu_density = sta->deflink.ht_cap.ampdu_density; rtl8xxxu_set_ampdu_factor(priv, ampdu_factor); rtl8xxxu_set_ampdu_min_space(priv, ampdu_density); dev_dbg(dev, @@ -6259,10 +6259,10 @@ static void rtl8xxxu_refresh_rate_mask(struct rtl8xxxu_priv *priv, u32 rate_bitmap = 0; rcu_read_lock(); - rate_bitmap = (sta->supp_rates[0] & 0xfff) | - (sta->ht_cap.mcs.rx_mask[0] << 12) | - (sta->ht_cap.mcs.rx_mask[1] << 20); - if (sta->ht_cap.cap & + rate_bitmap = (sta->deflink.supp_rates[0] & 0xfff) | + (sta->deflink.ht_cap.mcs.rx_mask[0] << 12) | + (sta->deflink.ht_cap.mcs.rx_mask[1] << 20); + if (sta->deflink.ht_cap.cap & (IEEE80211_HT_CAP_SGI_40 | IEEE80211_HT_CAP_SGI_20)) sgi = 1; rcu_read_unlock(); diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c index ffd150ec181f..21695958b3ad 100644 --- a/drivers/net/wireless/realtek/rtlwifi/base.c +++ b/drivers/net/wireless/realtek/rtlwifi/base.c @@ -629,11 +629,12 @@ static void _rtl_query_shortgi(struct ieee80211_hw *hw, if (sta == NULL) return; - sgi_40 = sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40; - sgi_20 = sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20; - sgi_80 = sta->vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_80; + sgi_40 = sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40; + sgi_20 = sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20; + sgi_80 = sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_80; - if ((!sta->ht_cap.ht_supported) && (!sta->vht_cap.vht_supported)) + if (!sta->deflink.ht_cap.ht_supported && + !sta->deflink.vht_cap.vht_supported) return; if (!sgi_40 && !sgi_20) @@ -645,8 +646,8 @@ static void _rtl_query_shortgi(struct ieee80211_hw *hw, } else if (mac->opmode == NL80211_IFTYPE_AP || mac->opmode == NL80211_IFTYPE_ADHOC || mac->opmode == NL80211_IFTYPE_MESH_POINT) { - bw_40 = sta->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40; - bw_80 = sta->vht_cap.vht_supported; + bw_40 = sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40; + bw_80 = sta->deflink.vht_cap.vht_supported; } if (bw_80) { @@ -864,11 +865,11 @@ static void _rtl_query_bandwidth_mode(struct ieee80211_hw *hw, if (mac->opmode == NL80211_IFTYPE_AP || mac->opmode == NL80211_IFTYPE_ADHOC || mac->opmode == NL80211_IFTYPE_MESH_POINT) { - if (!(sta->ht_cap.ht_supported) || - !(sta->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40)) + if (!(sta->deflink.ht_cap.ht_supported) || + !(sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40)) return; } else if (mac->opmode == NL80211_IFTYPE_STATION) { - if (!mac->bw_40 || !(sta->ht_cap.ht_supported)) + if (!mac->bw_40 || !(sta->deflink.ht_cap.ht_supported)) return; } if (tcb_desc->multicast || tcb_desc->broadcast) @@ -884,11 +885,11 @@ static void _rtl_query_bandwidth_mode(struct ieee80211_hw *hw, if (mac->opmode == NL80211_IFTYPE_AP || mac->opmode == NL80211_IFTYPE_ADHOC || mac->opmode == NL80211_IFTYPE_MESH_POINT) { - if (!(sta->vht_cap.vht_supported)) + if (!(sta->deflink.vht_cap.vht_supported)) return; } else if (mac->opmode == NL80211_IFTYPE_STATION) { if (!mac->bw_80 || - !(sta->vht_cap.vht_supported)) + !(sta->deflink.vht_cap.vht_supported)) return; } if (tcb_desc->hw_rate <= @@ -904,7 +905,7 @@ static u8 _rtl_get_vht_highest_n_rate(struct ieee80211_hw *hw, struct rtl_priv *rtlpriv = rtl_priv(hw); struct rtl_phy *rtlphy = &(rtlpriv->phy); u8 hw_rate; - u16 tx_mcs_map = le16_to_cpu(sta->vht_cap.vht_mcs.tx_mcs_map); + u16 tx_mcs_map = le16_to_cpu(sta->deflink.vht_cap.vht_mcs.tx_mcs_map); if ((get_rf_type(rtlphy) == RF_2T2R) && (tx_mcs_map & 0x000c) != 0x000c) { @@ -944,7 +945,7 @@ static u8 _rtl_get_highest_n_rate(struct ieee80211_hw *hw, u8 hw_rate; if (get_rf_type(rtlphy) == RF_2T2R && - sta->ht_cap.mcs.rx_mask[1] != 0) + sta->deflink.ht_cap.mcs.rx_mask[1] != 0) hw_rate = rtlpriv->cfg->maps[RTL_RC_HT_RATEMCS15]; else hw_rate = rtlpriv->cfg->maps[RTL_RC_HT_RATEMCS7]; @@ -1271,11 +1272,11 @@ void rtl_get_tcb_desc(struct ieee80211_hw *hw, *and N rate will all be controlled by FW *when tcb_desc->use_driver_rate = false */ - if (sta && sta->vht_cap.vht_supported) { + if (sta && sta->deflink.vht_cap.vht_supported) { tcb_desc->hw_rate = _rtl_get_vht_highest_n_rate(hw, sta); } else { - if (sta && sta->ht_cap.ht_supported) { + if (sta && sta->deflink.ht_cap.ht_supported) { tcb_desc->hw_rate = _rtl_get_highest_n_rate(hw, sta); } else { diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c index 8efe2f5e5b9f..99a1d91ced5a 100644 --- a/drivers/net/wireless/realtek/rtlwifi/core.c +++ b/drivers/net/wireless/realtek/rtlwifi/core.c @@ -903,18 +903,18 @@ static int rtl_op_sta_add(struct ieee80211_hw *hw, spin_unlock_bh(&rtlpriv->locks.entry_list_lock); if (rtlhal->current_bandtype == BAND_ON_2_4G) { sta_entry->wireless_mode = WIRELESS_MODE_G; - if (sta->supp_rates[0] <= 0xf) + if (sta->deflink.supp_rates[0] <= 0xf) sta_entry->wireless_mode = WIRELESS_MODE_B; - if (sta->ht_cap.ht_supported) + if (sta->deflink.ht_cap.ht_supported) sta_entry->wireless_mode = WIRELESS_MODE_N_24G; if (vif->type == NL80211_IFTYPE_ADHOC) sta_entry->wireless_mode = WIRELESS_MODE_G; } else if (rtlhal->current_bandtype == BAND_ON_5G) { sta_entry->wireless_mode = WIRELESS_MODE_A; - if (sta->ht_cap.ht_supported) + if (sta->deflink.ht_cap.ht_supported) sta_entry->wireless_mode = WIRELESS_MODE_N_5G; - if (sta->vht_cap.vht_supported) + if (sta->deflink.vht_cap.vht_supported) sta_entry->wireless_mode = WIRELESS_MODE_AC_5G; if (vif->type == NL80211_IFTYPE_ADHOC) @@ -922,7 +922,7 @@ static int rtl_op_sta_add(struct ieee80211_hw *hw, } /*disable cck rate for p2p*/ if (mac->p2p) - sta->supp_rates[0] &= 0xfffffff0; + sta->deflink.supp_rates[0] &= 0xfffffff0; memcpy(sta_entry->mac_addr, sta->addr, ETH_ALEN); rtl_dbg(rtlpriv, COMP_MAC80211, DBG_DMESG, @@ -1126,7 +1126,7 @@ static void rtl_op_bss_info_changed(struct ieee80211_hw *hw, rtl_dbg(rtlpriv, COMP_EASY_CONCURRENT, DBG_LOUD, "send PS STATIC frame\n"); if (rtlpriv->dm.supp_phymode_switch) { - if (sta->ht_cap.ht_supported) + if (sta->deflink.ht_cap.ht_supported) rtl_send_smps_action(hw, sta, IEEE80211_SMPS_STATIC); } @@ -1134,20 +1134,20 @@ static void rtl_op_bss_info_changed(struct ieee80211_hw *hw, if (rtlhal->current_bandtype == BAND_ON_5G) { mac->mode = WIRELESS_MODE_A; } else { - if (sta->supp_rates[0] <= 0xf) + if (sta->deflink.supp_rates[0] <= 0xf) mac->mode = WIRELESS_MODE_B; else mac->mode = WIRELESS_MODE_G; } - if (sta->ht_cap.ht_supported) { + if (sta->deflink.ht_cap.ht_supported) { if (rtlhal->current_bandtype == BAND_ON_2_4G) mac->mode = WIRELESS_MODE_N_24G; else mac->mode = WIRELESS_MODE_N_5G; } - if (sta->vht_cap.vht_supported) { + if (sta->deflink.vht_cap.vht_supported) { if (rtlhal->current_bandtype == BAND_ON_5G) mac->mode = WIRELESS_MODE_AC_5G; else @@ -1256,14 +1256,14 @@ static void rtl_op_bss_info_changed(struct ieee80211_hw *hw, rcu_read_lock(); sta = ieee80211_find_sta(vif, (u8 *)bss_conf->bssid); if (sta) { - if (sta->ht_cap.ampdu_density > + if (sta->deflink.ht_cap.ampdu_density > mac->current_ampdu_density) mac->current_ampdu_density = - sta->ht_cap.ampdu_density; - if (sta->ht_cap.ampdu_factor < + sta->deflink.ht_cap.ampdu_density; + if (sta->deflink.ht_cap.ampdu_factor < mac->current_ampdu_factor) mac->current_ampdu_factor = - sta->ht_cap.ampdu_factor; + sta->deflink.ht_cap.ampdu_factor; } rcu_read_unlock(); @@ -1298,20 +1298,20 @@ static void rtl_op_bss_info_changed(struct ieee80211_hw *hw, if (rtlhal->current_bandtype == BAND_ON_5G) { mac->mode = WIRELESS_MODE_A; } else { - if (sta->supp_rates[0] <= 0xf) + if (sta->deflink.supp_rates[0] <= 0xf) mac->mode = WIRELESS_MODE_B; else mac->mode = WIRELESS_MODE_G; } - if (sta->ht_cap.ht_supported) { + if (sta->deflink.ht_cap.ht_supported) { if (rtlhal->current_bandtype == BAND_ON_2_4G) mac->mode = WIRELESS_MODE_N_24G; else mac->mode = WIRELESS_MODE_N_5G; } - if (sta->vht_cap.vht_supported) { + if (sta->deflink.vht_cap.vht_supported) { if (rtlhal->current_bandtype == BAND_ON_5G) mac->mode = WIRELESS_MODE_AC_5G; else @@ -1327,7 +1327,7 @@ static void rtl_op_bss_info_changed(struct ieee80211_hw *hw, sta_entry->wireless_mode = mac->mode; } - if (sta->ht_cap.ht_supported) { + if (sta->deflink.ht_cap.ht_supported) { mac->ht_enable = true; /* @@ -1338,16 +1338,16 @@ static void rtl_op_bss_info_changed(struct ieee80211_hw *hw, * */ } - if (sta->vht_cap.vht_supported) + if (sta->deflink.vht_cap.vht_supported) mac->vht_enable = true; if (changed & BSS_CHANGED_BASIC_RATES) { /* for 5G must << RATE_6M_INDEX = 4, * because 5G have no cck rate*/ if (rtlhal->current_bandtype == BAND_ON_5G) - basic_rates = sta->supp_rates[1] << 4; + basic_rates = sta->deflink.supp_rates[1] << 4; else - basic_rates = sta->supp_rates[0]; + basic_rates = sta->deflink.supp_rates[0]; mac->basic_rates = basic_rates; rtlpriv->cfg->ops->set_hw_reg(hw, HW_VAR_BASIC_RATE, diff --git a/drivers/net/wireless/realtek/rtlwifi/rc.c b/drivers/net/wireless/realtek/rtlwifi/rc.c index 4b5ea0ec9109..a164364109ba 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rc.c +++ b/drivers/net/wireless/realtek/rtlwifi/rc.c @@ -66,7 +66,7 @@ static u8 _rtl_rc_get_highest_rix(struct rtl_priv *rtlpriv, else return N_MODE_MCS15_RIX; } else if (wireless_mode == WIRELESS_MODE_AC_24G) { - if (sta->bandwidth == IEEE80211_STA_RX_BW_20) { + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_20) { ieee80211_rate_set_vht(&rate, AC_MODE_MCS8_RIX, nss); @@ -88,7 +88,7 @@ static u8 _rtl_rc_get_highest_rix(struct rtl_priv *rtlpriv, else return N_MODE_MCS15_RIX; } else if (wireless_mode == WIRELESS_MODE_AC_5G) { - if (sta->bandwidth == IEEE80211_STA_RX_BW_20) { + if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_20) { ieee80211_rate_set_vht(&rate, AC_MODE_MCS8_RIX, nss); @@ -121,9 +121,9 @@ static void _rtl_rc_rate_set_series(struct rtl_priv *rtlpriv, u8 sgi_20 = 0, sgi_40 = 0, sgi_80 = 0; if (sta) { - sgi_20 = sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20; - sgi_40 = sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40; - sgi_80 = sta->vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_80; + sgi_20 = sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20; + sgi_40 = sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40; + sgi_80 = sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_80; sta_entry = (struct rtl_sta_info *)sta->drv_priv; wireless_mode = sta_entry->wireless_mode; } @@ -135,10 +135,10 @@ static void _rtl_rc_rate_set_series(struct rtl_priv *rtlpriv, rate->flags |= IEEE80211_TX_RC_USE_SHORT_PREAMBLE; if (mac->opmode == NL80211_IFTYPE_AP || mac->opmode == NL80211_IFTYPE_ADHOC) { - if (sta && (sta->ht_cap.cap & + if (sta && (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40)) rate->flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; - if (sta && sta->vht_cap.vht_supported) + if (sta && sta->deflink.vht_cap.vht_supported) rate->flags |= IEEE80211_TX_RC_80_MHZ_WIDTH; } else { if (mac->bw_80) @@ -149,11 +149,11 @@ static void _rtl_rc_rate_set_series(struct rtl_priv *rtlpriv, if (sgi_20 || sgi_40 || sgi_80) rate->flags |= IEEE80211_TX_RC_SHORT_GI; - if (sta && sta->ht_cap.ht_supported && + if (sta && sta->deflink.ht_cap.ht_supported && (wireless_mode == WIRELESS_MODE_N_5G || wireless_mode == WIRELESS_MODE_N_24G)) rate->flags |= IEEE80211_TX_RC_MCS; - if (sta && sta->vht_cap.vht_supported && + if (sta && sta->deflink.vht_cap.vht_supported && (wireless_mode == WIRELESS_MODE_AC_5G || wireless_mode == WIRELESS_MODE_AC_24G || wireless_mode == WIRELESS_MODE_AC_ONLY)) @@ -229,7 +229,7 @@ static void rtl_tx_status(void *ppriv, if (sta) { /* Check if aggregation has to be enabled for this tid */ sta_entry = (struct rtl_sta_info *)sta->drv_priv; - if (sta->ht_cap.ht_supported && + if (sta->deflink.ht_cap.ht_supported && !(skb->protocol == cpu_to_be16(ETH_P_PAE))) { if (ieee80211_is_data_qos(fc)) { u8 tid = rtl_get_tid(skb); diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/hw.c index bf686a916acb..58c2ab3d44be 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/hw.c @@ -1975,21 +1975,21 @@ static void rtl88ee_update_hal_rate_table(struct ieee80211_hw *hw, u16 shortgi_rate; u32 tmp_ratr_value; u8 curtxbw_40mhz = mac->bw_40; - u8 curshortgi_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? + u8 curshortgi_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? 1 : 0; - u8 curshortgi_20mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? + u8 curshortgi_20mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? 1 : 0; enum wireless_mode wirelessmode = mac->mode; u32 ratr_mask; if (rtlhal->current_bandtype == BAND_ON_5G) - ratr_value = sta->supp_rates[1] << 4; + ratr_value = sta->deflink.supp_rates[1] << 4; else - ratr_value = sta->supp_rates[0]; + ratr_value = sta->deflink.supp_rates[0]; if (mac->opmode == NL80211_IFTYPE_ADHOC) ratr_value = 0xfff; - ratr_value |= (sta->ht_cap.mcs.rx_mask[1] << 20 | - sta->ht_cap.mcs.rx_mask[0] << 12); + ratr_value |= (sta->deflink.ht_cap.mcs.rx_mask[1] << 20 | + sta->deflink.ht_cap.mcs.rx_mask[0] << 12); switch (wirelessmode) { case WIRELESS_MODE_B: if (ratr_value & 0x0000000c) @@ -2061,11 +2061,11 @@ static void rtl88ee_update_hal_rate_mask(struct ieee80211_hw *hw, struct rtl_sta_info *sta_entry = NULL; u32 ratr_bitmap; u8 ratr_index; - u8 curtxbw_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) + u8 curtxbw_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) ? 1 : 0; - u8 curshortgi_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? + u8 curshortgi_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? 1 : 0; - u8 curshortgi_20mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? + u8 curshortgi_20mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? 1 : 0; enum wireless_mode wirelessmode = 0; bool b_shortgi = false; @@ -2083,13 +2083,13 @@ static void rtl88ee_update_hal_rate_mask(struct ieee80211_hw *hw, macid = sta->aid + 1; if (rtlhal->current_bandtype == BAND_ON_5G) - ratr_bitmap = sta->supp_rates[1] << 4; + ratr_bitmap = sta->deflink.supp_rates[1] << 4; else - ratr_bitmap = sta->supp_rates[0]; + ratr_bitmap = sta->deflink.supp_rates[0]; if (mac->opmode == NL80211_IFTYPE_ADHOC) ratr_bitmap = 0xfff; - ratr_bitmap |= (sta->ht_cap.mcs.rx_mask[1] << 20 | - sta->ht_cap.mcs.rx_mask[0] << 12); + ratr_bitmap |= (sta->deflink.ht_cap.mcs.rx_mask[1] << 20 | + sta->deflink.ht_cap.mcs.rx_mask[0] << 12); switch (wirelessmode) { case WIRELESS_MODE_B: ratr_index = RATR_INX_WIRELESS_B; diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/trx.c index c948dafa0c80..257816563fa0 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/trx.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/trx.c @@ -504,7 +504,7 @@ void rtl88ee_tx_fill_desc(struct ieee80211_hw *hw, } else if (mac->opmode == NL80211_IFTYPE_AP || mac->opmode == NL80211_IFTYPE_ADHOC) { if (sta) - bw_40 = sta->ht_cap.cap & + bw_40 = sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40; } seq_number = (le16_to_cpu(hdr->seq_ctrl) & IEEE80211_SCTL_SEQ) >> 4; @@ -591,7 +591,7 @@ void rtl88ee_tx_fill_desc(struct ieee80211_hw *hw, set_tx_desc_linip(pdesc, 0); set_tx_desc_pkt_size(pdesc, (u16)skb_len); if (sta) { - u8 ampdu_density = sta->ht_cap.ampdu_density; + u8 ampdu_density = sta->deflink.ht_cap.ampdu_density; set_tx_desc_ampdu_density(pdesc, ampdu_density); } if (info->control.hw_key) { diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ce/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ce/hw.c index bb5a0c4aec93..b9c62640d2cb 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ce/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ce/hw.c @@ -1765,22 +1765,22 @@ static void rtl92ce_update_hal_rate_table(struct ieee80211_hw *hw, u16 shortgi_rate; u32 tmp_ratr_value; u8 curtxbw_40mhz = mac->bw_40; - u8 curshortgi_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? + u8 curshortgi_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? 1 : 0; - u8 curshortgi_20mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? + u8 curshortgi_20mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? 1 : 0; enum wireless_mode wirelessmode = mac->mode; u32 ratr_mask; if (rtlhal->current_bandtype == BAND_ON_5G) - ratr_value = sta->supp_rates[1] << 4; + ratr_value = sta->deflink.supp_rates[1] << 4; else - ratr_value = sta->supp_rates[0]; + ratr_value = sta->deflink.supp_rates[0]; if (mac->opmode == NL80211_IFTYPE_ADHOC) ratr_value = 0xfff; - ratr_value |= (sta->ht_cap.mcs.rx_mask[1] << 20 | - sta->ht_cap.mcs.rx_mask[0] << 12); + ratr_value |= (sta->deflink.ht_cap.mcs.rx_mask[1] << 20 | + sta->deflink.ht_cap.mcs.rx_mask[0] << 12); switch (wirelessmode) { case WIRELESS_MODE_B: if (ratr_value & 0x0000000c) @@ -1853,11 +1853,11 @@ static void rtl92ce_update_hal_rate_mask(struct ieee80211_hw *hw, struct rtl_sta_info *sta_entry = NULL; u32 ratr_bitmap; u8 ratr_index; - u8 curtxbw_40mhz = (sta->ht_cap.cap & + u8 curtxbw_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) ? 1 : 0; - u8 curshortgi_40mhz = (sta->ht_cap.cap & + u8 curshortgi_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? 1 : 0; - u8 curshortgi_20mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? + u8 curshortgi_20mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? 1 : 0; enum wireless_mode wirelessmode = 0; bool shortgi = false; @@ -1874,13 +1874,13 @@ static void rtl92ce_update_hal_rate_mask(struct ieee80211_hw *hw, macid = sta->aid + 1; if (rtlhal->current_bandtype == BAND_ON_5G) - ratr_bitmap = sta->supp_rates[1] << 4; + ratr_bitmap = sta->deflink.supp_rates[1] << 4; else - ratr_bitmap = sta->supp_rates[0]; + ratr_bitmap = sta->deflink.supp_rates[0]; if (mac->opmode == NL80211_IFTYPE_ADHOC) ratr_bitmap = 0xfff; - ratr_bitmap |= (sta->ht_cap.mcs.rx_mask[1] << 20 | - sta->ht_cap.mcs.rx_mask[0] << 12); + ratr_bitmap |= (sta->deflink.ht_cap.mcs.rx_mask[1] << 20 | + sta->deflink.ht_cap.mcs.rx_mask[0] << 12); switch (wirelessmode) { case WIRELESS_MODE_B: ratr_index = RATR_INX_WIRELESS_B; diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ce/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ce/trx.c index 4165175cf5c0..fdbd3758cde1 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ce/trx.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ce/trx.c @@ -379,7 +379,7 @@ void rtl92ce_tx_fill_desc(struct ieee80211_hw *hw, mac->opmode == NL80211_IFTYPE_ADHOC || mac->opmode == NL80211_IFTYPE_MESH_POINT) { if (sta) - bw_40 = sta->bandwidth >= IEEE80211_STA_RX_BW_40; + bw_40 = sta->deflink.bandwidth >= IEEE80211_STA_RX_BW_40; } seq_number = (le16_to_cpu(hdr->seq_ctrl) & IEEE80211_SCTL_SEQ) >> 4; @@ -441,7 +441,7 @@ void rtl92ce_tx_fill_desc(struct ieee80211_hw *hw, set_tx_desc_pkt_size(pdesc, (u16)skb->len); if (sta) { - u8 ampdu_density = sta->ht_cap.ampdu_density; + u8 ampdu_density = sta->deflink.ht_cap.ampdu_density; set_tx_desc_ampdu_density(pdesc, ampdu_density); } diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192cu/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192cu/hw.c index eaba66113328..9bd02c3c3adc 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192cu/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192cu/hw.c @@ -1918,21 +1918,21 @@ static void rtl92cu_update_hal_rate_table(struct ieee80211_hw *hw, u16 shortgi_rate; u32 tmp_ratr_value; u8 curtxbw_40mhz = mac->bw_40; - u8 curshortgi_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? + u8 curshortgi_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? 1 : 0; - u8 curshortgi_20mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? + u8 curshortgi_20mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? 1 : 0; enum wireless_mode wirelessmode = mac->mode; if (rtlhal->current_bandtype == BAND_ON_5G) - ratr_value = sta->supp_rates[1] << 4; + ratr_value = sta->deflink.supp_rates[1] << 4; else - ratr_value = sta->supp_rates[0]; + ratr_value = sta->deflink.supp_rates[0]; if (mac->opmode == NL80211_IFTYPE_ADHOC) ratr_value = 0xfff; - ratr_value |= (sta->ht_cap.mcs.rx_mask[1] << 20 | - sta->ht_cap.mcs.rx_mask[0] << 12); + ratr_value |= (sta->deflink.ht_cap.mcs.rx_mask[1] << 20 | + sta->deflink.ht_cap.mcs.rx_mask[0] << 12); switch (wirelessmode) { case WIRELESS_MODE_B: if (ratr_value & 0x0000000c) @@ -2003,11 +2003,11 @@ static void rtl92cu_update_hal_rate_mask(struct ieee80211_hw *hw, struct rtl_sta_info *sta_entry = NULL; u32 ratr_bitmap; u8 ratr_index; - u8 curtxbw_40mhz = (sta->bandwidth >= IEEE80211_STA_RX_BW_40) ? 1 : 0; + u8 curtxbw_40mhz = (sta->deflink.bandwidth >= IEEE80211_STA_RX_BW_40) ? 1 : 0; u8 curshortgi_40mhz = curtxbw_40mhz && - (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? + (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? 1 : 0; - u8 curshortgi_20mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? + u8 curshortgi_20mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? 1 : 0; enum wireless_mode wirelessmode = 0; bool shortgi = false; @@ -2025,13 +2025,13 @@ static void rtl92cu_update_hal_rate_mask(struct ieee80211_hw *hw, macid = sta->aid + 1; if (rtlhal->current_bandtype == BAND_ON_5G) - ratr_bitmap = sta->supp_rates[1] << 4; + ratr_bitmap = sta->deflink.supp_rates[1] << 4; else - ratr_bitmap = sta->supp_rates[0]; + ratr_bitmap = sta->deflink.supp_rates[0]; if (mac->opmode == NL80211_IFTYPE_ADHOC) ratr_bitmap = 0xfff; - ratr_bitmap |= (sta->ht_cap.mcs.rx_mask[1] << 20 | - sta->ht_cap.mcs.rx_mask[0] << 12); + ratr_bitmap |= (sta->deflink.ht_cap.mcs.rx_mask[1] << 20 | + sta->deflink.ht_cap.mcs.rx_mask[0] << 12); switch (wirelessmode) { case WIRELESS_MODE_B: ratr_index = RATR_INX_WIRELESS_B; diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192cu/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192cu/trx.c index 87f959d5d861..ae3c4f97637e 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192cu/trx.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192cu/trx.c @@ -540,7 +540,7 @@ void rtl92cu_tx_fill_desc(struct ieee80211_hw *hw, rcu_read_lock(); sta = ieee80211_find_sta(mac->vif, mac->bssid); if (sta) { - u8 ampdu_density = sta->ht_cap.ampdu_density; + u8 ampdu_density = sta->deflink.ht_cap.ampdu_density; set_tx_desc_ampdu_density(txdesc, ampdu_density); } diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192de/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192de/hw.c index f849291cc587..2aecb2583f75 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192de/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192de/hw.c @@ -1802,18 +1802,18 @@ static void rtl92de_update_hal_rate_table(struct ieee80211_hw *hw, u16 shortgi_rate; u32 tmp_ratr_value; u8 curtxbw_40mhz = mac->bw_40; - u8 curshortgi_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? + u8 curshortgi_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? 1 : 0; - u8 curshortgi_20mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? + u8 curshortgi_20mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? 1 : 0; enum wireless_mode wirelessmode = mac->mode; if (rtlhal->current_bandtype == BAND_ON_5G) - ratr_value = sta->supp_rates[1] << 4; + ratr_value = sta->deflink.supp_rates[1] << 4; else - ratr_value = sta->supp_rates[0]; - ratr_value |= (sta->ht_cap.mcs.rx_mask[1] << 20 | - sta->ht_cap.mcs.rx_mask[0] << 12); + ratr_value = sta->deflink.supp_rates[0]; + ratr_value |= (sta->deflink.ht_cap.mcs.rx_mask[1] << 20 | + sta->deflink.ht_cap.mcs.rx_mask[0] << 12); switch (wirelessmode) { case WIRELESS_MODE_A: ratr_value &= 0x00000FF0; @@ -1880,10 +1880,10 @@ static void rtl92de_update_hal_rate_mask(struct ieee80211_hw *hw, struct rtl_sta_info *sta_entry = NULL; u32 ratr_bitmap; u8 ratr_index; - u8 curtxbw_40mhz = (sta->bandwidth >= IEEE80211_STA_RX_BW_40) ? 1 : 0; - u8 curshortgi_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? + u8 curtxbw_40mhz = (sta->deflink.bandwidth >= IEEE80211_STA_RX_BW_40) ? 1 : 0; + u8 curshortgi_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? 1 : 0; - u8 curshortgi_20mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? + u8 curshortgi_20mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? 1 : 0; enum wireless_mode wirelessmode = 0; bool shortgi = false; @@ -1901,11 +1901,11 @@ static void rtl92de_update_hal_rate_mask(struct ieee80211_hw *hw, macid = sta->aid + 1; if (rtlhal->current_bandtype == BAND_ON_5G) - ratr_bitmap = sta->supp_rates[1] << 4; + ratr_bitmap = sta->deflink.supp_rates[1] << 4; else - ratr_bitmap = sta->supp_rates[0]; - ratr_bitmap |= (sta->ht_cap.mcs.rx_mask[1] << 20 | - sta->ht_cap.mcs.rx_mask[0] << 12); + ratr_bitmap = sta->deflink.supp_rates[0]; + ratr_bitmap |= (sta->deflink.ht_cap.mcs.rx_mask[1] << 20 | + sta->deflink.ht_cap.mcs.rx_mask[0] << 12); switch (wirelessmode) { case WIRELESS_MODE_B: ratr_index = RATR_INX_WIRELESS_B; diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192de/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192de/trx.c index c02813fba934..807b66c16e11 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192de/trx.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192de/trx.c @@ -498,7 +498,7 @@ void rtl92de_tx_fill_desc(struct ieee80211_hw *hw, } else if (mac->opmode == NL80211_IFTYPE_AP || mac->opmode == NL80211_IFTYPE_ADHOC) { if (sta) - bw_40 = sta->bandwidth >= IEEE80211_STA_RX_BW_40; + bw_40 = sta->deflink.bandwidth >= IEEE80211_STA_RX_BW_40; } seq_number = (le16_to_cpu(hdr->seq_ctrl) & IEEE80211_SCTL_SEQ) >> 4; rtl_get_tcb_desc(hw, info, sta, skb, ptcb_desc); @@ -586,7 +586,7 @@ void rtl92de_tx_fill_desc(struct ieee80211_hw *hw, set_tx_desc_linip(pdesc, 0); set_tx_desc_pkt_size(pdesc, (u16)skb_len); if (sta) { - u8 ampdu_density = sta->ht_cap.ampdu_density; + u8 ampdu_density = sta->deflink.ht_cap.ampdu_density; set_tx_desc_ampdu_density(pdesc, ampdu_density); } if (info->control.hw_key) { diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c index 88fa2e593fef..eb0b42339d2e 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c @@ -2256,11 +2256,11 @@ static void rtl92ee_update_hal_rate_mask(struct ieee80211_hw *hw, struct rtl_sta_info *sta_entry = NULL; u32 ratr_bitmap; u8 ratr_index; - u8 curtxbw_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) + u8 curtxbw_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) ? 1 : 0; - u8 b_curshortgi_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? + u8 b_curshortgi_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? 1 : 0; - u8 b_curshortgi_20mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? + u8 b_curshortgi_20mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? 1 : 0; enum wireless_mode wirelessmode = 0; bool b_shortgi = false; @@ -2276,12 +2276,12 @@ static void rtl92ee_update_hal_rate_mask(struct ieee80211_hw *hw, mac->opmode == NL80211_IFTYPE_ADHOC) macid = sta->aid + 1; - ratr_bitmap = sta->supp_rates[0]; + ratr_bitmap = sta->deflink.supp_rates[0]; if (mac->opmode == NL80211_IFTYPE_ADHOC) ratr_bitmap = 0xfff; - ratr_bitmap |= (sta->ht_cap.mcs.rx_mask[1] << 20 | - sta->ht_cap.mcs.rx_mask[0] << 12); + ratr_bitmap |= (sta->deflink.ht_cap.mcs.rx_mask[1] << 20 | + sta->deflink.ht_cap.mcs.rx_mask[0] << 12); switch (wirelessmode) { case WIRELESS_MODE_B: diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.c index eef7a041e80d..272750612abf 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.c @@ -665,7 +665,7 @@ void rtl92ee_tx_fill_desc(struct ieee80211_hw *hw, } else if (mac->opmode == NL80211_IFTYPE_AP || mac->opmode == NL80211_IFTYPE_ADHOC) { if (sta) - bw_40 = sta->ht_cap.cap & + bw_40 = sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40; } seq_number = (le16_to_cpu(hdr->seq_ctrl) & IEEE80211_SCTL_SEQ) >> 4; @@ -759,7 +759,7 @@ void rtl92ee_tx_fill_desc(struct ieee80211_hw *hw, set_tx_desc_linip(pdesc, 0); if (sta) { - u8 ampdu_density = sta->ht_cap.ampdu_density; + u8 ampdu_density = sta->deflink.ht_cap.ampdu_density; set_tx_desc_ampdu_density(pdesc, ampdu_density); } diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192se/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192se/hw.c index 91199262aaca..4ca299c9de77 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192se/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192se/hw.c @@ -2017,20 +2017,20 @@ static void rtl92se_update_hal_rate_table(struct ieee80211_hw *hw, u16 shortgi_rate = 0; u32 tmp_ratr_value = 0; u8 curtxbw_40mhz = mac->bw_40; - u8 curshortgi_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? + u8 curshortgi_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? 1 : 0; - u8 curshortgi_20mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? + u8 curshortgi_20mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? 1 : 0; enum wireless_mode wirelessmode = mac->mode; if (rtlhal->current_bandtype == BAND_ON_5G) - ratr_value = sta->supp_rates[1] << 4; + ratr_value = sta->deflink.supp_rates[1] << 4; else - ratr_value = sta->supp_rates[0]; + ratr_value = sta->deflink.supp_rates[0]; if (mac->opmode == NL80211_IFTYPE_ADHOC) ratr_value = 0xfff; - ratr_value |= (sta->ht_cap.mcs.rx_mask[1] << 20 | - sta->ht_cap.mcs.rx_mask[0] << 12); + ratr_value |= (sta->deflink.ht_cap.mcs.rx_mask[1] << 20 | + sta->deflink.ht_cap.mcs.rx_mask[0] << 12); switch (wirelessmode) { case WIRELESS_MODE_B: ratr_value &= 0x0000000D; @@ -2115,10 +2115,10 @@ static void rtl92se_update_hal_rate_mask(struct ieee80211_hw *hw, struct rtl_sta_info *sta_entry = NULL; u32 ratr_bitmap; u8 ratr_index = 0; - u8 curtxbw_40mhz = (sta->bandwidth >= IEEE80211_STA_RX_BW_40) ? 1 : 0; - u8 curshortgi_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? + u8 curtxbw_40mhz = (sta->deflink.bandwidth >= IEEE80211_STA_RX_BW_40) ? 1 : 0; + u8 curshortgi_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? 1 : 0; - u8 curshortgi_20mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? + u8 curshortgi_20mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? 1 : 0; enum wireless_mode wirelessmode = 0; bool shortgi = false; @@ -2139,13 +2139,13 @@ static void rtl92se_update_hal_rate_mask(struct ieee80211_hw *hw, macid = sta->aid + 1; if (rtlhal->current_bandtype == BAND_ON_5G) - ratr_bitmap = sta->supp_rates[1] << 4; + ratr_bitmap = sta->deflink.supp_rates[1] << 4; else - ratr_bitmap = sta->supp_rates[0]; + ratr_bitmap = sta->deflink.supp_rates[0]; if (mac->opmode == NL80211_IFTYPE_ADHOC) ratr_bitmap = 0xfff; - ratr_bitmap |= (sta->ht_cap.mcs.rx_mask[1] << 20 | - sta->ht_cap.mcs.rx_mask[0] << 12); + ratr_bitmap |= (sta->deflink.ht_cap.mcs.rx_mask[1] << 20 | + sta->deflink.ht_cap.mcs.rx_mask[0] << 12); switch (wirelessmode) { case WIRELESS_MODE_B: band |= WIRELESS_11B; diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192se/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192se/trx.c index e474b4ec17f3..a5853a170b58 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192se/trx.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192se/trx.c @@ -342,7 +342,7 @@ void rtl92se_tx_fill_desc(struct ieee80211_hw *hw, } else if (mac->opmode == NL80211_IFTYPE_AP || mac->opmode == NL80211_IFTYPE_ADHOC) { if (sta) - bw_40 = sta->bandwidth >= IEEE80211_STA_RX_BW_40; + bw_40 = sta->deflink.bandwidth >= IEEE80211_STA_RX_BW_40; } seq_number = (le16_to_cpu(hdr->seq_ctrl) & IEEE80211_SCTL_SEQ) >> 4; diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c index c98f2216734f..965d98b9b09f 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c @@ -1841,21 +1841,21 @@ static void rtl8723e_update_hal_rate_table(struct ieee80211_hw *hw, u16 shortgi_rate; u32 tmp_ratr_value; u8 curtxbw_40mhz = mac->bw_40; - u8 curshortgi_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? + u8 curshortgi_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? 1 : 0; - u8 curshortgi_20mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? + u8 curshortgi_20mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? 1 : 0; enum wireless_mode wirelessmode = mac->mode; u32 ratr_mask; if (rtlhal->current_bandtype == BAND_ON_5G) - ratr_value = sta->supp_rates[1] << 4; + ratr_value = sta->deflink.supp_rates[1] << 4; else - ratr_value = sta->supp_rates[0]; + ratr_value = sta->deflink.supp_rates[0]; if (mac->opmode == NL80211_IFTYPE_ADHOC) ratr_value = 0xfff; - ratr_value |= (sta->ht_cap.mcs.rx_mask[1] << 20 | - sta->ht_cap.mcs.rx_mask[0] << 12); + ratr_value |= (sta->deflink.ht_cap.mcs.rx_mask[1] << 20 | + sta->deflink.ht_cap.mcs.rx_mask[0] << 12); switch (wirelessmode) { case WIRELESS_MODE_B: if (ratr_value & 0x0000000c) @@ -1928,11 +1928,11 @@ static void rtl8723e_update_hal_rate_mask(struct ieee80211_hw *hw, struct rtl_sta_info *sta_entry = NULL; u32 ratr_bitmap; u8 ratr_index; - u8 curtxbw_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) + u8 curtxbw_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) ? 1 : 0; - u8 curshortgi_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? + u8 curshortgi_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? 1 : 0; - u8 curshortgi_20mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? + u8 curshortgi_20mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? 1 : 0; enum wireless_mode wirelessmode = 0; bool shortgi = false; @@ -1949,13 +1949,13 @@ static void rtl8723e_update_hal_rate_mask(struct ieee80211_hw *hw, macid = sta->aid + 1; if (rtlhal->current_bandtype == BAND_ON_5G) - ratr_bitmap = sta->supp_rates[1] << 4; + ratr_bitmap = sta->deflink.supp_rates[1] << 4; else - ratr_bitmap = sta->supp_rates[0]; + ratr_bitmap = sta->deflink.supp_rates[0]; if (mac->opmode == NL80211_IFTYPE_ADHOC) ratr_bitmap = 0xfff; - ratr_bitmap |= (sta->ht_cap.mcs.rx_mask[1] << 20 | - sta->ht_cap.mcs.rx_mask[0] << 12); + ratr_bitmap |= (sta->deflink.ht_cap.mcs.rx_mask[1] << 20 | + sta->deflink.ht_cap.mcs.rx_mask[0] << 12); switch (wirelessmode) { case WIRELESS_MODE_B: ratr_index = RATR_INX_WIRELESS_B; diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/trx.c index 340b3d68a54e..3797a1e2af82 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/trx.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/trx.c @@ -376,7 +376,7 @@ void rtl8723e_tx_fill_desc(struct ieee80211_hw *hw, } else if (mac->opmode == NL80211_IFTYPE_AP || mac->opmode == NL80211_IFTYPE_ADHOC) { if (sta) - bw_40 = sta->ht_cap.cap & + bw_40 = sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40; } @@ -442,7 +442,7 @@ void rtl8723e_tx_fill_desc(struct ieee80211_hw *hw, set_tx_desc_pkt_size(pdesc, (u16)skb->len); if (sta) { - u8 ampdu_density = sta->ht_cap.ampdu_density; + u8 ampdu_density = sta->deflink.ht_cap.ampdu_density; set_tx_desc_ampdu_density(pdesc, ampdu_density); } diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c index 0748aedce2ad..189cc6437600 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c @@ -2315,11 +2315,11 @@ static void rtl8723be_update_hal_rate_mask(struct ieee80211_hw *hw, struct rtl_sta_info *sta_entry = NULL; u32 ratr_bitmap; u8 ratr_index; - u8 curtxbw_40mhz = (sta->ht_cap.cap & + u8 curtxbw_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) ? 1 : 0; - u8 curshortgi_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? + u8 curshortgi_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? 1 : 0; - u8 curshortgi_20mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? + u8 curshortgi_20mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? 1 : 0; enum wireless_mode wirelessmode = 0; bool shortgi = false; @@ -2335,13 +2335,13 @@ static void rtl8723be_update_hal_rate_mask(struct ieee80211_hw *hw, mac->opmode == NL80211_IFTYPE_ADHOC) macid = sta->aid + 1; - ratr_bitmap = sta->supp_rates[0]; + ratr_bitmap = sta->deflink.supp_rates[0]; if (mac->opmode == NL80211_IFTYPE_ADHOC) ratr_bitmap = 0xfff; - ratr_bitmap |= (sta->ht_cap.mcs.rx_mask[1] << 20 | - sta->ht_cap.mcs.rx_mask[0] << 12); + ratr_bitmap |= (sta->deflink.ht_cap.mcs.rx_mask[1] << 20 | + sta->deflink.ht_cap.mcs.rx_mask[0] << 12); switch (wirelessmode) { case WIRELESS_MODE_B: ratr_index = RATR_INX_WIRELESS_B; diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.c index 5a7cd270575a..c08f57e487e5 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.c @@ -429,7 +429,7 @@ void rtl8723be_tx_fill_desc(struct ieee80211_hw *hw, } else if (mac->opmode == NL80211_IFTYPE_AP || mac->opmode == NL80211_IFTYPE_ADHOC) { if (sta) - bw_40 = sta->ht_cap.cap & + bw_40 = sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40; } seq_number = (le16_to_cpu(hdr->seq_ctrl) & IEEE80211_SCTL_SEQ) >> 4; @@ -516,7 +516,7 @@ void rtl8723be_tx_fill_desc(struct ieee80211_hw *hw, set_tx_desc_linip(pdesc, 0); set_tx_desc_pkt_size(pdesc, (u16)skb_len); if (sta) { - u8 ampdu_density = sta->ht_cap.ampdu_density; + u8 ampdu_density = sta->deflink.ht_cap.ampdu_density; set_tx_desc_ampdu_density(pdesc, ampdu_density); } if (info->control.hw_key) { diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c index 33ffc24d3675..7e0f62d59fe1 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c @@ -3300,20 +3300,20 @@ static void rtl8821ae_update_hal_rate_table(struct ieee80211_hw *hw, u16 shortgi_rate; u32 tmp_ratr_value; u8 curtxbw_40mhz = mac->bw_40; - u8 b_curshortgi_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? + u8 b_curshortgi_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? 1 : 0; - u8 b_curshortgi_20mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? + u8 b_curshortgi_20mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? 1 : 0; enum wireless_mode wirelessmode = mac->mode; if (rtlhal->current_bandtype == BAND_ON_5G) - ratr_value = sta->supp_rates[1] << 4; + ratr_value = sta->deflink.supp_rates[1] << 4; else - ratr_value = sta->supp_rates[0]; + ratr_value = sta->deflink.supp_rates[0]; if (mac->opmode == NL80211_IFTYPE_ADHOC) ratr_value = 0xfff; - ratr_value |= (sta->ht_cap.mcs.rx_mask[1] << 20 | - sta->ht_cap.mcs.rx_mask[0] << 12); + ratr_value |= (sta->deflink.ht_cap.mcs.rx_mask[1] << 20 | + sta->deflink.ht_cap.mcs.rx_mask[0] << 12); switch (wirelessmode) { case WIRELESS_MODE_B: if (ratr_value & 0x0000000c) @@ -3484,12 +3484,12 @@ static bool _rtl8821ae_get_ra_shortgi(struct ieee80211_hw *hw, struct ieee80211_ u8 mac_id) { bool b_short_gi = false; - u8 b_curshortgi_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? + u8 b_curshortgi_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40) ? 1 : 0; - u8 b_curshortgi_20mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? + u8 b_curshortgi_20mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) ? 1 : 0; u8 b_curshortgi_80mhz = 0; - b_curshortgi_80mhz = (sta->vht_cap.cap & + b_curshortgi_80mhz = (sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_80) ? 1 : 0; if (mac_id == MAC_ID_STATIC_FOR_BROADCAST_MULTICAST) @@ -3512,7 +3512,7 @@ static void rtl8821ae_update_hal_rate_mask(struct ieee80211_hw *hw, u32 ratr_bitmap; u8 ratr_index; enum wireless_mode wirelessmode = 0; - u8 curtxbw_40mhz = (sta->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) + u8 curtxbw_40mhz = (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) ? 1 : 0; bool b_shortgi = false; u8 rate_mask[7]; @@ -3534,22 +3534,22 @@ static void rtl8821ae_update_hal_rate_mask(struct ieee80211_hw *hw, if (wirelessmode == WIRELESS_MODE_N_5G || wirelessmode == WIRELESS_MODE_AC_5G || wirelessmode == WIRELESS_MODE_A) - ratr_bitmap = sta->supp_rates[NL80211_BAND_5GHZ] << 4; + ratr_bitmap = sta->deflink.supp_rates[NL80211_BAND_5GHZ] << 4; else - ratr_bitmap = sta->supp_rates[NL80211_BAND_2GHZ]; + ratr_bitmap = sta->deflink.supp_rates[NL80211_BAND_2GHZ]; if (mac->opmode == NL80211_IFTYPE_ADHOC) ratr_bitmap = 0xfff; if (wirelessmode == WIRELESS_MODE_N_24G || wirelessmode == WIRELESS_MODE_N_5G) - ratr_bitmap |= (sta->ht_cap.mcs.rx_mask[1] << 20 | - sta->ht_cap.mcs.rx_mask[0] << 12); + ratr_bitmap |= (sta->deflink.ht_cap.mcs.rx_mask[1] << 20 | + sta->deflink.ht_cap.mcs.rx_mask[0] << 12); else if (wirelessmode == WIRELESS_MODE_AC_24G || wirelessmode == WIRELESS_MODE_AC_5G || wirelessmode == WIRELESS_MODE_AC_ONLY) ratr_bitmap |= _rtl8821ae_rate_to_bitmap_2ssvht( - sta->vht_cap.vht_mcs.rx_mcs_map) << 12; + sta->deflink.vht_cap.vht_mcs.rx_mcs_map) << 12; b_shortgi = _rtl8821ae_get_ra_shortgi(hw, sta, macid); rf_type = _rtl8821ae_get_ra_rftype(hw, wirelessmode, ratr_bitmap); diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.c index 9d6f8dcbf2d6..8c48108960b9 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.c @@ -761,7 +761,7 @@ void rtl8821ae_tx_fill_desc(struct ieee80211_hw *hw, set_tx_desc_linip(pdesc, 0); set_tx_desc_pkt_size(pdesc, (u16)skb_len); if (sta) { - u8 ampdu_density = sta->ht_cap.ampdu_density; + u8 ampdu_density = sta->deflink.ht_cap.ampdu_density; set_tx_desc_ampdu_density(pdesc, ampdu_density); } diff --git a/drivers/net/wireless/realtek/rtw88/bf.c b/drivers/net/wireless/realtek/rtw88/bf.c index aff70e4ae028..3938a5c30a25 100644 --- a/drivers/net/wireless/realtek/rtw88/bf.c +++ b/drivers/net/wireless/realtek/rtw88/bf.c @@ -55,7 +55,7 @@ void rtw_bf_assoc(struct rtw_dev *rtwdev, struct ieee80211_vif *vif, } ic_vht_cap = &hw->wiphy->bands[NL80211_BAND_5GHZ]->vht_cap; - vht_cap = &sta->vht_cap; + vht_cap = &sta->deflink.vht_cap; if ((ic_vht_cap->cap & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE) && (vht_cap->cap & IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE)) { diff --git a/drivers/net/wireless/realtek/rtw88/main.c b/drivers/net/wireless/realtek/rtw88/main.c index d7b7b2cce974..69f87747b9eb 100644 --- a/drivers/net/wireless/realtek/rtw88/main.c +++ b/drivers/net/wireless/realtek/rtw88/main.c @@ -848,7 +848,7 @@ static void rtw_hw_config_rf_ant_num(struct rtw_dev *rtwdev, u8 hw_ant_num) static u64 get_vht_ra_mask(struct ieee80211_sta *sta) { u64 ra_mask = 0; - u16 mcs_map = le16_to_cpu(sta->vht_cap.vht_mcs.rx_mcs_map); + u16 mcs_map = le16_to_cpu(sta->deflink.vht_cap.vht_mcs.rx_mcs_map); u8 vht_mcs_cap; int i, nss; @@ -1051,19 +1051,19 @@ void rtw_update_sta_info(struct rtw_dev *rtwdev, struct rtw_sta_info *si) bool is_vht_enable = false; bool is_support_sgi = false; - if (sta->vht_cap.vht_supported) { + if (sta->deflink.vht_cap.vht_supported) { is_vht_enable = true; ra_mask |= get_vht_ra_mask(sta); - if (sta->vht_cap.cap & IEEE80211_VHT_CAP_RXSTBC_MASK) + if (sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_RXSTBC_MASK) stbc_en = VHT_STBC_EN; - if (sta->vht_cap.cap & IEEE80211_VHT_CAP_RXLDPC) + if (sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_RXLDPC) ldpc_en = VHT_LDPC_EN; - } else if (sta->ht_cap.ht_supported) { - ra_mask |= (sta->ht_cap.mcs.rx_mask[1] << 20) | - (sta->ht_cap.mcs.rx_mask[0] << 12); - if (sta->ht_cap.cap & IEEE80211_HT_CAP_RX_STBC) + } else if (sta->deflink.ht_cap.ht_supported) { + ra_mask |= (sta->deflink.ht_cap.mcs.rx_mask[1] << 20) | + (sta->deflink.ht_cap.mcs.rx_mask[0] << 12); + if (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_RX_STBC) stbc_en = HT_STBC_EN; - if (sta->ht_cap.cap & IEEE80211_HT_CAP_LDPC_CODING) + if (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_LDPC_CODING) ldpc_en = HT_LDPC_EN; } @@ -1071,11 +1071,11 @@ void rtw_update_sta_info(struct rtw_dev *rtwdev, struct rtw_sta_info *si) ra_mask &= RA_MASK_VHT_RATES_1SS | RA_MASK_HT_RATES_1SS; if (hal->current_band_type == RTW_BAND_5G) { - ra_mask |= (u64)sta->supp_rates[NL80211_BAND_5GHZ] << 4; - if (sta->vht_cap.vht_supported) { + ra_mask |= (u64)sta->deflink.supp_rates[NL80211_BAND_5GHZ] << 4; + if (sta->deflink.vht_cap.vht_supported) { ra_mask &= RA_MASK_VHT_RATES | RA_MASK_OFDM_IN_VHT; wireless_set = WIRELESS_OFDM | WIRELESS_VHT; - } else if (sta->ht_cap.ht_supported) { + } else if (sta->deflink.ht_cap.ht_supported) { ra_mask &= RA_MASK_HT_RATES | RA_MASK_OFDM_IN_HT_5G; wireless_set = WIRELESS_OFDM | WIRELESS_HT; } else { @@ -1083,18 +1083,18 @@ void rtw_update_sta_info(struct rtw_dev *rtwdev, struct rtw_sta_info *si) } dm_info->rrsr_val_init = RRSR_INIT_5G; } else if (hal->current_band_type == RTW_BAND_2G) { - ra_mask |= sta->supp_rates[NL80211_BAND_2GHZ]; - if (sta->vht_cap.vht_supported) { + ra_mask |= sta->deflink.supp_rates[NL80211_BAND_2GHZ]; + if (sta->deflink.vht_cap.vht_supported) { ra_mask &= RA_MASK_VHT_RATES | RA_MASK_CCK_IN_VHT | RA_MASK_OFDM_IN_VHT; wireless_set = WIRELESS_CCK | WIRELESS_OFDM | WIRELESS_HT | WIRELESS_VHT; - } else if (sta->ht_cap.ht_supported) { + } else if (sta->deflink.ht_cap.ht_supported) { ra_mask &= RA_MASK_HT_RATES | RA_MASK_CCK_IN_HT | RA_MASK_OFDM_IN_HT_2G; wireless_set = WIRELESS_CCK | WIRELESS_OFDM | WIRELESS_HT; - } else if (sta->supp_rates[0] <= 0xf) { + } else if (sta->deflink.supp_rates[0] <= 0xf) { wireless_set = WIRELESS_CCK; } else { wireless_set = WIRELESS_CCK | WIRELESS_OFDM; @@ -1105,28 +1105,28 @@ void rtw_update_sta_info(struct rtw_dev *rtwdev, struct rtw_sta_info *si) wireless_set = 0; } - switch (sta->bandwidth) { + switch (sta->deflink.bandwidth) { case IEEE80211_STA_RX_BW_80: bw_mode = RTW_CHANNEL_WIDTH_80; - is_support_sgi = sta->vht_cap.vht_supported && - (sta->vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_80); + is_support_sgi = sta->deflink.vht_cap.vht_supported && + (sta->deflink.vht_cap.cap & IEEE80211_VHT_CAP_SHORT_GI_80); break; case IEEE80211_STA_RX_BW_40: bw_mode = RTW_CHANNEL_WIDTH_40; - is_support_sgi = sta->ht_cap.ht_supported && - (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40); + is_support_sgi = sta->deflink.ht_cap.ht_supported && + (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40); break; default: bw_mode = RTW_CHANNEL_WIDTH_20; - is_support_sgi = sta->ht_cap.ht_supported && - (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20); + is_support_sgi = sta->deflink.ht_cap.ht_supported && + (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20); break; } - if (sta->vht_cap.vht_supported && ra_mask & 0xffc00000) { + if (sta->deflink.vht_cap.vht_supported && ra_mask & 0xffc00000) { tx_num = 2; rf_type = RF_2T2R; - } else if (sta->ht_cap.ht_supported && ra_mask & 0xfff00000) { + } else if (sta->deflink.ht_cap.ht_supported && ra_mask & 0xfff00000) { tx_num = 2; rf_type = RF_2T2R; } diff --git a/drivers/net/wireless/realtek/rtw88/tx.c b/drivers/net/wireless/realtek/rtw88/tx.c index 3a101aa139ed..c08ecc0b5051 100644 --- a/drivers/net/wireless/realtek/rtw88/tx.c +++ b/drivers/net/wireless/realtek/rtw88/tx.c @@ -72,7 +72,7 @@ EXPORT_SYMBOL(rtw_tx_fill_tx_desc); static u8 get_tx_ampdu_factor(struct ieee80211_sta *sta) { - u8 exp = sta->ht_cap.ampdu_factor; + u8 exp = sta->deflink.ht_cap.ampdu_factor; /* the least ampdu factor is 8K, and the value in the tx desc is the * max aggregation num, which represents val * 2 packets can be @@ -83,7 +83,7 @@ static u8 get_tx_ampdu_factor(struct ieee80211_sta *sta) static u8 get_tx_ampdu_density(struct ieee80211_sta *sta) { - return sta->ht_cap.ampdu_density; + return sta->deflink.ht_cap.ampdu_density; } static u8 get_highest_ht_tx_rate(struct rtw_dev *rtwdev, @@ -91,7 +91,7 @@ static u8 get_highest_ht_tx_rate(struct rtw_dev *rtwdev, { u8 rate; - if (rtwdev->hal.rf_type == RF_2T2R && sta->ht_cap.mcs.rx_mask[1] != 0) + if (rtwdev->hal.rf_type == RF_2T2R && sta->deflink.ht_cap.mcs.rx_mask[1] != 0) rate = DESC_RATEMCS15; else rate = DESC_RATEMCS7; @@ -106,7 +106,7 @@ static u8 get_highest_vht_tx_rate(struct rtw_dev *rtwdev, u8 rate; u16 tx_mcs_map; - tx_mcs_map = le16_to_cpu(sta->vht_cap.vht_mcs.tx_mcs_map); + tx_mcs_map = le16_to_cpu(sta->deflink.vht_cap.vht_mcs.tx_mcs_map); if (efuse->hw_cap.nss == 1) { switch (tx_mcs_map & 0x3) { case IEEE80211_VHT_MCS_SUPPORT_0_7: @@ -321,11 +321,11 @@ static void rtw_tx_data_pkt_info_update(struct rtw_dev *rtwdev, if (info->control.use_rts || skb->len > hw->wiphy->rts_threshold) pkt_info->rts = true; - if (sta->vht_cap.vht_supported) + if (sta->deflink.vht_cap.vht_supported) rate = get_highest_vht_tx_rate(rtwdev, sta); - else if (sta->ht_cap.ht_supported) + else if (sta->deflink.ht_cap.ht_supported) rate = get_highest_ht_tx_rate(rtwdev, sta); - else if (sta->supp_rates[0] <= 0xf) + else if (sta->deflink.supp_rates[0] <= 0xf) rate = DESC_RATE11M; else rate = DESC_RATE54M; diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c index 70841d131d72..2f540b04b4a1 100644 --- a/drivers/net/wireless/rndis_wlan.c +++ b/drivers/net/wireless/rndis_wlan.c @@ -489,14 +489,16 @@ static int rndis_join_ibss(struct wiphy *wiphy, struct net_device *dev, static int rndis_leave_ibss(struct wiphy *wiphy, struct net_device *dev); static int rndis_add_key(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool pairwise, const u8 *mac_addr, - struct key_params *params); + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr, struct key_params *params); static int rndis_del_key(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool pairwise, const u8 *mac_addr); + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr); static int rndis_set_default_key(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool unicast, bool multicast); + int link_id, u8 key_index, bool unicast, + bool multicast); static int rndis_get_station(struct wiphy *wiphy, struct net_device *dev, const u8 *mac, struct station_info *sinfo); @@ -2370,8 +2372,8 @@ static int rndis_leave_ibss(struct wiphy *wiphy, struct net_device *dev) } static int rndis_add_key(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool pairwise, const u8 *mac_addr, - struct key_params *params) + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr, struct key_params *params) { struct rndis_wlan_private *priv = wiphy_priv(wiphy); struct usbnet *usbdev = priv->usbdev; @@ -2406,7 +2408,8 @@ static int rndis_add_key(struct wiphy *wiphy, struct net_device *netdev, } static int rndis_del_key(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool pairwise, const u8 *mac_addr) + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr) { struct rndis_wlan_private *priv = wiphy_priv(wiphy); struct usbnet *usbdev = priv->usbdev; @@ -2417,7 +2420,8 @@ static int rndis_del_key(struct wiphy *wiphy, struct net_device *netdev, } static int rndis_set_default_key(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool unicast, bool multicast) + int link_id, u8 key_index, bool unicast, + bool multicast) { struct rndis_wlan_private *priv = wiphy_priv(wiphy); struct usbnet *usbdev = priv->usbdev; @@ -2806,8 +2810,9 @@ static void rndis_wlan_do_link_up_work(struct usbnet *usbdev) resp_ie_len, 0, GFP_KERNEL); } else { struct cfg80211_roam_info roam_info = { - .channel = get_current_channel(usbdev, NULL), - .bssid = bssid, + .links[0].channel = + get_current_channel(usbdev, NULL), + .links[0].bssid = bssid, .req_ie = req_ie, .req_ie_len = req_ie_len, .resp_ie = resp_ie, diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c index e70c1c7fdf59..2794c9ed97f7 100644 --- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c +++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c @@ -1487,13 +1487,13 @@ static int rsi_mac80211_sta_add(struct ieee80211_hw *hw, if ((vif->type == NL80211_IFTYPE_STATION) || (vif->type == NL80211_IFTYPE_P2P_CLIENT)) { - common->bitrate_mask[common->band] = sta->supp_rates[common->band]; - common->vif_info[0].is_ht = sta->ht_cap.ht_supported; - if (sta->ht_cap.ht_supported) { + common->bitrate_mask[common->band] = sta->deflink.supp_rates[common->band]; + common->vif_info[0].is_ht = sta->deflink.ht_cap.ht_supported; + if (sta->deflink.ht_cap.ht_supported) { common->bitrate_mask[NL80211_BAND_2GHZ] = - sta->supp_rates[NL80211_BAND_2GHZ]; - if ((sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) || - (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40)) + sta->deflink.supp_rates[NL80211_BAND_2GHZ]; + if ((sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) || + (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40)) common->vif_info[0].sgi = true; ieee80211_start_tx_ba_session(sta, 0, 0); } diff --git a/drivers/net/wireless/rsi/rsi_91x_mgmt.c b/drivers/net/wireless/rsi/rsi_91x_mgmt.c index 0848f7a7e76c..c14689266fec 100644 --- a/drivers/net/wireless/rsi/rsi_91x_mgmt.c +++ b/drivers/net/wireless/rsi/rsi_91x_mgmt.c @@ -1357,10 +1357,10 @@ static int rsi_send_auto_rate_request(struct rsi_common *common, is_ht = common->vif_info[0].is_ht; is_sgi = common->vif_info[0].sgi; } else { - rate_bitmap = sta->supp_rates[band]; - is_ht = sta->ht_cap.ht_supported; - if ((sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) || - (sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_40)) + rate_bitmap = sta->deflink.supp_rates[band]; + is_ht = sta->deflink.ht_cap.ht_supported; + if ((sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_20) || + (sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_SGI_40)) is_sgi = true; } diff --git a/drivers/net/wireless/st/cw1200/sta.c b/drivers/net/wireless/st/cw1200/sta.c index 236022d4ae2a..321df124d449 100644 --- a/drivers/net/wireless/st/cw1200/sta.c +++ b/drivers/net/wireless/st/cw1200/sta.c @@ -1907,10 +1907,10 @@ void cw1200_bss_info_changed(struct ieee80211_hw *dev, if (info->bssid && !info->ibss_joined) sta = ieee80211_find_sta(vif, info->bssid); if (sta) { - priv->ht_info.ht_cap = sta->ht_cap; + priv->ht_info.ht_cap = sta->deflink.ht_cap; priv->bss_params.operational_rate_set = cw1200_rate_mask_to_wsm(priv, - sta->supp_rates[priv->channel->band]); + sta->deflink.supp_rates[priv->channel->band]); priv->ht_info.channel_type = cfg80211_get_chandef_type(&dev->conf.chandef); priv->ht_info.operation_mode = info->ht_operation_mode; } else { diff --git a/drivers/net/wireless/ti/wlcore/cmd.c b/drivers/net/wireless/ti/wlcore/cmd.c index 8b798b5fcaf5..09eb1116be0e 100644 --- a/drivers/net/wireless/ti/wlcore/cmd.c +++ b/drivers/net/wireless/ti/wlcore/cmd.c @@ -1558,11 +1558,11 @@ int wl12xx_cmd_add_peer(struct wl1271 *wl, struct wl12xx_vif *wlvif, WL1271_PSD_LEGACY; - sta_rates = sta->supp_rates[wlvif->band]; - if (sta->ht_cap.ht_supported) + sta_rates = sta->deflink.supp_rates[wlvif->band]; + if (sta->deflink.ht_cap.ht_supported) sta_rates |= - (sta->ht_cap.mcs.rx_mask[0] << HW_HT_RATES_OFFSET) | - (sta->ht_cap.mcs.rx_mask[1] << HW_MIMO_RATES_OFFSET); + (sta->deflink.ht_cap.mcs.rx_mask[0] << HW_HT_RATES_OFFSET) | + (sta->deflink.ht_cap.mcs.rx_mask[1] << HW_MIMO_RATES_OFFSET); cmd->supported_rates = cpu_to_le32(wl1271_tx_enabled_rates_get(wl, sta_rates, diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index 5669f17b395f..8440e0942674 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -4439,15 +4439,15 @@ static void wl1271_bss_info_changed_sta(struct wl1271 *wl, rcu_read_lock(); sta = ieee80211_find_sta(vif, bss_conf->bssid); if (sta) { - u8 *rx_mask = sta->ht_cap.mcs.rx_mask; + u8 *rx_mask = sta->deflink.ht_cap.mcs.rx_mask; /* save the supp_rates of the ap */ - sta_rate_set = sta->supp_rates[wlvif->band]; - if (sta->ht_cap.ht_supported) + sta_rate_set = sta->deflink.supp_rates[wlvif->band]; + if (sta->deflink.ht_cap.ht_supported) sta_rate_set |= (rx_mask[0] << HW_HT_RATES_OFFSET) | (rx_mask[1] << HW_MIMO_RATES_OFFSET); - sta_ht_cap = sta->ht_cap; + sta_ht_cap = sta->deflink.ht_cap; sta_exists = true; } @@ -5225,7 +5225,8 @@ static int wl12xx_update_sta_state(struct wl1271 *wl, if (ret < 0) return ret; - ret = wl1271_acx_set_ht_capabilities(wl, &sta->ht_cap, true, + ret = wl1271_acx_set_ht_capabilities(wl, &sta->deflink.ht_cap, + true, wl_sta->hlid); if (ret) return ret; @@ -5786,8 +5787,9 @@ static void wlcore_op_sta_rc_update(struct ieee80211_hw *hw, return; /* this callback is atomic, so schedule a new work */ - wlvif->rc_update_bw = sta->bandwidth; - memcpy(&wlvif->rc_ht_cap, &sta->ht_cap, sizeof(sta->ht_cap)); + wlvif->rc_update_bw = sta->deflink.bandwidth; + memcpy(&wlvif->rc_ht_cap, &sta->deflink.ht_cap, + sizeof(sta->deflink.ht_cap)); ieee80211_queue_work(hw, &wlvif->rc_update_work); } diff --git a/drivers/net/wireless/virt_wifi.c b/drivers/net/wireless/virt_wifi.c index 514f2c1124b6..3cfd99067b2e 100644 --- a/drivers/net/wireless/virt_wifi.c +++ b/drivers/net/wireless/virt_wifi.c @@ -14,6 +14,7 @@ #include #include #include +#include static struct wiphy *common_wiphy; @@ -21,6 +22,7 @@ struct virt_wifi_wiphy_priv { struct delayed_work scan_result; struct cfg80211_scan_request *scan_request; bool being_deleted; + struct virt_wifi_network_simulation *network_simulation; }; static struct ieee80211_channel channel_2ghz = { @@ -172,6 +174,9 @@ static int virt_wifi_scan(struct wiphy *wiphy, priv->scan_request = request; schedule_delayed_work(&priv->scan_result, HZ * 2); + if (priv->network_simulation && + priv->network_simulation->notify_scan_trigger) + priv->network_simulation->notify_scan_trigger(wiphy, request); return 0; } @@ -187,6 +192,12 @@ static void virt_wifi_scan_result(struct work_struct *work) virt_wifi_inform_bss(wiphy); + if(priv->network_simulation && + priv->network_simulation->generate_virt_scan_result) { + if(priv->network_simulation->generate_virt_scan_result(wiphy)) + wiphy_err(wiphy, "Fail to generater the simulated scan result.\n"); + } + /* Schedules work which acquires and releases the rtnl lock. */ cfg80211_scan_done(priv->scan_request, &scan_info); priv->scan_request = NULL; @@ -378,6 +389,8 @@ static struct wiphy *virt_wifi_make_wiphy(void) priv = wiphy_priv(wiphy); priv->being_deleted = false; priv->scan_request = NULL; + priv->network_simulation = NULL; + INIT_DELAYED_WORK(&priv->scan_result, virt_wifi_scan_result); err = wiphy_register(wiphy); @@ -393,7 +406,6 @@ static struct wiphy *virt_wifi_make_wiphy(void) static void virt_wifi_destroy_wiphy(struct wiphy *wiphy) { struct virt_wifi_wiphy_priv *priv; - WARN(!wiphy, "%s called with null wiphy", __func__); if (!wiphy) return; @@ -427,8 +439,13 @@ static netdev_tx_t virt_wifi_start_xmit(struct sk_buff *skb, static int virt_wifi_net_device_open(struct net_device *dev) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); - + struct virt_wifi_wiphy_priv *w_priv; priv->is_up = true; + w_priv = wiphy_priv(dev->ieee80211_ptr->wiphy); + if(w_priv->network_simulation && + w_priv->network_simulation->notify_device_open) + w_priv->network_simulation->notify_device_open(dev); + return 0; } @@ -436,16 +453,22 @@ static int virt_wifi_net_device_open(struct net_device *dev) static int virt_wifi_net_device_stop(struct net_device *dev) { struct virt_wifi_netdev_priv *n_priv = netdev_priv(dev); + struct virt_wifi_wiphy_priv *w_priv; n_priv->is_up = false; if (!dev->ieee80211_ptr) return 0; + w_priv = wiphy_priv(dev->ieee80211_ptr->wiphy); virt_wifi_cancel_scan(dev->ieee80211_ptr->wiphy); virt_wifi_cancel_connect(dev); netif_carrier_off(dev); + if (w_priv->network_simulation && + w_priv->network_simulation->notify_device_stop) + w_priv->network_simulation->notify_device_stop(dev); + return 0; } @@ -688,6 +711,27 @@ static void __exit virt_wifi_cleanup_module(void) unregister_netdevice_notifier(&virt_wifi_notifier); } +int virt_wifi_register_network_simulation + (struct virt_wifi_network_simulation *ops) +{ + struct virt_wifi_wiphy_priv *priv = wiphy_priv(common_wiphy); + if (priv->network_simulation) + return -EEXIST; + priv->network_simulation = ops; + return 0; +} +EXPORT_SYMBOL_GPL(virt_wifi_register_network_simulation); + +int virt_wifi_unregister_network_simulation(void) +{ + struct virt_wifi_wiphy_priv *priv = wiphy_priv(common_wiphy); + if(!priv->network_simulation) + return -ENODATA; + priv->network_simulation = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(virt_wifi_unregister_network_simulation); + module_init(virt_wifi_init_module); module_exit(virt_wifi_cleanup_module); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index cfd038551156..123141e9ed60 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1671,3 +1671,4 @@ module_init(nvmet_init); module_exit(nvmet_exit); MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver); diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 338171c978cc..8d9f6f0771cf 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -1136,16 +1136,40 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname, return 0; } +/* + * Convert configs to something easy to use in C code + */ +#if defined(CONFIG_CMDLINE_FORCE) +static const int overwrite_incoming_cmdline = 1; +static const int read_dt_cmdline; +static const int concat_cmdline; +#elif defined(CONFIG_CMDLINE_EXTEND) +static const int overwrite_incoming_cmdline; +static const int read_dt_cmdline = 1; +static const int concat_cmdline = 1; +#else /* CMDLINE_FROM_BOOTLOADER */ +static const int overwrite_incoming_cmdline; +static const int read_dt_cmdline = 1; +static const int concat_cmdline; +#endif + +#ifdef CONFIG_CMDLINE +static const char *config_cmdline = CONFIG_CMDLINE; +#else +static const char *config_cmdline = ""; +#endif + int __init early_init_dt_scan_chosen(unsigned long node, const char *uname, int depth, void *data) { - int l; - const char *p; + int l = 0; + const char *p = NULL; const void *rng_seed; + char *cmdline = data; pr_debug("search \"chosen\", depth: %d, uname: %s\n", depth, uname); - if (depth != 1 || !data || + if (depth != 1 || !cmdline || (strcmp(uname, "chosen") != 0 && strcmp(uname, "chosen@0") != 0)) return 0; @@ -1154,28 +1178,28 @@ int __init early_init_dt_scan_chosen(unsigned long node, const char *uname, early_init_dt_check_for_initrd(node); early_init_dt_check_for_elfcorehdr(node); - /* Retrieve command line */ - p = of_get_flat_dt_prop(node, "bootargs", &l); - if (p != NULL && l > 0) - strlcpy(data, p, min(l, COMMAND_LINE_SIZE)); + /* Put CONFIG_CMDLINE in if forced or if data had nothing in it to start */ + if (overwrite_incoming_cmdline || !cmdline[0]) + strlcpy(cmdline, config_cmdline, COMMAND_LINE_SIZE); - /* - * CONFIG_CMDLINE is meant to be a default in case nothing else - * managed to set the command line, unless CONFIG_CMDLINE_FORCE - * is set in which case we override whatever was found earlier. - */ -#ifdef CONFIG_CMDLINE -#if defined(CONFIG_CMDLINE_EXTEND) - strlcat(data, " ", COMMAND_LINE_SIZE); - strlcat(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE); -#elif defined(CONFIG_CMDLINE_FORCE) - strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE); -#else - /* No arguments from boot loader, use kernel's cmdl*/ - if (!((char *)data)[0]) - strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE); -#endif -#endif /* CONFIG_CMDLINE */ + /* Retrieve command line unless forcing */ + if (read_dt_cmdline) + p = of_get_flat_dt_prop(node, "bootargs", &l); + + if (p != NULL && l > 0) { + if (concat_cmdline) { + int cmdline_len; + int copy_len; + strlcat(cmdline, " ", COMMAND_LINE_SIZE); + cmdline_len = strlen(cmdline); + copy_len = COMMAND_LINE_SIZE - cmdline_len - 1; + copy_len = min((int)l, copy_len); + strncpy(cmdline + cmdline_len, p, copy_len); + cmdline[cmdline_len + copy_len] = '\0'; + } else { + strlcpy(cmdline, p, min(l, COMMAND_LINE_SIZE)); + } + } pr_debug("Command line is: %s\n", (char *)data); diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 9da8835ba5a5..1117320ea7d0 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -25,7 +25,7 @@ #include "of_private.h" -#define MAX_RESERVED_REGIONS 64 +#define MAX_RESERVED_REGIONS 128 static struct reserved_mem reserved_mem[MAX_RESERVED_REGIONS]; static int reserved_mem_count; diff --git a/drivers/of/platform.c b/drivers/of/platform.c index 74afbb7a4f5e..4bd151d0f832 100644 --- a/drivers/of/platform.c +++ b/drivers/of/platform.c @@ -511,6 +511,7 @@ static const struct of_device_id reserved_mem_matches[] = { { .compatible = "qcom,cmd-db" }, { .compatible = "ramoops" }, { .compatible = "nvmem-rmem" }, + { .compatible = "google,open-dice" }, {} }; diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 2af4ed90e12b..89a524de83bd 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -83,6 +83,7 @@ void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar) for (func_no = 0; func_no < funcs; func_no++) __dw_pcie_ep_reset_bar(pci, func_no, bar, 0); } +EXPORT_SYMBOL_GPL(dw_pcie_ep_reset_bar); static u8 __dw_pcie_ep_find_next_cap(struct dw_pcie_ep *ep, u8 func_no, u8 cap_ptr, u8 cap) diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index 7cd4593ad12f..d55915ee7b7f 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -83,6 +83,7 @@ irqreturn_t dw_handle_msi_irq(struct pcie_port *pp) return ret; } +EXPORT_SYMBOL_GPL(dw_handle_msi_irq); /* Chained MSI interrupt service routine */ static void dw_chained_msi_isr(struct irq_desc *desc) @@ -262,14 +263,6 @@ static void dw_pcie_free_msi(struct pcie_port *pp) irq_domain_remove(pp->msi_domain); irq_domain_remove(pp->irq_domain); - - if (pp->msi_data) { - struct dw_pcie *pci = to_dw_pcie_from_pp(pp); - struct device *dev = pci->dev; - - dma_unmap_single_attrs(dev, pp->msi_data, sizeof(pp->msi_msg), - DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); - } } static void dw_pcie_msi_init(struct pcie_port *pp) @@ -294,6 +287,7 @@ int dw_pcie_host_init(struct pcie_port *pp) struct resource_entry *win; struct pci_host_bridge *bridge; struct resource *cfg_res; + u64 *msi_vaddr; int ret; raw_spin_lock_init(&pci->pp.lock); @@ -372,19 +366,29 @@ int dw_pcie_host_init(struct pcie_port *pp) dw_chained_msi_isr, pp); - ret = dma_set_mask(pci->dev, DMA_BIT_MASK(32)); + ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32)); if (ret) dev_warn(pci->dev, "Failed to set DMA mask to 32-bit. Devices with only 32-bit MSI support may not work properly\n"); - pp->msi_data = dma_map_single_attrs(pci->dev, &pp->msi_msg, - sizeof(pp->msi_msg), - DMA_FROM_DEVICE, - DMA_ATTR_SKIP_CPU_SYNC); - ret = dma_mapping_error(pci->dev, pp->msi_data); - if (ret) { - dev_err(pci->dev, "Failed to map MSI data\n"); - pp->msi_data = 0; - goto err_free_msi; + msi_vaddr = dmam_alloc_coherent(dev, sizeof(u64), &pp->msi_data, + GFP_KERNEL); + if (!msi_vaddr) { + u16 msi_capabilities; + + /* Retry the allocation with a 64-bit mask if supported. */ + msi_capabilities = dw_pcie_msi_capabilities(pci); + if ((msi_capabilities & PCI_MSI_FLAGS_ENABLE) && + (msi_capabilities & PCI_MSI_FLAGS_64BIT)) { + dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); + msi_vaddr = dmam_alloc_coherent(dev, sizeof(u64), + &pp->msi_data, + GFP_KERNEL); + } + if (!msi_vaddr) { + dev_err(dev, "Failed to alloc and map MSI data\n"); + ret = -ENOMEM; + goto err_free_msi; + } } } } diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 00972a7bc976..930b5931f8dd 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -55,6 +55,14 @@ u8 dw_pcie_find_capability(struct dw_pcie *pci, u8 cap) } EXPORT_SYMBOL_GPL(dw_pcie_find_capability); +u16 dw_pcie_msi_capabilities(struct dw_pcie *pci) +{ + u8 offset; + + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_MSI); + return dw_pcie_readw_dbi(pci, offset + PCI_MSI_FLAGS); +} + static u16 dw_pcie_find_next_ext_capability(struct dw_pcie *pci, u16 start, u8 cap) { @@ -552,6 +560,7 @@ int dw_pcie_link_up(struct dw_pcie *pci) return ((val & PCIE_PORT_DEBUG1_LINK_UP) && (!(val & PCIE_PORT_DEBUG1_LINK_IN_TRAINING))); } +EXPORT_SYMBOL_GPL(dw_pcie_link_up); void dw_pcie_upconfig_setup(struct dw_pcie *pci) { diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index 7d6e9b7576be..18fd16d60252 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -190,7 +190,6 @@ struct pcie_port { int msi_irq; struct irq_domain *irq_domain; struct irq_domain *msi_domain; - u16 msi_msg; dma_addr_t msi_data; struct irq_chip *msi_irq_chip; u32 num_vectors; @@ -284,6 +283,7 @@ struct dw_pcie { u8 dw_pcie_find_capability(struct dw_pcie *pci, u8 cap); u16 dw_pcie_find_ext_capability(struct dw_pcie *pci, u8 cap); +u16 dw_pcie_msi_capabilities(struct dw_pcie *pci); int dw_pcie_read(void __iomem *addr, int size, u32 *val); int dw_pcie_write(void __iomem *addr, int size, u32 val); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 1cce56c2aea0..12f38ab0e82c 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -3,6 +3,7 @@ #define DRIVERS_PCI_H #include +#include /* Number of possible devfns: 0.0 to 1f.7 inclusive */ #define MAX_NR_DEVFNS 256 @@ -339,6 +340,11 @@ struct pci_sriov { u16 subsystem_device; /* VF subsystem device */ resource_size_t barsz[PCI_SRIOV_NUM_BARS]; /* VF BAR size */ bool drivers_autoprobe; /* Auto probing of VFs by driver */ + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; /** diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index 25b7423f3cd0..a9d52daa361a 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -43,7 +43,7 @@ config ARM_CCN config ARM_CMN tristate "Arm CMN-600 PMU support" - depends on ARM64 || (COMPILE_TEST && 64BIT) + depends on ARM64 || COMPILE_TEST help Support for PMU events monitoring on the Arm CMN-600 Coherent Mesh Network interconnect. diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 400eb7f579dc..0e48adce57ef 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -5,8 +5,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -23,7 +25,10 @@ #define CMN_NI_LOGICAL_ID GENMASK_ULL(47, 32) #define CMN_NODEID_DEVID(reg) ((reg) & 3) +#define CMN_NODEID_EXT_DEVID(reg) ((reg) & 1) #define CMN_NODEID_PID(reg) (((reg) >> 2) & 1) +#define CMN_NODEID_EXT_PID(reg) (((reg) >> 1) & 3) +#define CMN_NODEID_1x1_PID(reg) (((reg) >> 2) & 7) #define CMN_NODEID_X(reg, bits) ((reg) >> (3 + (bits))) #define CMN_NODEID_Y(reg, bits) (((reg) >> 3) & ((1U << (bits)) - 1)) @@ -34,20 +39,28 @@ #define CMN_CHILD_NODE_ADDR GENMASK(27, 0) #define CMN_CHILD_NODE_EXTERNAL BIT(31) -#define CMN_ADDR_NODE_PTR GENMASK(27, 14) +#define CMN_MAX_DIMENSION 8 +#define CMN_MAX_XPS (CMN_MAX_DIMENSION * CMN_MAX_DIMENSION) +#define CMN_MAX_DTMS (CMN_MAX_XPS + (CMN_MAX_DIMENSION - 1) * 4) -#define CMN_NODE_PTR_DEVID(ptr) (((ptr) >> 2) & 3) -#define CMN_NODE_PTR_PID(ptr) ((ptr) & 1) -#define CMN_NODE_PTR_X(ptr, bits) ((ptr) >> (6 + (bits))) -#define CMN_NODE_PTR_Y(ptr, bits) (((ptr) >> 6) & ((1U << (bits)) - 1)) - -#define CMN_MAX_XPS (8 * 8) - -/* The CFG node has one other useful purpose */ +/* The CFG node has various info besides the discovery tree */ #define CMN_CFGM_PERIPH_ID_2 0x0010 #define CMN_CFGM_PID2_REVISION GENMASK(7, 4) -/* PMU registers occupy the 3rd 4KB page of each node's 16KB space */ +#define CMN_CFGM_INFO_GLOBAL 0x900 +#define CMN_INFO_MULTIPLE_DTM_EN BIT_ULL(63) +#define CMN_INFO_RSP_VC_NUM GENMASK_ULL(53, 52) +#define CMN_INFO_DAT_VC_NUM GENMASK_ULL(51, 50) + +/* XPs also have some local topology info which has uses too */ +#define CMN_MXP__CONNECT_INFO_P0 0x0008 +#define CMN_MXP__CONNECT_INFO_P1 0x0010 +#define CMN_MXP__CONNECT_INFO_P2 0x0028 +#define CMN_MXP__CONNECT_INFO_P3 0x0030 +#define CMN_MXP__CONNECT_INFO_P4 0x0038 +#define CMN_MXP__CONNECT_INFO_P5 0x0040 + +/* PMU registers occupy the 3rd 4KB page of each node's region */ #define CMN_PMU_OFFSET 0x2000 /* For most nodes, this is all there is */ @@ -57,6 +70,7 @@ /* DTMs live in the PMU space of XP registers */ #define CMN_DTM_WPn(n) (0x1A0 + (n) * 0x18) #define CMN_DTM_WPn_CONFIG(n) (CMN_DTM_WPn(n) + 0x00) +#define CMN_DTM_WPn_CONFIG_WP_DEV_SEL2 GENMASK_ULL(18,17) #define CMN_DTM_WPn_CONFIG_WP_COMBINE BIT(6) #define CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE BIT(5) #define CMN_DTM_WPn_CONFIG_WP_GRP BIT(4) @@ -81,7 +95,11 @@ #define CMN_DTM_PMEVCNTSR 0x240 +#define CMN_DTM_UNIT_INFO 0x0910 + #define CMN_DTM_NUM_COUNTERS 4 +/* Want more local counters? Why not replicate the whole DTM! Ugh... */ +#define CMN_DTM_OFFSET(n) ((n) * 0x200) /* The DTC node is where the magic happens */ #define CMN_DT_DTC_CTL 0x0a00 @@ -122,11 +140,11 @@ /* Event attributes */ -#define CMN_CONFIG_TYPE GENMASK(15, 0) -#define CMN_CONFIG_EVENTID GENMASK(23, 16) -#define CMN_CONFIG_OCCUPID GENMASK(27, 24) -#define CMN_CONFIG_BYNODEID BIT(31) -#define CMN_CONFIG_NODEID GENMASK(47, 32) +#define CMN_CONFIG_TYPE GENMASK_ULL(15, 0) +#define CMN_CONFIG_EVENTID GENMASK_ULL(23, 16) +#define CMN_CONFIG_OCCUPID GENMASK_ULL(27, 24) +#define CMN_CONFIG_BYNODEID BIT_ULL(31) +#define CMN_CONFIG_NODEID GENMASK_ULL(47, 32) #define CMN_EVENT_TYPE(event) FIELD_GET(CMN_CONFIG_TYPE, (event)->attr.config) #define CMN_EVENT_EVENTID(event) FIELD_GET(CMN_CONFIG_EVENTID, (event)->attr.config) @@ -134,13 +152,13 @@ #define CMN_EVENT_BYNODEID(event) FIELD_GET(CMN_CONFIG_BYNODEID, (event)->attr.config) #define CMN_EVENT_NODEID(event) FIELD_GET(CMN_CONFIG_NODEID, (event)->attr.config) -#define CMN_CONFIG_WP_COMBINE GENMASK(27, 24) -#define CMN_CONFIG_WP_DEV_SEL BIT(48) -#define CMN_CONFIG_WP_CHN_SEL GENMASK(50, 49) -#define CMN_CONFIG_WP_GRP BIT(52) -#define CMN_CONFIG_WP_EXCLUSIVE BIT(53) -#define CMN_CONFIG1_WP_VAL GENMASK(63, 0) -#define CMN_CONFIG2_WP_MASK GENMASK(63, 0) +#define CMN_CONFIG_WP_COMBINE GENMASK_ULL(27, 24) +#define CMN_CONFIG_WP_DEV_SEL GENMASK_ULL(50, 48) +#define CMN_CONFIG_WP_CHN_SEL GENMASK_ULL(55, 51) +#define CMN_CONFIG_WP_GRP BIT_ULL(56) +#define CMN_CONFIG_WP_EXCLUSIVE BIT_ULL(57) +#define CMN_CONFIG1_WP_VAL GENMASK_ULL(63, 0) +#define CMN_CONFIG2_WP_MASK GENMASK_ULL(63, 0) #define CMN_EVENT_WP_COMBINE(event) FIELD_GET(CMN_CONFIG_WP_COMBINE, (event)->attr.config) #define CMN_EVENT_WP_DEV_SEL(event) FIELD_GET(CMN_CONFIG_WP_DEV_SEL, (event)->attr.config) @@ -155,7 +173,13 @@ #define CMN_WP_DOWN 2 -/* r0px probably don't exist in silicon, thankfully */ +enum cmn_model { + CMN_ANY = -1, + CMN600 = 1, + CI700 = 2, +}; + +/* CMN-600 r0px shouldn't exist in silicon, thankfully */ enum cmn_revision { CMN600_R1P0, CMN600_R1P1, @@ -163,6 +187,10 @@ enum cmn_revision { CMN600_R1P3, CMN600_R2P0, CMN600_R3P0, + CMN600_R3P1, + CI700_R0P0 = 0, + CI700_R1P0, + CI700_R2P0, }; enum cmn_node_type { @@ -174,9 +202,12 @@ enum cmn_node_type { CMN_TYPE_HNF, CMN_TYPE_XP, CMN_TYPE_SBSX, - CMN_TYPE_RNI = 0xa, + CMN_TYPE_MPAM_S, + CMN_TYPE_MPAM_NS, + CMN_TYPE_RNI, CMN_TYPE_RND = 0xd, CMN_TYPE_RNSAM = 0xf, + CMN_TYPE_MTSX, CMN_TYPE_CXRA = 0x100, CMN_TYPE_CXHA = 0x101, CMN_TYPE_CXLA = 0x102, @@ -189,32 +220,32 @@ struct arm_cmn_node { u16 id, logid; enum cmn_node_type type; + int dtm; union { - /* Device node */ + /* DN/HN-F/CXHA */ struct { - int to_xp; - /* DN/HN-F/CXHA */ - unsigned int occupid_val; - unsigned int occupid_count; + u8 occupid_val; + u8 occupid_count; }; /* XP */ - struct { - int dtc; - u32 pmu_config_low; - union { - u8 input_sel[4]; - __le32 pmu_config_high; - }; - s8 wp_event[4]; - }; + u8 dtc; }; - union { u8 event[4]; __le32 event_sel; }; }; +struct arm_cmn_dtm { + void __iomem *base; + u32 pmu_config_low; + union { + u8 input_sel[4]; + __le32 pmu_config_high; + }; + s8 wp_event[4]; +}; + struct arm_cmn_dtc { void __iomem *base; int irq; @@ -231,35 +262,238 @@ struct arm_cmn_dtc { struct arm_cmn { struct device *dev; void __iomem *base; + unsigned int state; enum cmn_revision rev; + enum cmn_model model; u8 mesh_x; u8 mesh_y; u16 num_xps; u16 num_dns; + bool multi_dtm; + u8 ports_used; + struct { + unsigned int rsp_vc_num : 2; + unsigned int dat_vc_num : 2; + }; + struct arm_cmn_node *xps; struct arm_cmn_node *dns; + struct arm_cmn_dtm *dtms; struct arm_cmn_dtc *dtc; unsigned int num_dtcs; int cpu; struct hlist_node cpuhp_node; - unsigned int state; struct pmu pmu; + struct dentry *debug; }; #define to_cmn(p) container_of(p, struct arm_cmn, pmu) static int arm_cmn_hp_state; +struct arm_cmn_nodeid { + u8 x; + u8 y; + u8 port; + u8 dev; +}; + +static int arm_cmn_xyidbits(const struct arm_cmn *cmn) +{ + int dim = max(cmn->mesh_x, cmn->mesh_y); + + return dim > 4 ? 3 : 2; +} + +static struct arm_cmn_nodeid arm_cmn_nid(const struct arm_cmn *cmn, u16 id) +{ + struct arm_cmn_nodeid nid; + + if (cmn->num_xps == 1) { + nid.x = 0; + nid.y = 0; + nid.port = CMN_NODEID_1x1_PID(id); + nid.dev = CMN_NODEID_DEVID(id); + } else { + int bits = arm_cmn_xyidbits(cmn); + + nid.x = CMN_NODEID_X(id, bits); + nid.y = CMN_NODEID_Y(id, bits); + if (cmn->ports_used & 0xc) { + nid.port = CMN_NODEID_EXT_PID(id); + nid.dev = CMN_NODEID_EXT_DEVID(id); + } else { + nid.port = CMN_NODEID_PID(id); + nid.dev = CMN_NODEID_DEVID(id); + } + } + return nid; +} + +static struct arm_cmn_node *arm_cmn_node_to_xp(const struct arm_cmn *cmn, + const struct arm_cmn_node *dn) +{ + struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id); + int xp_idx = cmn->mesh_x * nid.y + nid.x; + + return cmn->xps + xp_idx; +} +static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn, + enum cmn_node_type type) +{ + struct arm_cmn_node *dn; + + for (dn = cmn->dns; dn->type; dn++) + if (dn->type == type) + return dn; + return NULL; +} + +struct dentry *arm_cmn_debugfs; + +#ifdef CONFIG_DEBUG_FS +static const char *arm_cmn_device_type(u8 type) +{ + switch(type) { + case 0x01: return " RN-I |"; + case 0x02: return " RN-D |"; + case 0x04: return " RN-F_B |"; + case 0x05: return "RN-F_B_E|"; + case 0x06: return " RN-F_A |"; + case 0x07: return "RN-F_A_E|"; + case 0x08: return " HN-T |"; + case 0x09: return " HN-I |"; + case 0x0a: return " HN-D |"; + case 0x0c: return " SN-F |"; + case 0x0d: return " SBSX |"; + case 0x0e: return " HN-F |"; + case 0x0f: return " SN-F_E |"; + case 0x10: return " SN-F_D |"; + case 0x11: return " CXHA |"; + case 0x12: return " CXRA |"; + case 0x13: return " CXRH |"; + case 0x14: return " RN-F_D |"; + case 0x15: return "RN-F_D_E|"; + case 0x16: return " RN-F_C |"; + case 0x17: return "RN-F_C_E|"; + case 0x1c: return " MTSX |"; + default: return " |"; + } +} + +static void arm_cmn_show_logid(struct seq_file *s, int x, int y, int p, int d) +{ + struct arm_cmn *cmn = s->private; + struct arm_cmn_node *dn; + + for (dn = cmn->dns; dn->type; dn++) { + struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id); + + if (dn->type == CMN_TYPE_XP) + continue; + /* Ignore the extra components that will overlap on some ports */ + if (dn->type < CMN_TYPE_HNI) + continue; + + if (nid.x != x || nid.y != y || nid.port != p || nid.dev != d) + continue; + + seq_printf(s, " #%-2d |", dn->logid); + return; + } + seq_puts(s, " |"); +} + +static int arm_cmn_map_show(struct seq_file *s, void *data) +{ + struct arm_cmn *cmn = s->private; + int x, y, p, pmax = fls(cmn->ports_used); + + seq_puts(s, " X"); + for (x = 0; x < cmn->mesh_x; x++) + seq_printf(s, " %d ", x); + seq_puts(s, "\nY P D+"); + y = cmn->mesh_y; + while (y--) { + int xp_base = cmn->mesh_x * y; + u8 port[6][CMN_MAX_DIMENSION]; + + for (x = 0; x < cmn->mesh_x; x++) + seq_puts(s, "--------+"); + + seq_printf(s, "\n%d |", y); + for (x = 0; x < cmn->mesh_x; x++) { + struct arm_cmn_node *xp = cmn->xps + xp_base + x; + void __iomem *base = xp->pmu_base - CMN_PMU_OFFSET; + + port[0][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P0); + port[1][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P1); + port[2][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P2); + port[3][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P3); + port[4][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P4); + port[5][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P5); + seq_printf(s, " XP #%-2d |", xp_base + x); + } + + seq_puts(s, "\n |"); + for (x = 0; x < cmn->mesh_x; x++) { + u8 dtc = cmn->xps[xp_base + x].dtc; + + if (dtc & (dtc - 1)) + seq_puts(s, " DTC ?? |"); + else + seq_printf(s, " DTC %ld |", __ffs(dtc)); + } + seq_puts(s, "\n |"); + for (x = 0; x < cmn->mesh_x; x++) + seq_puts(s, "........|"); + + for (p = 0; p < pmax; p++) { + seq_printf(s, "\n %d |", p); + for (x = 0; x < cmn->mesh_x; x++) + seq_puts(s, arm_cmn_device_type(port[p][x])); + seq_puts(s, "\n 0|"); + for (x = 0; x < cmn->mesh_x; x++) + arm_cmn_show_logid(s, x, y, p, 0); + seq_puts(s, "\n 1|"); + for (x = 0; x < cmn->mesh_x; x++) + arm_cmn_show_logid(s, x, y, p, 1); + } + seq_puts(s, "\n-----+"); + } + for (x = 0; x < cmn->mesh_x; x++) + seq_puts(s, "--------+"); + seq_puts(s, "\n"); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(arm_cmn_map); + +static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id) +{ + const char *name = "map"; + + if (id > 0) + name = devm_kasprintf(cmn->dev, GFP_KERNEL, "map_%d", id); + if (!name) + return; + + cmn->debug = debugfs_create_file(name, 0444, arm_cmn_debugfs, cmn, &arm_cmn_map_fops); +} +#else +static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id) {} +#endif + struct arm_cmn_hw_event { struct arm_cmn_node *dn; u64 dtm_idx[2]; unsigned int dtc_idx; u8 dtcs_used; u8 num_dns; + u8 dtm_offset; }; #define for_each_hw_dn(hw, dn, i) \ @@ -283,6 +517,7 @@ static unsigned int arm_cmn_get_index(u64 x[], unsigned int pos) struct arm_cmn_event_attr { struct device_attribute attr; + enum cmn_model model; enum cmn_node_type type; u8 eventid; u8 occupid; @@ -294,50 +529,22 @@ struct arm_cmn_format_attr { int config; }; -static int arm_cmn_xyidbits(const struct arm_cmn *cmn) -{ - return cmn->mesh_x > 4 || cmn->mesh_y > 4 ? 3 : 2; -} - -static void arm_cmn_init_node_to_xp(const struct arm_cmn *cmn, - struct arm_cmn_node *dn) -{ - int bits = arm_cmn_xyidbits(cmn); - int x = CMN_NODEID_X(dn->id, bits); - int y = CMN_NODEID_Y(dn->id, bits); - int xp_idx = cmn->mesh_x * y + x; - - dn->to_xp = (cmn->xps + xp_idx) - dn; -} - -static struct arm_cmn_node *arm_cmn_node_to_xp(struct arm_cmn_node *dn) -{ - return dn->type == CMN_TYPE_XP ? dn : dn + dn->to_xp; -} - -static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn, - enum cmn_node_type type) -{ - int i; - - for (i = 0; i < cmn->num_dns; i++) - if (cmn->dns[i].type == type) - return &cmn->dns[i]; - return NULL; -} - -#define CMN_EVENT_ATTR(_name, _type, _eventid, _occupid) \ +#define CMN_EVENT_ATTR(_model, _name, _type, _eventid, _occupid) \ (&((struct arm_cmn_event_attr[]) {{ \ .attr = __ATTR(_name, 0444, arm_cmn_event_show, NULL), \ + .model = _model, \ .type = _type, \ .eventid = _eventid, \ .occupid = _occupid, \ }})[0].attr.attr) -static bool arm_cmn_is_occup_event(enum cmn_node_type type, unsigned int id) +static bool arm_cmn_is_occup_event(enum cmn_model model, + enum cmn_node_type type, unsigned int id) { - return (type == CMN_TYPE_DVM && id == 0x05) || - (type == CMN_TYPE_HNF && id == 0x0f); + if (type == CMN_TYPE_DVM) + return (model == CMN600 && id == 0x05) || + (model == CI700 && id == 0x0c); + return type == CMN_TYPE_HNF && id == 0x0f; } static ssize_t arm_cmn_event_show(struct device *dev, @@ -355,7 +562,7 @@ static ssize_t arm_cmn_event_show(struct device *dev, "type=0x%x,eventid=0x%x,wp_dev_sel=?,wp_chn_sel=?,wp_grp=?,wp_val=?,wp_mask=?\n", eattr->type, eattr->eventid); - if (arm_cmn_is_occup_event(eattr->type, eattr->eventid)) + if (arm_cmn_is_occup_event(eattr->model, eattr->type, eattr->eventid)) return sysfs_emit(buf, "type=0x%x,eventid=0x%x,occupid=0x%x\n", eattr->type, eattr->eventid, eattr->occupid); @@ -370,60 +577,81 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj, struct device *dev = kobj_to_dev(kobj); struct arm_cmn *cmn = to_cmn(dev_get_drvdata(dev)); struct arm_cmn_event_attr *eattr; - enum cmn_node_type type; eattr = container_of(attr, typeof(*eattr), attr.attr); - type = eattr->type; - /* Watchpoints aren't nodes */ - if (type == CMN_TYPE_WP) - type = CMN_TYPE_XP; + if (!(eattr->model & cmn->model)) + return 0; - /* Revision-specific differences */ - if (cmn->rev < CMN600_R1P2) { - if (type == CMN_TYPE_HNF && eattr->eventid == 0x1b) + /* Watchpoints aren't nodes, so avoid confusion */ + if (eattr->type == CMN_TYPE_WP) + return attr->mode; + + /* Hide XP events for unused interfaces/channels */ + if (eattr->type == CMN_TYPE_XP) { + unsigned int intf = (eattr->eventid >> 2) & 7; + unsigned int chan = eattr->eventid >> 5; + + if ((intf & 4) && !(cmn->ports_used & BIT(intf & 3))) + return 0; + + if ((chan == 5 && cmn->rsp_vc_num < 2) || + (chan == 6 && cmn->dat_vc_num < 2)) return 0; } - if (!arm_cmn_node(cmn, type)) + /* Revision-specific differences */ + if (cmn->model == CMN600 && cmn->rev < CMN600_R1P2) { + if (eattr->type == CMN_TYPE_HNF && eattr->eventid == 0x1b) + return 0; + } + + if (!arm_cmn_node(cmn, eattr->type)) return 0; return attr->mode; } -#define _CMN_EVENT_DVM(_name, _event, _occup) \ - CMN_EVENT_ATTR(dn_##_name, CMN_TYPE_DVM, _event, _occup) +#define _CMN_EVENT_DVM(_model, _name, _event, _occup) \ + CMN_EVENT_ATTR(_model, dn_##_name, CMN_TYPE_DVM, _event, _occup) #define CMN_EVENT_DTC(_name) \ - CMN_EVENT_ATTR(dtc_##_name, CMN_TYPE_DTC, 0, 0) -#define _CMN_EVENT_HNF(_name, _event, _occup) \ - CMN_EVENT_ATTR(hnf_##_name, CMN_TYPE_HNF, _event, _occup) + CMN_EVENT_ATTR(CMN_ANY, dtc_##_name, CMN_TYPE_DTC, 0, 0) +#define _CMN_EVENT_HNF(_model, _name, _event, _occup) \ + CMN_EVENT_ATTR(_model, hnf_##_name, CMN_TYPE_HNF, _event, _occup) #define CMN_EVENT_HNI(_name, _event) \ - CMN_EVENT_ATTR(hni_##_name, CMN_TYPE_HNI, _event, 0) + CMN_EVENT_ATTR(CMN_ANY, hni_##_name, CMN_TYPE_HNI, _event, 0) #define __CMN_EVENT_XP(_name, _event) \ - CMN_EVENT_ATTR(mxp_##_name, CMN_TYPE_XP, _event, 0) -#define CMN_EVENT_SBSX(_name, _event) \ - CMN_EVENT_ATTR(sbsx_##_name, CMN_TYPE_SBSX, _event, 0) -#define CMN_EVENT_RNID(_name, _event) \ - CMN_EVENT_ATTR(rnid_##_name, CMN_TYPE_RNI, _event, 0) + CMN_EVENT_ATTR(CMN_ANY, mxp_##_name, CMN_TYPE_XP, _event, 0) +#define CMN_EVENT_SBSX(_model, _name, _event) \ + CMN_EVENT_ATTR(_model, sbsx_##_name, CMN_TYPE_SBSX, _event, 0) +#define CMN_EVENT_RNID(_model, _name, _event) \ + CMN_EVENT_ATTR(_model, rnid_##_name, CMN_TYPE_RNI, _event, 0) +#define CMN_EVENT_MTSX(_name, _event) \ + CMN_EVENT_ATTR(CMN_ANY, mtsx_##_name, CMN_TYPE_MTSX, _event, 0) -#define CMN_EVENT_DVM(_name, _event) \ - _CMN_EVENT_DVM(_name, _event, 0) -#define CMN_EVENT_HNF(_name, _event) \ - _CMN_EVENT_HNF(_name, _event, 0) +#define CMN_EVENT_DVM(_model, _name, _event) \ + _CMN_EVENT_DVM(_model, _name, _event, 0) +#define CMN_EVENT_HNF(_model, _name, _event) \ + _CMN_EVENT_HNF(_model, _name, _event, 0) #define _CMN_EVENT_XP(_name, _event) \ __CMN_EVENT_XP(e_##_name, (_event) | (0 << 2)), \ __CMN_EVENT_XP(w_##_name, (_event) | (1 << 2)), \ __CMN_EVENT_XP(n_##_name, (_event) | (2 << 2)), \ __CMN_EVENT_XP(s_##_name, (_event) | (3 << 2)), \ __CMN_EVENT_XP(p0_##_name, (_event) | (4 << 2)), \ - __CMN_EVENT_XP(p1_##_name, (_event) | (5 << 2)) + __CMN_EVENT_XP(p1_##_name, (_event) | (5 << 2)), \ + __CMN_EVENT_XP(p2_##_name, (_event) | (6 << 2)), \ + __CMN_EVENT_XP(p3_##_name, (_event) | (7 << 2)) /* Good thing there are only 3 fundamental XP events... */ #define CMN_EVENT_XP(_name, _event) \ _CMN_EVENT_XP(req_##_name, (_event) | (0 << 5)), \ _CMN_EVENT_XP(rsp_##_name, (_event) | (1 << 5)), \ _CMN_EVENT_XP(snp_##_name, (_event) | (2 << 5)), \ - _CMN_EVENT_XP(dat_##_name, (_event) | (3 << 5)) + _CMN_EVENT_XP(dat_##_name, (_event) | (3 << 5)), \ + _CMN_EVENT_XP(pub_##_name, (_event) | (4 << 5)), \ + _CMN_EVENT_XP(rsp2_##_name, (_event) | (5 << 5)), \ + _CMN_EVENT_XP(dat2_##_name, (_event) | (6 << 5)) static struct attribute *arm_cmn_event_attrs[] = { @@ -434,115 +662,152 @@ static struct attribute *arm_cmn_event_attrs[] = { * slot, but our lazy short-cut of using the DTM counter index for * the PMU index as well happens to avoid that by construction. */ - CMN_EVENT_DVM(rxreq_dvmop, 0x01), - CMN_EVENT_DVM(rxreq_dvmsync, 0x02), - CMN_EVENT_DVM(rxreq_dvmop_vmid_filtered, 0x03), - CMN_EVENT_DVM(rxreq_retried, 0x04), - _CMN_EVENT_DVM(rxreq_trk_occupancy_all, 0x05, 0), - _CMN_EVENT_DVM(rxreq_trk_occupancy_dvmop, 0x05, 1), - _CMN_EVENT_DVM(rxreq_trk_occupancy_dvmsync, 0x05, 2), + CMN_EVENT_DVM(CMN600, rxreq_dvmop, 0x01), + CMN_EVENT_DVM(CMN600, rxreq_dvmsync, 0x02), + CMN_EVENT_DVM(CMN600, rxreq_dvmop_vmid_filtered, 0x03), + CMN_EVENT_DVM(CMN600, rxreq_retried, 0x04), + _CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_all, 0x05, 0), + _CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_dvmop, 0x05, 1), + _CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_dvmsync, 0x05, 2), + CMN_EVENT_DVM(CI700, dvmop_tlbi, 0x01), + CMN_EVENT_DVM(CI700, dvmop_bpi, 0x02), + CMN_EVENT_DVM(CI700, dvmop_pici, 0x03), + CMN_EVENT_DVM(CI700, dvmop_vici, 0x04), + CMN_EVENT_DVM(CI700, dvmsync, 0x05), + CMN_EVENT_DVM(CI700, vmid_filtered, 0x06), + CMN_EVENT_DVM(CI700, rndop_filtered, 0x07), + CMN_EVENT_DVM(CI700, retry, 0x08), + CMN_EVENT_DVM(CI700, txsnp_flitv, 0x09), + CMN_EVENT_DVM(CI700, txsnp_stall, 0x0a), + CMN_EVENT_DVM(CI700, trkfull, 0x0b), + _CMN_EVENT_DVM(CI700, trk_occupancy_all, 0x0c, 0), + _CMN_EVENT_DVM(CI700, trk_occupancy_dvmop, 0x0c, 1), + _CMN_EVENT_DVM(CI700, trk_occupancy_dvmsync, 0x0c, 2), - CMN_EVENT_HNF(cache_miss, 0x01), - CMN_EVENT_HNF(slc_sf_cache_access, 0x02), - CMN_EVENT_HNF(cache_fill, 0x03), - CMN_EVENT_HNF(pocq_retry, 0x04), - CMN_EVENT_HNF(pocq_reqs_recvd, 0x05), - CMN_EVENT_HNF(sf_hit, 0x06), - CMN_EVENT_HNF(sf_evictions, 0x07), - CMN_EVENT_HNF(dir_snoops_sent, 0x08), - CMN_EVENT_HNF(brd_snoops_sent, 0x09), - CMN_EVENT_HNF(slc_eviction, 0x0a), - CMN_EVENT_HNF(slc_fill_invalid_way, 0x0b), - CMN_EVENT_HNF(mc_retries, 0x0c), - CMN_EVENT_HNF(mc_reqs, 0x0d), - CMN_EVENT_HNF(qos_hh_retry, 0x0e), - _CMN_EVENT_HNF(qos_pocq_occupancy_all, 0x0f, 0), - _CMN_EVENT_HNF(qos_pocq_occupancy_read, 0x0f, 1), - _CMN_EVENT_HNF(qos_pocq_occupancy_write, 0x0f, 2), - _CMN_EVENT_HNF(qos_pocq_occupancy_atomic, 0x0f, 3), - _CMN_EVENT_HNF(qos_pocq_occupancy_stash, 0x0f, 4), - CMN_EVENT_HNF(pocq_addrhaz, 0x10), - CMN_EVENT_HNF(pocq_atomic_addrhaz, 0x11), - CMN_EVENT_HNF(ld_st_swp_adq_full, 0x12), - CMN_EVENT_HNF(cmp_adq_full, 0x13), - CMN_EVENT_HNF(txdat_stall, 0x14), - CMN_EVENT_HNF(txrsp_stall, 0x15), - CMN_EVENT_HNF(seq_full, 0x16), - CMN_EVENT_HNF(seq_hit, 0x17), - CMN_EVENT_HNF(snp_sent, 0x18), - CMN_EVENT_HNF(sfbi_dir_snp_sent, 0x19), - CMN_EVENT_HNF(sfbi_brd_snp_sent, 0x1a), - CMN_EVENT_HNF(snp_sent_untrk, 0x1b), - CMN_EVENT_HNF(intv_dirty, 0x1c), - CMN_EVENT_HNF(stash_snp_sent, 0x1d), - CMN_EVENT_HNF(stash_data_pull, 0x1e), - CMN_EVENT_HNF(snp_fwded, 0x1f), + CMN_EVENT_HNF(CMN_ANY, cache_miss, 0x01), + CMN_EVENT_HNF(CMN_ANY, slc_sf_cache_access, 0x02), + CMN_EVENT_HNF(CMN_ANY, cache_fill, 0x03), + CMN_EVENT_HNF(CMN_ANY, pocq_retry, 0x04), + CMN_EVENT_HNF(CMN_ANY, pocq_reqs_recvd, 0x05), + CMN_EVENT_HNF(CMN_ANY, sf_hit, 0x06), + CMN_EVENT_HNF(CMN_ANY, sf_evictions, 0x07), + CMN_EVENT_HNF(CMN_ANY, dir_snoops_sent, 0x08), + CMN_EVENT_HNF(CMN_ANY, brd_snoops_sent, 0x09), + CMN_EVENT_HNF(CMN_ANY, slc_eviction, 0x0a), + CMN_EVENT_HNF(CMN_ANY, slc_fill_invalid_way, 0x0b), + CMN_EVENT_HNF(CMN_ANY, mc_retries, 0x0c), + CMN_EVENT_HNF(CMN_ANY, mc_reqs, 0x0d), + CMN_EVENT_HNF(CMN_ANY, qos_hh_retry, 0x0e), + _CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_all, 0x0f, 0), + _CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_read, 0x0f, 1), + _CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_write, 0x0f, 2), + _CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_atomic, 0x0f, 3), + _CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_stash, 0x0f, 4), + CMN_EVENT_HNF(CMN_ANY, pocq_addrhaz, 0x10), + CMN_EVENT_HNF(CMN_ANY, pocq_atomic_addrhaz, 0x11), + CMN_EVENT_HNF(CMN_ANY, ld_st_swp_adq_full, 0x12), + CMN_EVENT_HNF(CMN_ANY, cmp_adq_full, 0x13), + CMN_EVENT_HNF(CMN_ANY, txdat_stall, 0x14), + CMN_EVENT_HNF(CMN_ANY, txrsp_stall, 0x15), + CMN_EVENT_HNF(CMN_ANY, seq_full, 0x16), + CMN_EVENT_HNF(CMN_ANY, seq_hit, 0x17), + CMN_EVENT_HNF(CMN_ANY, snp_sent, 0x18), + CMN_EVENT_HNF(CMN_ANY, sfbi_dir_snp_sent, 0x19), + CMN_EVENT_HNF(CMN_ANY, sfbi_brd_snp_sent, 0x1a), + CMN_EVENT_HNF(CMN_ANY, snp_sent_untrk, 0x1b), + CMN_EVENT_HNF(CMN_ANY, intv_dirty, 0x1c), + CMN_EVENT_HNF(CMN_ANY, stash_snp_sent, 0x1d), + CMN_EVENT_HNF(CMN_ANY, stash_data_pull, 0x1e), + CMN_EVENT_HNF(CMN_ANY, snp_fwded, 0x1f), + CMN_EVENT_HNF(CI700, atomic_fwd, 0x20), + CMN_EVENT_HNF(CI700, mpam_hardlim, 0x21), + CMN_EVENT_HNF(CI700, mpam_softlim, 0x22), - CMN_EVENT_HNI(rrt_rd_occ_cnt_ovfl, 0x20), - CMN_EVENT_HNI(rrt_wr_occ_cnt_ovfl, 0x21), - CMN_EVENT_HNI(rdt_rd_occ_cnt_ovfl, 0x22), - CMN_EVENT_HNI(rdt_wr_occ_cnt_ovfl, 0x23), - CMN_EVENT_HNI(wdb_occ_cnt_ovfl, 0x24), - CMN_EVENT_HNI(rrt_rd_alloc, 0x25), - CMN_EVENT_HNI(rrt_wr_alloc, 0x26), - CMN_EVENT_HNI(rdt_rd_alloc, 0x27), - CMN_EVENT_HNI(rdt_wr_alloc, 0x28), - CMN_EVENT_HNI(wdb_alloc, 0x29), - CMN_EVENT_HNI(txrsp_retryack, 0x2a), - CMN_EVENT_HNI(arvalid_no_arready, 0x2b), - CMN_EVENT_HNI(arready_no_arvalid, 0x2c), - CMN_EVENT_HNI(awvalid_no_awready, 0x2d), - CMN_EVENT_HNI(awready_no_awvalid, 0x2e), - CMN_EVENT_HNI(wvalid_no_wready, 0x2f), - CMN_EVENT_HNI(txdat_stall, 0x30), - CMN_EVENT_HNI(nonpcie_serialization, 0x31), - CMN_EVENT_HNI(pcie_serialization, 0x32), + CMN_EVENT_HNI(rrt_rd_occ_cnt_ovfl, 0x20), + CMN_EVENT_HNI(rrt_wr_occ_cnt_ovfl, 0x21), + CMN_EVENT_HNI(rdt_rd_occ_cnt_ovfl, 0x22), + CMN_EVENT_HNI(rdt_wr_occ_cnt_ovfl, 0x23), + CMN_EVENT_HNI(wdb_occ_cnt_ovfl, 0x24), + CMN_EVENT_HNI(rrt_rd_alloc, 0x25), + CMN_EVENT_HNI(rrt_wr_alloc, 0x26), + CMN_EVENT_HNI(rdt_rd_alloc, 0x27), + CMN_EVENT_HNI(rdt_wr_alloc, 0x28), + CMN_EVENT_HNI(wdb_alloc, 0x29), + CMN_EVENT_HNI(txrsp_retryack, 0x2a), + CMN_EVENT_HNI(arvalid_no_arready, 0x2b), + CMN_EVENT_HNI(arready_no_arvalid, 0x2c), + CMN_EVENT_HNI(awvalid_no_awready, 0x2d), + CMN_EVENT_HNI(awready_no_awvalid, 0x2e), + CMN_EVENT_HNI(wvalid_no_wready, 0x2f), + CMN_EVENT_HNI(txdat_stall, 0x30), + CMN_EVENT_HNI(nonpcie_serialization, 0x31), + CMN_EVENT_HNI(pcie_serialization, 0x32), - CMN_EVENT_XP(txflit_valid, 0x01), - CMN_EVENT_XP(txflit_stall, 0x02), - CMN_EVENT_XP(partial_dat_flit, 0x03), + CMN_EVENT_XP(txflit_valid, 0x01), + CMN_EVENT_XP(txflit_stall, 0x02), + CMN_EVENT_XP(partial_dat_flit, 0x03), /* We treat watchpoints as a special made-up class of XP events */ - CMN_EVENT_ATTR(watchpoint_up, CMN_TYPE_WP, 0, 0), - CMN_EVENT_ATTR(watchpoint_down, CMN_TYPE_WP, 2, 0), + CMN_EVENT_ATTR(CMN_ANY, watchpoint_up, CMN_TYPE_WP, CMN_WP_UP, 0), + CMN_EVENT_ATTR(CMN_ANY, watchpoint_down, CMN_TYPE_WP, CMN_WP_DOWN, 0), - CMN_EVENT_SBSX(rd_req, 0x01), - CMN_EVENT_SBSX(wr_req, 0x02), - CMN_EVENT_SBSX(cmo_req, 0x03), - CMN_EVENT_SBSX(txrsp_retryack, 0x04), - CMN_EVENT_SBSX(txdat_flitv, 0x05), - CMN_EVENT_SBSX(txrsp_flitv, 0x06), - CMN_EVENT_SBSX(rd_req_trkr_occ_cnt_ovfl, 0x11), - CMN_EVENT_SBSX(wr_req_trkr_occ_cnt_ovfl, 0x12), - CMN_EVENT_SBSX(cmo_req_trkr_occ_cnt_ovfl, 0x13), - CMN_EVENT_SBSX(wdb_occ_cnt_ovfl, 0x14), - CMN_EVENT_SBSX(rd_axi_trkr_occ_cnt_ovfl, 0x15), - CMN_EVENT_SBSX(cmo_axi_trkr_occ_cnt_ovfl, 0x16), - CMN_EVENT_SBSX(arvalid_no_arready, 0x21), - CMN_EVENT_SBSX(awvalid_no_awready, 0x22), - CMN_EVENT_SBSX(wvalid_no_wready, 0x23), - CMN_EVENT_SBSX(txdat_stall, 0x24), - CMN_EVENT_SBSX(txrsp_stall, 0x25), + CMN_EVENT_SBSX(CMN_ANY, rd_req, 0x01), + CMN_EVENT_SBSX(CMN_ANY, wr_req, 0x02), + CMN_EVENT_SBSX(CMN_ANY, cmo_req, 0x03), + CMN_EVENT_SBSX(CMN_ANY, txrsp_retryack, 0x04), + CMN_EVENT_SBSX(CMN_ANY, txdat_flitv, 0x05), + CMN_EVENT_SBSX(CMN_ANY, txrsp_flitv, 0x06), + CMN_EVENT_SBSX(CMN_ANY, rd_req_trkr_occ_cnt_ovfl, 0x11), + CMN_EVENT_SBSX(CMN_ANY, wr_req_trkr_occ_cnt_ovfl, 0x12), + CMN_EVENT_SBSX(CMN_ANY, cmo_req_trkr_occ_cnt_ovfl, 0x13), + CMN_EVENT_SBSX(CMN_ANY, wdb_occ_cnt_ovfl, 0x14), + CMN_EVENT_SBSX(CMN_ANY, rd_axi_trkr_occ_cnt_ovfl, 0x15), + CMN_EVENT_SBSX(CMN_ANY, cmo_axi_trkr_occ_cnt_ovfl, 0x16), + CMN_EVENT_SBSX(CI700, rdb_occ_cnt_ovfl, 0x17), + CMN_EVENT_SBSX(CMN_ANY, arvalid_no_arready, 0x21), + CMN_EVENT_SBSX(CMN_ANY, awvalid_no_awready, 0x22), + CMN_EVENT_SBSX(CMN_ANY, wvalid_no_wready, 0x23), + CMN_EVENT_SBSX(CMN_ANY, txdat_stall, 0x24), + CMN_EVENT_SBSX(CMN_ANY, txrsp_stall, 0x25), - CMN_EVENT_RNID(s0_rdata_beats, 0x01), - CMN_EVENT_RNID(s1_rdata_beats, 0x02), - CMN_EVENT_RNID(s2_rdata_beats, 0x03), - CMN_EVENT_RNID(rxdat_flits, 0x04), - CMN_EVENT_RNID(txdat_flits, 0x05), - CMN_EVENT_RNID(txreq_flits_total, 0x06), - CMN_EVENT_RNID(txreq_flits_retried, 0x07), - CMN_EVENT_RNID(rrt_occ_ovfl, 0x08), - CMN_EVENT_RNID(wrt_occ_ovfl, 0x09), - CMN_EVENT_RNID(txreq_flits_replayed, 0x0a), - CMN_EVENT_RNID(wrcancel_sent, 0x0b), - CMN_EVENT_RNID(s0_wdata_beats, 0x0c), - CMN_EVENT_RNID(s1_wdata_beats, 0x0d), - CMN_EVENT_RNID(s2_wdata_beats, 0x0e), - CMN_EVENT_RNID(rrt_alloc, 0x0f), - CMN_EVENT_RNID(wrt_alloc, 0x10), - CMN_EVENT_RNID(rdb_unord, 0x11), - CMN_EVENT_RNID(rdb_replay, 0x12), - CMN_EVENT_RNID(rdb_hybrid, 0x13), - CMN_EVENT_RNID(rdb_ord, 0x14), + CMN_EVENT_RNID(CMN_ANY, s0_rdata_beats, 0x01), + CMN_EVENT_RNID(CMN_ANY, s1_rdata_beats, 0x02), + CMN_EVENT_RNID(CMN_ANY, s2_rdata_beats, 0x03), + CMN_EVENT_RNID(CMN_ANY, rxdat_flits, 0x04), + CMN_EVENT_RNID(CMN_ANY, txdat_flits, 0x05), + CMN_EVENT_RNID(CMN_ANY, txreq_flits_total, 0x06), + CMN_EVENT_RNID(CMN_ANY, txreq_flits_retried, 0x07), + CMN_EVENT_RNID(CMN_ANY, rrt_occ_ovfl, 0x08), + CMN_EVENT_RNID(CMN_ANY, wrt_occ_ovfl, 0x09), + CMN_EVENT_RNID(CMN_ANY, txreq_flits_replayed, 0x0a), + CMN_EVENT_RNID(CMN_ANY, wrcancel_sent, 0x0b), + CMN_EVENT_RNID(CMN_ANY, s0_wdata_beats, 0x0c), + CMN_EVENT_RNID(CMN_ANY, s1_wdata_beats, 0x0d), + CMN_EVENT_RNID(CMN_ANY, s2_wdata_beats, 0x0e), + CMN_EVENT_RNID(CMN_ANY, rrt_alloc, 0x0f), + CMN_EVENT_RNID(CMN_ANY, wrt_alloc, 0x10), + CMN_EVENT_RNID(CMN600, rdb_unord, 0x11), + CMN_EVENT_RNID(CMN600, rdb_replay, 0x12), + CMN_EVENT_RNID(CMN600, rdb_hybrid, 0x13), + CMN_EVENT_RNID(CMN600, rdb_ord, 0x14), + CMN_EVENT_RNID(CI700, padb_occ_ovfl, 0x11), + CMN_EVENT_RNID(CI700, rpdb_occ_ovfl, 0x12), + CMN_EVENT_RNID(CI700, rrt_occup_ovfl_slice1, 0x13), + CMN_EVENT_RNID(CI700, rrt_occup_ovfl_slice2, 0x14), + CMN_EVENT_RNID(CI700, rrt_occup_ovfl_slice3, 0x15), + CMN_EVENT_RNID(CI700, wrt_throttled, 0x16), + + CMN_EVENT_MTSX(tc_lookup, 0x01), + CMN_EVENT_MTSX(tc_fill, 0x02), + CMN_EVENT_MTSX(tc_miss, 0x03), + CMN_EVENT_MTSX(tdb_forward, 0x04), + CMN_EVENT_MTSX(tcq_hazard, 0x05), + CMN_EVENT_MTSX(tcq_rd_alloc, 0x06), + CMN_EVENT_MTSX(tcq_wr_alloc, 0x07), + CMN_EVENT_MTSX(tcq_cmo_alloc, 0x08), + CMN_EVENT_MTSX(axi_rd_req, 0x09), + CMN_EVENT_MTSX(axi_wr_req, 0x0a), + CMN_EVENT_MTSX(tcq_occ_cnt_ovfl, 0x0b), + CMN_EVENT_MTSX(tdb_occ_cnt_ovfl, 0x0c), NULL }; @@ -644,7 +909,8 @@ static u32 arm_cmn_wp_config(struct perf_event *event) config = FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_DEV_SEL, dev) | FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_CHN_SEL, chn) | FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_GRP, grp) | - FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE, exc); + FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE, exc) | + FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_DEV_SEL2, dev >> 1); if (combine && !grp) config |= CMN_DTM_WPn_CONFIG_WP_COMBINE; @@ -679,18 +945,19 @@ static void arm_cmn_pmu_disable(struct pmu *pmu) static u64 arm_cmn_read_dtm(struct arm_cmn *cmn, struct arm_cmn_hw_event *hw, bool snapshot) { + struct arm_cmn_dtm *dtm = NULL; struct arm_cmn_node *dn; - unsigned int i, offset; - u64 count = 0; + unsigned int i, offset, dtm_idx; + u64 reg, count = 0; offset = snapshot ? CMN_DTM_PMEVCNTSR : CMN_DTM_PMEVCNT; for_each_hw_dn(hw, dn, i) { - struct arm_cmn_node *xp = arm_cmn_node_to_xp(dn); - int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i); - u64 reg = readq_relaxed(xp->pmu_base + offset); - u16 dtm_count = reg >> (dtm_idx * 16); - - count += dtm_count; + if (dtm != &cmn->dtms[dn->dtm]) { + dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset; + reg = readq_relaxed(dtm->base + offset); + } + dtm_idx = arm_cmn_get_index(hw->dtm_idx, i); + count += (u16)(reg >> (dtm_idx * 16)); } return count; } @@ -774,8 +1041,10 @@ static void arm_cmn_event_start(struct perf_event *event, int flags) u64 mask = CMN_EVENT_WP_MASK(event); for_each_hw_dn(hw, dn, i) { - writeq_relaxed(val, dn->pmu_base + CMN_DTM_WPn_VAL(wp_idx)); - writeq_relaxed(mask, dn->pmu_base + CMN_DTM_WPn_MASK(wp_idx)); + void __iomem *base = dn->pmu_base + CMN_DTM_OFFSET(hw->dtm_offset); + + writeq_relaxed(val, base + CMN_DTM_WPn_VAL(wp_idx)); + writeq_relaxed(mask, base + CMN_DTM_WPn_MASK(wp_idx)); } } else for_each_hw_dn(hw, dn, i) { int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i); @@ -800,8 +1069,10 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags) int wp_idx = arm_cmn_wp_idx(event); for_each_hw_dn(hw, dn, i) { - writeq_relaxed(0, dn->pmu_base + CMN_DTM_WPn_MASK(wp_idx)); - writeq_relaxed(~0ULL, dn->pmu_base + CMN_DTM_WPn_VAL(wp_idx)); + void __iomem *base = dn->pmu_base + CMN_DTM_OFFSET(hw->dtm_offset); + + writeq_relaxed(0, base + CMN_DTM_WPn_MASK(wp_idx)); + writeq_relaxed(~0ULL, base + CMN_DTM_WPn_VAL(wp_idx)); } } else for_each_hw_dn(hw, dn, i) { int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i); @@ -814,14 +1085,15 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags) } struct arm_cmn_val { - u8 dtm_count[CMN_MAX_XPS]; - u8 occupid[CMN_MAX_XPS]; - u8 wp[CMN_MAX_XPS][4]; + u8 dtm_count[CMN_MAX_DTMS]; + u8 occupid[CMN_MAX_DTMS]; + u8 wp[CMN_MAX_DTMS][4]; int dtc_count; bool cycles; }; -static void arm_cmn_val_add_event(struct arm_cmn_val *val, struct perf_event *event) +static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val, + struct perf_event *event) { struct arm_cmn_hw_event *hw = to_cmn_hw(event); struct arm_cmn_node *dn; @@ -839,33 +1111,33 @@ static void arm_cmn_val_add_event(struct arm_cmn_val *val, struct perf_event *ev } val->dtc_count++; - if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) + if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event))) occupid = CMN_EVENT_OCCUPID(event) + 1; else occupid = 0; for_each_hw_dn(hw, dn, i) { - int wp_idx, xp = arm_cmn_node_to_xp(dn)->logid; + int wp_idx, dtm = dn->dtm; - val->dtm_count[xp]++; - val->occupid[xp] = occupid; + val->dtm_count[dtm]++; + val->occupid[dtm] = occupid; if (type != CMN_TYPE_WP) continue; wp_idx = arm_cmn_wp_idx(event); - val->wp[xp][wp_idx] = CMN_EVENT_WP_COMBINE(event) + 1; + val->wp[dtm][wp_idx] = CMN_EVENT_WP_COMBINE(event) + 1; } } -static int arm_cmn_validate_group(struct perf_event *event) +static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event) { struct arm_cmn_hw_event *hw = to_cmn_hw(event); struct arm_cmn_node *dn; struct perf_event *sibling, *leader = event->group_leader; enum cmn_node_type type; - struct arm_cmn_val val; - int i; + struct arm_cmn_val *val; + int i, ret = -EINVAL; u8 occupid; if (leader == event) @@ -874,54 +1146,61 @@ static int arm_cmn_validate_group(struct perf_event *event) if (event->pmu != leader->pmu && !is_software_event(leader)) return -EINVAL; - memset(&val, 0, sizeof(val)); + val = kzalloc(sizeof(*val), GFP_KERNEL); + if (!val) + return -ENOMEM; - arm_cmn_val_add_event(&val, leader); + arm_cmn_val_add_event(cmn, val, leader); for_each_sibling_event(sibling, leader) - arm_cmn_val_add_event(&val, sibling); + arm_cmn_val_add_event(cmn, val, sibling); type = CMN_EVENT_TYPE(event); - if (type == CMN_TYPE_DTC) - return val.cycles ? -EINVAL : 0; + if (type == CMN_TYPE_DTC) { + ret = val->cycles ? -EINVAL : 0; + goto done; + } - if (val.dtc_count == CMN_DT_NUM_COUNTERS) - return -EINVAL; + if (val->dtc_count == CMN_DT_NUM_COUNTERS) + goto done; - if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) + if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event))) occupid = CMN_EVENT_OCCUPID(event) + 1; else occupid = 0; for_each_hw_dn(hw, dn, i) { - int wp_idx, wp_cmb, xp = arm_cmn_node_to_xp(dn)->logid; + int wp_idx, wp_cmb, dtm = dn->dtm; - if (val.dtm_count[xp] == CMN_DTM_NUM_COUNTERS) - return -EINVAL; + if (val->dtm_count[dtm] == CMN_DTM_NUM_COUNTERS) + goto done; - if (occupid && val.occupid[xp] && occupid != val.occupid[xp]) - return -EINVAL; + if (occupid && val->occupid[dtm] && occupid != val->occupid[dtm]) + goto done; if (type != CMN_TYPE_WP) continue; wp_idx = arm_cmn_wp_idx(event); - if (val.wp[xp][wp_idx]) - return -EINVAL; + if (val->wp[dtm][wp_idx]) + goto done; - wp_cmb = val.wp[xp][wp_idx ^ 1]; + wp_cmb = val->wp[dtm][wp_idx ^ 1]; if (wp_cmb && wp_cmb != CMN_EVENT_WP_COMBINE(event) + 1) - return -EINVAL; + goto done; } - return 0; + ret = 0; +done: + kfree(val); + return ret; } static int arm_cmn_event_init(struct perf_event *event) { struct arm_cmn *cmn = to_cmn(event->pmu); struct arm_cmn_hw_event *hw = to_cmn_hw(event); + struct arm_cmn_node *dn; enum cmn_node_type type; - unsigned int i; bool bynodeid; u16 nodeid, eventid; @@ -947,38 +1226,37 @@ static int arm_cmn_event_init(struct perf_event *event) eventid = CMN_EVENT_EVENTID(event); if (eventid != CMN_WP_UP && eventid != CMN_WP_DOWN) return -EINVAL; + /* ...but the DTM may depend on which port we're watching */ + if (cmn->multi_dtm) + hw->dtm_offset = CMN_EVENT_WP_DEV_SEL(event) / 2; } bynodeid = CMN_EVENT_BYNODEID(event); nodeid = CMN_EVENT_NODEID(event); hw->dn = arm_cmn_node(cmn, type); - for (i = hw->dn - cmn->dns; i < cmn->num_dns && cmn->dns[i].type == type; i++) { - if (!bynodeid) { - hw->num_dns++; - } else if (cmn->dns[i].id != nodeid) { + if (!hw->dn) + return -EINVAL; + for (dn = hw->dn; dn->type == type; dn++) { + if (bynodeid && dn->id != nodeid) { hw->dn++; - } else { - hw->num_dns = 1; - break; + continue; } + hw->dtcs_used |= arm_cmn_node_to_xp(cmn, dn)->dtc; + hw->num_dns++; + if (bynodeid) + break; } if (!hw->num_dns) { - int bits = arm_cmn_xyidbits(cmn); + struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, nodeid); dev_dbg(cmn->dev, "invalid node 0x%x (%d,%d,%d,%d) type 0x%x\n", - nodeid, CMN_NODEID_X(nodeid, bits), CMN_NODEID_Y(nodeid, bits), - CMN_NODEID_PID(nodeid), CMN_NODEID_DEVID(nodeid), type); + nodeid, nid.x, nid.y, nid.port, nid.dev, type); return -EINVAL; } - /* - * By assuming events count in all DTC domains, we cunningly avoid - * needing to know anything about how XPs are assigned to domains. - */ - hw->dtcs_used = (1U << cmn->num_dtcs) - 1; - return arm_cmn_validate_group(event); + return arm_cmn_validate_group(cmn, event); } static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event, @@ -988,17 +1266,17 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event, enum cmn_node_type type = CMN_EVENT_TYPE(event); while (i--) { - struct arm_cmn_node *xp = arm_cmn_node_to_xp(hw->dn + i); + struct arm_cmn_dtm *dtm = &cmn->dtms[hw->dn[i].dtm] + hw->dtm_offset; unsigned int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i); if (type == CMN_TYPE_WP) - hw->dn[i].wp_event[arm_cmn_wp_idx(event)] = -1; + dtm->wp_event[arm_cmn_wp_idx(event)] = -1; - if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) + if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event))) hw->dn[i].occupid_count--; - xp->pmu_config_low &= ~CMN__PMEVCNT_PAIRED(dtm_idx); - writel_relaxed(xp->pmu_config_low, xp->pmu_base + CMN_DTM_PMU_CONFIG); + dtm->pmu_config_low &= ~CMN__PMEVCNT_PAIRED(dtm_idx); + writel_relaxed(dtm->pmu_config_low, dtm->base + CMN_DTM_PMU_CONFIG); } memset(hw->dtm_idx, 0, sizeof(hw->dtm_idx)); @@ -1040,12 +1318,12 @@ static int arm_cmn_event_add(struct perf_event *event, int flags) /* ...then the local counters to feed it. */ for_each_hw_dn(hw, dn, i) { - struct arm_cmn_node *xp = arm_cmn_node_to_xp(dn); + struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset; unsigned int dtm_idx, shift; u64 reg; dtm_idx = 0; - while (xp->pmu_config_low & CMN__PMEVCNT_PAIRED(dtm_idx)) + while (dtm->pmu_config_low & CMN__PMEVCNT_PAIRED(dtm_idx)) if (++dtm_idx == CMN_DTM_NUM_COUNTERS) goto free_dtms; @@ -1055,26 +1333,28 @@ static int arm_cmn_event_add(struct perf_event *event, int flags) int tmp, wp_idx = arm_cmn_wp_idx(event); u32 cfg = arm_cmn_wp_config(event); - if (dn->wp_event[wp_idx] >= 0) + if (dtm->wp_event[wp_idx] >= 0) goto free_dtms; - tmp = dn->wp_event[wp_idx ^ 1]; + tmp = dtm->wp_event[wp_idx ^ 1]; if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) != CMN_EVENT_WP_COMBINE(dtc->counters[tmp])) goto free_dtms; input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx; - dn->wp_event[wp_idx] = dtc_idx; - writel_relaxed(cfg, dn->pmu_base + CMN_DTM_WPn_CONFIG(wp_idx)); + dtm->wp_event[wp_idx] = dtc_idx; + writel_relaxed(cfg, dtm->base + CMN_DTM_WPn_CONFIG(wp_idx)); } else { - unsigned int port = CMN_NODEID_PID(dn->id); - unsigned int dev = CMN_NODEID_DEVID(dn->id); + struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id); + + if (cmn->multi_dtm) + nid.port %= 2; input_sel = CMN__PMEVCNT0_INPUT_SEL_DEV + dtm_idx + - (port << 4) + (dev << 2); + (nid.port << 4) + (nid.dev << 2); - if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) { - int occupid = CMN_EVENT_OCCUPID(event); + if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event))) { + u8 occupid = CMN_EVENT_OCCUPID(event); if (dn->occupid_count == 0) { dn->occupid_val = occupid; @@ -1089,13 +1369,13 @@ static int arm_cmn_event_add(struct perf_event *event, int flags) arm_cmn_set_index(hw->dtm_idx, i, dtm_idx); - xp->input_sel[dtm_idx] = input_sel; + dtm->input_sel[dtm_idx] = input_sel; shift = CMN__PMEVCNTn_GLOBAL_NUM_SHIFT(dtm_idx); - xp->pmu_config_low &= ~(CMN__PMEVCNT0_GLOBAL_NUM << shift); - xp->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, dtc_idx) << shift; - xp->pmu_config_low |= CMN__PMEVCNT_PAIRED(dtm_idx); - reg = (u64)le32_to_cpu(xp->pmu_config_high) << 32 | xp->pmu_config_low; - writeq_relaxed(reg, xp->pmu_base + CMN_DTM_PMU_CONFIG); + dtm->pmu_config_low &= ~(CMN__PMEVCNT0_GLOBAL_NUM << shift); + dtm->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, dtc_idx) << shift; + dtm->pmu_config_low |= CMN__PMEVCNT_PAIRED(dtm_idx); + reg = (u64)le32_to_cpu(dtm->pmu_config_high) << 32 | dtm->pmu_config_low; + writeq_relaxed(reg, dtm->base + CMN_DTM_PMU_CONFIG); } /* Go go go! */ @@ -1147,23 +1427,47 @@ static int arm_cmn_commit_txn(struct pmu *pmu) return 0; } -static int arm_cmn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) +static void arm_cmn_migrate(struct arm_cmn *cmn, unsigned int cpu) +{ + unsigned int i; + + perf_pmu_migrate_context(&cmn->pmu, cmn->cpu, cpu); + for (i = 0; i < cmn->num_dtcs; i++) + irq_set_affinity(cmn->dtc[i].irq, cpumask_of(cpu)); + cmn->cpu = cpu; +} + +static int arm_cmn_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node) { struct arm_cmn *cmn; - unsigned int i, target; + int node; - cmn = hlist_entry_safe(node, struct arm_cmn, cpuhp_node); + cmn = hlist_entry_safe(cpuhp_node, struct arm_cmn, cpuhp_node); + node = dev_to_node(cmn->dev); + if (node != NUMA_NO_NODE && cpu_to_node(cmn->cpu) != node && cpu_to_node(cpu) == node) + arm_cmn_migrate(cmn, cpu); + return 0; +} + +static int arm_cmn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node) +{ + struct arm_cmn *cmn; + unsigned int target; + int node; + cpumask_t mask; + + cmn = hlist_entry_safe(cpuhp_node, struct arm_cmn, cpuhp_node); if (cpu != cmn->cpu) return 0; - target = cpumask_any_but(cpu_online_mask, cpu); - if (target >= nr_cpu_ids) - return 0; - - perf_pmu_migrate_context(&cmn->pmu, cpu, target); - for (i = 0; i < cmn->num_dtcs; i++) - irq_set_affinity(cmn->dtc[i].irq, cpumask_of(target)); - cmn->cpu = target; + node = dev_to_node(cmn->dev); + if (cpumask_and(&mask, cpumask_of_node(node), cpu_online_mask) && + cpumask_andnot(&mask, &mask, cpumask_of(cpu))) + target = cpumask_any(&mask); + else + target = cpumask_any_but(cpu_online_mask, cpu); + if (target < nr_cpu_ids) + arm_cmn_migrate(cmn, target); return 0; } @@ -1231,23 +1535,22 @@ static int arm_cmn_init_irqs(struct arm_cmn *cmn) return 0; } -static void arm_cmn_init_dtm(struct arm_cmn_node *xp) +static void arm_cmn_init_dtm(struct arm_cmn_dtm *dtm, struct arm_cmn_node *xp, int idx) { int i; + dtm->base = xp->pmu_base + CMN_DTM_OFFSET(idx); + dtm->pmu_config_low = CMN_DTM_PMU_CONFIG_PMU_EN; for (i = 0; i < 4; i++) { - xp->wp_event[i] = -1; - writeq_relaxed(0, xp->pmu_base + CMN_DTM_WPn_MASK(i)); - writeq_relaxed(~0ULL, xp->pmu_base + CMN_DTM_WPn_VAL(i)); + dtm->wp_event[i] = -1; + writeq_relaxed(0, dtm->base + CMN_DTM_WPn_MASK(i)); + writeq_relaxed(~0ULL, dtm->base + CMN_DTM_WPn_VAL(i)); } - xp->pmu_config_low = CMN_DTM_PMU_CONFIG_PMU_EN; - xp->dtc = -1; } static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int idx) { struct arm_cmn_dtc *dtc = cmn->dtc + idx; - struct arm_cmn_node *xp; dtc->base = dn->pmu_base - CMN_PMU_OFFSET; dtc->irq = platform_get_irq(to_platform_device(cmn->dev), idx); @@ -1258,10 +1561,6 @@ static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int id writel_relaxed(0x1ff, dtc->base + CMN_DT_PMOVSR_CLR); writel_relaxed(CMN_DT_PMCR_OVFL_INTR_EN, dtc->base + CMN_DT_PMCR); - /* We do at least know that a DTC's XP must be in that DTC's domain */ - xp = arm_cmn_node_to_xp(dn); - xp->dtc = idx; - return 0; } @@ -1278,8 +1577,9 @@ static int arm_cmn_node_cmp(const void *a, const void *b) static int arm_cmn_init_dtcs(struct arm_cmn *cmn) { - struct arm_cmn_node *dn; + struct arm_cmn_node *dn, *xp; int dtc_idx = 0; + u8 dtcs_present = (1 << cmn->num_dtcs) - 1; cmn->dtc = devm_kcalloc(cmn->dev, cmn->num_dtcs, sizeof(cmn->dtc[0]), GFP_KERNEL); if (!cmn->dtc) @@ -1289,14 +1589,26 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn) cmn->xps = arm_cmn_node(cmn, CMN_TYPE_XP); - for (dn = cmn->dns; dn < cmn->dns + cmn->num_dns; dn++) { - if (dn->type != CMN_TYPE_XP) - arm_cmn_init_node_to_xp(cmn, dn); - else if (cmn->num_dtcs == 1) - dn->dtc = 0; + for (dn = cmn->dns; dn->type; dn++) { + if (dn->type == CMN_TYPE_XP) { + dn->dtc &= dtcs_present; + continue; + } - if (dn->type == CMN_TYPE_DTC) - arm_cmn_init_dtc(cmn, dn, dtc_idx++); + xp = arm_cmn_node_to_xp(cmn, dn); + dn->dtm = xp->dtm; + if (cmn->multi_dtm) + dn->dtm += arm_cmn_nid(cmn, dn->id).port / 2; + + if (dn->type == CMN_TYPE_DTC) { + int err; + /* We do at least know that a DTC's XP must be in that DTC's domain */ + if (xp->dtc == 0xf) + xp->dtc = 1 << dtc_idx; + err = arm_cmn_init_dtc(cmn, dn, dtc_idx++); + if (err) + return err; + } /* To the PMU, RN-Ds don't add anything over RN-Is, so smoosh them together */ if (dn->type == CMN_TYPE_RND) @@ -1335,20 +1647,26 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) { void __iomem *cfg_region; struct arm_cmn_node cfg, *dn; + struct arm_cmn_dtm *dtm; u16 child_count, child_poff; u32 xp_offset[CMN_MAX_XPS]; u64 reg; int i, j; - - cfg_region = cmn->base + rgn_offset; - reg = readl_relaxed(cfg_region + CMN_CFGM_PERIPH_ID_2); - cmn->rev = FIELD_GET(CMN_CFGM_PID2_REVISION, reg); - dev_dbg(cmn->dev, "periph_id_2 revision: %d\n", cmn->rev); + size_t sz; arm_cmn_init_node_info(cmn, rgn_offset, &cfg); if (cfg.type != CMN_TYPE_CFG) return -ENODEV; + cfg_region = cmn->base + rgn_offset; + reg = readl_relaxed(cfg_region + CMN_CFGM_PERIPH_ID_2); + cmn->rev = FIELD_GET(CMN_CFGM_PID2_REVISION, reg); + + reg = readq_relaxed(cfg_region + CMN_CFGM_INFO_GLOBAL); + cmn->multi_dtm = reg & CMN_INFO_MULTIPLE_DTM_EN; + cmn->rsp_vc_num = FIELD_GET(CMN_INFO_RSP_VC_NUM, reg); + cmn->dat_vc_num = FIELD_GET(CMN_INFO_DAT_VC_NUM, reg); + reg = readq_relaxed(cfg_region + CMN_CHILD_INFO); child_count = FIELD_GET(CMN_CI_CHILD_COUNT, reg); child_poff = FIELD_GET(CMN_CI_CHILD_PTR_OFFSET, reg); @@ -1365,20 +1683,28 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) cmn->num_dns += FIELD_GET(CMN_CI_CHILD_COUNT, reg); } - /* Cheeky +1 to help terminate pointer-based iteration */ - cmn->dns = devm_kcalloc(cmn->dev, cmn->num_dns + 1, - sizeof(*cmn->dns), GFP_KERNEL); - if (!cmn->dns) + /* Cheeky +1 to help terminate pointer-based iteration later */ + dn = devm_kcalloc(cmn->dev, cmn->num_dns + 1, sizeof(*dn), GFP_KERNEL); + if (!dn) + return -ENOMEM; + + /* Initial safe upper bound on DTMs for any possible mesh layout */ + i = cmn->num_xps; + if (cmn->multi_dtm) + i += cmn->num_xps + 1; + dtm = devm_kcalloc(cmn->dev, i, sizeof(*dtm), GFP_KERNEL); + if (!dtm) return -ENOMEM; /* Pass 2: now we can actually populate the nodes */ - dn = cmn->dns; + cmn->dns = dn; + cmn->dtms = dtm; for (i = 0; i < cmn->num_xps; i++) { void __iomem *xp_region = cmn->base + xp_offset[i]; struct arm_cmn_node *xp = dn++; + unsigned int xp_ports = 0; arm_cmn_init_node_info(cmn, xp_offset[i], xp); - arm_cmn_init_dtm(xp); /* * Thanks to the order in which XP logical IDs seem to be * assigned, we can handily infer the mesh X dimension by @@ -1388,6 +1714,40 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) if (xp->id == (1 << 3)) cmn->mesh_x = xp->logid; + if (cmn->model == CMN600) + xp->dtc = 0xf; + else + xp->dtc = 1 << readl_relaxed(xp_region + CMN_DTM_UNIT_INFO); + + xp->dtm = dtm - cmn->dtms; + arm_cmn_init_dtm(dtm++, xp, 0); + /* + * Keeping track of connected ports will let us filter out + * unnecessary XP events easily. We can also reliably infer the + * "extra device ports" configuration for the node ID format + * from this, since in that case we will see at least one XP + * with port 2 connected, for the HN-D. + */ + if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P0)) + xp_ports |= BIT(0); + if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P1)) + xp_ports |= BIT(1); + if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P2)) + xp_ports |= BIT(2); + if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P3)) + xp_ports |= BIT(3); + if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P4)) + xp_ports |= BIT(4); + if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P5)) + xp_ports |= BIT(5); + + if (cmn->multi_dtm && (xp_ports & 0xc)) + arm_cmn_init_dtm(dtm++, xp, 1); + if (cmn->multi_dtm && (xp_ports & 0x30)) + arm_cmn_init_dtm(dtm++, xp, 2); + + cmn->ports_used |= xp_ports; + reg = readq_relaxed(xp_region + CMN_CHILD_INFO); child_count = FIELD_GET(CMN_CI_CHILD_COUNT, reg); child_poff = FIELD_GET(CMN_CI_CHILD_PTR_OFFSET, reg); @@ -1422,11 +1782,14 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) case CMN_TYPE_SBSX: case CMN_TYPE_RNI: case CMN_TYPE_RND: + case CMN_TYPE_MTSX: case CMN_TYPE_CXRA: case CMN_TYPE_CXHA: dn++; break; /* Nothing to see here */ + case CMN_TYPE_MPAM_S: + case CMN_TYPE_MPAM_NS: case CMN_TYPE_RNSAM: case CMN_TYPE_CXLA: break; @@ -1441,6 +1804,16 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) /* Correct for any nodes we skipped */ cmn->num_dns = dn - cmn->dns; + sz = (void *)(dn + 1) - (void *)cmn->dns; + dn = devm_krealloc(cmn->dev, cmn->dns, sz, GFP_KERNEL); + if (dn) + cmn->dns = dn; + + sz = (void *)dtm - (void *)cmn->dtms; + dtm = devm_krealloc(cmn->dev, cmn->dtms, sz, GFP_KERNEL); + if (dtm) + cmn->dtms = dtm; + /* * If mesh_x wasn't set during discovery then we never saw * an XP at (0,1), thus we must have an Nx1 configuration. @@ -1449,13 +1822,20 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) cmn->mesh_x = cmn->num_xps; cmn->mesh_y = cmn->num_xps / cmn->mesh_x; - dev_dbg(cmn->dev, "mesh %dx%d, ID width %d\n", - cmn->mesh_x, cmn->mesh_y, arm_cmn_xyidbits(cmn)); + /* 1x1 config plays havoc with XP event encodings */ + if (cmn->num_xps == 1) + dev_warn(cmn->dev, "1x1 config not fully supported, translate XP events manually\n"); + + dev_dbg(cmn->dev, "model %d, periph_id_2 revision %d\n", cmn->model, cmn->rev); + reg = cmn->ports_used; + dev_dbg(cmn->dev, "mesh %dx%d, ID width %d, ports %6pbl%s\n", + cmn->mesh_x, cmn->mesh_y, arm_cmn_xyidbits(cmn), ®, + cmn->multi_dtm ? ", multi-DTM" : ""); return 0; } -static int arm_cmn_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn) +static int arm_cmn600_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn) { struct resource *cfg, *root; @@ -1482,21 +1862,11 @@ static int arm_cmn_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn) return root->start - cfg->start; } -static int arm_cmn_of_probe(struct platform_device *pdev, struct arm_cmn *cmn) +static int arm_cmn600_of_probe(struct device_node *np) { - struct device_node *np = pdev->dev.of_node; u32 rootnode; - int ret; - cmn->base = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(cmn->base)) - return PTR_ERR(cmn->base); - - ret = of_property_read_u32(np, "arm,root-node", &rootnode); - if (ret) - return ret; - - return rootnode; + return of_property_read_u32(np, "arm,root-node", &rootnode) ?: rootnode; } static int arm_cmn_probe(struct platform_device *pdev) @@ -1504,19 +1874,26 @@ static int arm_cmn_probe(struct platform_device *pdev) struct arm_cmn *cmn; const char *name; static atomic_t id; - int err, rootnode; + int err, rootnode, this_id; cmn = devm_kzalloc(&pdev->dev, sizeof(*cmn), GFP_KERNEL); if (!cmn) return -ENOMEM; cmn->dev = &pdev->dev; + cmn->model = (unsigned long)device_get_match_data(cmn->dev); platform_set_drvdata(pdev, cmn); - if (has_acpi_companion(cmn->dev)) - rootnode = arm_cmn_acpi_probe(pdev, cmn); - else - rootnode = arm_cmn_of_probe(pdev, cmn); + if (cmn->model == CMN600 && has_acpi_companion(cmn->dev)) { + rootnode = arm_cmn600_acpi_probe(pdev, cmn); + } else { + rootnode = 0; + cmn->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(cmn->base)) + return PTR_ERR(cmn->base); + if (cmn->model == CMN600) + rootnode = arm_cmn600_of_probe(pdev->dev.of_node); + } if (rootnode < 0) return rootnode; @@ -1532,7 +1909,7 @@ static int arm_cmn_probe(struct platform_device *pdev) if (err) return err; - cmn->cpu = raw_smp_processor_id(); + cmn->cpu = cpumask_local_spread(0, dev_to_node(cmn->dev)); cmn->pmu = (struct pmu) { .module = THIS_MODULE, .attr_groups = arm_cmn_attr_groups, @@ -1551,7 +1928,8 @@ static int arm_cmn_probe(struct platform_device *pdev) .cancel_txn = arm_cmn_end_txn, }; - name = devm_kasprintf(cmn->dev, GFP_KERNEL, "arm_cmn_%d", atomic_fetch_inc(&id)); + this_id = atomic_fetch_inc(&id); + name = devm_kasprintf(cmn->dev, GFP_KERNEL, "arm_cmn_%d", this_id); if (!name) return -ENOMEM; @@ -1562,6 +1940,8 @@ static int arm_cmn_probe(struct platform_device *pdev) err = perf_pmu_register(&cmn->pmu, name, -1); if (err) cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node); + else + arm_cmn_debugfs_init(cmn, this_id); return err; } @@ -1574,12 +1954,14 @@ static int arm_cmn_remove(struct platform_device *pdev) perf_pmu_unregister(&cmn->pmu); cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node); + debugfs_remove(cmn->debug); return 0; } #ifdef CONFIG_OF static const struct of_device_id arm_cmn_of_match[] = { - { .compatible = "arm,cmn-600", }, + { .compatible = "arm,cmn-600", .data = (void *)CMN600 }, + { .compatible = "arm,ci-700", .data = (void *)CI700 }, {} }; MODULE_DEVICE_TABLE(of, arm_cmn_of_match); @@ -1587,7 +1969,7 @@ MODULE_DEVICE_TABLE(of, arm_cmn_of_match); #ifdef CONFIG_ACPI static const struct acpi_device_id arm_cmn_acpi_match[] = { - { "ARMHC600", }, + { "ARMHC600", CMN600 }, {} }; MODULE_DEVICE_TABLE(acpi, arm_cmn_acpi_match); @@ -1608,15 +1990,20 @@ static int __init arm_cmn_init(void) int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, - "perf/arm/cmn:online", NULL, + "perf/arm/cmn:online", + arm_cmn_pmu_online_cpu, arm_cmn_pmu_offline_cpu); if (ret < 0) return ret; arm_cmn_hp_state = ret; + arm_cmn_debugfs = debugfs_create_dir("arm-cmn", NULL); + ret = platform_driver_register(&arm_cmn_driver); - if (ret) + if (ret) { cpuhp_remove_multi_state(arm_cmn_hp_state); + debugfs_remove(arm_cmn_debugfs); + } return ret; } @@ -1624,6 +2011,7 @@ static void __exit arm_cmn_exit(void) { platform_driver_unregister(&arm_cmn_driver); cpuhp_remove_multi_state(arm_cmn_hp_state); + debugfs_remove(arm_cmn_debugfs); } module_init(arm_cmn_init); diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index cd5945e17fdf..182a97eff314 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -674,9 +674,9 @@ static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev) static u64 arm_spe_pmsevfr_res0(u16 pmsver) { switch (pmsver) { - case ID_AA64DFR0_PMSVER_8_2: + case ID_AA64DFR0_EL1_PMSVer_IMP: return SYS_PMSEVFR_EL1_RES0_8_2; - case ID_AA64DFR0_PMSVER_8_3: + case ID_AA64DFR0_EL1_PMSVer_V1P1: /* Return the highest version we support in default */ default: return SYS_PMSEVFR_EL1_RES0_8_3; @@ -958,7 +958,7 @@ static void __arm_spe_pmu_dev_probe(void *info) struct device *dev = &spe_pmu->pdev->dev; fld = cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64DFR0_EL1), - ID_AA64DFR0_PMSVER_SHIFT); + ID_AA64DFR0_EL1_PMSVer_SHIFT); if (!fld) { dev_err(dev, "unsupported ID_AA64DFR0_EL1.PMSVer [%d] on CPU %d\n", diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c index 3f9c60c5b250..51d47fc72aaf 100644 --- a/drivers/power/supply/power_supply_core.c +++ b/drivers/power/supply/power_supply_core.c @@ -32,6 +32,13 @@ EXPORT_SYMBOL_GPL(power_supply_notifier); static struct device_type power_supply_dev_type; +struct match_device_node_array_param { + struct device_node *parent_of_node; + struct power_supply **psy; + ssize_t psy_size; + ssize_t psy_count; +}; + #define POWER_SUPPLY_DEFERRED_REGISTER_TIME msecs_to_jiffies(10) static bool __power_supply_is_supplied_by(struct power_supply *supplier, @@ -125,6 +132,7 @@ void power_supply_changed(struct power_supply *psy) } EXPORT_SYMBOL_GPL(power_supply_changed); +static int psy_register_cooler(struct power_supply *psy); /* * Notify that power supply was registered after parent finished the probing. * @@ -132,6 +140,8 @@ EXPORT_SYMBOL_GPL(power_supply_changed); * calling power_supply_changed() directly from power_supply_register() * would lead to execution of get_property() function provided by the driver * too early - before the probe ends. + * Also, registering cooling device from the probe will execute the + * get_property() function. So register the cooling device after the probe. * * Avoid that by waiting on parent's mutex. */ @@ -149,6 +159,7 @@ static void power_supply_deferred_register_work(struct work_struct *work) } power_supply_changed(psy); + psy_register_cooler(psy); if (psy->dev.parent) mutex_unlock(&psy->dev.parent->mutex); @@ -522,6 +533,77 @@ struct power_supply *power_supply_get_by_phandle(struct device_node *np, } EXPORT_SYMBOL_GPL(power_supply_get_by_phandle); +static int power_supply_match_device_node_array(struct device *dev, + void *data) +{ + struct match_device_node_array_param *param = + (struct match_device_node_array_param *)data; + struct power_supply **psy = param->psy; + ssize_t size = param->psy_size; + ssize_t *count = ¶m->psy_count; + + if (!dev->parent || dev->parent->of_node != param->parent_of_node) + return 0; + + if (*count >= size) + return -EOVERFLOW; + + psy[*count] = dev_get_drvdata(dev); + atomic_inc(&psy[*count]->use_cnt); + (*count)++; + + return 0; +} + +/** + * power_supply_get_by_phandle_array() - Similar to + * power_supply_get_by_phandle but returns an array of power supply + * objects which are associated with the phandle. + * @np: Pointer to device node holding phandle property. + * @property: Name of property holding a power supply name. + * @psy: Array of power_supply pointers provided by the client which is + * filled by power_supply_get_by_phandle_array. + * @size: size of power_supply pointer array. + * + * If power supply was found, it increases reference count for the + * internal power supply's device. The user should power_supply_put() + * after usage. + * + * Return: On success returns the number of power supply objects filled + * in the @psy array. + * -EOVERFLOW when size of @psy array is not suffice. + * -EINVAL when @psy is NULL or @size is 0. + * -ENODEV when matching device_node is not found. + */ +int power_supply_get_by_phandle_array(struct device_node *np, + const char *property, + struct power_supply **psy, + ssize_t size) +{ + struct device_node *power_supply_np; + int ret; + struct match_device_node_array_param param; + + if (!psy || !size) + return -EINVAL; + + power_supply_np = of_parse_phandle(np, property, 0); + if (!power_supply_np) + return -ENODEV; + + param.parent_of_node = power_supply_np; + param.psy = psy; + param.psy_size = size; + param.psy_count = 0; + ret = class_for_each_device(power_supply_class, NULL, ¶m, + power_supply_match_device_node_array); + + of_node_put(power_supply_np); + + return param.psy_count; +} +EXPORT_SYMBOL_GPL(power_supply_get_by_phandle_array); + static void devm_power_supply_put(struct device *dev, void *res) { struct power_supply **psy = res; @@ -1080,9 +1162,15 @@ static int psy_register_cooler(struct power_supply *psy) for (i = 0; i < psy->desc->num_properties; i++) { if (psy->desc->properties[i] == POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT) { - psy->tcd = thermal_cooling_device_register( - (char *)psy->desc->name, - psy, &psy_tcd_ops); + if (psy->dev.parent) + psy->tcd = thermal_of_cooling_device_register( + dev_of_node(psy->dev.parent), + (char *)psy->desc->name, + psy, &psy_tcd_ops); + else + psy->tcd = thermal_cooling_device_register( + (char *)psy->desc->name, + psy, &psy_tcd_ops); return PTR_ERR_OR_ZERO(psy->tcd); } } @@ -1188,10 +1276,6 @@ __power_supply_register(struct device *parent, if (rc) goto register_thermal_failed; - rc = psy_register_cooler(psy); - if (rc) - goto register_cooler_failed; - rc = power_supply_create_triggers(psy); if (rc) goto create_triggers_failed; @@ -1221,8 +1305,6 @@ __power_supply_register(struct device *parent, add_hwmon_sysfs_failed: power_supply_remove_triggers(psy); create_triggers_failed: - psy_unregister_cooler(psy); -register_cooler_failed: psy_unregister_thermal(psy); register_thermal_failed: wakeup_init_failed: diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c index c3d7cbcd4fad..64fd5d30788d 100644 --- a/drivers/power/supply/power_supply_sysfs.c +++ b/drivers/power/supply/power_supply_sysfs.c @@ -89,6 +89,7 @@ static const char * const POWER_SUPPLY_CHARGE_TYPE_TEXT[] = { [POWER_SUPPLY_CHARGE_TYPE_ADAPTIVE] = "Adaptive", [POWER_SUPPLY_CHARGE_TYPE_CUSTOM] = "Custom", [POWER_SUPPLY_CHARGE_TYPE_LONGLIFE] = "Long Life", + [POWER_SUPPLY_CHARGE_TYPE_TAPER_EXT] = "Taper", }; static const char * const POWER_SUPPLY_HEALTH_TEXT[] = { diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 97e59f746126..0e13673f0d6d 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "remoteproc_internal.h" @@ -59,6 +60,7 @@ static int rproc_release_carveout(struct rproc *rproc, /* Unique indices for remoteproc devices */ static DEFINE_IDA(rproc_dev_index); +static struct workqueue_struct *rproc_recovery_wq; static const char * const rproc_crash_names[] = { [RPROC_MMUFAULT] = "mmufault", @@ -461,6 +463,7 @@ static void rproc_rvdev_release(struct device *dev) struct rproc_vdev *rvdev = container_of(dev, struct rproc_vdev, dev); of_reserved_mem_device_release(dev); + dma_release_coherent_memory(dev); kfree(rvdev); } @@ -1004,6 +1007,26 @@ void rproc_add_carveout(struct rproc *rproc, struct rproc_mem_entry *mem) } EXPORT_SYMBOL(rproc_add_carveout); +/** + * rproc_del_carveout() - remove an allocated carveout region + * @rproc: rproc handle + * @mem: memory entry to register + * + * This function removes specified memory entry in @rproc carveouts list. + */ +void rproc_del_carveout(struct rproc *rproc, struct rproc_mem_entry *mem) +{ + struct rproc_mem_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, &rproc->carveouts, node) { + if (entry == mem) { + list_del(&mem->node); + return; + } + } +} +EXPORT_SYMBOL(rproc_del_carveout); + /** * rproc_mem_entry_init() - allocate and initialize rproc_mem_entry struct * @dev: pointer on device struct @@ -1052,6 +1075,19 @@ rproc_mem_entry_init(struct device *dev, } EXPORT_SYMBOL(rproc_mem_entry_init); +/** + * rproc_mem_entry_free() - free a rproc_mem_entry struct + * @mem: rproc_mem_entry allocated by rproc_mem_entry_init() + * + * This function frees a rproc_mem_entry_struct that was allocated by + * rproc_mem_entry_init(). + */ +void rproc_mem_entry_free(struct rproc_mem_entry *mem) +{ + kfree(mem); +} +EXPORT_SYMBOL(rproc_mem_entry_free); + /** * rproc_of_resm_mem_entry_init() - allocate and initialize rproc_mem_entry struct * from a reserved memory phandle @@ -1977,6 +2013,7 @@ static void rproc_crash_handler_work(struct work_struct *work) rproc_trigger_recovery(rproc); out: + trace_android_vh_rproc_recovery(rproc); pm_relax(rproc->dev.parent); } @@ -2759,8 +2796,11 @@ void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type) dev_err(&rproc->dev, "crash detected in %s: type %s\n", rproc->name, rproc_crash_to_string(type)); + if (rproc_recovery_wq) + queue_work(rproc_recovery_wq, &rproc->crash_handler); + else /* Have a worker handle the error; ensure system is not suspended */ - queue_work(system_freezable_wq, &rproc->crash_handler); + queue_work(system_freezable_wq, &rproc->crash_handler); } EXPORT_SYMBOL(rproc_report_crash); @@ -2809,6 +2849,11 @@ static void __exit rproc_exit_panic(void) static int __init remoteproc_init(void) { + rproc_recovery_wq = alloc_workqueue("rproc_recovery_wq", + WQ_UNBOUND | WQ_FREEZABLE, 0); + if (!rproc_recovery_wq) + pr_err("remoteproc: creation of rproc_recovery_wq failed\n"); + rproc_init_sysfs(); rproc_init_debugfs(); rproc_init_cdev(); @@ -2825,6 +2870,8 @@ static void __exit remoteproc_exit(void) rproc_exit_panic(); rproc_exit_debugfs(); rproc_exit_sysfs(); + if (rproc_recovery_wq) + destroy_workqueue(rproc_recovery_wq); } module_exit(remoteproc_exit); diff --git a/drivers/remoteproc/remoteproc_coredump.c b/drivers/remoteproc/remoteproc_coredump.c index c892f433a323..8ed597a2eef3 100644 --- a/drivers/remoteproc/remoteproc_coredump.c +++ b/drivers/remoteproc/remoteproc_coredump.c @@ -32,6 +32,7 @@ void rproc_coredump_cleanup(struct rproc *rproc) kfree(entry); } } +EXPORT_SYMBOL(rproc_coredump_cleanup); /** * rproc_coredump_add_segment() - add segment of device memory to coredump @@ -327,6 +328,7 @@ void rproc_coredump(struct rproc *rproc) */ wait_for_completion(&dump_state.dump_done); } +EXPORT_SYMBOL(rproc_coredump); /** * rproc_coredump_using_sections() - perform coredump using section headers diff --git a/drivers/remoteproc/remoteproc_internal.h b/drivers/remoteproc/remoteproc_internal.h index a328e634b1de..72d4d3d7d94d 100644 --- a/drivers/remoteproc/remoteproc_internal.h +++ b/drivers/remoteproc/remoteproc_internal.h @@ -84,7 +84,6 @@ static inline void rproc_char_device_remove(struct rproc *rproc) void rproc_free_vring(struct rproc_vring *rvring); int rproc_alloc_vring(struct rproc_vdev *rvdev, int i); -void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem); phys_addr_t rproc_va_to_pa(void *cpu_addr); int rproc_trigger_recovery(struct rproc *rproc); diff --git a/drivers/remoteproc/remoteproc_sysfs.c b/drivers/remoteproc/remoteproc_sysfs.c index ea8b89f97d7b..1c7c34efe247 100644 --- a/drivers/remoteproc/remoteproc_sysfs.c +++ b/drivers/remoteproc/remoteproc_sysfs.c @@ -5,6 +5,7 @@ #include #include +#include #include "remoteproc_internal.h" @@ -51,9 +52,11 @@ static ssize_t recovery_store(struct device *dev, if (sysfs_streq(buf, "enabled")) { /* change the flag and begin the recovery process if needed */ rproc->recovery_disabled = false; + trace_android_vh_rproc_recovery_set(rproc); rproc_trigger_recovery(rproc); } else if (sysfs_streq(buf, "disabled")) { rproc->recovery_disabled = true; + trace_android_vh_rproc_recovery_set(rproc); } else if (sysfs_streq(buf, "recover")) { /* begin the recovery process without changing the flag */ rproc_trigger_recovery(rproc); diff --git a/drivers/rpmsg/Kconfig b/drivers/rpmsg/Kconfig index 0b4407abdf13..d3795860f5c0 100644 --- a/drivers/rpmsg/Kconfig +++ b/drivers/rpmsg/Kconfig @@ -15,6 +15,14 @@ config RPMSG_CHAR in /dev. They make it possible for user-space programs to send and receive rpmsg packets. +config RPMSG_CTRL + tristate "RPMSG control interface" + depends on RPMSG && ( RPMSG_CHAR || RPMSG_CHAR=n ) + help + Say Y here to enable the support of the /dev/rpmsg_ctrlX API. This API + allows user-space programs to create endpoints with specific service name, + source and destination addresses. + config RPMSG_NS tristate "RPMSG name service announcement" depends on RPMSG diff --git a/drivers/rpmsg/Makefile b/drivers/rpmsg/Makefile index 8d452656f0ee..58e3b382e316 100644 --- a/drivers/rpmsg/Makefile +++ b/drivers/rpmsg/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_RPMSG) += rpmsg_core.o obj-$(CONFIG_RPMSG_CHAR) += rpmsg_char.o +obj-$(CONFIG_RPMSG_CTRL) += rpmsg_ctrl.o obj-$(CONFIG_RPMSG_NS) += rpmsg_ns.o obj-$(CONFIG_RPMSG_MTK_SCP) += mtk_rpmsg.o qcom_glink-objs := qcom_glink_native.o qcom_glink_ssr.o diff --git a/drivers/rpmsg/qcom_glink_native.c b/drivers/rpmsg/qcom_glink_native.c index 13c31372337a..65a7d567b6bd 100644 --- a/drivers/rpmsg/qcom_glink_native.c +++ b/drivers/rpmsg/qcom_glink_native.c @@ -1642,7 +1642,7 @@ static int qcom_glink_create_chrdev(struct qcom_glink *glink) rpdev->dev.parent = glink->dev; rpdev->dev.release = qcom_glink_device_release; - return rpmsg_chrdev_register_device(rpdev); + return rpmsg_ctrldev_register_device(rpdev); } struct qcom_glink *qcom_glink_native_probe(struct device *dev, diff --git a/drivers/rpmsg/qcom_smd.c b/drivers/rpmsg/qcom_smd.c index 56bc622de25e..754161b02407 100644 --- a/drivers/rpmsg/qcom_smd.c +++ b/drivers/rpmsg/qcom_smd.c @@ -1113,7 +1113,7 @@ static int qcom_smd_create_chrdev(struct qcom_smd_edge *edge) qsdev->rpdev.dev.parent = &edge->dev; qsdev->rpdev.dev.release = qcom_smd_release_device; - return rpmsg_chrdev_register_device(&qsdev->rpdev); + return rpmsg_ctrldev_register_device(&qsdev->rpdev); } /* diff --git a/drivers/rpmsg/rpmsg_char.c b/drivers/rpmsg/rpmsg_char.c index 88c985f9e73a..aff402be53dc 100644 --- a/drivers/rpmsg/rpmsg_char.c +++ b/drivers/rpmsg/rpmsg_char.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2022, STMicroelectronics * Copyright (c) 2016, Linaro Ltd. * Copyright (c) 2012, Michal Simek * Copyright (c) 2012, PetaLogix @@ -9,6 +10,9 @@ * Based on rpmsg performance statistics driver by Michal Simek, which in turn * was based on TI & Google OMX rpmsg driver. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -22,35 +26,19 @@ #include #include +#include "rpmsg_char.h" #include "rpmsg_internal.h" #define RPMSG_DEV_MAX (MINORMASK + 1) static dev_t rpmsg_major; -static struct class *rpmsg_class; -static DEFINE_IDA(rpmsg_ctrl_ida); static DEFINE_IDA(rpmsg_ept_ida); static DEFINE_IDA(rpmsg_minor_ida); #define dev_to_eptdev(dev) container_of(dev, struct rpmsg_eptdev, dev) #define cdev_to_eptdev(i_cdev) container_of(i_cdev, struct rpmsg_eptdev, cdev) -#define dev_to_ctrldev(dev) container_of(dev, struct rpmsg_ctrldev, dev) -#define cdev_to_ctrldev(i_cdev) container_of(i_cdev, struct rpmsg_ctrldev, cdev) - -/** - * struct rpmsg_ctrldev - control device for instantiating endpoint devices - * @rpdev: underlaying rpmsg device - * @cdev: cdev for the ctrl device - * @dev: device for the ctrl device - */ -struct rpmsg_ctrldev { - struct rpmsg_device *rpdev; - struct cdev cdev; - struct device dev; -}; - /** * struct rpmsg_eptdev - endpoint device context * @dev: endpoint device @@ -62,6 +50,8 @@ struct rpmsg_ctrldev { * @queue_lock: synchronization of @queue operations * @queue: incoming message queue * @readq: wait object for incoming queue + * @default_ept: set to channel default endpoint if the default endpoint should be re-used + * on device open to prevent endpoint address update. */ struct rpmsg_eptdev { struct device dev; @@ -72,19 +62,23 @@ struct rpmsg_eptdev { struct mutex ept_lock; struct rpmsg_endpoint *ept; + struct rpmsg_endpoint *default_ept; spinlock_t queue_lock; struct sk_buff_head queue; wait_queue_head_t readq; + }; -static int rpmsg_eptdev_destroy(struct device *dev, void *data) +int rpmsg_chrdev_eptdev_destroy(struct device *dev, void *data) { struct rpmsg_eptdev *eptdev = dev_to_eptdev(dev); mutex_lock(&eptdev->ept_lock); if (eptdev->ept) { - rpmsg_destroy_ept(eptdev->ept); + /* The default endpoint is released by the rpmsg core */ + if (!eptdev->default_ept) + rpmsg_destroy_ept(eptdev->ept); eptdev->ept = NULL; } mutex_unlock(&eptdev->ept_lock); @@ -97,9 +91,10 @@ static int rpmsg_eptdev_destroy(struct device *dev, void *data) return 0; } +EXPORT_SYMBOL(rpmsg_chrdev_eptdev_destroy); -static int rpmsg_ept_cb(struct rpmsg_device *rpdev, void *buf, int len, - void *priv, u32 addr) +static int rpmsg_ept_copy_cb(struct rpmsg_device *rpdev, void *buf, int len, + void *priv, u32 addr) { struct rpmsg_eptdev *eptdev = priv; struct sk_buff *skb; @@ -120,6 +115,43 @@ static int rpmsg_ept_cb(struct rpmsg_device *rpdev, void *buf, int len, return 0; } +static int rpmsg_ept_no_copy_cb(struct rpmsg_device *rpdev, void *buf, int len, + void *priv, u32 addr) +{ + struct rpmsg_eptdev *eptdev = priv; + struct sk_buff *skb; + + skb = alloc_skb(0, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + skb->head = buf; + skb->data = buf; + skb_reset_tail_pointer(skb); + skb_set_end_offset(skb, len); + skb_put(skb, len); + + spin_lock(&eptdev->queue_lock); + skb_queue_tail(&eptdev->queue, skb); + spin_unlock(&eptdev->queue_lock); + + /* wake up any blocking processes, waiting for new data */ + wake_up_interruptible(&eptdev->readq); + + return RPMSG_DEFER; +} + +static int rpmsg_ept_cb(struct rpmsg_device *rpdev, void *buf, int len, + void *priv, u32 addr) +{ + struct rpmsg_eptdev *eptdev = priv; + rpmsg_rx_cb_t cb; + + cb = (eptdev->ept->rx_done) ? rpmsg_ept_no_copy_cb : rpmsg_ept_copy_cb; + + return cb(rpdev, buf, len, priv, addr); +} + static int rpmsg_eptdev_open(struct inode *inode, struct file *filp) { struct rpmsg_eptdev *eptdev = cdev_to_eptdev(inode->i_cdev); @@ -135,7 +167,15 @@ static int rpmsg_eptdev_open(struct inode *inode, struct file *filp) get_device(dev); - ept = rpmsg_create_ept(rpdev, rpmsg_ept_cb, eptdev, eptdev->chinfo); + /* + * If the default_ept is set, the rpmsg device default endpoint is used. + * Else a new endpoint is created on open that will be destroyed on release. + */ + if (eptdev->default_ept) + ept = eptdev->default_ept; + else + ept = rpmsg_create_ept(rpdev, rpmsg_ept_cb, eptdev, eptdev->chinfo); + if (!ept) { dev_err(dev, "failed to open %s\n", eptdev->chinfo.name); put_device(dev); @@ -158,7 +198,8 @@ static int rpmsg_eptdev_release(struct inode *inode, struct file *filp) /* Close the endpoint, if it's not already destroyed by the parent */ mutex_lock(&eptdev->ept_lock); if (eptdev->ept) { - rpmsg_destroy_ept(eptdev->ept); + if (!eptdev->default_ept) + rpmsg_destroy_ept(eptdev->ept); eptdev->ept = NULL; } mutex_unlock(&eptdev->ept_lock); @@ -213,6 +254,15 @@ static ssize_t rpmsg_eptdev_read_iter(struct kiocb *iocb, struct iov_iter *to) if (copy_to_iter(skb->data, use, to) != use) use = -EFAULT; + if (eptdev->ept->rx_done) { + rpmsg_rx_done(eptdev->ept, skb->data); + /* + * Data memory is freed by rpmsg_rx_done(), reset the skb data + * pointers so kfree_skb() does not try to free a second time. + */ + skb->head = NULL; + skb->data = NULL; + } kfree_skb(skb); return use; @@ -285,7 +335,11 @@ static long rpmsg_eptdev_ioctl(struct file *fp, unsigned int cmd, if (cmd != RPMSG_DESTROY_EPT_IOCTL) return -EINVAL; - return rpmsg_eptdev_destroy(&eptdev->dev, NULL); + /* Don't allow to destroy a default endpoint. */ + if (eptdev->default_ept) + return -EINVAL; + + return rpmsg_chrdev_eptdev_destroy(&eptdev->dev, NULL); } static const struct file_operations rpmsg_eptdev_fops = { @@ -343,21 +397,18 @@ static void rpmsg_eptdev_release_device(struct device *dev) kfree(eptdev); } -static int rpmsg_eptdev_create(struct rpmsg_ctrldev *ctrldev, - struct rpmsg_channel_info chinfo) +static struct rpmsg_eptdev *rpmsg_chrdev_eptdev_alloc(struct rpmsg_device *rpdev, + struct device *parent) { - struct rpmsg_device *rpdev = ctrldev->rpdev; struct rpmsg_eptdev *eptdev; struct device *dev; - int ret; eptdev = kzalloc(sizeof(*eptdev), GFP_KERNEL); if (!eptdev) - return -ENOMEM; + return ERR_PTR(-ENOMEM); dev = &eptdev->dev; eptdev->rpdev = rpdev; - eptdev->chinfo = chinfo; mutex_init(&eptdev->ept_lock); spin_lock_init(&eptdev->queue_lock); @@ -366,13 +417,23 @@ static int rpmsg_eptdev_create(struct rpmsg_ctrldev *ctrldev, device_initialize(dev); dev->class = rpmsg_class; - dev->parent = &ctrldev->dev; + dev->parent = parent; dev->groups = rpmsg_eptdev_groups; dev_set_drvdata(dev, eptdev); cdev_init(&eptdev->cdev, &rpmsg_eptdev_fops); eptdev->cdev.owner = THIS_MODULE; + return eptdev; +} + +static int rpmsg_chrdev_eptdev_add(struct rpmsg_eptdev *eptdev, struct rpmsg_channel_info chinfo) +{ + struct device *dev = &eptdev->dev; + int ret; + + eptdev->chinfo = chinfo; + ret = ida_simple_get(&rpmsg_minor_ida, 0, RPMSG_DEV_MAX, GFP_KERNEL); if (ret < 0) goto free_eptdev; @@ -404,163 +465,91 @@ free_eptdev: return ret; } -static int rpmsg_ctrldev_open(struct inode *inode, struct file *filp) +int rpmsg_chrdev_eptdev_create(struct rpmsg_device *rpdev, struct device *parent, + struct rpmsg_channel_info chinfo) { - struct rpmsg_ctrldev *ctrldev = cdev_to_ctrldev(inode->i_cdev); + struct rpmsg_eptdev *eptdev; + int ret; - get_device(&ctrldev->dev); - filp->private_data = ctrldev; + eptdev = rpmsg_chrdev_eptdev_alloc(rpdev, parent); + if (IS_ERR(eptdev)) + return PTR_ERR(eptdev); - return 0; -} - -static int rpmsg_ctrldev_release(struct inode *inode, struct file *filp) -{ - struct rpmsg_ctrldev *ctrldev = cdev_to_ctrldev(inode->i_cdev); - - put_device(&ctrldev->dev); - - return 0; -} - -static long rpmsg_ctrldev_ioctl(struct file *fp, unsigned int cmd, - unsigned long arg) -{ - struct rpmsg_ctrldev *ctrldev = fp->private_data; - void __user *argp = (void __user *)arg; - struct rpmsg_endpoint_info eptinfo; - struct rpmsg_channel_info chinfo; - - if (cmd != RPMSG_CREATE_EPT_IOCTL) - return -EINVAL; - - if (copy_from_user(&eptinfo, argp, sizeof(eptinfo))) - return -EFAULT; - - memcpy(chinfo.name, eptinfo.name, RPMSG_NAME_SIZE); - chinfo.name[RPMSG_NAME_SIZE-1] = '\0'; - chinfo.src = eptinfo.src; - chinfo.dst = eptinfo.dst; - - return rpmsg_eptdev_create(ctrldev, chinfo); -}; - -static const struct file_operations rpmsg_ctrldev_fops = { - .owner = THIS_MODULE, - .open = rpmsg_ctrldev_open, - .release = rpmsg_ctrldev_release, - .unlocked_ioctl = rpmsg_ctrldev_ioctl, - .compat_ioctl = compat_ptr_ioctl, -}; - -static void rpmsg_ctrldev_release_device(struct device *dev) -{ - struct rpmsg_ctrldev *ctrldev = dev_to_ctrldev(dev); - - ida_simple_remove(&rpmsg_ctrl_ida, dev->id); - ida_simple_remove(&rpmsg_minor_ida, MINOR(dev->devt)); - kfree(ctrldev); + ret = rpmsg_chrdev_eptdev_add(eptdev, chinfo); + + return ret; } +EXPORT_SYMBOL(rpmsg_chrdev_eptdev_create); static int rpmsg_chrdev_probe(struct rpmsg_device *rpdev) { - struct rpmsg_ctrldev *ctrldev; - struct device *dev; - int ret; + struct rpmsg_channel_info chinfo; + struct rpmsg_eptdev *eptdev; + struct device *dev = &rpdev->dev; - ctrldev = kzalloc(sizeof(*ctrldev), GFP_KERNEL); - if (!ctrldev) - return -ENOMEM; + memcpy(chinfo.name, rpdev->id.name, RPMSG_NAME_SIZE); + chinfo.src = rpdev->src; + chinfo.dst = rpdev->dst; - ctrldev->rpdev = rpdev; + eptdev = rpmsg_chrdev_eptdev_alloc(rpdev, dev); + if (IS_ERR(eptdev)) + return PTR_ERR(eptdev); - dev = &ctrldev->dev; - device_initialize(dev); - dev->parent = &rpdev->dev; - dev->class = rpmsg_class; + /* Set the default_ept to the rpmsg device endpoint */ + eptdev->default_ept = rpdev->ept; - cdev_init(&ctrldev->cdev, &rpmsg_ctrldev_fops); - ctrldev->cdev.owner = THIS_MODULE; + /* + * The rpmsg_ept_cb uses *priv parameter to get its rpmsg_eptdev context. + * Storedit in default_ept *priv field. + */ + eptdev->default_ept->priv = eptdev; - ret = ida_simple_get(&rpmsg_minor_ida, 0, RPMSG_DEV_MAX, GFP_KERNEL); - if (ret < 0) - goto free_ctrldev; - dev->devt = MKDEV(MAJOR(rpmsg_major), ret); - - ret = ida_simple_get(&rpmsg_ctrl_ida, 0, 0, GFP_KERNEL); - if (ret < 0) - goto free_minor_ida; - dev->id = ret; - dev_set_name(&ctrldev->dev, "rpmsg_ctrl%d", ret); - - ret = cdev_device_add(&ctrldev->cdev, &ctrldev->dev); - if (ret) - goto free_ctrl_ida; - - /* We can now rely on the release function for cleanup */ - dev->release = rpmsg_ctrldev_release_device; - - dev_set_drvdata(&rpdev->dev, ctrldev); - - return ret; - -free_ctrl_ida: - ida_simple_remove(&rpmsg_ctrl_ida, dev->id); -free_minor_ida: - ida_simple_remove(&rpmsg_minor_ida, MINOR(dev->devt)); -free_ctrldev: - put_device(dev); - kfree(ctrldev); - - return ret; + return rpmsg_chrdev_eptdev_add(eptdev, chinfo); } static void rpmsg_chrdev_remove(struct rpmsg_device *rpdev) { - struct rpmsg_ctrldev *ctrldev = dev_get_drvdata(&rpdev->dev); int ret; - /* Destroy all endpoints */ - ret = device_for_each_child(&ctrldev->dev, NULL, rpmsg_eptdev_destroy); + ret = device_for_each_child(&rpdev->dev, NULL, rpmsg_chrdev_eptdev_destroy); if (ret) - dev_warn(&rpdev->dev, "failed to nuke endpoints: %d\n", ret); - - cdev_device_del(&ctrldev->cdev, &ctrldev->dev); - put_device(&ctrldev->dev); + dev_warn(&rpdev->dev, "failed to destroy endpoints: %d\n", ret); } +static struct rpmsg_device_id rpmsg_chrdev_id_table[] = { + { .name = "rpmsg-raw" }, + { }, +}; + static struct rpmsg_driver rpmsg_chrdev_driver = { .probe = rpmsg_chrdev_probe, .remove = rpmsg_chrdev_remove, - .drv = { - .name = "rpmsg_chrdev", - }, + .callback = rpmsg_ept_cb, + .id_table = rpmsg_chrdev_id_table, + .drv.name = "rpmsg_chrdev", }; static int rpmsg_chrdev_init(void) { int ret; - ret = alloc_chrdev_region(&rpmsg_major, 0, RPMSG_DEV_MAX, "rpmsg"); + ret = alloc_chrdev_region(&rpmsg_major, 0, RPMSG_DEV_MAX, "rpmsg_char"); if (ret < 0) { - pr_err("rpmsg: failed to allocate char dev region\n"); + pr_err("failed to allocate char dev region\n"); return ret; } - rpmsg_class = class_create(THIS_MODULE, "rpmsg"); - if (IS_ERR(rpmsg_class)) { - pr_err("failed to create rpmsg class\n"); - unregister_chrdev_region(rpmsg_major, RPMSG_DEV_MAX); - return PTR_ERR(rpmsg_class); - } - ret = register_rpmsg_driver(&rpmsg_chrdev_driver); if (ret < 0) { - pr_err("rpmsgchr: failed to register rpmsg driver\n"); - class_destroy(rpmsg_class); - unregister_chrdev_region(rpmsg_major, RPMSG_DEV_MAX); + pr_err("rpmsg: failed to register rpmsg raw driver\n"); + goto free_region; } + return 0; + +free_region: + unregister_chrdev_region(rpmsg_major, RPMSG_DEV_MAX); + return ret; } postcore_initcall(rpmsg_chrdev_init); @@ -568,7 +557,6 @@ postcore_initcall(rpmsg_chrdev_init); static void rpmsg_chrdev_exit(void) { unregister_rpmsg_driver(&rpmsg_chrdev_driver); - class_destroy(rpmsg_class); unregister_chrdev_region(rpmsg_major, RPMSG_DEV_MAX); } module_exit(rpmsg_chrdev_exit); diff --git a/drivers/rpmsg/rpmsg_char.h b/drivers/rpmsg/rpmsg_char.h new file mode 100644 index 000000000000..117d9cbc52f0 --- /dev/null +++ b/drivers/rpmsg/rpmsg_char.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2022, STMicroelectronics + */ + +#ifndef __RPMSG_CHRDEV_H__ +#define __RPMSG_CHRDEV_H__ + +#if IS_ENABLED(CONFIG_RPMSG_CHAR) +/** + * rpmsg_chrdev_eptdev_create() - register char device based on an endpoint + * @rpdev: prepared rpdev to be used for creating endpoints + * @parent: parent device + * @chinfo: associated endpoint channel information. + * + * This function create a new rpmsg char endpoint device to instantiate a new + * endpoint based on chinfo information. + */ +int rpmsg_chrdev_eptdev_create(struct rpmsg_device *rpdev, struct device *parent, + struct rpmsg_channel_info chinfo); + +/** + * rpmsg_chrdev_eptdev_destroy() - destroy created char device endpoint. + * @data: private data associated to the endpoint device + * + * This function destroys a rpmsg char endpoint device created by the RPMSG_DESTROY_EPT_IOCTL + * control. + */ +int rpmsg_chrdev_eptdev_destroy(struct device *dev, void *data); + +#else /*IS_ENABLED(CONFIG_RPMSG_CHAR) */ + +static inline int rpmsg_chrdev_eptdev_create(struct rpmsg_device *rpdev, struct device *parent, + struct rpmsg_channel_info chinfo) +{ + return -ENXIO; +} + +static inline int rpmsg_chrdev_eptdev_destroy(struct device *dev, void *data) +{ + return -ENXIO; +} + +#endif /*IS_ENABLED(CONFIG_RPMSG_CHAR) */ + +#endif /*__RPMSG_CHRDEV_H__ */ diff --git a/drivers/rpmsg/rpmsg_core.c b/drivers/rpmsg/rpmsg_core.c index a71de08acc7b..79d6961663a9 100644 --- a/drivers/rpmsg/rpmsg_core.c +++ b/drivers/rpmsg/rpmsg_core.c @@ -20,6 +20,9 @@ #include "rpmsg_internal.h" +struct class *rpmsg_class; +EXPORT_SYMBOL(rpmsg_class); + /** * rpmsg_create_channel() - create a new rpmsg channel * using its name and address info. @@ -327,6 +330,62 @@ int rpmsg_trysend_offchannel(struct rpmsg_endpoint *ept, u32 src, u32 dst, } EXPORT_SYMBOL(rpmsg_trysend_offchannel); +/** + * rpmsg_get_signals() - get the signals for this endpoint + * @ept: the rpmsg endpoint + * + * Returns signal bits on success and an appropriate error value on failure. + */ +int rpmsg_get_signals(struct rpmsg_endpoint *ept) +{ + if (WARN_ON(!ept)) + return -EINVAL; + if (!ept->ops->get_signals) + return -ENXIO; + + return ept->ops->get_signals(ept); +} +EXPORT_SYMBOL(rpmsg_get_signals); + +/** + * rpmsg_set_signals() - set the remote signals for this endpoint + * @ept: the rpmsg endpoint + * @set: set mask for signals + * @clear: clear mask for signals + * + * Returns 0 on success and an appropriate error value on failure. + */ +int rpmsg_set_signals(struct rpmsg_endpoint *ept, u32 set, u32 clear) +{ + if (WARN_ON(!ept)) + return -EINVAL; + if (!ept->ops->set_signals) + return -ENXIO; + + return ept->ops->set_signals(ept, set, clear); +} +EXPORT_SYMBOL(rpmsg_set_signals); + +/** + * rpmsg_rx_done() - release resources related to @data from a @rx_cb + * @ept: the rpmsg endpoint + * @data: payload from a message + * + * Returns 0 on success and an appropriate error value on failure. + */ +int rpmsg_rx_done(struct rpmsg_endpoint *ept, void *data) +{ + if (WARN_ON(!ept)) + return -EINVAL; + if (!ept->ops->rx_done) + return -ENXIO; + if (!ept->rx_done) + return -EINVAL; + + return ept->ops->rx_done(ept, data); +} +EXPORT_SYMBOL(rpmsg_rx_done); + /* * match a rpmsg channel with a channel info struct. * this is used to make sure we're not creating rpmsg devices for channels @@ -514,6 +573,10 @@ static int rpmsg_dev_probe(struct device *dev) rpdev->ept = ept; rpdev->src = ept->addr; + + if (rpdrv->signals) + ept->sig_cb = rpdrv->signals; + } err = rpdrv->probe(rpdev); @@ -641,10 +704,17 @@ static int __init rpmsg_init(void) { int ret; - ret = bus_register(&rpmsg_bus); - if (ret) - pr_err("failed to register rpmsg bus: %d\n", ret); + rpmsg_class = class_create(THIS_MODULE, "rpmsg"); + if (IS_ERR(rpmsg_class)) { + pr_err("failed to create rpmsg class\n"); + return PTR_ERR(rpmsg_class); + } + ret = bus_register(&rpmsg_bus); + if (ret) { + pr_err("failed to register rpmsg bus: %d\n", ret); + class_destroy(rpmsg_class); + } return ret; } postcore_initcall(rpmsg_init); @@ -652,6 +722,7 @@ postcore_initcall(rpmsg_init); static void __exit rpmsg_fini(void) { bus_unregister(&rpmsg_bus); + class_destroy(rpmsg_class); } module_exit(rpmsg_fini); diff --git a/drivers/rpmsg/rpmsg_ctrl.c b/drivers/rpmsg/rpmsg_ctrl.c new file mode 100644 index 000000000000..107da70fdbaa --- /dev/null +++ b/drivers/rpmsg/rpmsg_ctrl.c @@ -0,0 +1,243 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022, STMicroelectronics + * Copyright (c) 2016, Linaro Ltd. + * Copyright (c) 2012, Michal Simek + * Copyright (c) 2012, PetaLogix + * Copyright (c) 2011, Texas Instruments, Inc. + * Copyright (c) 2011, Google, Inc. + * + * Based on rpmsg performance statistics driver by Michal Simek, which in turn + * was based on TI & Google OMX rpmsg driver. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rpmsg_char.h" +#include "rpmsg_internal.h" + +#define RPMSG_DEV_MAX (MINORMASK + 1) + +static dev_t rpmsg_major; + +static DEFINE_IDA(rpmsg_ctrl_ida); +static DEFINE_IDA(rpmsg_minor_ida); + +#define dev_to_ctrldev(dev) container_of(dev, struct rpmsg_ctrldev, dev) +#define cdev_to_ctrldev(i_cdev) container_of(i_cdev, struct rpmsg_ctrldev, cdev) + +/** + * struct rpmsg_ctrldev - control device for instantiating endpoint devices + * @rpdev: underlaying rpmsg device + * @cdev: cdev for the ctrl device + * @dev: device for the ctrl device + * @ctrl_lock: serialize the ioctrls. + */ +struct rpmsg_ctrldev { + struct rpmsg_device *rpdev; + struct cdev cdev; + struct device dev; + struct mutex ctrl_lock; +}; + +static int rpmsg_ctrldev_open(struct inode *inode, struct file *filp) +{ + struct rpmsg_ctrldev *ctrldev = cdev_to_ctrldev(inode->i_cdev); + + get_device(&ctrldev->dev); + filp->private_data = ctrldev; + + return 0; +} + +static int rpmsg_ctrldev_release(struct inode *inode, struct file *filp) +{ + struct rpmsg_ctrldev *ctrldev = cdev_to_ctrldev(inode->i_cdev); + + put_device(&ctrldev->dev); + + return 0; +} + +static long rpmsg_ctrldev_ioctl(struct file *fp, unsigned int cmd, + unsigned long arg) +{ + struct rpmsg_ctrldev *ctrldev = fp->private_data; + void __user *argp = (void __user *)arg; + struct rpmsg_endpoint_info eptinfo; + struct rpmsg_channel_info chinfo; + struct rpmsg_device *rpdev; + int ret = 0; + + if (copy_from_user(&eptinfo, argp, sizeof(eptinfo))) + return -EFAULT; + + memcpy(chinfo.name, eptinfo.name, RPMSG_NAME_SIZE); + chinfo.name[RPMSG_NAME_SIZE - 1] = '\0'; + chinfo.src = eptinfo.src; + chinfo.dst = eptinfo.dst; + + mutex_lock(&ctrldev->ctrl_lock); + switch (cmd) { + case RPMSG_CREATE_EPT_IOCTL: + ret = rpmsg_chrdev_eptdev_create(ctrldev->rpdev, &ctrldev->dev, chinfo); + break; + + case RPMSG_CREATE_DEV_IOCTL: + rpdev = rpmsg_create_channel(ctrldev->rpdev, &chinfo); + if (!rpdev) { + dev_err(&ctrldev->dev, "failed to create %s channel\n", chinfo.name); + ret = -ENXIO; + } + break; + + case RPMSG_RELEASE_DEV_IOCTL: + ret = rpmsg_release_channel(ctrldev->rpdev, &chinfo); + if (ret) + dev_err(&ctrldev->dev, "failed to release %s channel (%d)\n", + chinfo.name, ret); + break; + + default: + ret = -EINVAL; + } + mutex_unlock(&ctrldev->ctrl_lock); + + return ret; +}; + +static const struct file_operations rpmsg_ctrldev_fops = { + .owner = THIS_MODULE, + .open = rpmsg_ctrldev_open, + .release = rpmsg_ctrldev_release, + .unlocked_ioctl = rpmsg_ctrldev_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static void rpmsg_ctrldev_release_device(struct device *dev) +{ + struct rpmsg_ctrldev *ctrldev = dev_to_ctrldev(dev); + + ida_simple_remove(&rpmsg_ctrl_ida, dev->id); + ida_simple_remove(&rpmsg_minor_ida, MINOR(dev->devt)); + kfree(ctrldev); +} + +static int rpmsg_ctrldev_probe(struct rpmsg_device *rpdev) +{ + struct rpmsg_ctrldev *ctrldev; + struct device *dev; + int ret; + + ctrldev = kzalloc(sizeof(*ctrldev), GFP_KERNEL); + if (!ctrldev) + return -ENOMEM; + + ctrldev->rpdev = rpdev; + + dev = &ctrldev->dev; + device_initialize(dev); + dev->parent = &rpdev->dev; + dev->class = rpmsg_class; + + mutex_init(&ctrldev->ctrl_lock); + cdev_init(&ctrldev->cdev, &rpmsg_ctrldev_fops); + ctrldev->cdev.owner = THIS_MODULE; + + ret = ida_simple_get(&rpmsg_minor_ida, 0, RPMSG_DEV_MAX, GFP_KERNEL); + if (ret < 0) + goto free_ctrldev; + dev->devt = MKDEV(MAJOR(rpmsg_major), ret); + + ret = ida_simple_get(&rpmsg_ctrl_ida, 0, 0, GFP_KERNEL); + if (ret < 0) + goto free_minor_ida; + dev->id = ret; + dev_set_name(&ctrldev->dev, "rpmsg_ctrl%d", ret); + + ret = cdev_device_add(&ctrldev->cdev, &ctrldev->dev); + if (ret) + goto free_ctrl_ida; + + /* We can now rely on the release function for cleanup */ + dev->release = rpmsg_ctrldev_release_device; + + dev_set_drvdata(&rpdev->dev, ctrldev); + + return ret; + +free_ctrl_ida: + ida_simple_remove(&rpmsg_ctrl_ida, dev->id); +free_minor_ida: + ida_simple_remove(&rpmsg_minor_ida, MINOR(dev->devt)); +free_ctrldev: + put_device(dev); + kfree(ctrldev); + + return ret; +} + +static void rpmsg_ctrldev_remove(struct rpmsg_device *rpdev) +{ + struct rpmsg_ctrldev *ctrldev = dev_get_drvdata(&rpdev->dev); + int ret; + + /* Destroy all endpoints */ + ret = device_for_each_child(&ctrldev->dev, NULL, rpmsg_chrdev_eptdev_destroy); + if (ret) + dev_warn(&rpdev->dev, "failed to nuke endpoints: %d\n", ret); + + cdev_device_del(&ctrldev->cdev, &ctrldev->dev); + put_device(&ctrldev->dev); +} + +static struct rpmsg_driver rpmsg_ctrldev_driver = { + .probe = rpmsg_ctrldev_probe, + .remove = rpmsg_ctrldev_remove, + .drv = { + .name = "rpmsg_ctrl", + }, +}; + +static int rpmsg_ctrldev_init(void) +{ + int ret; + + ret = alloc_chrdev_region(&rpmsg_major, 0, RPMSG_DEV_MAX, "rpmsg_ctrl"); + if (ret < 0) { + pr_err("failed to allocate char dev region\n"); + return ret; + } + + ret = register_rpmsg_driver(&rpmsg_ctrldev_driver); + if (ret < 0) { + pr_err("failed to register rpmsg driver\n"); + unregister_chrdev_region(rpmsg_major, RPMSG_DEV_MAX); + } + + return ret; +} +postcore_initcall(rpmsg_ctrldev_init); + +static void rpmsg_ctrldev_exit(void) +{ + unregister_rpmsg_driver(&rpmsg_ctrldev_driver); + unregister_chrdev_region(rpmsg_major, RPMSG_DEV_MAX); +} +module_exit(rpmsg_ctrldev_exit); + +MODULE_DESCRIPTION("rpmsg control interface"); +MODULE_ALIAS("rpmsg:" KBUILD_MODNAME); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/rpmsg/rpmsg_internal.h b/drivers/rpmsg/rpmsg_internal.h index a76c344253bf..63cb74ea68d6 100644 --- a/drivers/rpmsg/rpmsg_internal.h +++ b/drivers/rpmsg/rpmsg_internal.h @@ -2,6 +2,7 @@ /* * remote processor messaging bus internals * + * Copyright (c) 2020 The Linux Foundation. * Copyright (C) 2011 Texas Instruments, Inc. * Copyright (C) 2011 Google, Inc. * @@ -18,6 +19,8 @@ #define to_rpmsg_device(d) container_of(d, struct rpmsg_device, dev) #define to_rpmsg_driver(d) container_of(d, struct rpmsg_driver, drv) +extern struct class *rpmsg_class; + /** * struct rpmsg_device_ops - indirection table for the rpmsg_device operations * @create_channel: create backend-specific channel, optional @@ -53,6 +56,8 @@ struct rpmsg_device_ops { * @trysendto: see @rpmsg_trysendto(), optional * @trysend_offchannel: see @rpmsg_trysend_offchannel(), optional * @poll: see @rpmsg_poll(), optional + * @get_signals: see @rpmsg_get_signals(), optional + * @set_signals: see @rpmsg_set_signals(), optional * * Indirection table for the operations that a rpmsg backend should implement. * In addition to @destroy_ept, the backend must at least implement @send and @@ -72,6 +77,9 @@ struct rpmsg_endpoint_ops { void *data, int len); __poll_t (*poll)(struct rpmsg_endpoint *ept, struct file *filp, poll_table *wait); + int (*rx_done)(struct rpmsg_endpoint *ept, void *data); + int (*get_signals)(struct rpmsg_endpoint *ept); + int (*set_signals)(struct rpmsg_endpoint *ept, u32 set, u32 clear); }; struct device *rpmsg_find_device(struct device *parent, @@ -82,16 +90,16 @@ struct rpmsg_device *rpmsg_create_channel(struct rpmsg_device *rpdev, int rpmsg_release_channel(struct rpmsg_device *rpdev, struct rpmsg_channel_info *chinfo); /** - * rpmsg_chrdev_register_device() - register chrdev device based on rpdev + * rpmsg_ctrldev_register_device() - register a char device for control based on rpdev * @rpdev: prepared rpdev to be used for creating endpoints * * This function wraps rpmsg_register_device() preparing the rpdev for use as * basis for the rpmsg chrdev. */ -static inline int rpmsg_chrdev_register_device(struct rpmsg_device *rpdev) +static inline int rpmsg_ctrldev_register_device(struct rpmsg_device *rpdev) { - strcpy(rpdev->id.name, "rpmsg_chrdev"); - rpdev->driver_override = "rpmsg_chrdev"; + strcpy(rpdev->id.name, "rpmsg_ctrl"); + rpdev->driver_override = "rpmsg_ctrl"; return rpmsg_register_device(rpdev); } diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c index b03e7404212f..fca034aafd15 100644 --- a/drivers/rpmsg/virtio_rpmsg_bus.c +++ b/drivers/rpmsg/virtio_rpmsg_bus.c @@ -840,7 +840,7 @@ static struct rpmsg_device *rpmsg_virtio_add_ctrl_dev(struct virtio_device *vdev rpdev_ctrl->dev.release = virtio_rpmsg_release_device; rpdev_ctrl->little_endian = virtio_is_little_endian(vrp->vdev); - err = rpmsg_chrdev_register_device(rpdev_ctrl); + err = rpmsg_ctrldev_register_device(rpdev_ctrl); if (err) { /* vch will be free in virtio_rpmsg_release_device() */ return ERR_PTR(err); diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c index 9e066281e2d0..78f9adf56456 100644 --- a/drivers/s390/char/vmcp.c +++ b/drivers/s390/char/vmcp.c @@ -70,7 +70,7 @@ static void vmcp_response_alloc(struct vmcp_session *session) * anymore the system won't work anyway. */ if (order > 2) - page = cma_alloc(vmcp_cma, nr_pages, 0, false); + page = cma_alloc(vmcp_cma, nr_pages, 0, GFP_KERNEL); if (page) { session->response = (char *)page_to_phys(page); session->cma_alloc = 1; diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 6e3a04107bb6..a9fe5152addd 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -500,7 +500,6 @@ source "drivers/scsi/megaraid/Kconfig.megaraid" source "drivers/scsi/mpt3sas/Kconfig" source "drivers/scsi/mpi3mr/Kconfig" source "drivers/scsi/smartpqi/Kconfig" -source "drivers/scsi/ufs/Kconfig" config SCSI_HPTIOP tristate "HighPoint RocketRAID 3xxx/4xxx Controller support" diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile index 19814c26c908..2ad3bc052531 100644 --- a/drivers/scsi/Makefile +++ b/drivers/scsi/Makefile @@ -101,7 +101,6 @@ obj-$(CONFIG_MEGARAID_NEWGEN) += megaraid/ obj-$(CONFIG_MEGARAID_SAS) += megaraid/ obj-$(CONFIG_SCSI_MPT3SAS) += mpt3sas/ obj-$(CONFIG_SCSI_MPI3MR) += mpi3mr/ -obj-$(CONFIG_SCSI_UFSHCD) += ufs/ obj-$(CONFIG_SCSI_ACARD) += atp870u.o obj-$(CONFIG_SCSI_SUNESP) += esp_scsi.o sun_esp.o obj-$(CONFIG_SCSI_INITIO) += initio.o diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 73d235540b98..f799a8d1ea41 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -2007,9 +2007,9 @@ static int iscsi_has_ping_timed_out(struct iscsi_conn *conn) return 0; } -enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc) +enum scsi_timeout_action iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc) { - enum blk_eh_timer_return rc = BLK_EH_DONE; + enum scsi_timeout_action rc = SCSI_EH_NOT_HANDLED; struct iscsi_task *task = NULL, *running_task; struct iscsi_cls_session *cls_session; struct iscsi_session *session; @@ -2029,7 +2029,7 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc) * Raced with completion. Blk layer has taken ownership * so let timeout code complete it now. */ - rc = BLK_EH_DONE; + rc = SCSI_EH_NOT_HANDLED; spin_unlock(&session->back_lock); goto done; } @@ -2047,21 +2047,21 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc) if (unlikely(system_state != SYSTEM_RUNNING)) { sc->result = DID_NO_CONNECT << 16; ISCSI_DBG_EH(session, "sc on shutdown, handled\n"); - rc = BLK_EH_DONE; + rc = SCSI_EH_NOT_HANDLED; goto done; } /* * We are probably in the middle of iscsi recovery so let * that complete and handle the error. */ - rc = BLK_EH_RESET_TIMER; + rc = SCSI_EH_RESET_TIMER; goto done; } conn = session->leadconn; if (!conn) { /* In the middle of shuting down */ - rc = BLK_EH_RESET_TIMER; + rc = SCSI_EH_RESET_TIMER; goto done; } @@ -2078,7 +2078,7 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc) "Last data xfer at %lu. Last timeout was at " "%lu\n.", task->last_xfer, task->last_timeout); task->have_checked_conn = false; - rc = BLK_EH_RESET_TIMER; + rc = SCSI_EH_RESET_TIMER; goto done; } @@ -2089,7 +2089,7 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc) * and can let the iscsi eh handle it */ if (iscsi_has_ping_timed_out(conn)) { - rc = BLK_EH_RESET_TIMER; + rc = SCSI_EH_RESET_TIMER; goto done; } @@ -2127,7 +2127,7 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc) task->last_xfer, running_task->last_xfer, task->last_timeout); spin_unlock(&session->back_lock); - rc = BLK_EH_RESET_TIMER; + rc = SCSI_EH_RESET_TIMER; goto done; } } @@ -2143,14 +2143,14 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc) */ if (READ_ONCE(conn->ping_task)) { task->have_checked_conn = true; - rc = BLK_EH_RESET_TIMER; + rc = SCSI_EH_RESET_TIMER; goto done; } /* Make sure there is a transport check done */ iscsi_send_nopout(conn, NULL); task->have_checked_conn = true; - rc = BLK_EH_RESET_TIMER; + rc = SCSI_EH_RESET_TIMER; done: spin_unlock_bh(&session->frwd_lock); @@ -2159,7 +2159,7 @@ done: task->last_timeout = jiffies; iscsi_put_task(task); } - ISCSI_DBG_EH(session, "return %s\n", rc == BLK_EH_RESET_TIMER ? + ISCSI_DBG_EH(session, "return %s\n", rc == SCSI_EH_RESET_TIMER ? "timer reset" : "shutdown or nh"); return rc; } diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 88e164e3d2ea..be72ad1111c2 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -2927,15 +2927,14 @@ static int megasas_generic_reset(struct scsi_cmnd *scmd) * Sets the FW busy flag and reduces the host->can_queue if the * cmd has not been completed within the timeout period. */ -static enum -blk_eh_timer_return megasas_reset_timer(struct scsi_cmnd *scmd) +static enum scsi_timeout_action megasas_reset_timer(struct scsi_cmnd *scmd) { struct megasas_instance *instance; unsigned long flags; if (time_after(jiffies, scmd->jiffies_at_alloc + (scmd_timeout * 2) * HZ)) { - return BLK_EH_DONE; + return SCSI_EH_NOT_HANDLED; } instance = (struct megasas_instance *)scmd->device->host->hostdata; @@ -2949,7 +2948,7 @@ blk_eh_timer_return megasas_reset_timer(struct scsi_cmnd *scmd) spin_unlock_irqrestore(instance->host->host_lock, flags); } - return BLK_EH_RESET_TIMER; + return SCSI_EH_RESET_TIMER; } /** diff --git a/drivers/scsi/mvumi.c b/drivers/scsi/mvumi.c index 4d251bf630a3..d125288f3741 100644 --- a/drivers/scsi/mvumi.c +++ b/drivers/scsi/mvumi.c @@ -2109,7 +2109,7 @@ out_return_cmd: return 0; } -static enum blk_eh_timer_return mvumi_timed_out(struct scsi_cmnd *scmd) +static enum scsi_timeout_action mvumi_timed_out(struct scsi_cmnd *scmd) { struct mvumi_cmd *cmd = (struct mvumi_cmd *) scmd->SCp.ptr; struct Scsi_Host *host = scmd->device->host; @@ -2137,7 +2137,7 @@ static enum blk_eh_timer_return mvumi_timed_out(struct scsi_cmnd *scmd) mvumi_return_cmd(mhba, cmd); spin_unlock_irqrestore(mhba->shost->host_lock, flags); - return BLK_EH_DONE; + return SCSI_EH_NOT_HANDLED; } static int diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index f1ea65c6e5f5..16f48fd8f19b 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c @@ -116,7 +116,7 @@ static int qla4xxx_iface_set_param(struct Scsi_Host *shost, void *data, static int qla4xxx_get_iface_param(struct iscsi_iface *iface, enum iscsi_param_type param_type, int param, char *buf); -static enum blk_eh_timer_return qla4xxx_eh_cmd_timed_out(struct scsi_cmnd *sc); +static enum scsi_timeout_action qla4xxx_eh_cmd_timed_out(struct scsi_cmnd *sc); static struct iscsi_endpoint *qla4xxx_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, int non_blocking); @@ -1871,17 +1871,17 @@ exit_get_stats: return; } -static enum blk_eh_timer_return qla4xxx_eh_cmd_timed_out(struct scsi_cmnd *sc) +static enum scsi_timeout_action qla4xxx_eh_cmd_timed_out(struct scsi_cmnd *sc) { struct iscsi_cls_session *session; unsigned long flags; - enum blk_eh_timer_return ret = BLK_EH_DONE; + enum scsi_timeout_action ret = SCSI_EH_NOT_HANDLED; session = starget_to_session(scsi_target(sc->device)); spin_lock_irqsave(&session->lock, flags); if (session->state == ISCSI_SESSION_FAILED) - ret = BLK_EH_RESET_TIMER; + ret = SCSI_EH_RESET_TIMER; spin_unlock_irqrestore(&session->lock, flags); return ret; diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 591df0a91057..c459164ade46 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -16,7 +16,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ #include - +#include #include #include #include @@ -98,6 +98,7 @@ static const char *sdebug_version_date = "20200710"; #define WRITE_BOUNDARY_ASCQ 0x5 #define READ_INVDATA_ASCQ 0x6 #define READ_BOUNDARY_ASCQ 0x7 +#define ATTEMPT_ACCESS_GAP 0x9 #define INSUFF_ZONE_ASCQ 0xe /* Additional Sense Code Qualifier (ASCQ) */ @@ -250,9 +251,11 @@ static const char *sdebug_version_date = "20200710"; /* Zone types (zbcr05 table 25) */ enum sdebug_z_type { - ZBC_ZONE_TYPE_CNV = 0x1, - ZBC_ZONE_TYPE_SWR = 0x2, - ZBC_ZONE_TYPE_SWP = 0x3, + ZBC_ZTYPE_CNV = 0x1, + ZBC_ZTYPE_SWR = 0x2, + ZBC_ZTYPE_SWP = 0x3, + /* ZBC_ZTYPE_SOBR = 0x4, */ + ZBC_ZTYPE_GAP = 0x5, }; /* enumeration names taken from table 26, zbcr05 */ @@ -290,10 +293,12 @@ struct sdebug_dev_info { /* For ZBC devices */ enum blk_zoned_model zmodel; + unsigned int zcap; unsigned int zsize; unsigned int zsize_shift; unsigned int nr_zones; unsigned int nr_conv_zones; + unsigned int nr_seq_zones; unsigned int nr_imp_open; unsigned int nr_exp_open; unsigned int nr_closed; @@ -827,6 +832,7 @@ static int dif_errors; /* ZBC global data */ static bool sdeb_zbc_in_use; /* true for host-aware and host-managed disks */ +static int sdeb_zbc_zone_cap_mb; static int sdeb_zbc_zone_size_mb; static int sdeb_zbc_max_open = DEF_ZBC_MAX_OPEN_ZONES; static int sdeb_zbc_nr_conv = DEF_ZBC_NR_CONV_ZONES; @@ -1551,6 +1557,12 @@ static int inquiry_vpd_b6(struct sdebug_dev_info *devip, unsigned char *arr) put_unaligned_be32(devip->max_open, &arr[12]); else put_unaligned_be32(0xffffffff, &arr[12]); + if (devip->zcap < devip->zsize) { + arr[19] = ZBC_CONSTANT_ZONE_START_OFFSET; + put_unaligned_be64(devip->zsize, &arr[20]); + } else { + arr[19] = 0; + } return 0x3c; } @@ -2677,12 +2689,38 @@ static inline bool sdebug_dev_is_zoned(struct sdebug_dev_info *devip) static struct sdeb_zone_state *zbc_zone(struct sdebug_dev_info *devip, unsigned long long lba) { - return &devip->zstate[lba >> devip->zsize_shift]; + u32 zno = lba >> devip->zsize_shift; + struct sdeb_zone_state *zsp; + + if (devip->zcap == devip->zsize || zno < devip->nr_conv_zones) + return &devip->zstate[zno]; + + /* + * If the zone capacity is less than the zone size, adjust for gap + * zones. + */ + zno = 2 * zno - devip->nr_conv_zones; + WARN_ONCE(zno >= devip->nr_zones, "%u > %u\n", zno, devip->nr_zones); + zsp = &devip->zstate[zno]; + if (lba >= zsp->z_start + zsp->z_size) + zsp++; + WARN_ON_ONCE(lba >= zsp->z_start + zsp->z_size); + return zsp; } static inline bool zbc_zone_is_conv(struct sdeb_zone_state *zsp) { - return zsp->z_type == ZBC_ZONE_TYPE_CNV; + return zsp->z_type == ZBC_ZTYPE_CNV; +} + +static inline bool zbc_zone_is_gap(struct sdeb_zone_state *zsp) +{ + return zsp->z_type == ZBC_ZTYPE_GAP; +} + +static inline bool zbc_zone_is_seq(struct sdeb_zone_state *zsp) +{ + return !zbc_zone_is_conv(zsp) && !zbc_zone_is_gap(zsp); } static void zbc_close_zone(struct sdebug_dev_info *devip, @@ -2690,7 +2728,7 @@ static void zbc_close_zone(struct sdebug_dev_info *devip, { enum sdebug_z_cond zc; - if (zbc_zone_is_conv(zsp)) + if (!zbc_zone_is_seq(zsp)) return; zc = zsp->z_cond; @@ -2728,7 +2766,7 @@ static void zbc_open_zone(struct sdebug_dev_info *devip, { enum sdebug_z_cond zc; - if (zbc_zone_is_conv(zsp)) + if (!zbc_zone_is_seq(zsp)) return; zc = zsp->z_cond; @@ -2778,10 +2816,10 @@ static void zbc_inc_wp(struct sdebug_dev_info *devip, struct sdeb_zone_state *zsp = zbc_zone(devip, lba); unsigned long long n, end, zend = zsp->z_start + zsp->z_size; - if (zbc_zone_is_conv(zsp)) + if (!zbc_zone_is_seq(zsp)) return; - if (zsp->z_type == ZBC_ZONE_TYPE_SWR) { + if (zsp->z_type == ZBC_ZTYPE_SWR) { zsp->z_wp += num; if (zsp->z_wp >= zend) zbc_set_zone_full(devip, zsp); @@ -2826,9 +2864,7 @@ static int check_zbc_access_params(struct scsi_cmnd *scp, if (devip->zmodel == BLK_ZONED_HA) return 0; /* For host-managed, reads cannot cross zone types boundaries */ - if (zsp_end != zsp && - zbc_zone_is_conv(zsp) && - !zbc_zone_is_conv(zsp_end)) { + if (zsp->z_type != zsp_end->z_type) { mk_sense_buffer(scp, ILLEGAL_REQUEST, LBA_OUT_OF_RANGE, READ_INVDATA_ASCQ); @@ -2837,6 +2873,13 @@ static int check_zbc_access_params(struct scsi_cmnd *scp, return 0; } + /* Writing into a gap zone is not allowed */ + if (zbc_zone_is_gap(zsp)) { + mk_sense_buffer(scp, ILLEGAL_REQUEST, LBA_OUT_OF_RANGE, + ATTEMPT_ACCESS_GAP); + return check_condition_result; + } + /* No restrictions for writes within conventional zones */ if (zbc_zone_is_conv(zsp)) { if (!zbc_zone_is_conv(zsp_end)) { @@ -2848,7 +2891,7 @@ static int check_zbc_access_params(struct scsi_cmnd *scp, return 0; } - if (zsp->z_type == ZBC_ZONE_TYPE_SWR) { + if (zsp->z_type == ZBC_ZTYPE_SWR) { /* Writes cannot cross sequential zone boundaries */ if (zsp_end != zsp) { mk_sense_buffer(scp, ILLEGAL_REQUEST, @@ -4332,18 +4375,18 @@ cleanup: #define RZONES_DESC_HD 64 -/* Report zones depending on start LBA nad reporting options */ +/* Report zones depending on start LBA and reporting options */ static int resp_report_zones(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) { - unsigned int i, max_zones, rep_max_zones, nrz = 0; + unsigned int rep_max_zones, nrz = 0; int ret = 0; u32 alloc_len, rep_opts, rep_len; bool partial; u64 lba, zs_lba; u8 *arr = NULL, *desc; u8 *cmd = scp->cmnd; - struct sdeb_zone_state *zsp; + struct sdeb_zone_state *zsp = NULL; struct sdeb_store_info *sip = devip2sip(devip, false); rwlock_t *macc_lckp = sip ? &sip->macc_lck : &sdeb_fake_rw_lck; @@ -4363,9 +4406,7 @@ static int resp_report_zones(struct scsi_cmnd *scp, return check_condition_result; } - max_zones = devip->nr_zones - (zs_lba >> devip->zsize_shift); - rep_max_zones = min((alloc_len - 64) >> ilog2(RZONES_DESC_HD), - max_zones); + rep_max_zones = (alloc_len - 64) >> ilog2(RZONES_DESC_HD); arr = kzalloc(alloc_len, GFP_ATOMIC | __GFP_NOWARN); if (!arr) { @@ -4377,9 +4418,9 @@ static int resp_report_zones(struct scsi_cmnd *scp, read_lock(macc_lckp); desc = arr + 64; - for (i = 0; i < max_zones; i++) { - lba = zs_lba + devip->zsize * i; - if (lba > sdebug_capacity) + for (lba = zs_lba; lba < sdebug_capacity; + lba = zsp->z_start + zsp->z_size) { + if (WARN_ONCE(zbc_zone(devip, lba) == zsp, "lba = %llu\n", lba)) break; zsp = zbc_zone(devip, lba); switch (rep_opts) { @@ -4424,9 +4465,14 @@ static int resp_report_zones(struct scsi_cmnd *scp, if (!zsp->z_non_seq_resource) continue; break; + case 0x3e: + /* All zones except gap zones. */ + if (zbc_zone_is_gap(zsp)) + continue; + break; case 0x3f: /* Not write pointer (conventional) zones */ - if (!zbc_zone_is_conv(zsp)) + if (zbc_zone_is_seq(zsp)) continue; break; default: @@ -4455,8 +4501,13 @@ static int resp_report_zones(struct scsi_cmnd *scp, } /* Report header */ + /* Zone list length. */ put_unaligned_be32(nrz * RZONES_DESC_HD, arr + 0); + /* Maximum LBA */ put_unaligned_be64(sdebug_capacity - 1, arr + 8); + /* Zone starting LBA granularity. */ + if (devip->zcap < devip->zsize) + put_unaligned_be64(devip->zsize, arr + 16); rep_len = (unsigned long)desc - (unsigned long)arr; ret = fill_from_dev_buffer(scp, arr, min_t(u32, alloc_len, rep_len)); @@ -4684,7 +4735,7 @@ static void zbc_rwp_zone(struct sdebug_dev_info *devip, enum sdebug_z_cond zc; struct sdeb_store_info *sip = devip2sip(devip, false); - if (zbc_zone_is_conv(zsp)) + if (!zbc_zone_is_seq(zsp)) return; zc = zsp->z_cond; @@ -4875,6 +4926,7 @@ static int sdebug_device_create_zones(struct sdebug_dev_info *devip) { struct sdeb_zone_state *zsp; sector_t capacity = get_sdebug_capacity(); + sector_t conv_capacity; sector_t zstart = 0; unsigned int i; @@ -4909,11 +4961,30 @@ static int sdebug_device_create_zones(struct sdebug_dev_info *devip) devip->zsize_shift = ilog2(devip->zsize); devip->nr_zones = (capacity + devip->zsize - 1) >> devip->zsize_shift; - if (sdeb_zbc_nr_conv >= devip->nr_zones) { + if (sdeb_zbc_zone_cap_mb == 0) { + devip->zcap = devip->zsize; + } else { + devip->zcap = (sdeb_zbc_zone_cap_mb * SZ_1M) >> + ilog2(sdebug_sector_size); + if (devip->zcap > devip->zsize) { + pr_err("Zone capacity too large\n"); + return -EINVAL; + } + } + + conv_capacity = (sector_t)sdeb_zbc_nr_conv << devip->zsize_shift; + if (conv_capacity >= capacity) { pr_err("Number of conventional zones too large\n"); return -EINVAL; } devip->nr_conv_zones = sdeb_zbc_nr_conv; + devip->nr_seq_zones = ALIGN(capacity - conv_capacity, devip->zsize) >> + devip->zsize_shift; + devip->nr_zones = devip->nr_conv_zones + devip->nr_seq_zones; + + /* Add gap zones if zone capacity is smaller than the zone size */ + if (devip->zcap < devip->zsize) + devip->nr_zones += devip->nr_seq_zones; if (devip->zmodel == BLK_ZONED_HM) { /* zbc_max_open_zones can be 0, meaning "not reported" */ @@ -4934,23 +5005,29 @@ static int sdebug_device_create_zones(struct sdebug_dev_info *devip) zsp->z_start = zstart; if (i < devip->nr_conv_zones) { - zsp->z_type = ZBC_ZONE_TYPE_CNV; + zsp->z_type = ZBC_ZTYPE_CNV; zsp->z_cond = ZBC_NOT_WRITE_POINTER; zsp->z_wp = (sector_t)-1; - } else { + zsp->z_size = + min_t(u64, devip->zsize, capacity - zstart); + } else if ((zstart & (devip->zsize - 1)) == 0) { if (devip->zmodel == BLK_ZONED_HM) - zsp->z_type = ZBC_ZONE_TYPE_SWR; + zsp->z_type = ZBC_ZTYPE_SWR; else - zsp->z_type = ZBC_ZONE_TYPE_SWP; + zsp->z_type = ZBC_ZTYPE_SWP; zsp->z_cond = ZC1_EMPTY; zsp->z_wp = zsp->z_start; + zsp->z_size = + min_t(u64, devip->zcap, capacity - zstart); + } else { + zsp->z_type = ZBC_ZTYPE_GAP; + zsp->z_cond = ZBC_NOT_WRITE_POINTER; + zsp->z_wp = (sector_t)-1; + zsp->z_size = min_t(u64, devip->zsize - devip->zcap, + capacity - zstart); } - if (zsp->z_start + devip->zsize < capacity) - zsp->z_size = devip->zsize; - else - zsp->z_size = capacity - zsp->z_start; - + WARN_ON_ONCE((int)zsp->z_size <= 0); zstart += zsp->z_size; } @@ -5718,6 +5795,7 @@ module_param_named(wp, sdebug_wp, bool, S_IRUGO | S_IWUSR); module_param_named(write_same_length, sdebug_write_same_length, int, S_IRUGO | S_IWUSR); module_param_named(zbc, sdeb_zbc_model_s, charp, S_IRUGO); +module_param_named(zone_cap_mb, sdeb_zbc_zone_cap_mb, int, S_IRUGO); module_param_named(zone_max_open, sdeb_zbc_max_open, int, S_IRUGO); module_param_named(zone_nr_conv, sdeb_zbc_nr_conv, int, S_IRUGO); module_param_named(zone_size_mb, sdeb_zbc_zone_size_mb, int, S_IRUGO); @@ -5788,6 +5866,7 @@ MODULE_PARM_DESC(vpd_use_hostno, "0 -> dev ids ignore hostno (def=1 -> unique de MODULE_PARM_DESC(wp, "Write Protect (def=0)"); MODULE_PARM_DESC(write_same_length, "Maximum blocks per WRITE SAME cmd (def=0xffff)"); MODULE_PARM_DESC(zbc, "'none' [0]; 'aware' [1]; 'managed' [2] (def=0). Can have 'host-' prefix"); +MODULE_PARM_DESC(zone_cap_mb, "Zone capacity in MiB (def=zone size)"); MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones; [0] for no limit (def=auto)"); MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones (def=1)"); MODULE_PARM_DESC(zone_size_mb, "Zone size in MiB (def=auto)"); diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index dd9f5778f687..e45f8652867c 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -329,7 +329,6 @@ void scsi_eh_scmd_add(struct scsi_cmnd *scmd) enum blk_eh_timer_return scsi_times_out(struct request *req) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); - enum blk_eh_timer_return rtn = BLK_EH_DONE; struct Scsi_Host *host = scmd->device->host; trace_scsi_dispatch_cmd_timeout(scmd); @@ -338,23 +337,29 @@ enum blk_eh_timer_return scsi_times_out(struct request *req) if (host->eh_deadline != -1 && !host->last_reset) host->last_reset = jiffies; - if (host->hostt->eh_timed_out) - rtn = host->hostt->eh_timed_out(scmd); - - if (rtn == BLK_EH_DONE) { - /* - * If scsi_done() has already set SCMD_STATE_COMPLETE, do not - * modify *scmd. - */ - if (test_and_set_bit(SCMD_STATE_COMPLETE, &scmd->state)) + if (host->hostt->eh_timed_out) { + switch (host->hostt->eh_timed_out(scmd)) { + case SCSI_EH_DONE: return BLK_EH_DONE; - if (scsi_abort_command(scmd) != SUCCESS) { - set_host_byte(scmd, DID_TIME_OUT); - scsi_eh_scmd_add(scmd); + case SCSI_EH_RESET_TIMER: + return BLK_EH_RESET_TIMER; + case SCSI_EH_NOT_HANDLED: + break; } } - return rtn; + /* + * If scsi_done() has already set SCMD_STATE_COMPLETE, do not modify + * *scmd. + */ + if (test_and_set_bit(SCMD_STATE_COMPLETE, &scmd->state)) + return BLK_EH_DONE; + if (scsi_abort_command(scmd) != SUCCESS) { + set_host_byte(scmd, DID_TIME_OUT); + scsi_eh_scmd_add(scmd); + } + + return BLK_EH_DONE; } /** @@ -666,6 +671,12 @@ enum scsi_disposition scsi_check_sense(struct scsi_cmnd *scmd) fallthrough; case ILLEGAL_REQUEST: + /* + * Unaligned write command. This indicates that zoned writes got + * reordered. Retry after all pending commands have completed. + */ + if (sshdr.asc == 0x21 && sshdr.ascq == 0x04) + return NEEDS_DELAYED_RETRY; if (sshdr.asc == 0x20 || /* Invalid command operation code */ sshdr.asc == 0x21 || /* Logical block address out of range */ sshdr.asc == 0x22 || /* Invalid function */ diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index ef4361b2d142..eb62ac0c7f45 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1328,9 +1328,6 @@ static inline int scsi_host_queue_ready(struct request_queue *q, struct scsi_device *sdev, struct scsi_cmnd *cmd) { - if (scsi_host_in_recovery(shost)) - return 0; - if (atomic_read(&shost->host_blocked) > 0) { if (scsi_host_busy(shost) > 0) goto starved; @@ -1662,6 +1659,11 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, ret = BLK_STS_RESOURCE; if (!scsi_target_queue_ready(shost, sdev)) goto out_put_budget; + if (unlikely(scsi_host_in_recovery(shost))) { + if (cmd->flags & SCMD_FAIL_IF_RECOVERING) + ret = BLK_STS_IOERR; + goto out_dec_target_busy; + } if (!scsi_host_queue_ready(q, shost, sdev, cmd)) goto out_dec_target_busy; diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c index a2524106206d..b86da3797ef0 100644 --- a/drivers/scsi/scsi_transport_fc.c +++ b/drivers/scsi/scsi_transport_fc.c @@ -2528,15 +2528,14 @@ static int fc_vport_match(struct attribute_container *cont, * Notes: * This routine assumes no locks are held on entry. */ -enum blk_eh_timer_return -fc_eh_timed_out(struct scsi_cmnd *scmd) +enum scsi_timeout_action fc_eh_timed_out(struct scsi_cmnd *scmd) { struct fc_rport *rport = starget_to_rport(scsi_target(scmd->device)); if (rport->port_state == FC_PORTSTATE_BLOCKED) - return BLK_EH_RESET_TIMER; + return SCSI_EH_RESET_TIMER; - return BLK_EH_DONE; + return SCSI_EH_NOT_HANDLED; } EXPORT_SYMBOL(fc_eh_timed_out); diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c index 98a34ed10f1a..87d0fb8dc503 100644 --- a/drivers/scsi/scsi_transport_srp.c +++ b/drivers/scsi/scsi_transport_srp.c @@ -594,13 +594,13 @@ EXPORT_SYMBOL(srp_reconnect_rport); * @scmd: SCSI command. * * If a timeout occurs while an rport is in the blocked state, ask the SCSI - * EH to continue waiting (BLK_EH_RESET_TIMER). Otherwise let the SCSI core - * handle the timeout (BLK_EH_DONE). + * EH to continue waiting (SCSI_EH_RESET_TIMER). Otherwise let the SCSI core + * handle the timeout (SCSI_EH_NOT_HANDLED). * * Note: This function is called from soft-IRQ context and with the request * queue lock held. */ -enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd) +enum scsi_timeout_action srp_timed_out(struct scsi_cmnd *scmd) { struct scsi_device *sdev = scmd->device; struct Scsi_Host *shost = sdev->host; @@ -611,7 +611,7 @@ enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd) return rport && rport->fast_io_fail_tmo < 0 && rport->dev_loss_tmo < 0 && i->f->reset_timer_if_blocked && scsi_device_blocked(sdev) ? - BLK_EH_RESET_TIMER : BLK_EH_DONE; + SCSI_EH_RESET_TIMER : SCSI_EH_NOT_HANDLED; } EXPORT_SYMBOL(srp_timed_out); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 1e887c11e83d..67c5d45ba78e 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1304,6 +1304,9 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) cmd->transfersize = sdp->sector_size; cmd->underflow = nr_blocks << 9; cmd->allowed = sdkp->max_retries; + if (blk_queue_pipeline_zoned_writes(rq->q) && + blk_rq_is_seq_zone_write(rq)) + cmd->allowed += rq->q->nr_requests; cmd->sdb.length = nr_blocks * sdp->sector_size; SCSI_LOG_HLQUEUE(1, diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index b59136c4125b..697fad362cca 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -67,6 +67,20 @@ enum { SD_ZERO_WS10_UNMAP, /* Use WRITE SAME(10) with UNMAP */ }; +/** + * struct zoned_disk_info - Specific properties of a ZBC SCSI device. + * @nr_zones: number of zones. + * @zone_blocks: number of logical blocks per zone. + * + * This data structure holds the ZBC SCSI device properties that are retrieved + * twice: a first time before the gendisk capacity is known and a second time + * after the gendisk capacity is known. + */ +struct zoned_disk_info { + u32 nr_zones; + u32 zone_blocks; +}; + struct scsi_disk { struct scsi_driver *driver; /* always &sd_template */ struct scsi_device *device; @@ -74,13 +88,18 @@ struct scsi_disk { struct gendisk *disk; struct opal_dev *opal_dev; #ifdef CONFIG_BLK_DEV_ZONED - u32 nr_zones; - u32 rev_nr_zones; - u32 zone_blocks; - u32 rev_zone_blocks; + /* Updated during revalidation before the gendisk capacity is known. */ + struct zoned_disk_info early_zone_info; + /* Updated during revalidation after the gendisk capacity is known. */ + struct zoned_disk_info zone_info; u32 zones_optimal_open; u32 zones_optimal_nonseq; u32 zones_max_open; + /* + * Either zero or a power of two. If not zero it means that the offset + * between zone starting LBAs is constant. + */ + u32 zone_starting_lba_gran; u32 *zones_wp_offset; spinlock_t zones_wp_offset_lock; u32 *rev_wp_offset; @@ -125,6 +144,9 @@ struct scsi_disk { unsigned urswrz : 1; unsigned security : 1; unsigned ignore_medium_access_errors : 1; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; #define to_scsi_disk(obj) container_of(obj,struct scsi_disk,dev) @@ -217,7 +239,7 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp) #ifdef CONFIG_BLK_DEV_ZONED void sd_zbc_release_disk(struct scsi_disk *sdkp); -int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer); +int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]); int sd_zbc_revalidate_zones(struct scsi_disk *sdkp); blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned char op, bool all); @@ -233,8 +255,7 @@ blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, static inline void sd_zbc_release_disk(struct scsi_disk *sdkp) {} -static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, - unsigned char *buf) +static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) { return 0; } diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index ed06798983f8..068a3832b5f5 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -20,6 +20,12 @@ #include "sd.h" +/** + * sd_zbc_get_zone_wp_offset - Get zone write pointer offset. + * @zone: Zone for which to return the write pointer offset. + * + * Return: offset of the write pointer from the start of the zone. + */ static unsigned int sd_zbc_get_zone_wp_offset(struct blk_zone *zone) { if (zone->type == ZBC_ZONE_TYPE_CONV) @@ -44,13 +50,37 @@ static unsigned int sd_zbc_get_zone_wp_offset(struct blk_zone *zone) } } -static int sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf, +/* Whether or not a SCSI zone descriptor describes a gap zone. */ +static bool sd_zbc_is_gap_zone(const u8 buf[64]) +{ + return (buf[0] & 0xf) == ZBC_ZONE_TYPE_GAP; +} + +/** + * sd_zbc_parse_report - Parse a SCSI zone descriptor + * @sdkp: SCSI disk pointer. + * @buf: SCSI zone descriptor. + * @idx: Index of the zone relative to the first zone reported by the current + * sd_zbc_report_zones() call. + * @cb: Callback function pointer. + * @data: Second argument passed to @cb. + * + * Return: Value returned by @cb. + * + * Convert a SCSI zone descriptor into struct blk_zone format. Additionally, + * call @cb(blk_zone, @data). + */ +static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64], unsigned int idx, report_zones_cb cb, void *data) { struct scsi_device *sdp = sdkp->device; struct blk_zone zone = { 0 }; + sector_t start_lba, gran; int ret; + if (WARN_ON_ONCE(sd_zbc_is_gap_zone(buf))) + return -EINVAL; + zone.type = buf[0] & 0x0f; zone.cond = (buf[1] >> 4) & 0xf; if (buf[1] & 0x01) @@ -58,13 +88,31 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf, if (buf[1] & 0x02) zone.non_seq = 1; - zone.len = logical_to_sectors(sdp, get_unaligned_be64(&buf[8])); - zone.capacity = zone.len; - zone.start = logical_to_sectors(sdp, get_unaligned_be64(&buf[16])); - zone.wp = logical_to_sectors(sdp, get_unaligned_be64(&buf[24])); - if (zone.type != ZBC_ZONE_TYPE_CONV && - zone.cond == ZBC_ZONE_COND_FULL) + start_lba = get_unaligned_be64(&buf[16]); + zone.start = logical_to_sectors(sdp, start_lba); + zone.capacity = logical_to_sectors(sdp, get_unaligned_be64(&buf[8])); + zone.len = zone.capacity; + if (sdkp->zone_starting_lba_gran) { + gran = logical_to_sectors(sdp, sdkp->zone_starting_lba_gran); + if (zone.len > gran) { + sd_printk(KERN_ERR, sdkp, + "Invalid zone at LBA %llu with capacity %llu and length %llu; granularity = %llu\n", + start_lba, + sectors_to_logical(sdp, zone.capacity), + sectors_to_logical(sdp, zone.len), + sectors_to_logical(sdp, gran)); + return -EINVAL; + } + /* + * Use the starting LBA granularity instead of the zone length + * obtained from the REPORT ZONES command. + */ + zone.len = gran; + } + if (zone.cond == ZBC_ZONE_COND_FULL) zone.wp = zone.start + zone.len; + else + zone.wp = logical_to_sectors(sdp, get_unaligned_be64(&buf[24])); ret = cb(&zone, idx, data); if (ret) @@ -161,7 +209,7 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, * sure that the allocated buffer can always be mapped by limiting the * number of pages allocated to the HBA max segments limit. */ - nr_zones = min(nr_zones, sdkp->nr_zones); + nr_zones = min(nr_zones, sdkp->zone_info.nr_zones); bufsize = roundup((nr_zones + 1) * 64, SECTOR_SIZE); bufsize = min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT); @@ -186,16 +234,28 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, */ static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp) { - return logical_to_sectors(sdkp->device, sdkp->zone_blocks); + return logical_to_sectors(sdkp->device, sdkp->zone_info.zone_blocks); } +/** + * sd_zbc_report_zones - SCSI .report_zones() callback. + * @disk: Disk to report zones for. + * @sector: Start sector. + * @nr_zones: Maximum number of zones to report. + * @cb: Callback function called to report zone information. + * @data: Second argument passed to @cb. + * + * Called by the block layer to iterate over zone information. See also the + * disk->fops->report_zones() calls in block/blk-zoned.c. + */ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { struct scsi_disk *sdkp = scsi_disk(disk); - sector_t capacity = logical_to_sectors(sdkp->device, sdkp->capacity); + sector_t lba = sectors_to_logical(sdkp->device, sector); unsigned int nr, i; unsigned char *buf; + u64 zone_length, start_lba; size_t offset, buflen = 0; int zone_idx = 0; int ret; @@ -204,7 +264,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, /* Not a zoned device */ return -EOPNOTSUPP; - if (!capacity) + if (!sdkp->capacity) /* Device gone or invalid */ return -ENODEV; @@ -212,9 +272,19 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, if (!buf) return -ENOMEM; - while (zone_idx < nr_zones && sector < capacity) { - ret = sd_zbc_do_report_zones(sdkp, buf, buflen, - sectors_to_logical(sdkp->device, sector), true); + while (zone_idx < nr_zones && lba < sdkp->capacity) { + ret = sd_zbc_do_report_zones(sdkp, buf, buflen, lba, true); + if (ret && zone_idx) { + sd_printk(KERN_WARNING, sdkp, + "ZBC violation: %llu LBAs are not associated with a zone (zone length %llu)\n", + sdkp->capacity - lba, zone_length); + sdkp->capacity = lba; + set_capacity_and_notify(disk, + logical_to_sectors(sdkp->device, + sdkp->capacity)); + ret = 0; + break; + } if (ret) goto out; @@ -225,14 +295,36 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, for (i = 0; i < nr && zone_idx < nr_zones; i++) { offset += 64; + start_lba = get_unaligned_be64(&buf[offset + 16]); + zone_length = get_unaligned_be64(&buf[offset + 8]); + if ((zone_idx == 0 && + (lba < start_lba || + lba >= start_lba + zone_length)) || + (zone_idx > 0 && start_lba != lba) || + start_lba + zone_length < start_lba) { + sd_printk(KERN_ERR, sdkp, + "Zone %d at LBA %llu is invalid: %llu + %llu\n", + zone_idx, lba, start_lba, zone_length); + ret = -EINVAL; + goto out; + } + lba = start_lba + zone_length; + if (sd_zbc_is_gap_zone(&buf[offset])) { + if (sdkp->zone_starting_lba_gran) + continue; + sd_printk(KERN_ERR, sdkp, + "Gap zone without constant LBA offsets\n"); + ret = -EINVAL; + goto out; + } + ret = sd_zbc_parse_report(sdkp, buf + offset, zone_idx, cb, data); if (ret) goto out; + zone_idx++; } - - sector += sd_zbc_zone_sectors(sdkp) * i; } ret = zone_idx; @@ -276,6 +368,10 @@ static int sd_zbc_update_wp_offset_cb(struct blk_zone *zone, unsigned int idx, return 0; } +/* + * An attempt to append a zone triggered an invalid write pointer error. + * Reread the write pointer of the zone(s) in which the append failed. + */ static void sd_zbc_update_wp_offset_workfn(struct work_struct *work) { struct scsi_disk *sdkp; @@ -286,14 +382,14 @@ static void sd_zbc_update_wp_offset_workfn(struct work_struct *work) sdkp = container_of(work, struct scsi_disk, zone_wp_offset_work); spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags); - for (zno = 0; zno < sdkp->nr_zones; zno++) { + for (zno = 0; zno < sdkp->zone_info.nr_zones; zno++) { if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST) continue; spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags); ret = sd_zbc_do_report_zones(sdkp, sdkp->zone_wp_update_buf, SD_BUF_SIZE, - zno * sdkp->zone_blocks, true); + zno * sdkp->zone_info.zone_blocks, true); spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags); if (!ret) sd_zbc_parse_report(sdkp, sdkp->zone_wp_update_buf + 64, @@ -360,7 +456,7 @@ blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, break; default: wp_offset = sectors_to_logical(sdkp->device, wp_offset); - if (wp_offset + nr_blocks > sdkp->zone_blocks) { + if (wp_offset + nr_blocks > sdkp->zone_info.zone_blocks) { ret = BLK_STS_IOERR; break; } @@ -491,7 +587,7 @@ static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd, break; case REQ_OP_ZONE_RESET_ALL: memset(sdkp->zones_wp_offset, 0, - sdkp->nr_zones * sizeof(unsigned int)); + sdkp->zone_info.nr_zones * sizeof(unsigned int)); break; default: break; @@ -547,6 +643,7 @@ unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp, unsigned char *buf) { + u64 zone_starting_lba_gran; if (scsi_get_vpd_page(sdkp->device, 0xb6, buf, 64)) { sd_printk(KERN_NOTICE, sdkp, @@ -560,12 +657,36 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp, sdkp->zones_optimal_open = get_unaligned_be32(&buf[8]); sdkp->zones_optimal_nonseq = get_unaligned_be32(&buf[12]); sdkp->zones_max_open = 0; - } else { - /* Host-managed */ - sdkp->urswrz = buf[4] & 1; - sdkp->zones_optimal_open = 0; - sdkp->zones_optimal_nonseq = 0; - sdkp->zones_max_open = get_unaligned_be32(&buf[16]); + return 0; + } + + /* Host-managed */ + sdkp->urswrz = buf[4] & 1; + sdkp->zones_optimal_open = 0; + sdkp->zones_optimal_nonseq = 0; + sdkp->zones_max_open = get_unaligned_be32(&buf[16]); + /* Check zone alignment method */ + switch (buf[23] & 0xf) { + case 0: + case ZBC_CONSTANT_ZONE_LENGTH: + /* Use zone length */ + break; + case ZBC_CONSTANT_ZONE_START_OFFSET: + zone_starting_lba_gran = get_unaligned_be64(&buf[24]); + if (zone_starting_lba_gran == 0 || + !is_power_of_2(zone_starting_lba_gran) || + logical_to_sectors(sdkp->device, zone_starting_lba_gran) > + UINT_MAX) { + sd_printk(KERN_ERR, sdkp, + "Invalid zone starting LBA granularity %llu\n", + zone_starting_lba_gran); + return -ENODEV; + } + sdkp->zone_starting_lba_gran = zone_starting_lba_gran; + break; + default: + sd_printk(KERN_ERR, sdkp, "Invalid zone alignment method\n"); + return -ENODEV; } /* @@ -587,7 +708,7 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp, * sd_zbc_check_capacity - Check the device capacity * @sdkp: Target disk * @buf: command buffer - * @zblocks: zone size in number of blocks + * @zblocks: zone size in logical blocks * * Get the device zone size and check that the device capacity as reported * by READ CAPACITY matches the max_lba value (plus one) of the report zones @@ -621,14 +742,25 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp, unsigned char *buf, } } - /* Get the size of the first reported zone */ - rec = buf + 64; - zone_blocks = get_unaligned_be64(&rec[8]); - if (logical_to_sectors(sdkp->device, zone_blocks) > UINT_MAX) { - if (sdkp->first_scan) - sd_printk(KERN_NOTICE, sdkp, - "Zone size too large\n"); - return -EFBIG; + if (sdkp->zone_starting_lba_gran == 0) { + /* Get the size of the first reported zone */ + rec = buf + 64; + zone_blocks = get_unaligned_be64(&rec[8]); + if (logical_to_sectors(sdkp->device, zone_blocks) > UINT_MAX) { + if (sdkp->first_scan) + sd_printk(KERN_NOTICE, sdkp, + "Zone size too large\n"); + return -EFBIG; + } + } else { + zone_blocks = sdkp->zone_starting_lba_gran; + } + + if (!is_power_of_2(zone_blocks)) { + sd_printk(KERN_ERR, sdkp, + "Zone size %llu is not a power of two.\n", + zone_blocks); + return -EINVAL; } *zblocks = zone_blocks; @@ -641,16 +773,16 @@ static void sd_zbc_print_zones(struct scsi_disk *sdkp) if (!sd_is_zoned(sdkp) || !sdkp->capacity) return; - if (sdkp->capacity & (sdkp->zone_blocks - 1)) + if (sdkp->capacity & (sdkp->zone_info.zone_blocks - 1)) sd_printk(KERN_NOTICE, sdkp, "%u zones of %u logical blocks + 1 runt zone\n", - sdkp->nr_zones - 1, - sdkp->zone_blocks); + sdkp->zone_info.nr_zones - 1, + sdkp->zone_info.zone_blocks); else sd_printk(KERN_NOTICE, sdkp, "%u zones of %u logical blocks\n", - sdkp->nr_zones, - sdkp->zone_blocks); + sdkp->zone_info.nr_zones, + sdkp->zone_info.zone_blocks); } static int sd_zbc_init_disk(struct scsi_disk *sdkp) @@ -677,10 +809,8 @@ static void sd_zbc_clear_zone_info(struct scsi_disk *sdkp) kfree(sdkp->zone_wp_update_buf); sdkp->zone_wp_update_buf = NULL; - sdkp->nr_zones = 0; - sdkp->rev_nr_zones = 0; - sdkp->zone_blocks = 0; - sdkp->rev_zone_blocks = 0; + sdkp->early_zone_info = (struct zoned_disk_info){ }; + sdkp->zone_info = (struct zoned_disk_info){ }; mutex_unlock(&sdkp->rev_mutex); } @@ -698,12 +828,17 @@ static void sd_zbc_revalidate_zones_cb(struct gendisk *disk) swap(sdkp->zones_wp_offset, sdkp->rev_wp_offset); } +/* + * Call blk_revalidate_disk_zones() if any of the zoned disk properties have + * changed that make it necessary to call that function. Called by + * sd_revalidate_disk() after the gendisk capacity has been set. + */ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) { struct gendisk *disk = sdkp->disk; struct request_queue *q = disk->queue; - u32 zone_blocks = sdkp->rev_zone_blocks; - unsigned int nr_zones = sdkp->rev_nr_zones; + u32 zone_blocks = sdkp->early_zone_info.zone_blocks; + unsigned int nr_zones = sdkp->early_zone_info.nr_zones; u32 max_append; int ret = 0; unsigned int flags; @@ -734,14 +869,14 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) */ mutex_lock(&sdkp->rev_mutex); - if (sdkp->zone_blocks == zone_blocks && - sdkp->nr_zones == nr_zones && + if (sdkp->zone_info.zone_blocks == zone_blocks && + sdkp->zone_info.nr_zones == nr_zones && disk->queue->nr_zones == nr_zones) goto unlock; flags = memalloc_noio_save(); - sdkp->zone_blocks = zone_blocks; - sdkp->nr_zones = nr_zones; + sdkp->zone_info.zone_blocks = zone_blocks; + sdkp->zone_info.nr_zones = nr_zones; sdkp->rev_wp_offset = kvcalloc(nr_zones, sizeof(u32), GFP_KERNEL); if (!sdkp->rev_wp_offset) { ret = -ENOMEM; @@ -756,8 +891,7 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) sdkp->rev_wp_offset = NULL; if (ret) { - sdkp->zone_blocks = 0; - sdkp->nr_zones = 0; + sdkp->zone_info = (struct zoned_disk_info){ }; sdkp->capacity = 0; goto unlock; } @@ -776,7 +910,16 @@ unlock: return ret; } -int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) +/** + * sd_zbc_read_zones - Read zone information and update the request queue + * @sdkp: SCSI disk pointer. + * @buf: 512 byte buffer used for storing SCSI command output. + * + * Read zone information and update the request queue zone characteristics and + * also the zoned device information in *sdkp. Called by sd_revalidate_disk() + * before the gendisk capacity has been set. + */ +int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) { struct gendisk *disk = sdkp->disk; struct request_queue *q = disk->queue; @@ -834,8 +977,8 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) if (blk_queue_zoned_model(q) == BLK_ZONED_HM) blk_queue_zone_write_granularity(q, sdkp->physical_block_size); - sdkp->rev_nr_zones = nr_zones; - sdkp->rev_zone_blocks = zone_blocks; + sdkp->early_zone_info.nr_zones = nr_zones; + sdkp->early_zone_info.zone_blocks = zone_blocks; return 0; diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 6110dfd903f7..90dac9190432 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1711,13 +1711,13 @@ static int storvsc_host_reset_handler(struct scsi_cmnd *scmnd) * be unbounded on Azure. Reset the timer unconditionally to give the host a * chance to perform EH. */ -static enum blk_eh_timer_return storvsc_eh_timed_out(struct scsi_cmnd *scmnd) +static enum scsi_timeout_action storvsc_eh_timed_out(struct scsi_cmnd *scmnd) { #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS) if (scmnd->device->host->transportt == fc_transport_template) return fc_eh_timed_out(scmnd); #endif - return BLK_EH_RESET_TIMER; + return SCSI_EH_RESET_TIMER; } static bool storvsc_scsi_cmd_ok(struct scsi_cmnd *scmnd) diff --git a/drivers/scsi/ufs/Kconfig b/drivers/scsi/ufs/Kconfig deleted file mode 100644 index 432df76e6318..000000000000 --- a/drivers/scsi/ufs/Kconfig +++ /dev/null @@ -1,201 +0,0 @@ -# -# Kernel configuration file for the UFS Host Controller -# -# This code is based on drivers/scsi/ufs/Kconfig -# Copyright (C) 2011-2013 Samsung India Software Operations -# -# Authors: -# Santosh Yaraganavi -# Vinayak Holikatti -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# See the COPYING file in the top-level directory or visit -# -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# This program is provided "AS IS" and "WITH ALL FAULTS" and -# without warranty of any kind. You are solely responsible for -# determining the appropriateness of using and distributing -# the program and assume all risks associated with your exercise -# of rights with respect to the program, including but not limited -# to infringement of third party rights, the risks and costs of -# program errors, damage to or loss of data, programs or equipment, -# and unavailability or interruption of operations. Under no -# circumstances will the contributor of this Program be liable for -# any damages of any kind arising from your use or distribution of -# this program. - -config SCSI_UFSHCD - tristate "Universal Flash Storage Controller Driver Core" - depends on SCSI && SCSI_DMA - select PM_DEVFREQ - select DEVFREQ_GOV_SIMPLE_ONDEMAND - select NLS - help - This selects the support for UFS devices in Linux, say Y and make - sure that you know the name of your UFS host adapter (the card - inside your computer that "speaks" the UFS protocol, also - called UFS Host Controller), because you will be asked for it. - The module will be called ufshcd. - - To compile this driver as a module, choose M here and read - . - However, do not compile this as a module if your root file system - (the one containing the directory /) is located on a UFS device. - -config SCSI_UFSHCD_PCI - tristate "PCI bus based UFS Controller support" - depends on SCSI_UFSHCD && PCI - help - This selects the PCI UFS Host Controller Interface. Select this if - you have UFS Host Controller with PCI Interface. - - If you have a controller with this interface, say Y or M here. - - If unsure, say N. - -config SCSI_UFS_DWC_TC_PCI - tristate "DesignWare pci support using a G210 Test Chip" - depends on SCSI_UFSHCD_PCI - help - Synopsys Test Chip is a PHY for prototyping purposes. - - If unsure, say N. - -config SCSI_UFSHCD_PLATFORM - tristate "Platform bus based UFS Controller support" - depends on SCSI_UFSHCD - depends on HAS_IOMEM - help - This selects the UFS host controller support. Select this if - you have an UFS controller on Platform bus. - - If you have a controller with this interface, say Y or M here. - - If unsure, say N. - -config SCSI_UFS_CDNS_PLATFORM - tristate "Cadence UFS Controller platform driver" - depends on SCSI_UFSHCD_PLATFORM - help - This selects the Cadence-specific additions to UFSHCD platform driver. - - If unsure, say N. - -config SCSI_UFS_DWC_TC_PLATFORM - tristate "DesignWare platform support using a G210 Test Chip" - depends on SCSI_UFSHCD_PLATFORM - help - Synopsys Test Chip is a PHY for prototyping purposes. - - If unsure, say N. - -config SCSI_UFS_QCOM - tristate "QCOM specific hooks to UFS controller platform driver" - depends on SCSI_UFSHCD_PLATFORM && ARCH_QCOM - select QCOM_SCM if SCSI_UFS_CRYPTO - select RESET_CONTROLLER - help - This selects the QCOM specific additions to UFSHCD platform driver. - UFS host on QCOM needs some vendor specific configuration before - accessing the hardware which includes PHY configuration and vendor - specific registers. - - Select this if you have UFS controller on QCOM chipset. - If unsure, say N. - -config SCSI_UFS_MEDIATEK - tristate "Mediatek specific hooks to UFS controller platform driver" - depends on SCSI_UFSHCD_PLATFORM && ARCH_MEDIATEK - select PHY_MTK_UFS - select RESET_TI_SYSCON - help - This selects the Mediatek specific additions to UFSHCD platform driver. - UFS host on Mediatek needs some vendor specific configuration before - accessing the hardware which includes PHY configuration and vendor - specific registers. - - Select this if you have UFS controller on Mediatek chipset. - - If unsure, say N. - -config SCSI_UFS_HISI - tristate "Hisilicon specific hooks to UFS controller platform driver" - depends on (ARCH_HISI || COMPILE_TEST) && SCSI_UFSHCD_PLATFORM - help - This selects the Hisilicon specific additions to UFSHCD platform driver. - - Select this if you have UFS controller on Hisilicon chipset. - If unsure, say N. - -config SCSI_UFS_TI_J721E - tristate "TI glue layer for Cadence UFS Controller" - depends on OF && HAS_IOMEM && (ARCH_K3 || COMPILE_TEST) - help - This selects driver for TI glue layer for Cadence UFS Host - Controller IP. - - Selects this if you have TI platform with UFS controller. - If unsure, say N. - -config SCSI_UFS_BSG - bool "Universal Flash Storage BSG device node" - depends on SCSI_UFSHCD - select BLK_DEV_BSGLIB - help - Universal Flash Storage (UFS) is SCSI transport specification for - accessing flash storage on digital cameras, mobile phones and - consumer electronic devices. - A UFS controller communicates with a UFS device by exchanging - UFS Protocol Information Units (UPIUs). - UPIUs can not only be used as a transport layer for the SCSI protocol - but are also used by the UFS native command set. - This transport driver supports exchanging UFS protocol information units - with a UFS device. See also the ufshcd driver, which is a SCSI driver - that supports UFS devices. - - Select this if you need a bsg device node for your UFS controller. - If unsure, say N. - -config SCSI_UFS_EXYNOS - tristate "EXYNOS specific hooks to UFS controller platform driver" - depends on SCSI_UFSHCD_PLATFORM && (ARCH_EXYNOS || COMPILE_TEST) - help - This selects the EXYNOS specific additions to UFSHCD platform driver. - UFS host on EXYNOS includes HCI and UNIPRO layer, and associates with - UFS-PHY driver. - - Select this if you have UFS host controller on EXYNOS chipset. - If unsure, say N. - -config SCSI_UFS_CRYPTO - bool "UFS Crypto Engine Support" - depends on SCSI_UFSHCD && BLK_INLINE_ENCRYPTION - help - Enable Crypto Engine Support in UFS. - Enabling this makes it possible for the kernel to use the crypto - capabilities of the UFS device (if present) to perform crypto - operations on data being transferred to/from the device. - -config SCSI_UFS_HPB - bool "Support UFS Host Performance Booster" - depends on SCSI_UFSHCD - help - The UFS HPB feature improves random read performance. It caches - L2P (logical to physical) map of UFS to host DRAM. The driver uses HPB - read command by piggybacking physical page number for bypassing FTL (flash - translation layer)'s L2P address translation. - -config SCSI_UFS_FAULT_INJECTION - bool "UFS Fault Injection Support" - depends on SCSI_UFSHCD && FAULT_INJECTION - help - Enable fault injection support in the UFS driver. This makes it easier - to test the UFS error handler and abort handler. diff --git a/drivers/scsi/ufs/ufs-mediatek.h b/drivers/scsi/ufs/ufs-mediatek.h deleted file mode 100644 index 3f0d3bb769e8..000000000000 --- a/drivers/scsi/ufs/ufs-mediatek.h +++ /dev/null @@ -1,118 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2019 MediaTek Inc. - */ - -#ifndef _UFS_MEDIATEK_H -#define _UFS_MEDIATEK_H - -#include -#include - -/* - * Vendor specific UFSHCI Registers - */ -#define REG_UFS_REFCLK_CTRL 0x144 -#define REG_UFS_EXTREG 0x2100 -#define REG_UFS_MPHYCTRL 0x2200 -#define REG_UFS_REJECT_MON 0x22AC -#define REG_UFS_DEBUG_SEL 0x22C0 -#define REG_UFS_PROBE 0x22C8 - -/* - * Ref-clk control - * - * Values for register REG_UFS_REFCLK_CTRL - */ -#define REFCLK_RELEASE 0x0 -#define REFCLK_REQUEST BIT(0) -#define REFCLK_ACK BIT(1) - -#define REFCLK_REQ_TIMEOUT_US 3000 - -/* - * Other attributes - */ -#define VS_DEBUGCLOCKENABLE 0xD0A1 -#define VS_SAVEPOWERCONTROL 0xD0A6 -#define VS_UNIPROPOWERDOWNCONTROL 0xD0A8 - -/* - * Vendor specific link state - */ -enum { - VS_LINK_DISABLED = 0, - VS_LINK_DOWN = 1, - VS_LINK_UP = 2, - VS_LINK_HIBERN8 = 3, - VS_LINK_LOST = 4, - VS_LINK_CFG = 5, -}; - -/* - * SiP commands - */ -#define MTK_SIP_UFS_CONTROL MTK_SIP_SMC_CMD(0x276) -#define UFS_MTK_SIP_VA09_PWR_CTRL BIT(0) -#define UFS_MTK_SIP_DEVICE_RESET BIT(1) -#define UFS_MTK_SIP_CRYPTO_CTRL BIT(2) -#define UFS_MTK_SIP_REF_CLK_NOTIFICATION BIT(3) - -/* - * VS_DEBUGCLOCKENABLE - */ -enum { - TX_SYMBOL_CLK_REQ_FORCE = 5, -}; - -/* - * VS_SAVEPOWERCONTROL - */ -enum { - RX_SYMBOL_CLK_GATE_EN = 0, - SYS_CLK_GATE_EN = 2, - TX_CLK_GATE_EN = 3, -}; - -/* - * Host capability - */ -enum ufs_mtk_host_caps { - UFS_MTK_CAP_BOOST_CRYPT_ENGINE = 1 << 0, - UFS_MTK_CAP_VA09_PWR_CTRL = 1 << 1, - UFS_MTK_CAP_DISABLE_AH8 = 1 << 2, - UFS_MTK_CAP_BROKEN_VCC = 1 << 3, -}; - -struct ufs_mtk_crypt_cfg { - struct regulator *reg_vcore; - struct clk *clk_crypt_perf; - struct clk *clk_crypt_mux; - struct clk *clk_crypt_lp; - int vcore_volt; -}; - -struct ufs_mtk_hw_ver { - u8 step; - u8 minor; - u8 major; -}; - -struct ufs_mtk_host { - struct phy *mphy; - struct regulator *reg_va09; - struct reset_control *hci_reset; - struct reset_control *unipro_reset; - struct reset_control *crypto_reset; - struct ufs_hba *hba; - struct ufs_mtk_crypt_cfg *crypt; - struct ufs_mtk_hw_ver hw_ver; - enum ufs_mtk_host_caps caps; - bool mphy_powered_on; - bool unipro_lpm; - bool ref_clk_enabled; - u16 ref_clk_ungating_wait_us; - u16 ref_clk_gating_wait_us; -}; - -#endif /* !_UFS_MEDIATEK_H */ diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 07d0250f17c3..8b20df123fd1 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -731,9 +731,9 @@ static void virtscsi_commit_rqs(struct Scsi_Host *shost, u16 hwq) * latencies might be higher than on bare metal. Reset the timer * unconditionally to give the host a chance to perform EH. */ -static enum blk_eh_timer_return virtscsi_eh_timed_out(struct scsi_cmnd *scmnd) +static enum scsi_timeout_action virtscsi_eh_timed_out(struct scsi_cmnd *scmnd) { - return BLK_EH_RESET_TIMER; + return SCSI_EH_RESET_TIMER; } static struct scsi_host_template virtscsi_host_template = { diff --git a/drivers/soc/qcom/qcom-geni-se.c b/drivers/soc/qcom/qcom-geni-se.c index 7d649d2cf31e..83fd6f44689d 100644 --- a/drivers/soc/qcom/qcom-geni-se.c +++ b/drivers/soc/qcom/qcom-geni-se.c @@ -1,6 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2017-2018, The Linux Foundation. All rights reserved. +/* Disable MMIO tracing to prevent excessive logging of unwanted MMIO traces */ +#define __DISABLE_TRACE_MMIO__ + #include #include #include diff --git a/drivers/staging/android/Kconfig b/drivers/staging/android/Kconfig index 70498adb1575..9aca9ce10f8a 100644 --- a/drivers/staging/android/Kconfig +++ b/drivers/staging/android/Kconfig @@ -14,6 +14,17 @@ config ASHMEM It is, in theory, a good memory allocator for low-memory devices, because it can discard shared memory units when under memory pressure. +config DEBUG_KINFO + bool "Debug Kernel Information Support" + depends on KALLSYMS + help + This supports kernel information backup for bootloader usage. + Specifics: + - The kallsyms symbols for unwind_backtrace + - Page directory pointer + - UTS_RELEASE + - BUILD_INFO(ro.build.fingerprint) + endif # if ANDROID endmenu diff --git a/drivers/staging/android/Makefile b/drivers/staging/android/Makefile index e9a55a5e6529..b92c5fe69bad 100644 --- a/drivers/staging/android/Makefile +++ b/drivers/staging/android/Makefile @@ -2,3 +2,4 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_ASHMEM) += ashmem.o +obj-$(CONFIG_DEBUG_KINFO) += debug_kinfo.o diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c index ddbde3f8430e..a998435a5a83 100644 --- a/drivers/staging/android/ashmem.c +++ b/drivers/staging/android/ashmem.c @@ -817,6 +817,7 @@ out_unlock: static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct ashmem_area *asma = file->private_data; + unsigned long ino; long ret = -ENOTTY; switch (cmd) { @@ -860,6 +861,23 @@ static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ashmem_shrink_scan(&ashmem_shrinker, &sc); } break; + case ASHMEM_GET_FILE_ID: + /* Lock around our check to avoid racing with ashmem_mmap(). */ + mutex_lock(&ashmem_mutex); + if (!asma || !asma->file) { + mutex_unlock(&ashmem_mutex); + ret = -EINVAL; + break; + } + ino = file_inode(asma->file)->i_ino; + mutex_unlock(&ashmem_mutex); + + if (copy_to_user((void __user *)arg, &ino, sizeof(ino))) { + ret = -EFAULT; + break; + } + ret = 0; + break; } return ret; @@ -916,6 +934,15 @@ static const struct file_operations ashmem_fops = { #endif }; +/* + * is_ashmem_file - Check if struct file* is associated with ashmem + */ +int is_ashmem_file(struct file *file) +{ + return file->f_op == &ashmem_fops; +} +EXPORT_SYMBOL_GPL(is_ashmem_file); + static struct miscdevice ashmem_misc = { .minor = MISC_DYNAMIC_MINOR, .name = "ashmem", diff --git a/drivers/staging/android/ashmem.h b/drivers/staging/android/ashmem.h index 1a478173cd21..9fa72ed7b7ed 100644 --- a/drivers/staging/android/ashmem.h +++ b/drivers/staging/android/ashmem.h @@ -21,4 +21,6 @@ #define COMPAT_ASHMEM_SET_PROT_MASK _IOW(__ASHMEMIOC, 5, unsigned int) #endif +int is_ashmem_file(struct file *file); + #endif /* _LINUX_ASHMEM_H */ diff --git a/drivers/staging/android/debug_kinfo.c b/drivers/staging/android/debug_kinfo.c new file mode 100644 index 000000000000..0c68fc55b841 --- /dev/null +++ b/drivers/staging/android/debug_kinfo.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * debug_kinfo.c - backup kernel information for bootloader usage + * + * Copyright 2002 Rusty Russell IBM Corporation + * Copyright 2021 Google LLC + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "debug_kinfo.h" + +/* + * These will be re-linked against their real values + * during the second link stage. + */ +extern const unsigned long kallsyms_addresses[] __weak; +extern const int kallsyms_offsets[] __weak; +extern const u8 kallsyms_names[] __weak; + +/* + * Tell the compiler that the count isn't in the small data section if the arch + * has one (eg: FRV). + */ +extern const unsigned int kallsyms_num_syms __weak +__section(".rodata"); + +extern const unsigned long kallsyms_relative_base __weak +__section(".rodata"); + +extern const u8 kallsyms_token_table[] __weak; +extern const u16 kallsyms_token_index[] __weak; + +extern const unsigned int kallsyms_markers[] __weak; + +static void *all_info_addr; +static u32 all_info_size; + +static void update_kernel_all_info(struct kernel_all_info *all_info) +{ + int index; + struct kernel_info *info; + u32 *checksum_info; + + all_info->magic_number = DEBUG_KINFO_MAGIC; + all_info->combined_checksum = 0; + + info = &(all_info->info); + checksum_info = (u32 *)info; + for (index = 0; index < sizeof(*info) / sizeof(u32); index++) + all_info->combined_checksum ^= checksum_info[index]; +} + +static int build_info_set(const char *str, const struct kernel_param *kp) +{ + struct kernel_all_info *all_info; + size_t build_info_size; + int ret = 0; + + if (all_info_addr == 0 || all_info_size == 0) { + ret = -EPERM; + goto Exit; + } + + all_info = (struct kernel_all_info *)all_info_addr; + build_info_size = sizeof(all_info->info.build_info); + + memcpy(&all_info->info.build_info, str, min(build_info_size - 1, strlen(str))); + update_kernel_all_info(all_info); + + if (strlen(str) > build_info_size) { + pr_warn("%s: Build info buffer (len: %zd) can't hold entire string '%s'\n", + __func__, build_info_size, str); + ret = -ENOMEM; + } + +Exit: + return ret; +} + +static const struct kernel_param_ops build_info_op = { + .set = build_info_set, +}; + +module_param_cb(build_info, &build_info_op, NULL, 0200); +MODULE_PARM_DESC(build_info, "Write build info to field 'build_info' of debug kinfo."); + +static int debug_kinfo_probe(struct platform_device *pdev) +{ + struct device_node *mem_region; + struct reserved_mem *rmem; + struct kernel_all_info *all_info; + struct kernel_info *info; + + mem_region = of_parse_phandle(pdev->dev.of_node, "memory-region", 0); + if (!mem_region) { + dev_warn(&pdev->dev, "no such memory-region\n"); + return -ENODEV; + } + + rmem = of_reserved_mem_lookup(mem_region); + if (!rmem) { + dev_warn(&pdev->dev, "no such reserved mem of node name %s\n", + pdev->dev.of_node->name); + return -ENODEV; + } + + /* Need to wait for reserved memory to be mapped */ + if (!rmem->priv) { + return -EPROBE_DEFER; + } + + if (!rmem->base || !rmem->size) { + dev_warn(&pdev->dev, "unexpected reserved memory\n"); + return -EINVAL; + } + + if (rmem->size < sizeof(struct kernel_all_info)) { + dev_warn(&pdev->dev, "unexpected reserved memory size\n"); + return -EINVAL; + } + + all_info_addr = rmem->priv; + all_info_size = rmem->size; + + memset(all_info_addr, 0, sizeof(struct kernel_all_info)); + all_info = (struct kernel_all_info *)all_info_addr; + info = &(all_info->info); + info->enabled_all = IS_ENABLED(CONFIG_KALLSYMS_ALL); + info->enabled_base_relative = IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE); + info->enabled_absolute_percpu = IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU); + info->enabled_cfi_clang = IS_ENABLED(CONFIG_CFI_CLANG); + info->num_syms = kallsyms_num_syms; + info->name_len = KSYM_NAME_LEN; + info->bit_per_long = BITS_PER_LONG; + info->module_name_len = MODULE_NAME_LEN; + info->symbol_len = KSYM_SYMBOL_LEN; + if (!info->enabled_base_relative) + info->_addresses_pa = (u64)__pa_symbol((volatile void *)kallsyms_addresses); + else { + info->_relative_pa = (u64)__pa_symbol((volatile void *)kallsyms_relative_base); + info->_offsets_pa = (u64)__pa_symbol((volatile void *)kallsyms_offsets); + } + info->_stext_pa = (u64)__pa_symbol(_stext); + info->_etext_pa = (u64)__pa_symbol(_etext); + info->_sinittext_pa = (u64)__pa_symbol(_sinittext); + info->_einittext_pa = (u64)__pa_symbol(_einittext); + info->_end_pa = (u64)__pa_symbol(_end); + info->_names_pa = (u64)__pa_symbol((volatile void *)kallsyms_names); + info->_token_table_pa = (u64)__pa_symbol((volatile void *)kallsyms_token_table); + info->_token_index_pa = (u64)__pa_symbol((volatile void *)kallsyms_token_index); + info->_markers_pa = (u64)__pa_symbol((volatile void *)kallsyms_markers); + info->thread_size = THREAD_SIZE; + info->swapper_pg_dir_pa = (u64)__pa_symbol(swapper_pg_dir); + strlcpy(info->last_uts_release, init_utsname()->release, sizeof(info->last_uts_release)); + info->enabled_modules_tree_lookup = IS_ENABLED(CONFIG_MODULES_TREE_LOOKUP); + info->mod_core_layout_offset = offsetof(struct module, core_layout); + info->mod_init_layout_offset = offsetof(struct module, init_layout); + info->mod_kallsyms_offset = offsetof(struct module, kallsyms); +#if defined(CONFIG_RANDOMIZE_BASE) && defined(MODULES_VSIZE) + info->module_start_va = module_alloc_base; + info->module_end_va = info->module_start_va + MODULES_VSIZE; +#elif defined(CONFIG_MODULES) && defined(MODULES_VADDR) + info->module_start_va = MODULES_VADDR; + info->module_end_va = MODULES_END; +#else + info->module_start_va = VMALLOC_START; + info->module_end_va = VMALLOC_END; +#endif + info->enabled_module_scmversion = IS_ENABLED(CONFIG_MODULE_SCMVERSION); + info->mod_scmversion_offset = offsetof(struct module, scmversion); + update_kernel_all_info(all_info); + + return 0; +} + +static const struct of_device_id debug_kinfo_of_match[] = { + { .compatible = "google,debug-kinfo" }, + {}, +}; +MODULE_DEVICE_TABLE(of, debug_kinfo_of_match); + +static struct platform_driver debug_kinfo_driver = { + .probe = debug_kinfo_probe, + .driver = { + .name = "debug-kinfo", + .of_match_table = of_match_ptr(debug_kinfo_of_match), + }, +}; +module_platform_driver(debug_kinfo_driver); + +MODULE_AUTHOR("Jone Chou "); +MODULE_DESCRIPTION("Debug Kinfo Driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/staging/android/debug_kinfo.h b/drivers/staging/android/debug_kinfo.h new file mode 100644 index 000000000000..43d0a4cc987c --- /dev/null +++ b/drivers/staging/android/debug_kinfo.h @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * debug_kinfo.h - backup kernel information for bootloader usage + * + * Copyright 2021 Google LLC + */ + +#ifndef DEBUG_KINFO_H +#define DEBUG_KINFO_H + +#include + +#define BUILD_INFO_LEN 256 +#define DEBUG_KINFO_MAGIC 0xCCEEDDFF + +/* + * Header structure must be byte-packed, since the table is provided to + * bootloader. + */ +struct kernel_info { + /* For kallsyms */ + __u8 enabled_all; + __u8 enabled_base_relative; + __u8 enabled_absolute_percpu; + __u8 enabled_cfi_clang; + __u32 num_syms; + __u16 name_len; + __u16 bit_per_long; + __u16 module_name_len; + __u16 symbol_len; + __u64 _addresses_pa; + __u64 _relative_pa; + __u64 _stext_pa; + __u64 _etext_pa; + __u64 _sinittext_pa; + __u64 _einittext_pa; + __u64 _end_pa; + __u64 _offsets_pa; + __u64 _names_pa; + __u64 _token_table_pa; + __u64 _token_index_pa; + __u64 _markers_pa; + + /* For frame pointer */ + __u32 thread_size; + + /* For virt_to_phys */ + __u64 swapper_pg_dir_pa; + + /* For linux banner */ + __u8 last_uts_release[__NEW_UTS_LEN]; + + /* Info of running build */ + __u8 build_info[BUILD_INFO_LEN]; + + /* For module kallsyms */ + __u32 enabled_modules_tree_lookup; + __u32 mod_core_layout_offset; + __u32 mod_init_layout_offset; + __u32 mod_kallsyms_offset; + __u64 module_start_va; + __u64 module_end_va; + + /* For module scmversion */ + __u32 enabled_module_scmversion; + __u32 mod_scmversion_offset; +} __packed; + +struct kernel_all_info { + __u32 magic_number; + __u32 combined_checksum; + struct kernel_info info; +} __packed; + +#endif // DEBUG_KINFO_H diff --git a/drivers/staging/android/uapi/ashmem.h b/drivers/staging/android/uapi/ashmem.h index 134efacb3219..f962a5f38fde 100644 --- a/drivers/staging/android/uapi/ashmem.h +++ b/drivers/staging/android/uapi/ashmem.h @@ -39,5 +39,6 @@ struct ashmem_pin { #define ASHMEM_UNPIN _IOW(__ASHMEMIOC, 8, struct ashmem_pin) #define ASHMEM_GET_PIN_STATUS _IO(__ASHMEMIOC, 9) #define ASHMEM_PURGE_ALL_CACHES _IO(__ASHMEMIOC, 10) +#define ASHMEM_GET_FILE_ID _IOR(__ASHMEMIOC, 11, unsigned long) #endif /* _UAPI_LINUX_ASHMEM_H */ diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c index 499ac3a77512..07156cb39a29 100644 --- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c +++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c @@ -462,8 +462,8 @@ check_bss: notify_channel = ieee80211_get_channel(wiphy, freq); - roam_info.channel = notify_channel; - roam_info.bssid = cur_network->network.mac_address; + roam_info.links[0].channel = notify_channel; + roam_info.links[0].bssid = cur_network->network.mac_address; roam_info.req_ie = pmlmepriv->assoc_req+sizeof(struct ieee80211_hdr_3addr)+2; roam_info.req_ie_len = @@ -935,8 +935,8 @@ exit: } static int cfg80211_rtw_add_key(struct wiphy *wiphy, struct net_device *ndev, - u8 key_index, bool pairwise, const u8 *mac_addr, - struct key_params *params) + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr, struct key_params *params) { char *alg_name; u32 param_len; @@ -1027,8 +1027,8 @@ addkey_end: } static int cfg80211_rtw_get_key(struct wiphy *wiphy, struct net_device *ndev, - u8 key_index, bool pairwise, const u8 *mac_addr, - void *cookie, + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params*)) { @@ -1036,7 +1036,8 @@ static int cfg80211_rtw_get_key(struct wiphy *wiphy, struct net_device *ndev, } static int cfg80211_rtw_del_key(struct wiphy *wiphy, struct net_device *ndev, - u8 key_index, bool pairwise, const u8 *mac_addr) + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr) { struct adapter *padapter = rtw_netdev_priv(ndev); struct security_priv *psecuritypriv = &padapter->securitypriv; @@ -1051,7 +1052,7 @@ static int cfg80211_rtw_del_key(struct wiphy *wiphy, struct net_device *ndev, } static int cfg80211_rtw_set_default_key(struct wiphy *wiphy, - struct net_device *ndev, u8 key_index + struct net_device *ndev, int link_id, u8 key_index , bool unicast, bool multicast ) { @@ -2120,6 +2121,7 @@ static u8 rtw_get_chan_type(struct adapter *adapter) } static int cfg80211_rtw_get_channel(struct wiphy *wiphy, struct wireless_dev *wdev, + unsigned int link_id, struct cfg80211_chan_def *chandef) { struct adapter *adapter = wiphy_to_adapter(wiphy); @@ -2480,7 +2482,8 @@ static int cfg80211_rtw_change_beacon(struct wiphy *wiphy, struct net_device *nd return rtw_add_beacon(adapter, info->head, info->head_len, info->tail, info->tail_len); } -static int cfg80211_rtw_stop_ap(struct wiphy *wiphy, struct net_device *ndev) +static int cfg80211_rtw_stop_ap(struct wiphy *wiphy, struct net_device *ndev, + unsigned int link_id) { return 0; } diff --git a/drivers/staging/wfx/sta.c b/drivers/staging/wfx/sta.c index cb7e8abdf43c..bed830e26b8d 100644 --- a/drivers/staging/wfx/sta.c +++ b/drivers/staging/wfx/sta.c @@ -449,11 +449,11 @@ static void wfx_join_finalize(struct wfx_vif *wvif, rcu_read_lock(); // protect sta if (info->bssid && !info->ibss_joined) sta = ieee80211_find_sta(wvif->vif, info->bssid); - if (sta && sta->ht_cap.ht_supported) - ampdu_density = sta->ht_cap.ampdu_density; - if (sta && sta->ht_cap.ht_supported && + if (sta && sta->deflink.ht_cap.ht_supported) + ampdu_density = sta->deflink.ht_cap.ampdu_density; + if (sta && sta->deflink.ht_cap.ht_supported && !(info->ht_operation_mode & IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT)) - greenfield = !!(sta->ht_cap.cap & IEEE80211_HT_CAP_GRN_FLD); + greenfield = !!(sta->deflink.ht_cap.cap & IEEE80211_HT_CAP_GRN_FLD); rcu_read_unlock(); wvif->join_in_progress = false; diff --git a/drivers/staging/wlan-ng/cfg80211.c b/drivers/staging/wlan-ng/cfg80211.c index 7951bd63816f..63804b28657b 100644 --- a/drivers/staging/wlan-ng/cfg80211.c +++ b/drivers/staging/wlan-ng/cfg80211.c @@ -143,8 +143,8 @@ exit: } static int prism2_add_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_index, bool pairwise, const u8 *mac_addr, - struct key_params *params) + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr, struct key_params *params) { struct wlandevice *wlandev = dev->ml_priv; u32 did; @@ -172,7 +172,7 @@ static int prism2_add_key(struct wiphy *wiphy, struct net_device *dev, } static int prism2_get_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_index, bool pairwise, + int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params*)) { @@ -202,7 +202,8 @@ static int prism2_get_key(struct wiphy *wiphy, struct net_device *dev, } static int prism2_del_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_index, bool pairwise, const u8 *mac_addr) + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr) { struct wlandevice *wlandev = dev->ml_priv; u32 did; @@ -227,7 +228,8 @@ static int prism2_del_key(struct wiphy *wiphy, struct net_device *dev, } static int prism2_set_default_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_index, bool unicast, bool multicast) + int link_id, u8 key_index, bool unicast, + bool multicast) { struct wlandevice *wlandev = dev->ml_priv; @@ -647,7 +649,7 @@ void prism2_disconnected(struct wlandevice *wlandev) void prism2_roamed(struct wlandevice *wlandev) { struct cfg80211_roam_info roam_info = { - .bssid = wlandev->bssid, + .links[0].bssid = wlandev->bssid, }; cfg80211_roamed(wlandev->netdev, &roam_info, GFP_KERNEL); diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c index 023bd4516a68..573a0913a70a 100644 --- a/drivers/target/target_core_configfs.c +++ b/drivers/target/target_core_configfs.c @@ -3686,6 +3686,7 @@ static void __exit target_core_exit_configfs(void) MODULE_DESCRIPTION("Target_Core_Mod/ConfigFS"); MODULE_AUTHOR("nab@Linux-iSCSI.org"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver); module_init(target_core_init_configfs); module_exit(target_core_exit_configfs); diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index ef4a8e189fba..394e915b68e1 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -947,6 +947,7 @@ static void __exit fileio_module_exit(void) MODULE_DESCRIPTION("TCM FILEIO subsystem plugin"); MODULE_AUTHOR("nab@Linux-iSCSI.org"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver); module_init(fileio_module_init); module_exit(fileio_module_exit); diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c index 12a60415af95..ff386e50b10f 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -23,6 +23,7 @@ #include #include +#include /* * Cooling state <-> CPUFreq frequency @@ -218,6 +219,8 @@ static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev, freq = cpufreq_quick_get(policy->cpu); + trace_android_vh_modify_thermal_request_freq(policy, &freq); + if (trace_thermal_power_cpu_get_power_enabled()) { u32 ncpus = cpumask_weight(policy->related_cpus); @@ -317,6 +320,8 @@ static int cpufreq_power2state(struct thermal_cooling_device *cdev, normalised_power = (power * 100) / last_load; target_freq = cpu_power_to_freq(cpufreq_cdev, normalised_power); + trace_android_vh_modify_thermal_target_freq(policy, &target_freq); + *state = get_level(cpufreq_cdev, target_freq); trace_thermal_power_cpu_limit(policy->related_cpus, target_freq, *state, power); diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index 1d5052470967..ee6022d73ce0 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -14,6 +14,8 @@ #define CREATE_TRACE_POINTS #include +#undef CREATE_TRACE_POINTS +#include #include "thermal_core.h" @@ -468,6 +470,7 @@ static int allocate_power(struct thermal_zone_device *tz, } power_range = pid_controller(tz, control_temp, max_allocatable_power); + trace_android_vh_thermal_power_cap(&power_range); divvy_up_power(weighted_req_power, max_power, num_actors, total_weighted_req_power, power_range, granted_power, @@ -711,6 +714,8 @@ static int power_allocator_throttle(struct thermal_zone_device *tz, int trip) int switch_on_temp, control_temp; struct power_allocator_params *params = tz->governor_data; bool update; + bool enable = true; + bool override = false; /* * We get called for every trip point but we only need to do @@ -719,9 +724,22 @@ static int power_allocator_throttle(struct thermal_zone_device *tz, int trip) if (trip != params->trip_max_desired_temperature) return 0; - ret = tz->ops->get_trip_temp(tz, params->trip_switch_on, - &switch_on_temp); - if (!ret && (tz->temperature < switch_on_temp)) { + /* + * Enable or disable IPA control by temperature and user power budget. + * About enable: + * true: enable IPA control when temperature >= swtich_on_temp. + * false: disable IPA control when temperature < switch_on_temp. + * About override: + * true: power budget is overridden by user power budget. + * false: power budget is not overridden, there's no other thermal + * requirement. + */ + trace_android_vh_enable_thermal_power_throttle(&enable, &override); + if (enable) + ret = tz->ops->get_trip_temp(tz, params->trip_switch_on, + &switch_on_temp); + if (!enable || (!ret && (tz->temperature < switch_on_temp) && + !override)) { update = (tz->last_temperature >= switch_on_temp); tz->passive = 0; reset_pid_controller(params); diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 052e8e8fbb21..4d146d282690 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -23,6 +23,8 @@ #define CREATE_TRACE_POINTS #include +#undef CREATE_TRACE_POINTS +#include #include "thermal_core.h" #include "thermal_hwmon.h" @@ -505,6 +507,8 @@ void thermal_zone_device_update(struct thermal_zone_device *tz, for (count = 0; count < tz->num_trips; count++) handle_thermal_trip(tz, count); + + trace_android_vh_get_thermal_zone_device(tz); } EXPORT_SYMBOL_GPL(thermal_zone_device_update); diff --git a/drivers/thermal/thermal_netlink.c b/drivers/thermal/thermal_netlink.c index c70d407c2c71..27dc5e7ff397 100644 --- a/drivers/thermal/thermal_netlink.c +++ b/drivers/thermal/thermal_netlink.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "thermal_core.h" @@ -229,6 +230,11 @@ static int thermal_genl_send_event(enum thermal_genl_event event, struct sk_buff *msg; int ret = -EMSGSIZE; void *hdr; + int enable_thermal_genl = 1; + + trace_android_vh_enable_thermal_genl_check(event, p->tz_id, &enable_thermal_genl); + if (!enable_thermal_genl) + return 0; msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg) diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c index 9233f7e74454..ca73c9767891 100644 --- a/drivers/thermal/thermal_of.c +++ b/drivers/thermal/thermal_of.c @@ -203,6 +203,28 @@ static int of_thermal_get_trend(struct thermal_zone_device *tz, int trip, return data->ops->get_trend(data->sensor_data, trip, trend); } +static int of_thermal_change_mode(struct thermal_zone_device *tz, + enum thermal_device_mode mode) +{ + struct __thermal_zone *data = tz->devdata; + + return data->ops->change_mode(data->sensor_data, mode); +} + +static void of_thermal_hot_notify(struct thermal_zone_device *tz) +{ + struct __thermal_zone *data = tz->devdata; + + data->ops->hot(data->sensor_data); +} + +static void of_thermal_critical_notify(struct thermal_zone_device *tz) +{ + struct __thermal_zone *data = tz->devdata; + + data->ops->critical(data->sensor_data); +} + static int of_thermal_bind(struct thermal_zone_device *thermal, struct thermal_cooling_device *cdev) { @@ -408,6 +430,14 @@ thermal_zone_of_add_sensor(struct device_node *zone, if (ops->set_emul_temp) tzd->ops->set_emul_temp = of_thermal_set_emul_temp; + if (ops->change_mode) + tzd->ops->change_mode = of_thermal_change_mode; + + if (ops->hot) + tzd->ops->hot = of_thermal_hot_notify; + + if (ops->critical) + tzd->ops->critical = of_thermal_critical_notify; mutex_unlock(&tzd->lock); return tzd; @@ -569,6 +599,9 @@ void thermal_zone_of_sensor_unregister(struct device *dev, tzd->ops->get_temp = NULL; tzd->ops->get_trend = NULL; tzd->ops->set_emul_temp = NULL; + tzd->ops->change_mode = NULL; + tzd->ops->hot = NULL; + tzd->ops->critical = NULL; tz->ops = NULL; tz->sensor_data = NULL; diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c index de7cdec3db90..32d9238f73c4 100644 --- a/drivers/thermal/thermal_sysfs.c +++ b/drivers/thermal/thermal_sysfs.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "thermal_core.h" @@ -816,6 +817,15 @@ static void cooling_device_stats_setup(struct thermal_cooling_device *cdev) struct cooling_dev_stats *stats; unsigned long states; int var; + int disable_cdev_stats = 0; + + trace_android_vh_disable_thermal_cooling_stats(cdev, + &disable_cdev_stats); + if (disable_cdev_stats) { + var = ARRAY_SIZE(cooling_device_attr_groups) - 2; + cooling_device_attr_groups[var] = NULL; + return; + } if (cdev->ops->get_max_state(cdev, &states)) goto out; diff --git a/drivers/tty/hvc/hvc_console.h b/drivers/tty/hvc/hvc_console.h index 18d005814e4b..97200fbeeae8 100644 --- a/drivers/tty/hvc/hvc_console.h +++ b/drivers/tty/hvc/hvc_console.h @@ -30,7 +30,7 @@ * for the tty device. Since this driver supports hotplug of vty adapters we * need to make sure we have enough allocated. */ -#define HVC_ALLOC_TTY_ADAPTERS 8 +#define HVC_ALLOC_TTY_ADAPTERS 64 struct hvc_struct { struct tty_port port; diff --git a/drivers/tty/hvc/hvc_dcc.c b/drivers/tty/hvc/hvc_dcc.c index 8e0edb7d93fd..03c0a18f25c6 100644 --- a/drivers/tty/hvc/hvc_dcc.c +++ b/drivers/tty/hvc/hvc_dcc.c @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -11,6 +12,13 @@ #include "hvc_console.h" +/* + * Disable DCC driver at runtime. Want driver enabled for GKI, but some devices + * do not support the registers and crash when driver pokes the registers + */ +static bool enable; +module_param(enable, bool, 0444); + /* DCC Status Bits */ #define DCC_STATUS_RX (1 << 30) #define DCC_STATUS_TX (1 << 29) @@ -91,7 +99,7 @@ static int __init hvc_dcc_console_init(void) { int ret; - if (!hvc_dcc_check()) + if (!enable || !hvc_dcc_check()) return -ENODEV; /* Returns -1 if error */ @@ -105,7 +113,7 @@ static int __init hvc_dcc_init(void) { struct hvc_struct *p; - if (!hvc_dcc_check()) + if (!enable || !hvc_dcc_check()) return -ENODEV; p = hvc_alloc(0, 0, &hvc_dcc_get_put_ops, 128); diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index 131a6a587acd..029210f4ae55 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -236,7 +236,6 @@ config SERIAL_CLPS711X_CONSOLE config SERIAL_SAMSUNG tristate "Samsung SoC serial support" - depends on PLAT_SAMSUNG || ARCH_S5PV210 || ARCH_EXYNOS || ARCH_APPLE || COMPILE_TEST select SERIAL_CORE help Support for the on-chip UARTs on the Samsung S3C24XX series CPUs, @@ -938,6 +937,18 @@ config SERIAL_QCOM_GENI depends on QCOM_GENI_SE select SERIAL_CORE +config SERIAL_QCOM_GENI_CONSOLE_DEFAULT_ENABLED + bool "Enable QCOM on-chip GENI serial port for kernel console usage" + default y + depends on SERIAL_QCOM_GENI + help + Runtime enablement of serial HW for kernel can be controlled using + qcom_geni_serial.con_enabled=1. This is useful for kernels that should + have serial driver compiled, but not used for performance. + This option controls the default value for con_enabled. + + If unsure, say Y. + config SERIAL_QCOM_GENI_CONSOLE bool "QCOM GENI Serial Console support" depends on SERIAL_QCOM_GENI diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c index ce1c81731a2a..77a82a04650b 100644 --- a/drivers/tty/serial/qcom_geni_serial.c +++ b/drivers/tty/serial/qcom_geni_serial.c @@ -1,5 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2017-2018, The Linux foundation. All rights reserved. +/* + * Copyright (c) 2017-2018, The Linux foundation. All rights reserved. + * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. + */ + +/* Disable MMIO tracing to prevent excessive logging of unwanted MMIO traces */ +#define __DISABLE_TRACE_MMIO__ #include #include @@ -7,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -19,6 +26,10 @@ #include #include #include +#include + +static bool con_enabled = IS_ENABLED(CONFIG_SERIAL_QCOM_GENI_CONSOLE_DEFAULT_ENABLED); +module_param(con_enabled, bool, 0644); /* UART specific GENI registers */ #define SE_UART_LOOPBACK_CFG 0x22c @@ -135,6 +146,7 @@ struct qcom_geni_serial_port { int wakeup_irq; bool rx_tx_swap; bool cts_rts_swap; + bool is_console; struct qcom_geni_private_data private_data; }; @@ -1365,6 +1377,18 @@ static int qcom_geni_serial_probe(struct platform_device *pdev) return PTR_ERR(port); } + port->is_console = console; + + if (console && !con_enabled) { + dev_err(&pdev->dev, "%s, Console Disabled\n", __func__); + ret = pinctrl_pm_select_sleep_state(&pdev->dev); + if (ret) + dev_err(&pdev->dev, + "failed to set pinctrl state to sleep %d\n", ret); + platform_set_drvdata(pdev, port); + return ret; + } + uport = &port->uport; /* Don't allow 2 drivers to access the same port */ if (uport->private_data) @@ -1482,6 +1506,12 @@ static int qcom_geni_serial_remove(struct platform_device *pdev) struct qcom_geni_serial_port *port = platform_get_drvdata(pdev); struct uart_driver *drv = port->private_data.drv; + /* Platform driver is registered for console and when console + * is disabled from cmdline simply return success. + */ + if (port->is_console && !con_enabled) + return 0; + dev_pm_clear_wake_irq(&pdev->dev); device_init_wakeup(&pdev->dev, false); uart_remove_one_port(drv, &port->uport); @@ -1491,9 +1521,18 @@ static int qcom_geni_serial_remove(struct platform_device *pdev) static int __maybe_unused qcom_geni_serial_sys_suspend(struct device *dev) { + struct uart_port *uport; + struct qcom_geni_private_data *private_data; struct qcom_geni_serial_port *port = dev_get_drvdata(dev); - struct uart_port *uport = &port->uport; - struct qcom_geni_private_data *private_data = uport->private_data; + + /* Platform driver is registered for console and when console + * is disabled from cmdline simply return success. + */ + if (port->is_console && !con_enabled) + return 0; + + uport = &port->uport; + private_data = uport->private_data; /* * This is done so we can hit the lowest possible state in suspend @@ -1509,9 +1548,18 @@ static int __maybe_unused qcom_geni_serial_sys_suspend(struct device *dev) static int __maybe_unused qcom_geni_serial_sys_resume(struct device *dev) { int ret; + struct uart_port *uport; + struct qcom_geni_private_data *private_data; struct qcom_geni_serial_port *port = dev_get_drvdata(dev); - struct uart_port *uport = &port->uport; - struct qcom_geni_private_data *private_data = uport->private_data; + + /* Platform driver is registered for console and when console + * is disabled from cmdline simply return success. + */ + if (port->is_console && !con_enabled) + return 0; + + uport = &port->uport; + private_data = uport->private_data; ret = uart_resume_port(private_data->drv, uport); if (uart_console(uport)) { @@ -1547,19 +1595,22 @@ static int __init qcom_geni_serial_init(void) { int ret; - ret = console_register(&qcom_geni_console_driver); + ret = uart_register_driver(&qcom_geni_uart_driver); if (ret) return ret; - ret = uart_register_driver(&qcom_geni_uart_driver); - if (ret) { - console_unregister(&qcom_geni_console_driver); - return ret; + if (con_enabled) { + ret = console_register(&qcom_geni_console_driver); + if (ret) { + uart_unregister_driver(&qcom_geni_uart_driver); + return ret; + } } ret = platform_driver_register(&qcom_geni_serial_platform_driver); if (ret) { - console_unregister(&qcom_geni_console_driver); + if (con_enabled) + console_unregister(&qcom_geni_console_driver); uart_unregister_driver(&qcom_geni_uart_driver); } return ret; @@ -1569,7 +1620,8 @@ module_init(qcom_geni_serial_init); static void __exit qcom_geni_serial_exit(void) { platform_driver_unregister(&qcom_geni_serial_platform_driver); - console_unregister(&qcom_geni_console_driver); + if (con_enabled) + console_unregister(&qcom_geni_console_driver); uart_unregister_driver(&qcom_geni_uart_driver); } module_exit(qcom_geni_serial_exit); diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 6b445ece8339..07c33c20f62a 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -55,6 +55,8 @@ #include #include +#include + /* Whether we react on sysrq keys or just ignore them */ static int __read_mostly sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; static bool __read_mostly sysrq_always_enabled; @@ -152,6 +154,8 @@ static void sysrq_handle_crash(int key) /* release the RCU read lock before crashing */ rcu_read_unlock(); + trace_android_vh_sysrq_crash(current); + panic("sysrq triggered crash\n"); } static const struct sysrq_key_op sysrq_crash_op = { diff --git a/drivers/ufs/Kconfig b/drivers/ufs/Kconfig new file mode 100644 index 000000000000..90226f72c158 --- /dev/null +++ b/drivers/ufs/Kconfig @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# UFS subsystem configuration +# + +menuconfig SCSI_UFSHCD + tristate "Universal Flash Storage Controller" + depends on SCSI && SCSI_DMA + select PM_DEVFREQ + select DEVFREQ_GOV_SIMPLE_ONDEMAND + select NLS + help + Enables support for UFS (Universal Flash Storage) host controllers. + A UFS host controller is an electronic component that is able to + communicate with a UFS card. UFS host controllers occur in + smartphones, laptops, digital cameras and also in cars. + The kernel module will be called ufshcd. + + To compile this driver as a module, choose M here and read + . + However, do not compile this as a module if your root file system + (the one containing the directory /) is located on a UFS device. + +if SCSI_UFSHCD + +source "drivers/ufs/core/Kconfig" + +source "drivers/ufs/host/Kconfig" + +endif diff --git a/drivers/ufs/Makefile b/drivers/ufs/Makefile new file mode 100644 index 000000000000..5a199ef18d4c --- /dev/null +++ b/drivers/ufs/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +# The link order is important here. ufshcd-core must initialize +# before vendor drivers. +obj-$(CONFIG_SCSI_UFSHCD) += core/ host/ diff --git a/drivers/ufs/core/Kconfig b/drivers/ufs/core/Kconfig new file mode 100644 index 000000000000..e11978171403 --- /dev/null +++ b/drivers/ufs/core/Kconfig @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: GPL-2.0+ +# +# Kernel configuration file for the UFS Host Controller core. +# +# Copyright (C) 2011-2013 Samsung India Software Operations +# +# Authors: +# Santosh Yaraganavi +# Vinayak Holikatti + +config SCSI_UFS_BSG + bool "Universal Flash Storage BSG device node" + select BLK_DEV_BSGLIB + help + Universal Flash Storage (UFS) is SCSI transport specification for + accessing flash storage on digital cameras, mobile phones and + consumer electronic devices. + A UFS controller communicates with a UFS device by exchanging + UFS Protocol Information Units (UPIUs). + UPIUs can not only be used as a transport layer for the SCSI protocol + but are also used by the UFS native command set. + This transport driver supports exchanging UFS protocol information units + with a UFS device. See also the ufshcd driver, which is a SCSI driver + that supports UFS devices. + + Select this if you need a bsg device node for your UFS controller. + If unsure, say N. + +config SCSI_UFS_CRYPTO + bool "UFS Crypto Engine Support" + depends on BLK_INLINE_ENCRYPTION + help + Enable Crypto Engine Support in UFS. + Enabling this makes it possible for the kernel to use the crypto + capabilities of the UFS device (if present) to perform crypto + operations on data being transferred to/from the device. + +config SCSI_UFS_HPB + bool "Support UFS Host Performance Booster" + help + The UFS HPB feature improves random read performance. It caches + L2P (logical to physical) map of UFS to host DRAM. The driver uses HPB + read command by piggybacking physical page number for bypassing FTL (flash + translation layer)'s L2P address translation. + +config SCSI_UFS_FAULT_INJECTION + bool "UFS Fault Injection Support" + depends on FAULT_INJECTION + help + Enable fault injection support in the UFS driver. This makes it easier + to test the UFS error handler and abort handler. + +config SCSI_UFS_HWMON + bool "UFS Temperature Notification" + depends on SCSI_UFSHCD=HWMON || HWMON=y + help + This provides support for UFS hardware monitoring. If enabled, + a hardware monitoring device will be created for the UFS device. + + If unsure, say N. diff --git a/drivers/ufs/core/Makefile b/drivers/ufs/core/Makefile new file mode 100644 index 000000000000..4d02e0f2de10 --- /dev/null +++ b/drivers/ufs/core/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_SCSI_UFSHCD) += ufshcd-core.o +ufshcd-core-y += ufshcd.o ufs-sysfs.o ufs-mcq.o +ufshcd-core-$(CONFIG_DEBUG_FS) += ufs-debugfs.o +ufshcd-core-$(CONFIG_SCSI_UFS_BSG) += ufs_bsg.o +ufshcd-core-$(CONFIG_SCSI_UFS_CRYPTO) += ufshcd-crypto.o +ufshcd-core-$(CONFIG_SCSI_UFS_HPB) += ufshpb.o +ufshcd-core-$(CONFIG_SCSI_UFS_FAULT_INJECTION) += ufs-fault-injection.o +ufshcd-core-$(CONFIG_SCSI_UFS_HWMON) += ufs-hwmon.o diff --git a/drivers/scsi/ufs/ufs-debugfs.c b/drivers/ufs/core/ufs-debugfs.c similarity index 60% rename from drivers/scsi/ufs/ufs-debugfs.c rename to drivers/ufs/core/ufs-debugfs.c index 4e1ff209b933..e3baed6c70bd 100644 --- a/drivers/scsi/ufs/ufs-debugfs.c +++ b/drivers/ufs/core/ufs-debugfs.c @@ -4,10 +4,23 @@ #include #include "ufs-debugfs.h" -#include "ufshcd.h" +#include +#include "ufshcd-priv.h" static struct dentry *ufs_debugfs_root; +struct ufs_debugfs_attr { + const char *name; + mode_t mode; + const struct file_operations *fops; +}; + +/* @file corresponds to a debugfs attribute in directory hba->debugfs_root. */ +static inline struct ufs_hba *hba_from_file(const struct file *file) +{ + return d_inode(file->f_path.dentry->d_parent)->i_private; +} + void __init ufs_debugfs_init(void) { ufs_debugfs_root = debugfs_create_dir("ufshcd", NULL); @@ -20,7 +33,7 @@ void ufs_debugfs_exit(void) static int ufs_debugfs_stats_show(struct seq_file *s, void *data) { - struct ufs_hba *hba = s->private; + struct ufs_hba *hba = hba_from_file(s->file); struct ufs_event_hist *e = hba->ufs_stats.event; #define PRT(fmt, typ) \ @@ -126,13 +139,93 @@ static void ufs_debugfs_restart_ee(struct work_struct *work) ufs_debugfs_put_user_access(hba); } +static int ufs_saved_err_show(struct seq_file *s, void *data) +{ + struct ufs_debugfs_attr *attr = s->private; + struct ufs_hba *hba = hba_from_file(s->file); + const int *p; + + if (strcmp(attr->name, "saved_err") == 0) { + p = &hba->saved_err; + } else if (strcmp(attr->name, "saved_uic_err") == 0) { + p = &hba->saved_uic_err; + } else { + return -ENOENT; + } + + seq_printf(s, "%d\n", *p); + return 0; +} + +static ssize_t ufs_saved_err_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct ufs_debugfs_attr *attr = file->f_inode->i_private; + struct ufs_hba *hba = hba_from_file(file); + char val_str[16] = { }; + int val, ret; + + if (count > sizeof(val_str)) + return -EINVAL; + if (copy_from_user(val_str, buf, count)) + return -EFAULT; + ret = kstrtoint(val_str, 0, &val); + if (ret < 0) + return ret; + + spin_lock_irq(hba->host->host_lock); + if (strcmp(attr->name, "saved_err") == 0) { + hba->saved_err = val; + } else if (strcmp(attr->name, "saved_uic_err") == 0) { + hba->saved_uic_err = val; + } else { + ret = -ENOENT; + } + if (ret == 0) + ufshcd_schedule_eh_work(hba); + spin_unlock_irq(hba->host->host_lock); + + return ret < 0 ? ret : count; +} + +static int ufs_saved_err_open(struct inode *inode, struct file *file) +{ + return single_open(file, ufs_saved_err_show, inode->i_private); +} + +static const struct file_operations ufs_saved_err_fops = { + .owner = THIS_MODULE, + .open = ufs_saved_err_open, + .read = seq_read, + .write = ufs_saved_err_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct ufs_debugfs_attr ufs_attrs[] = { + { "stats", 0400, &ufs_debugfs_stats_fops }, + { "saved_err", 0600, &ufs_saved_err_fops }, + { "saved_uic_err", 0600, &ufs_saved_err_fops }, + { } +}; + void ufs_debugfs_hba_init(struct ufs_hba *hba) { + const struct ufs_debugfs_attr *attr; + struct dentry *root; + /* Set default exception event rate limit period to 20ms */ hba->debugfs_ee_rate_limit_ms = 20; INIT_DELAYED_WORK(&hba->debugfs_ee_work, ufs_debugfs_restart_ee); - hba->debugfs_root = debugfs_create_dir(dev_name(hba->dev), ufs_debugfs_root); - debugfs_create_file("stats", 0400, hba->debugfs_root, hba, &ufs_debugfs_stats_fops); + + root = debugfs_create_dir(dev_name(hba->dev), ufs_debugfs_root); + if (IS_ERR_OR_NULL(root)) + return; + hba->debugfs_root = root; + d_inode(root)->i_private = hba; + for (attr = ufs_attrs; attr->name; attr++) + debugfs_create_file(attr->name, attr->mode, root, (void *)attr, + attr->fops); debugfs_create_file("exception_event_mask", 0600, hba->debugfs_root, hba, &ee_usr_mask_fops); debugfs_create_u32("exception_event_rate_limit_ms", 0600, hba->debugfs_root, diff --git a/drivers/scsi/ufs/ufs-debugfs.h b/drivers/ufs/core/ufs-debugfs.h similarity index 100% rename from drivers/scsi/ufs/ufs-debugfs.h rename to drivers/ufs/core/ufs-debugfs.h diff --git a/drivers/scsi/ufs/ufs-fault-injection.c b/drivers/ufs/core/ufs-fault-injection.c similarity index 100% rename from drivers/scsi/ufs/ufs-fault-injection.c rename to drivers/ufs/core/ufs-fault-injection.c diff --git a/drivers/scsi/ufs/ufs-fault-injection.h b/drivers/ufs/core/ufs-fault-injection.h similarity index 100% rename from drivers/scsi/ufs/ufs-fault-injection.h rename to drivers/ufs/core/ufs-fault-injection.h diff --git a/drivers/ufs/core/ufs-hwmon.c b/drivers/ufs/core/ufs-hwmon.c new file mode 100644 index 000000000000..4c6a872b7a7c --- /dev/null +++ b/drivers/ufs/core/ufs-hwmon.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * UFS hardware monitoring support + * Copyright (c) 2021, Western Digital Corporation + */ + +#include +#include + +#include +#include "ufshcd-priv.h" + +struct ufs_hwmon_data { + struct ufs_hba *hba; + u8 mask; +}; + +static int ufs_read_temp_enable(struct ufs_hba *hba, u8 mask, long *val) +{ + u32 ee_mask; + int err; + + err = ufshcd_query_attr(hba, UPIU_QUERY_OPCODE_READ_ATTR, QUERY_ATTR_IDN_EE_CONTROL, 0, 0, + &ee_mask); + if (err) + return err; + + *val = (mask & ee_mask & MASK_EE_TOO_HIGH_TEMP) || (mask & ee_mask & MASK_EE_TOO_LOW_TEMP); + + return 0; +} + +static int ufs_get_temp(struct ufs_hba *hba, enum attr_idn idn, long *val) +{ + u32 value; + int err; + + err = ufshcd_query_attr(hba, UPIU_QUERY_OPCODE_READ_ATTR, idn, 0, 0, &value); + if (err) + return err; + + if (value == 0) + return -ENODATA; + + *val = ((long)value - 80) * MILLIDEGREE_PER_DEGREE; + + return 0; +} + +static int ufs_hwmon_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, + long *val) +{ + struct ufs_hwmon_data *data = dev_get_drvdata(dev); + struct ufs_hba *hba = data->hba; + int err; + + down(&hba->host_sem); + + if (!ufshcd_is_user_access_allowed(hba)) { + up(&hba->host_sem); + return -EBUSY; + } + + ufshcd_rpm_get_sync(hba); + + switch (attr) { + case hwmon_temp_enable: + err = ufs_read_temp_enable(hba, data->mask, val); + + break; + case hwmon_temp_crit: + err = ufs_get_temp(hba, QUERY_ATTR_IDN_HIGH_TEMP_BOUND, val); + + break; + case hwmon_temp_lcrit: + err = ufs_get_temp(hba, QUERY_ATTR_IDN_LOW_TEMP_BOUND, val); + + break; + case hwmon_temp_input: + err = ufs_get_temp(hba, QUERY_ATTR_IDN_CASE_ROUGH_TEMP, val); + + break; + default: + err = -EOPNOTSUPP; + + break; + } + + ufshcd_rpm_put_sync(hba); + + up(&hba->host_sem); + + return err; +} + +static int ufs_hwmon_write(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, + long val) +{ + struct ufs_hwmon_data *data = dev_get_drvdata(dev); + struct ufs_hba *hba = data->hba; + int err; + + if (attr != hwmon_temp_enable) + return -EINVAL; + + if (val != 0 && val != 1) + return -EINVAL; + + down(&hba->host_sem); + + if (!ufshcd_is_user_access_allowed(hba)) { + up(&hba->host_sem); + return -EBUSY; + } + + ufshcd_rpm_get_sync(hba); + + if (val == 1) + err = ufshcd_update_ee_usr_mask(hba, MASK_EE_URGENT_TEMP, 0); + else + err = ufshcd_update_ee_usr_mask(hba, 0, MASK_EE_URGENT_TEMP); + + ufshcd_rpm_put_sync(hba); + + up(&hba->host_sem); + + return err; +} + +static umode_t ufs_hwmon_is_visible(const void *_data, enum hwmon_sensor_types type, u32 attr, + int channel) +{ + if (type != hwmon_temp) + return 0; + + switch (attr) { + case hwmon_temp_enable: + return 0644; + case hwmon_temp_crit: + case hwmon_temp_lcrit: + case hwmon_temp_input: + return 0444; + default: + break; + } + return 0; +} + +static const struct hwmon_channel_info *ufs_hwmon_info[] = { + HWMON_CHANNEL_INFO(temp, HWMON_T_ENABLE | HWMON_T_INPUT | HWMON_T_CRIT | HWMON_T_LCRIT), + NULL +}; + +static const struct hwmon_ops ufs_hwmon_ops = { + .is_visible = ufs_hwmon_is_visible, + .read = ufs_hwmon_read, + .write = ufs_hwmon_write, +}; + +static const struct hwmon_chip_info ufs_hwmon_hba_info = { + .ops = &ufs_hwmon_ops, + .info = ufs_hwmon_info, +}; + +void ufs_hwmon_probe(struct ufs_hba *hba, u8 mask) +{ + struct device *dev = hba->dev; + struct ufs_hwmon_data *data; + struct device *hwmon; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return; + + data->hba = hba; + data->mask = mask; + + hwmon = hwmon_device_register_with_info(dev, "ufs", data, &ufs_hwmon_hba_info, NULL); + if (IS_ERR(hwmon)) { + dev_warn(dev, "Failed to instantiate hwmon device\n"); + kfree(data); + return; + } + + hba->hwmon_device = hwmon; +} + +void ufs_hwmon_remove(struct ufs_hba *hba) +{ + struct ufs_hwmon_data *data; + + if (!hba->hwmon_device) + return; + + data = dev_get_drvdata(hba->hwmon_device); + hwmon_device_unregister(hba->hwmon_device); + hba->hwmon_device = NULL; + kfree(data); +} + +void ufs_hwmon_notify_event(struct ufs_hba *hba, u8 ee_mask) +{ + if (!hba->hwmon_device) + return; + + if (ee_mask & MASK_EE_TOO_HIGH_TEMP) + hwmon_notify_event(hba->hwmon_device, hwmon_temp, hwmon_temp_max_alarm, 0); + + if (ee_mask & MASK_EE_TOO_LOW_TEMP) + hwmon_notify_event(hba->hwmon_device, hwmon_temp, hwmon_temp_min_alarm, 0); +} diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c new file mode 100644 index 000000000000..31df052fbc41 --- /dev/null +++ b/drivers/ufs/core/ufs-mcq.c @@ -0,0 +1,431 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Qualcomm Innovation Center. All rights reserved. + * + * Authors: + * Asutosh Das + * Can Guo + */ + +#include +#include +#include +#include +#include "ufshcd-priv.h" + +#define MAX_QUEUE_SUP GENMASK(7, 0) +#define UFS_MCQ_MIN_RW_QUEUES 2 +#define UFS_MCQ_MIN_READ_QUEUES 0 +#define UFS_MCQ_NUM_DEV_CMD_QUEUES 1 +#define UFS_MCQ_MIN_POLL_QUEUES 0 +#define QUEUE_EN_OFFSET 31 +#define QUEUE_ID_OFFSET 16 + +#define MAX_DEV_CMD_ENTRIES 2 +#define MCQ_CFG_MAC_MASK GENMASK(16, 8) +#define MCQ_QCFG_SIZE 0x40 +#define MCQ_ENTRY_SIZE_IN_DWORD 8 +#define CQE_UCD_BA GENMASK_ULL(63, 7) + +static int rw_queue_count_set(const char *val, const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, UFS_MCQ_MIN_RW_QUEUES, + num_possible_cpus()); +} + +static const struct kernel_param_ops rw_queue_count_ops = { + .set = rw_queue_count_set, + .get = param_get_uint, +}; + +static unsigned int rw_queues; +module_param_cb(rw_queues, &rw_queue_count_ops, &rw_queues, 0644); +MODULE_PARM_DESC(rw_queues, + "Number of interrupt driven I/O queues used for rw. Default value is nr_cpus"); + +static int read_queue_count_set(const char *val, const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, UFS_MCQ_MIN_READ_QUEUES, + num_possible_cpus()); +} + +static const struct kernel_param_ops read_queue_count_ops = { + .set = read_queue_count_set, + .get = param_get_uint, +}; + +static unsigned int read_queues; +module_param_cb(read_queues, &read_queue_count_ops, &read_queues, 0644); +MODULE_PARM_DESC(read_queues, + "Number of interrupt driven read queues used for read. Default value is 0"); + +static int poll_queue_count_set(const char *val, const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, UFS_MCQ_MIN_POLL_QUEUES, + num_possible_cpus()); +} + +static const struct kernel_param_ops poll_queue_count_ops = { + .set = poll_queue_count_set, + .get = param_get_uint, +}; + +static unsigned int poll_queues = 1; +module_param_cb(poll_queues, &poll_queue_count_ops, &poll_queues, 0644); +MODULE_PARM_DESC(poll_queues, + "Number of poll queues used for r/w. Default value is 1"); + +/** + * ufshcd_mcq_config_mac - Set the #Max Activ Cmds. + * @hba: per adapter instance + * @max_active_cmds: maximum # of active commands to the device at any time. + * + * The controller won't send more than the max_active_cmds to the device at + * any time. + */ +void ufshcd_mcq_config_mac(struct ufs_hba *hba, u32 max_active_cmds) +{ + u32 val; + + val = ufshcd_readl(hba, REG_UFS_MCQ_CFG); + val &= ~MCQ_CFG_MAC_MASK; + val |= FIELD_PREP(MCQ_CFG_MAC_MASK, max_active_cmds); + ufshcd_writel(hba, val, REG_UFS_MCQ_CFG); +} + +/** + * ufshcd_mcq_req_to_hwq - find the hardware queue on which the + * request would be issued. + * @hba: per adapter instance + * @req: pointer to the request to be issued + * + * Returns the hardware queue instance on which the request would + * be queued. + */ +struct ufs_hw_queue *ufshcd_mcq_req_to_hwq(struct ufs_hba *hba, + struct request *req) +{ + u32 utag = blk_mq_unique_tag(req); + u32 hwq = blk_mq_unique_tag_to_hwq(utag); + + /* uhq[0] is used to serve device commands */ + return &hba->uhq[hwq + UFSHCD_MCQ_IO_QUEUE_OFFSET]; +} + +/** + * ufshcd_mcq_decide_queue_depth - decide the queue depth + * @hba: per adapter instance + * + * Returns queue-depth on success, non-zero on error + * + * MAC - Max. Active Command of the Host Controller (HC) + * HC wouldn't send more than this commands to the device. + * It is mandatory to implement get_hba_mac() to enable MCQ mode. + * Calculates and adjusts the queue depth based on the depth + * supported by the HC and ufs device. + */ +int ufshcd_mcq_decide_queue_depth(struct ufs_hba *hba) +{ + int mac; + + /* Mandatory to implement get_hba_mac() */ + mac = ufshcd_mcq_vops_get_hba_mac(hba); + if (mac < 0) { + dev_err(hba->dev, "Failed to get mac, err=%d\n", mac); + return mac; + } + + WARN_ON_ONCE(!hba->dev_info.bqueuedepth); + /* + * max. value of bqueuedepth = 256, mac is host dependent. + * It is mandatory for UFS device to define bQueueDepth if + * shared queuing architecture is enabled. + */ + return min_t(int, mac, hba->dev_info.bqueuedepth); +} + +static int ufshcd_mcq_config_nr_queues(struct ufs_hba *hba) +{ + int i; + u32 hba_maxq, rem, tot_queues; + struct Scsi_Host *host = hba->host; + + hba_maxq = FIELD_GET(MAX_QUEUE_SUP, hba->mcq_capabilities); + + tot_queues = UFS_MCQ_NUM_DEV_CMD_QUEUES + read_queues + poll_queues + + rw_queues; + + if (hba_maxq < tot_queues) { + dev_err(hba->dev, "Total queues (%d) exceeds HC capacity (%d)\n", + tot_queues, hba_maxq); + return -EOPNOTSUPP; + } + + rem = hba_maxq - UFS_MCQ_NUM_DEV_CMD_QUEUES; + + if (rw_queues) { + hba->nr_queues[HCTX_TYPE_DEFAULT] = rw_queues; + rem -= hba->nr_queues[HCTX_TYPE_DEFAULT]; + } else { + rw_queues = num_possible_cpus(); + } + + if (poll_queues) { + hba->nr_queues[HCTX_TYPE_POLL] = poll_queues; + rem -= hba->nr_queues[HCTX_TYPE_POLL]; + } + + if (read_queues) { + hba->nr_queues[HCTX_TYPE_READ] = read_queues; + rem -= hba->nr_queues[HCTX_TYPE_READ]; + } + + if (!hba->nr_queues[HCTX_TYPE_DEFAULT]) + hba->nr_queues[HCTX_TYPE_DEFAULT] = min3(rem, rw_queues, + num_possible_cpus()); + + for (i = 0; i < HCTX_MAX_TYPES; i++) + host->nr_hw_queues += hba->nr_queues[i]; + + hba->nr_hw_queues = host->nr_hw_queues + UFS_MCQ_NUM_DEV_CMD_QUEUES; + return 0; +} + +int ufshcd_mcq_memory_alloc(struct ufs_hba *hba) +{ + struct ufs_hw_queue *hwq; + size_t utrdl_size, cqe_size; + int i; + + for (i = 0; i < hba->nr_hw_queues; i++) { + hwq = &hba->uhq[i]; + + utrdl_size = sizeof(struct utp_transfer_req_desc) * + hwq->max_entries; + hwq->sqe_base_addr = dmam_alloc_coherent(hba->dev, utrdl_size, + &hwq->sqe_dma_addr, + GFP_KERNEL); + if (!hwq->sqe_dma_addr) { + dev_err(hba->dev, "SQE allocation failed\n"); + return -ENOMEM; + } + + cqe_size = sizeof(struct cq_entry) * hwq->max_entries; + hwq->cqe_base_addr = dmam_alloc_coherent(hba->dev, cqe_size, + &hwq->cqe_dma_addr, + GFP_KERNEL); + if (!hwq->cqe_dma_addr) { + dev_err(hba->dev, "CQE allocation failed\n"); + return -ENOMEM; + } + } + + return 0; +} + + +/* Operation and runtime registers configuration */ +#define MCQ_CFG_n(r, i) ((r) + MCQ_QCFG_SIZE * (i)) +#define MCQ_OPR_OFFSET_n(p, i) \ + (hba->mcq_opr[(p)].offset + hba->mcq_opr[(p)].stride * (i)) + +static void __iomem *mcq_opr_base(struct ufs_hba *hba, + enum ufshcd_mcq_opr n, int i) +{ + struct ufshcd_mcq_opr_info_t *opr = &hba->mcq_opr[n]; + + return opr->base + opr->stride * i; +} + +u32 ufshcd_mcq_read_cqis(struct ufs_hba *hba, int i) +{ + return readl(mcq_opr_base(hba, OPR_CQIS, i) + REG_CQIS); +} + +void ufshcd_mcq_write_cqis(struct ufs_hba *hba, u32 val, int i) +{ + writel(val, mcq_opr_base(hba, OPR_CQIS, i) + REG_CQIS); +} +EXPORT_SYMBOL_GPL(ufshcd_mcq_write_cqis); + +/* + * Current MCQ specification doesn't provide a Task Tag or its equivalent in + * the Completion Queue Entry. Find the Task Tag using an indirect method. + */ +static int ufshcd_mcq_get_tag(struct ufs_hba *hba, + struct ufs_hw_queue *hwq, + struct cq_entry *cqe) +{ + u64 addr; + + /* sizeof(struct utp_transfer_cmd_desc) must be a multiple of 128 */ + BUILD_BUG_ON(sizeof(struct utp_transfer_cmd_desc) & GENMASK(6, 0)); + + /* Bits 63:7 UCD base address, 6:5 are reserved, 4:0 is SQ ID */ + addr = (le64_to_cpu(cqe->command_desc_base_addr) & CQE_UCD_BA) - + hba->ucdl_dma_addr; + + return div_u64(addr, sizeof(struct utp_transfer_cmd_desc)); +} + +static void ufshcd_mcq_process_cqe(struct ufs_hba *hba, + struct ufs_hw_queue *hwq) +{ + struct cq_entry *cqe = ufshcd_mcq_cur_cqe(hwq); + int tag = ufshcd_mcq_get_tag(hba, hwq, cqe); + + ufshcd_compl_one_cqe(hba, tag, cqe); +} + +unsigned long ufshcd_mcq_poll_cqe_nolock(struct ufs_hba *hba, + struct ufs_hw_queue *hwq) +{ + unsigned long completed_reqs = 0; + + ufshcd_mcq_update_cq_tail_slot(hwq); + while (!ufshcd_mcq_is_cq_empty(hwq)) { + ufshcd_mcq_process_cqe(hba, hwq); + ufshcd_mcq_inc_cq_head_slot(hwq); + completed_reqs++; + } + + if (completed_reqs) + ufshcd_mcq_update_cq_head(hwq); + + return completed_reqs; +} +EXPORT_SYMBOL_GPL(ufshcd_mcq_poll_cqe_nolock); + +unsigned long ufshcd_mcq_poll_cqe_lock(struct ufs_hba *hba, + struct ufs_hw_queue *hwq) +{ + unsigned long completed_reqs; + + spin_lock(&hwq->cq_lock); + completed_reqs = ufshcd_mcq_poll_cqe_nolock(hba, hwq); + spin_unlock(&hwq->cq_lock); + + return completed_reqs; +} + +void ufshcd_mcq_make_queues_operational(struct ufs_hba *hba) +{ + struct ufs_hw_queue *hwq; + u16 qsize; + int i; + + for (i = 0; i < hba->nr_hw_queues; i++) { + hwq = &hba->uhq[i]; + hwq->id = i; + qsize = hwq->max_entries * MCQ_ENTRY_SIZE_IN_DWORD - 1; + + /* Submission Queue Lower Base Address */ + ufsmcq_writelx(hba, lower_32_bits(hwq->sqe_dma_addr), + MCQ_CFG_n(REG_SQLBA, i)); + /* Submission Queue Upper Base Address */ + ufsmcq_writelx(hba, upper_32_bits(hwq->sqe_dma_addr), + MCQ_CFG_n(REG_SQUBA, i)); + /* Submission Queue Doorbell Address Offset */ + ufsmcq_writelx(hba, MCQ_OPR_OFFSET_n(OPR_SQD, i), + MCQ_CFG_n(REG_SQDAO, i)); + /* Submission Queue Interrupt Status Address Offset */ + ufsmcq_writelx(hba, MCQ_OPR_OFFSET_n(OPR_SQIS, i), + MCQ_CFG_n(REG_SQISAO, i)); + + /* Completion Queue Lower Base Address */ + ufsmcq_writelx(hba, lower_32_bits(hwq->cqe_dma_addr), + MCQ_CFG_n(REG_CQLBA, i)); + /* Completion Queue Upper Base Address */ + ufsmcq_writelx(hba, upper_32_bits(hwq->cqe_dma_addr), + MCQ_CFG_n(REG_CQUBA, i)); + /* Completion Queue Doorbell Address Offset */ + ufsmcq_writelx(hba, MCQ_OPR_OFFSET_n(OPR_CQD, i), + MCQ_CFG_n(REG_CQDAO, i)); + /* Completion Queue Interrupt Status Address Offset */ + ufsmcq_writelx(hba, MCQ_OPR_OFFSET_n(OPR_CQIS, i), + MCQ_CFG_n(REG_CQISAO, i)); + + /* Save the base addresses for quicker access */ + hwq->mcq_sq_head = mcq_opr_base(hba, OPR_SQD, i) + REG_SQHP; + hwq->mcq_sq_tail = mcq_opr_base(hba, OPR_SQD, i) + REG_SQTP; + hwq->mcq_cq_head = mcq_opr_base(hba, OPR_CQD, i) + REG_CQHP; + hwq->mcq_cq_tail = mcq_opr_base(hba, OPR_CQD, i) + REG_CQTP; + + /* Reinitializing is needed upon HC reset */ + hwq->sq_tail_slot = hwq->cq_tail_slot = hwq->cq_head_slot = 0; + + /* Enable Tail Entry Push Status interrupt only for non-poll queues */ + if (i < hba->nr_hw_queues - hba->nr_queues[HCTX_TYPE_POLL]) + writel(1, mcq_opr_base(hba, OPR_CQIS, i) + REG_CQIE); + + /* Completion Queue Enable|Size to Completion Queue Attribute */ + ufsmcq_writel(hba, (1 << QUEUE_EN_OFFSET) | qsize, + MCQ_CFG_n(REG_CQATTR, i)); + + /* + * Submission Qeueue Enable|Size|Completion Queue ID to + * Submission Queue Attribute + */ + ufsmcq_writel(hba, (1 << QUEUE_EN_OFFSET) | qsize | + (i << QUEUE_ID_OFFSET), + MCQ_CFG_n(REG_SQATTR, i)); + } +} + +void ufshcd_mcq_enable_esi(struct ufs_hba *hba) +{ + ufshcd_writel(hba, ufshcd_readl(hba, REG_UFS_MEM_CFG) | 0x2, + REG_UFS_MEM_CFG); +} +EXPORT_SYMBOL_GPL(ufshcd_mcq_enable_esi); + +void ufshcd_mcq_config_esi(struct ufs_hba *hba, struct msi_msg *msg) +{ + ufshcd_writel(hba, msg->address_lo, REG_UFS_ESILBA); + ufshcd_writel(hba, msg->address_hi, REG_UFS_ESIUBA); +} +EXPORT_SYMBOL_GPL(ufshcd_mcq_config_esi); + +int ufshcd_mcq_init(struct ufs_hba *hba) +{ + struct Scsi_Host *host = hba->host; + struct ufs_hw_queue *hwq; + int ret, i; + + ret = ufshcd_mcq_config_nr_queues(hba); + if (ret) + return ret; + + ret = ufshcd_vops_mcq_config_resource(hba); + if (ret) + return ret; + + ret = ufshcd_mcq_vops_op_runtime_config(hba); + if (ret) { + dev_err(hba->dev, "Operation runtime config failed, ret=%d\n", + ret); + return ret; + } + hba->uhq = devm_kzalloc(hba->dev, + hba->nr_hw_queues * sizeof(struct ufs_hw_queue), + GFP_KERNEL); + if (!hba->uhq) { + dev_err(hba->dev, "ufs hw queue memory allocation failed\n"); + return -ENOMEM; + } + + for (i = 0; i < hba->nr_hw_queues; i++) { + hwq = &hba->uhq[i]; + hwq->max_entries = hba->nutrs; + spin_lock_init(&hwq->sq_lock); + spin_lock_init(&hwq->cq_lock); + } + + /* The very first HW queue serves device commands */ + hba->dev_cmd_queue = &hba->uhq[0]; + /* Give dev_cmd_queue the minimal number of entries */ + hba->dev_cmd_queue->max_entries = MAX_DEV_CMD_ENTRIES; + + host->host_tagset = 1; + return 0; +} diff --git a/drivers/scsi/ufs/ufs-sysfs.c b/drivers/ufs/core/ufs-sysfs.c similarity index 92% rename from drivers/scsi/ufs/ufs-sysfs.c rename to drivers/ufs/core/ufs-sysfs.c index 5c405ff7b6ea..db6cb5cbcb20 100644 --- a/drivers/scsi/ufs/ufs-sysfs.c +++ b/drivers/ufs/core/ufs-sysfs.c @@ -6,8 +6,11 @@ #include #include -#include "ufs.h" +#include #include "ufs-sysfs.h" +#include "ufshcd-priv.h" + +#include static const char *ufshcd_uic_link_state_to_string( enum uic_link_state state) @@ -224,12 +227,13 @@ static ssize_t wb_on_store(struct device *dev, struct device_attribute *attr, unsigned int wb_enable; ssize_t res; - if (!ufshcd_is_wb_allowed(hba) || ufshcd_is_clkscaling_supported(hba)) { + if (!ufshcd_is_wb_allowed(hba) || (ufshcd_is_clkscaling_supported(hba) + && ufshcd_enable_wb_if_scaling_up(hba))) { /* * If the platform supports UFSHCD_CAP_CLK_SCALING, turn WB * on/off will be done while clock scaling up/down. */ - dev_warn(dev, "To control WB through wb_on is not allowed!\n"); + dev_warn(dev, "It is not allowed to configure WB!\n"); return -EOPNOTSUPP; } @@ -253,6 +257,49 @@ out: return res < 0 ? res : count; } +static ssize_t enable_wb_buf_flush_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%d\n", hba->dev_info.wb_buf_flush_enabled); +} + +static ssize_t enable_wb_buf_flush_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + unsigned int enable_wb_buf_flush; + ssize_t res; + + if (!ufshcd_is_wb_buf_flush_allowed(hba)) { + dev_warn(dev, "It is not allowed to configure WB buf flushing!\n"); + return -EOPNOTSUPP; + } + + if (kstrtouint(buf, 0, &enable_wb_buf_flush)) + return -EINVAL; + + if (enable_wb_buf_flush != 0 && enable_wb_buf_flush != 1) + return -EINVAL; + + down(&hba->host_sem); + if (!ufshcd_is_user_access_allowed(hba)) { + res = -EBUSY; + goto out; + } + + ufshcd_rpm_get_sync(hba); + res = ufshcd_wb_toggle_buf_flush(hba, enable_wb_buf_flush); + ufshcd_rpm_put_sync(hba); + +out: + up(&hba->host_sem); + return res < 0 ? res : count; +} + static DEVICE_ATTR_RW(rpm_lvl); static DEVICE_ATTR_RO(rpm_target_dev_state); static DEVICE_ATTR_RO(rpm_target_link_state); @@ -261,6 +308,7 @@ static DEVICE_ATTR_RO(spm_target_dev_state); static DEVICE_ATTR_RO(spm_target_link_state); static DEVICE_ATTR_RW(auto_hibern8); static DEVICE_ATTR_RW(wb_on); +static DEVICE_ATTR_RW(enable_wb_buf_flush); static struct attribute *ufs_sysfs_ufshcd_attrs[] = { &dev_attr_rpm_lvl.attr, @@ -271,6 +319,7 @@ static struct attribute *ufs_sysfs_ufshcd_attrs[] = { &dev_attr_spm_target_link_state.attr, &dev_attr_auto_hibern8.attr, &dev_attr_wb_on.attr, + &dev_attr_enable_wb_buf_flush.attr, NULL }; @@ -278,6 +327,40 @@ static const struct attribute_group ufs_sysfs_default_group = { .attrs = ufs_sysfs_ufshcd_attrs, }; +static ssize_t clock_scaling_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%d\n", ufshcd_is_clkscaling_supported(hba)); +} + +static ssize_t write_booster_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%d\n", ufshcd_is_wb_allowed(hba)); +} + +static DEVICE_ATTR_RO(clock_scaling); +static DEVICE_ATTR_RO(write_booster); + +/* + * See Documentation/ABI/testing/sysfs-driver-ufs for the semantics of this + * group. + */ +static struct attribute *ufs_sysfs_capabilities_attrs[] = { + &dev_attr_clock_scaling.attr, + &dev_attr_write_booster.attr, + NULL +}; + +static const struct attribute_group ufs_sysfs_capabilities_group = { + .name = "capabilities", + .attrs = ufs_sysfs_capabilities_attrs, +}; + static ssize_t monitor_enable_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1133,6 +1216,7 @@ static const struct attribute_group ufs_sysfs_attributes_group = { static const struct attribute_group *ufs_sysfs_groups[] = { &ufs_sysfs_default_group, + &ufs_sysfs_capabilities_group, &ufs_sysfs_monitor_group, &ufs_sysfs_device_descriptor_group, &ufs_sysfs_interconnect_descriptor_group, @@ -1152,8 +1236,7 @@ static ssize_t _pname##_show(struct device *dev, \ struct scsi_device *sdev = to_scsi_device(dev); \ struct ufs_hba *hba = shost_priv(sdev->host); \ u8 lun = ufshcd_scsi_to_upiu_lun(sdev->lun); \ - if (!ufs_is_valid_unit_desc_lun(&hba->dev_info, lun, \ - _duname##_DESC_PARAM##_puname)) \ + if (!ufs_is_valid_unit_desc_lun(&hba->dev_info, lun)) \ return -EINVAL; \ return ufs_sysfs_read_desc_param(hba, QUERY_DESC_IDN_##_duname, \ lun, _duname##_DESC_PARAM##_puname, buf, _size); \ @@ -1204,9 +1287,27 @@ static struct attribute *ufs_sysfs_unit_descriptor[] = { NULL, }; +static umode_t ufs_unit_descriptor_is_visible(struct kobject *kobj, struct attribute *attr, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct scsi_device *sdev = to_scsi_device(dev); + u8 lun = ufshcd_scsi_to_upiu_lun(sdev->lun); + umode_t mode = attr->mode; + + if (lun == UFS_UPIU_BOOT_WLUN || lun == UFS_UPIU_UFS_DEVICE_WLUN) + /* Boot and device WLUN have no unit descriptors */ + mode = 0; + if (lun == UFS_UPIU_RPMB_WLUN && attr == &dev_attr_wb_buf_alloc_units.attr) + mode = 0; + + return mode; +} + + const struct attribute_group ufs_sysfs_unit_descriptor_group = { .name = "unit_descriptor", .attrs = ufs_sysfs_unit_descriptor, + .is_visible = ufs_unit_descriptor_is_visible, }; static ssize_t dyn_cap_needed_attribute_show(struct device *dev, @@ -1250,15 +1351,19 @@ const struct attribute_group ufs_sysfs_lun_attributes_group = { .attrs = ufs_sysfs_lun_attributes, }; -void ufs_sysfs_add_nodes(struct device *dev) +void ufs_sysfs_add_nodes(struct ufs_hba *hba) { int ret; - ret = sysfs_create_groups(&dev->kobj, ufs_sysfs_groups); - if (ret) - dev_err(dev, + ret = sysfs_create_groups(&hba->dev->kobj, ufs_sysfs_groups); + if (ret) { + dev_err(hba->dev, "%s: sysfs groups creation failed (err = %d)\n", __func__, ret); + return; + } + + trace_android_vh_ufs_update_sysfs(hba); } void ufs_sysfs_remove_nodes(struct device *dev) diff --git a/drivers/scsi/ufs/ufs-sysfs.h b/drivers/ufs/core/ufs-sysfs.h similarity index 82% rename from drivers/scsi/ufs/ufs-sysfs.h rename to drivers/ufs/core/ufs-sysfs.h index 0f4e750a6748..9f0074cac9cc 100644 --- a/drivers/scsi/ufs/ufs-sysfs.h +++ b/drivers/ufs/core/ufs-sysfs.h @@ -7,11 +7,13 @@ #include -#include "ufshcd.h" +struct device; +struct ufs_hba; -void ufs_sysfs_add_nodes(struct device *dev); +void ufs_sysfs_add_nodes(struct ufs_hba *hba); void ufs_sysfs_remove_nodes(struct device *dev); extern const struct attribute_group ufs_sysfs_unit_descriptor_group; extern const struct attribute_group ufs_sysfs_lun_attributes_group; + #endif diff --git a/drivers/scsi/ufs/ufs_bsg.c b/drivers/ufs/core/ufs_bsg.c similarity index 58% rename from drivers/scsi/ufs/ufs_bsg.c rename to drivers/ufs/core/ufs_bsg.c index 39bf204c6ec3..0d38e7fa34cc 100644 --- a/drivers/scsi/ufs/ufs_bsg.c +++ b/drivers/ufs/core/ufs_bsg.c @@ -4,37 +4,24 @@ * * Copyright (C) 2018 Western Digital Corporation */ + +#include +#include +#include +#include #include "ufs_bsg.h" +#include +#include "ufshcd-priv.h" static int ufs_bsg_get_query_desc_size(struct ufs_hba *hba, int *desc_len, struct utp_upiu_query *qr) { int desc_size = be16_to_cpu(qr->length); - int desc_id = qr->idn; if (desc_size <= 0) return -EINVAL; - ufshcd_map_desc_id_to_length(hba, desc_id, desc_len); - if (!*desc_len) - return -EINVAL; - - *desc_len = min_t(int, *desc_len, desc_size); - - return 0; -} - -static int ufs_bsg_verify_query_size(struct ufs_hba *hba, - unsigned int request_len, - unsigned int reply_len) -{ - int min_req_len = sizeof(struct ufs_bsg_request); - int min_rsp_len = sizeof(struct ufs_bsg_reply); - - if (min_req_len > request_len || min_rsp_len > reply_len) { - dev_err(hba->dev, "not enough space assigned\n"); - return -EINVAL; - } + *desc_len = min_t(int, QUERY_DESC_MAX_SIZE, desc_size); return 0; } @@ -77,23 +64,84 @@ out: return 0; } +static int ufs_bsg_exec_advanced_rpmb_req(struct ufs_hba *hba, struct bsg_job *job) +{ + struct ufs_rpmb_request *rpmb_request = job->request; + struct ufs_rpmb_reply *rpmb_reply = job->reply; + struct bsg_buffer *payload = NULL; + enum dma_data_direction dir; + struct scatterlist *sg_list = NULL; + int rpmb_req_type; + int sg_cnt = 0; + int ret; + int data_len; + + if (hba->ufs_version < ufshci_version(4, 0) || !hba->dev_info.b_advanced_rpmb_en || + !(hba->capabilities & MASK_EHSLUTRD_SUPPORTED)) + return -EINVAL; + + if (rpmb_request->ehs_req.length != 2 || rpmb_request->ehs_req.ehs_type != 1) + return -EINVAL; + + rpmb_req_type = be16_to_cpu(rpmb_request->ehs_req.meta.req_resp_type); + + switch (rpmb_req_type) { + case UFS_RPMB_WRITE_KEY: + case UFS_RPMB_READ_CNT: + case UFS_RPMB_PURGE_ENABLE: + dir = DMA_NONE; + break; + case UFS_RPMB_WRITE: + case UFS_RPMB_SEC_CONF_WRITE: + dir = DMA_TO_DEVICE; + break; + case UFS_RPMB_READ: + case UFS_RPMB_SEC_CONF_READ: + case UFS_RPMB_PURGE_STATUS_READ: + dir = DMA_FROM_DEVICE; + break; + default: + return -EINVAL; + } + + if (dir != DMA_NONE) { + payload = &job->request_payload; + if (!payload || !payload->payload_len || !payload->sg_cnt) + return -EINVAL; + + sg_cnt = dma_map_sg(hba->host->dma_dev, payload->sg_list, payload->sg_cnt, dir); + if (unlikely(!sg_cnt)) + return -ENOMEM; + sg_list = payload->sg_list; + data_len = payload->payload_len; + } + + ret = ufshcd_advanced_rpmb_req_handler(hba, &rpmb_request->bsg_request.upiu_req, + &rpmb_reply->bsg_reply.upiu_rsp, &rpmb_request->ehs_req, + &rpmb_reply->ehs_rsp, sg_cnt, sg_list, dir); + + if (dir != DMA_NONE) { + dma_unmap_sg(hba->host->dma_dev, payload->sg_list, payload->sg_cnt, dir); + + if (!ret) + rpmb_reply->bsg_reply.reply_payload_rcv_len = data_len; + } + + return ret; +} + static int ufs_bsg_request(struct bsg_job *job) { struct ufs_bsg_request *bsg_request = job->request; struct ufs_bsg_reply *bsg_reply = job->reply; struct ufs_hba *hba = shost_priv(dev_to_shost(job->dev->parent)); - unsigned int req_len = job->request_len; - unsigned int reply_len = job->reply_len; struct uic_command uc = {}; int msgcode; - uint8_t *desc_buff = NULL; + uint8_t *buff = NULL; int desc_len = 0; enum query_opcode desc_op = UPIU_QUERY_OPCODE_NOP; int ret; - - ret = ufs_bsg_verify_query_size(hba, req_len, reply_len); - if (ret) - goto out; + bool rpmb = false; bsg_reply->reply_payload_rcv_len = 0; @@ -103,34 +151,39 @@ static int ufs_bsg_request(struct bsg_job *job) switch (msgcode) { case UPIU_TRANSACTION_QUERY_REQ: desc_op = bsg_request->upiu_req.qr.opcode; - ret = ufs_bsg_alloc_desc_buffer(hba, job, &desc_buff, - &desc_len, desc_op); - if (ret) { - ufshcd_rpm_put_sync(hba); + ret = ufs_bsg_alloc_desc_buffer(hba, job, &buff, &desc_len, desc_op); + if (ret) goto out; - } - fallthrough; case UPIU_TRANSACTION_NOP_OUT: case UPIU_TRANSACTION_TASK_REQ: ret = ufshcd_exec_raw_upiu_cmd(hba, &bsg_request->upiu_req, &bsg_reply->upiu_rsp, msgcode, - desc_buff, &desc_len, desc_op); + buff, &desc_len, desc_op); if (ret) - dev_err(hba->dev, - "exe raw upiu: error code %d\n", ret); - + dev_err(hba->dev, "exe raw upiu: error code %d\n", ret); + else if (desc_op == UPIU_QUERY_OPCODE_READ_DESC && desc_len) { + bsg_reply->reply_payload_rcv_len = + sg_copy_from_buffer(job->request_payload.sg_list, + job->request_payload.sg_cnt, + buff, desc_len); + } break; case UPIU_TRANSACTION_UIC_CMD: memcpy(&uc, &bsg_request->upiu_req.uc, UIC_CMD_SIZE); ret = ufshcd_send_uic_cmd(hba, &uc); if (ret) - dev_err(hba->dev, - "send uic cmd: error code %d\n", ret); + dev_err(hba->dev, "send uic cmd: error code %d\n", ret); memcpy(&bsg_reply->upiu_rsp.uc, &uc, UIC_CMD_SIZE); break; + case UPIU_TRANSACTION_ARPMB_CMD: + rpmb = true; + ret = ufs_bsg_exec_advanced_rpmb_req(hba, job); + if (ret) + dev_err(hba->dev, "ARPMB OP failed: error code %d\n", ret); + break; default: ret = -ENOTSUPP; dev_err(hba->dev, "unsupported msgcode 0x%x\n", msgcode); @@ -138,22 +191,11 @@ static int ufs_bsg_request(struct bsg_job *job) break; } - ufshcd_rpm_put_sync(hba); - - if (!desc_buff) - goto out; - - if (desc_op == UPIU_QUERY_OPCODE_READ_DESC && desc_len) - bsg_reply->reply_payload_rcv_len = - sg_copy_from_buffer(job->request_payload.sg_list, - job->request_payload.sg_cnt, - desc_buff, desc_len); - - kfree(desc_buff); - out: + ufshcd_rpm_put_sync(hba); + kfree(buff); bsg_reply->result = ret; - job->reply_len = sizeof(struct ufs_bsg_reply); + job->reply_len = !rpmb ? sizeof(struct ufs_bsg_reply) : sizeof(struct ufs_rpmb_reply); /* complete the job here only if no error */ if (ret == 0) bsg_job_done(job, ret, bsg_reply->reply_payload_rcv_len); diff --git a/drivers/scsi/ufs/ufs_bsg.h b/drivers/ufs/core/ufs_bsg.h similarity index 77% rename from drivers/scsi/ufs/ufs_bsg.h rename to drivers/ufs/core/ufs_bsg.h index d09918758631..57712d2656d2 100644 --- a/drivers/scsi/ufs/ufs_bsg.h +++ b/drivers/ufs/core/ufs_bsg.h @@ -5,12 +5,7 @@ #ifndef UFS_BSG_H #define UFS_BSG_H -#include -#include -#include - -#include "ufshcd.h" -#include "ufs.h" +struct ufs_hba; #ifdef CONFIG_SCSI_UFS_BSG void ufs_bsg_remove(struct ufs_hba *hba); diff --git a/drivers/scsi/ufs/ufshcd-crypto.c b/drivers/ufs/core/ufshcd-crypto.c similarity index 79% rename from drivers/scsi/ufs/ufshcd-crypto.c rename to drivers/ufs/core/ufshcd-crypto.c index d70cdcd35e43..9702cd39f0ca 100644 --- a/drivers/scsi/ufs/ufshcd-crypto.c +++ b/drivers/ufs/core/ufshcd-crypto.c @@ -3,9 +3,15 @@ * Copyright 2019 Google LLC */ -#include "ufshcd.h" +#include #include "ufshcd-crypto.h" +#undef CREATE_TRACE_POINTS +#include + +#undef CREATE_TRACE_POINTS +#include + /* Blk-crypto modes supported by UFS crypto */ static const struct ufs_crypto_alg_entry { enum ufs_crypto_alg ufs_alg; @@ -48,11 +54,12 @@ out: return err; } -static int ufshcd_crypto_keyslot_program(struct blk_keyslot_manager *ksm, +static int ufshcd_crypto_keyslot_program(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, unsigned int slot) { - struct ufs_hba *hba = container_of(ksm, struct ufs_hba, ksm); + struct ufs_hba *hba = + container_of(profile, struct ufs_hba, crypto_profile); const union ufs_crypto_cap_entry *ccap_array = hba->crypto_cap_array; const struct ufs_crypto_alg_entry *alg = &ufs_crypto_algs[key->crypto_cfg.crypto_mode]; @@ -105,11 +112,12 @@ static int ufshcd_clear_keyslot(struct ufs_hba *hba, int slot) return ufshcd_program_key(hba, &cfg, slot); } -static int ufshcd_crypto_keyslot_evict(struct blk_keyslot_manager *ksm, +static int ufshcd_crypto_keyslot_evict(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, unsigned int slot) { - struct ufs_hba *hba = container_of(ksm, struct ufs_hba, ksm); + struct ufs_hba *hba = + container_of(profile, struct ufs_hba, crypto_profile); return ufshcd_clear_keyslot(hba, slot); } @@ -120,11 +128,21 @@ bool ufshcd_crypto_enable(struct ufs_hba *hba) return false; /* Reset might clear all keys, so reprogram all the keys. */ - blk_ksm_reprogram_all_keys(&hba->ksm); + if (hba->crypto_profile.num_slots) { + int err = -EOPNOTSUPP; + + trace_android_rvh_ufs_reprogram_all_keys(hba, &err); + if (err == -EOPNOTSUPP) + blk_crypto_reprogram_all_keys(&hba->crypto_profile); + } + + if (hba->quirks & UFSHCD_QUIRK_BROKEN_CRYPTO_ENABLE) + return false; + return true; } -static const struct blk_ksm_ll_ops ufshcd_ksm_ops = { +static const struct blk_crypto_ll_ops ufshcd_crypto_ops = { .keyslot_program = ufshcd_crypto_keyslot_program, .keyslot_evict = ufshcd_crypto_keyslot_evict, }; @@ -157,6 +175,9 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba) int err = 0; enum blk_crypto_mode_num blk_mode_num; + if (hba->quirks & UFSHCD_QUIRK_CUSTOM_CRYPTO_PROFILE) + return 0; + /* * Don't use crypto if either the hardware doesn't advertise the * standard crypto capability bit *or* if the vendor specific driver @@ -179,15 +200,17 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba) } /* The actual number of configurations supported is (CFGC+1) */ - err = devm_blk_ksm_init(hba->dev, &hba->ksm, - hba->crypto_capabilities.config_count + 1); + err = devm_blk_crypto_profile_init( + hba->dev, &hba->crypto_profile, + hba->crypto_capabilities.config_count + 1); if (err) goto out; - hba->ksm.ksm_ll_ops = ufshcd_ksm_ops; + hba->crypto_profile.ll_ops = ufshcd_crypto_ops; /* UFS only supports 8 bytes for any DUN */ - hba->ksm.max_dun_bytes_supported = 8; - hba->ksm.dev = hba->dev; + hba->crypto_profile.max_dun_bytes_supported = 8; + hba->crypto_profile.key_types_supported = BLK_CRYPTO_KEY_TYPE_STANDARD; + hba->crypto_profile.dev = hba->dev; /* * Cache all the UFS crypto capabilities and advertise the supported @@ -202,7 +225,7 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba) blk_mode_num = ufshcd_find_blk_crypto_mode( hba->crypto_cap_array[cap_idx]); if (blk_mode_num != BLK_ENCRYPTION_MODE_INVALID) - hba->ksm.crypto_modes_supported[blk_mode_num] |= + hba->crypto_profile.modes_supported[blk_mode_num] |= hba->crypto_cap_array[cap_idx].sdus_mask * 512; } @@ -225,14 +248,14 @@ void ufshcd_init_crypto(struct ufs_hba *hba) if (!(hba->caps & UFSHCD_CAP_CRYPTO)) return; - /* Clear all keyslots - the number of keyslots is (CFGC + 1) */ - for (slot = 0; slot < hba->crypto_capabilities.config_count + 1; slot++) - ufshcd_clear_keyslot(hba, slot); + /* Clear all keyslots */ + for (slot = 0; slot < hba->crypto_profile.num_slots; slot++) + hba->crypto_profile.ll_ops.keyslot_evict(&hba->crypto_profile, + NULL, slot); } -void ufshcd_crypto_setup_rq_keyslot_manager(struct ufs_hba *hba, - struct request_queue *q) +void ufshcd_crypto_register(struct ufs_hba *hba, struct request_queue *q) { if (hba->caps & UFSHCD_CAP_CRYPTO) - blk_ksm_register(&hba->ksm, q); + blk_crypto_register(&hba->crypto_profile, q); } diff --git a/drivers/scsi/ufs/ufshcd-crypto.h b/drivers/ufs/core/ufshcd-crypto.h similarity index 65% rename from drivers/scsi/ufs/ufshcd-crypto.h rename to drivers/ufs/core/ufshcd-crypto.h index 78a58e788dff..190195f131d7 100644 --- a/drivers/scsi/ufs/ufshcd-crypto.h +++ b/drivers/ufs/core/ufshcd-crypto.h @@ -6,9 +6,12 @@ #ifndef _UFSHCD_CRYPTO_H #define _UFSHCD_CRYPTO_H +#include +#include +#include "ufshcd-priv.h" +#include + #ifdef CONFIG_SCSI_UFS_CRYPTO -#include "ufshcd.h" -#include "ufshci.h" static inline void ufshcd_prepare_lrbp_crypto(struct request *rq, struct ufshcd_lrb *lrbp) @@ -18,7 +21,7 @@ static inline void ufshcd_prepare_lrbp_crypto(struct request *rq, return; } - lrbp->crypto_key_slot = blk_ksm_get_slot_idx(rq->crypt_keyslot); + lrbp->crypto_key_slot = blk_crypto_keyslot_index(rq->crypt_keyslot); lrbp->data_unit_num = rq->crypt_ctx->bc_dun[0]; } @@ -34,14 +37,26 @@ ufshcd_prepare_req_desc_hdr_crypto(struct ufshcd_lrb *lrbp, u32 *dword_0, } } +static inline void ufshcd_crypto_clear_prdt(struct ufs_hba *hba, + struct ufshcd_lrb *lrbp) +{ + if (!(hba->quirks & UFSHCD_QUIRK_KEYS_IN_PRDT)) + return; + + if (!(scsi_cmd_to_rq(lrbp->cmd)->crypt_ctx)) + return; + + memzero_explicit(lrbp->ucd_prdt_ptr, + ufshcd_sg_entry_size(hba) * scsi_sg_count(lrbp->cmd)); +} + bool ufshcd_crypto_enable(struct ufs_hba *hba); int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba); void ufshcd_init_crypto(struct ufs_hba *hba); -void ufshcd_crypto_setup_rq_keyslot_manager(struct ufs_hba *hba, - struct request_queue *q); +void ufshcd_crypto_register(struct ufs_hba *hba, struct request_queue *q); #else /* CONFIG_SCSI_UFS_CRYPTO */ @@ -52,6 +67,9 @@ static inline void ufshcd_prepare_req_desc_hdr_crypto(struct ufshcd_lrb *lrbp, u32 *dword_0, u32 *dword_1, u32 *dword_3) { } +static inline void ufshcd_crypto_clear_prdt(struct ufs_hba *hba, + struct ufshcd_lrb *lrbp) { } + static inline bool ufshcd_crypto_enable(struct ufs_hba *hba) { return false; @@ -64,8 +82,8 @@ static inline int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba) static inline void ufshcd_init_crypto(struct ufs_hba *hba) { } -static inline void ufshcd_crypto_setup_rq_keyslot_manager(struct ufs_hba *hba, - struct request_queue *q) { } +static inline void ufshcd_crypto_register(struct ufs_hba *hba, + struct request_queue *q) { } #endif /* CONFIG_SCSI_UFS_CRYPTO */ diff --git a/drivers/ufs/core/ufshcd-priv.h b/drivers/ufs/core/ufshcd-priv.h new file mode 100644 index 000000000000..bee1e66f42fb --- /dev/null +++ b/drivers/ufs/core/ufshcd-priv.h @@ -0,0 +1,401 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _UFSHCD_PRIV_H_ +#define _UFSHCD_PRIV_H_ + +#include +#include + +static inline bool ufshcd_is_user_access_allowed(struct ufs_hba *hba) +{ + return !hba->shutting_down; +} + +void ufshcd_schedule_eh_work(struct ufs_hba *hba); + +static inline bool ufshcd_keep_autobkops_enabled_except_suspend( + struct ufs_hba *hba) +{ + return hba->caps & UFSHCD_CAP_KEEP_AUTO_BKOPS_ENABLED_EXCEPT_SUSPEND; +} + +static inline u8 ufshcd_wb_get_query_index(struct ufs_hba *hba) +{ + if (hba->dev_info.wb_buffer_type == WB_BUF_MODE_LU_DEDICATED) + return hba->dev_info.wb_dedicated_lu; + return 0; +} + +static inline bool ufshcd_is_wb_buf_flush_allowed(struct ufs_hba *hba) +{ + return ufshcd_is_wb_allowed(hba) && + !(hba->quirks & UFSHCI_QUIRK_SKIP_MANUAL_WB_FLUSH_CTRL); +} + +#ifdef CONFIG_SCSI_UFS_HWMON +void ufs_hwmon_probe(struct ufs_hba *hba, u8 mask); +void ufs_hwmon_remove(struct ufs_hba *hba); +void ufs_hwmon_notify_event(struct ufs_hba *hba, u8 ee_mask); +#else +static inline void ufs_hwmon_probe(struct ufs_hba *hba, u8 mask) {} +static inline void ufs_hwmon_remove(struct ufs_hba *hba) {} +static inline void ufs_hwmon_notify_event(struct ufs_hba *hba, u8 ee_mask) {} +#endif + +int ufshcd_read_desc_param(struct ufs_hba *hba, + enum desc_idn desc_id, + int desc_index, + u8 param_offset, + u8 *param_read_buf, + u8 param_size); +int ufshcd_query_attr_retry(struct ufs_hba *hba, enum query_opcode opcode, + enum attr_idn idn, u8 index, u8 selector, + u32 *attr_val); +int ufshcd_query_attr(struct ufs_hba *hba, enum query_opcode opcode, + enum attr_idn idn, u8 index, u8 selector, u32 *attr_val); +int ufshcd_query_flag(struct ufs_hba *hba, enum query_opcode opcode, + enum flag_idn idn, u8 index, bool *flag_res); +void ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit); +void ufshcd_compl_one_cqe(struct ufs_hba *hba, int task_tag, + struct cq_entry *cqe); +int ufshcd_mcq_init(struct ufs_hba *hba); +int ufshcd_mcq_decide_queue_depth(struct ufs_hba *hba); +int ufshcd_mcq_memory_alloc(struct ufs_hba *hba); +void ufshcd_mcq_make_queues_operational(struct ufs_hba *hba); +void ufshcd_mcq_config_mac(struct ufs_hba *hba, u32 max_active_cmds); +void ufshcd_mcq_select_mcq_mode(struct ufs_hba *hba); +u32 ufshcd_mcq_read_cqis(struct ufs_hba *hba, int i); +void ufshcd_mcq_write_cqis(struct ufs_hba *hba, u32 val, int i); +unsigned long ufshcd_mcq_poll_cqe_nolock(struct ufs_hba *hba, + struct ufs_hw_queue *hwq); +struct ufs_hw_queue *ufshcd_mcq_req_to_hwq(struct ufs_hba *hba, + struct request *req); +unsigned long ufshcd_mcq_poll_cqe_lock(struct ufs_hba *hba, + struct ufs_hw_queue *hwq); + +#define UFSHCD_MCQ_IO_QUEUE_OFFSET 1 +#define SD_ASCII_STD true +#define SD_RAW false +int ufshcd_read_string_desc(struct ufs_hba *hba, u8 desc_index, + u8 **buf, bool ascii); + +int ufshcd_hold(struct ufs_hba *hba, bool async); +void ufshcd_release(struct ufs_hba *hba); + +int ufshcd_send_uic_cmd(struct ufs_hba *hba, struct uic_command *uic_cmd); + +int ufshcd_exec_raw_upiu_cmd(struct ufs_hba *hba, + struct utp_upiu_req *req_upiu, + struct utp_upiu_req *rsp_upiu, + int msgcode, + u8 *desc_buff, int *buff_len, + enum query_opcode desc_op); + +int ufshcd_wb_toggle(struct ufs_hba *hba, bool enable); + +/* Wrapper functions for safely calling variant operations */ +static inline const char *ufshcd_get_var_name(struct ufs_hba *hba) +{ + if (hba->vops) + return hba->vops->name; + return ""; +} + +static inline void ufshcd_vops_exit(struct ufs_hba *hba) +{ + if (hba->vops && hba->vops->exit) + return hba->vops->exit(hba); +} + +static inline u32 ufshcd_vops_get_ufs_hci_version(struct ufs_hba *hba) +{ + if (hba->vops && hba->vops->get_ufs_hci_version) + return hba->vops->get_ufs_hci_version(hba); + + return ufshcd_readl(hba, REG_UFS_VERSION); +} + +static inline int ufshcd_vops_clk_scale_notify(struct ufs_hba *hba, + bool up, enum ufs_notify_change_status status) +{ + if (hba->vops && hba->vops->clk_scale_notify) + return hba->vops->clk_scale_notify(hba, up, status); + return 0; +} + +static inline void ufshcd_vops_event_notify(struct ufs_hba *hba, + enum ufs_event_type evt, + void *data) +{ + if (hba->vops && hba->vops->event_notify) + hba->vops->event_notify(hba, evt, data); +} + +static inline int ufshcd_vops_setup_clocks(struct ufs_hba *hba, bool on, + enum ufs_notify_change_status status) +{ + if (hba->vops && hba->vops->setup_clocks) + return hba->vops->setup_clocks(hba, on, status); + return 0; +} + +static inline int ufshcd_vops_hce_enable_notify(struct ufs_hba *hba, + bool status) +{ + if (hba->vops && hba->vops->hce_enable_notify) + return hba->vops->hce_enable_notify(hba, status); + + return 0; +} +static inline int ufshcd_vops_link_startup_notify(struct ufs_hba *hba, + bool status) +{ + if (hba->vops && hba->vops->link_startup_notify) + return hba->vops->link_startup_notify(hba, status); + + return 0; +} + +static inline int ufshcd_vops_pwr_change_notify(struct ufs_hba *hba, + enum ufs_notify_change_status status, + struct ufs_pa_layer_attr *dev_max_params, + struct ufs_pa_layer_attr *dev_req_params) +{ + if (hba->vops && hba->vops->pwr_change_notify) + return hba->vops->pwr_change_notify(hba, status, + dev_max_params, dev_req_params); + + return -ENOTSUPP; +} + +static inline void ufshcd_vops_setup_task_mgmt(struct ufs_hba *hba, + int tag, u8 tm_function) +{ + if (hba->vops && hba->vops->setup_task_mgmt) + return hba->vops->setup_task_mgmt(hba, tag, tm_function); +} + +static inline void ufshcd_vops_hibern8_notify(struct ufs_hba *hba, + enum uic_cmd_dme cmd, + enum ufs_notify_change_status status) +{ + if (hba->vops && hba->vops->hibern8_notify) + return hba->vops->hibern8_notify(hba, cmd, status); +} + +static inline int ufshcd_vops_apply_dev_quirks(struct ufs_hba *hba) +{ + if (hba->vops && hba->vops->apply_dev_quirks) + return hba->vops->apply_dev_quirks(hba); + return 0; +} + +static inline void ufshcd_vops_fixup_dev_quirks(struct ufs_hba *hba) +{ + if (hba->vops && hba->vops->fixup_dev_quirks) + hba->vops->fixup_dev_quirks(hba); +} + +static inline int ufshcd_vops_suspend(struct ufs_hba *hba, enum ufs_pm_op op, + enum ufs_notify_change_status status) +{ + if (hba->vops && hba->vops->suspend) + return hba->vops->suspend(hba, op, status); + + return 0; +} + +static inline int ufshcd_vops_resume(struct ufs_hba *hba, enum ufs_pm_op op) +{ + if (hba->vops && hba->vops->resume) + return hba->vops->resume(hba, op); + + return 0; +} + +static inline void ufshcd_vops_dbg_register_dump(struct ufs_hba *hba) +{ + if (hba->vops && hba->vops->dbg_register_dump) + hba->vops->dbg_register_dump(hba); +} + +static inline int ufshcd_vops_device_reset(struct ufs_hba *hba) +{ + if (hba->vops && hba->vops->device_reset) + return hba->vops->device_reset(hba); + + return -EOPNOTSUPP; +} + +static inline void ufshcd_vops_config_scaling_param(struct ufs_hba *hba, + struct devfreq_dev_profile *p, + struct devfreq_simple_ondemand_data *data) +{ + if (hba->vops && hba->vops->config_scaling_param) + hba->vops->config_scaling_param(hba, p, data); +} + +static inline void ufshcd_vops_reinit_notify(struct ufs_hba *hba) +{ + if (hba->vops && hba->vops->reinit_notify) + hba->vops->reinit_notify(hba); +} + +static inline int ufshcd_vops_mcq_config_resource(struct ufs_hba *hba) +{ + if (hba->vops && hba->vops->mcq_config_resource) + return hba->vops->mcq_config_resource(hba); + + return -EOPNOTSUPP; +} + +static inline int ufshcd_mcq_vops_get_hba_mac(struct ufs_hba *hba) +{ + if (hba->vops && hba->vops->get_hba_mac) + return hba->vops->get_hba_mac(hba); + + return -EOPNOTSUPP; +} + +static inline int ufshcd_mcq_vops_op_runtime_config(struct ufs_hba *hba) +{ + if (hba->vops && hba->vops->op_runtime_config) + return hba->vops->op_runtime_config(hba); + + return -EOPNOTSUPP; +} + +static inline int ufshcd_vops_get_outstanding_cqs(struct ufs_hba *hba, + unsigned long *ocqs) +{ + if (hba->vops && hba->vops->get_outstanding_cqs) + return hba->vops->get_outstanding_cqs(hba, ocqs); + + return -EOPNOTSUPP; +} + +static inline int ufshcd_mcq_vops_config_esi(struct ufs_hba *hba) +{ + if (hba->vops && hba->vops->config_esi) + return hba->vops->config_esi(hba); + + return -EOPNOTSUPP; +} + +extern const struct ufs_pm_lvl_states ufs_pm_lvl_states[]; + +/** + * ufshcd_scsi_to_upiu_lun - maps scsi LUN to UPIU LUN + * @scsi_lun: scsi LUN id + * + * Returns UPIU LUN id + */ +static inline u8 ufshcd_scsi_to_upiu_lun(unsigned int scsi_lun) +{ + if (scsi_is_wlun(scsi_lun)) + return (scsi_lun & UFS_UPIU_MAX_UNIT_NUM_ID) + | UFS_UPIU_WLUN_ID; + else + return scsi_lun & UFS_UPIU_MAX_UNIT_NUM_ID; +} + +int __ufshcd_write_ee_control(struct ufs_hba *hba, u32 ee_ctrl_mask); +int ufshcd_write_ee_control(struct ufs_hba *hba); +int ufshcd_update_ee_control(struct ufs_hba *hba, u16 *mask, + const u16 *other_mask, u16 set, u16 clr); + +static inline int ufshcd_update_ee_drv_mask(struct ufs_hba *hba, + u16 set, u16 clr) +{ + return ufshcd_update_ee_control(hba, &hba->ee_drv_mask, + &hba->ee_usr_mask, set, clr); +} + +static inline int ufshcd_update_ee_usr_mask(struct ufs_hba *hba, + u16 set, u16 clr) +{ + return ufshcd_update_ee_control(hba, &hba->ee_usr_mask, + &hba->ee_drv_mask, set, clr); +} + +static inline int ufshcd_rpm_get_sync(struct ufs_hba *hba) +{ + return pm_runtime_get_sync(&hba->ufs_device_wlun->sdev_gendev); +} + +static inline int ufshcd_rpm_put_sync(struct ufs_hba *hba) +{ + return pm_runtime_put_sync(&hba->ufs_device_wlun->sdev_gendev); +} + +static inline void ufshcd_rpm_get_noresume(struct ufs_hba *hba) +{ + pm_runtime_get_noresume(&hba->ufs_device_wlun->sdev_gendev); +} + +static inline int ufshcd_rpm_resume(struct ufs_hba *hba) +{ + return pm_runtime_resume(&hba->ufs_device_wlun->sdev_gendev); +} + +static inline int ufshcd_rpm_put(struct ufs_hba *hba) +{ + return pm_runtime_put(&hba->ufs_device_wlun->sdev_gendev); +} + +/** + * ufs_is_valid_unit_desc_lun - checks if the given LUN has a unit descriptor + * @dev_info: pointer of instance of struct ufs_dev_info + * @lun: LU number to check + * @return: true if the lun has a matching unit descriptor, false otherwise + */ +static inline bool ufs_is_valid_unit_desc_lun(struct ufs_dev_info *dev_info, u8 lun) +{ + if (!dev_info || !dev_info->max_lu_supported) { + pr_err("Max General LU supported by UFS isn't initialized\n"); + return false; + } + return lun == UFS_UPIU_RPMB_WLUN || (lun < dev_info->max_lu_supported); +} + +static inline void ufshcd_inc_sq_tail(struct ufs_hw_queue *q) +{ + u32 mask = q->max_entries - 1; + u32 val; + + q->sq_tail_slot = (q->sq_tail_slot + 1) & mask; + val = q->sq_tail_slot * sizeof(struct utp_transfer_req_desc); + writel(val, q->mcq_sq_tail); +} + +static inline void ufshcd_mcq_update_cq_tail_slot(struct ufs_hw_queue *q) +{ + u32 val = readl(q->mcq_cq_tail); + + q->cq_tail_slot = val / sizeof(struct cq_entry); +} + +static inline bool ufshcd_mcq_is_cq_empty(struct ufs_hw_queue *q) +{ + return q->cq_head_slot == q->cq_tail_slot; +} + +static inline void ufshcd_mcq_inc_cq_head_slot(struct ufs_hw_queue *q) +{ + q->cq_head_slot++; + if (q->cq_head_slot == q->max_entries) + q->cq_head_slot = 0; +} + +static inline void ufshcd_mcq_update_cq_head(struct ufs_hw_queue *q) +{ + writel(q->cq_head_slot * sizeof(struct cq_entry), q->mcq_cq_head); +} + +static inline struct cq_entry *ufshcd_mcq_cur_cqe(struct ufs_hw_queue *q) +{ + struct cq_entry *cqe = q->cqe_base_addr; + + return cqe + q->cq_head_slot; +} +#endif /* _UFSHCD_PRIV_H_ */ diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/ufs/core/ufshcd.c similarity index 83% rename from drivers/scsi/ufs/ufshcd.c rename to drivers/ufs/core/ufshcd.c index 120831428ec6..f1eb7bf71c82 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -16,10 +16,19 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include "ufshcd.h" -#include "ufs_quirks.h" -#include "unipro.h" +#include +#include "ufshcd-priv.h" +#include +#include #include "ufs-sysfs.h" #include "ufs-debugfs.h" #include "ufs-fault-injection.h" @@ -31,9 +40,18 @@ #define CREATE_TRACE_POINTS #include +#undef CREATE_TRACE_POINTS +#include + #define UFSHCD_ENABLE_INTRS (UTP_TRANSFER_REQ_COMPL |\ UTP_TASK_REQ_COMPL |\ UFSHCD_ERROR_MASK) + +#define UFSHCD_ENABLE_MCQ_INTRS (UTP_TASK_REQ_COMPL |\ + UFSHCD_ERROR_MASK |\ + MCQ_CQ_EVENT_STATUS) + + /* UIC command timeout, unit: ms */ #define UIC_CMD_TIMEOUT 500 @@ -47,6 +65,9 @@ /* Query request timeout */ #define QUERY_REQ_TIMEOUT 1500 /* 1.5 seconds */ +/* Advanced RPMB request timeout */ +#define ADVANCED_RPMB_REQ_TIMEOUT 3000 /* 3 seconds */ + /* Task management command timeout */ #define TM_CMD_TIMEOUT 100 /* msecs */ @@ -56,12 +77,12 @@ /* maximum number of link-startup retries */ #define DME_LINKSTARTUP_RETRIES 3 -/* Maximum retries for Hibern8 enter */ -#define UIC_HIBERN8_ENTER_RETRIES 3 - /* maximum number of reset retries before giving up */ #define MAX_HOST_RESET_RETRIES 5 +/* Maximum number of error handler retries before giving up */ +#define MAX_ERR_HANDLER_RETRIES 5 + /* Expose the flag value from utp_upiu_query.value */ #define MASK_QUERY_UPIU_FLAG_LOC 0xFF @@ -80,7 +101,32 @@ /* Polling time to wait for fDeviceInit */ #define FDEVICEINIT_COMPL_TIMEOUT 1500 /* millisecs */ -#define wlun_dev_to_hba(dv) shost_priv(to_scsi_device(dv)->host) +/* UFSHC 4.0 compliant HC support this mode, refer param_set_mcq_mode() */ +static bool use_mcq_mode = true; + +static bool is_mcq_supported(struct ufs_hba *hba) +{ + return hba->mcq_sup && use_mcq_mode; +} + +static int param_set_mcq_mode(const char *val, const struct kernel_param *kp) +{ + int ret; + + ret = param_set_bool(val, kp); + if (ret) + return ret; + + return 0; +} + +static const struct kernel_param_ops mcq_mode_ops = { + .set = param_set_mcq_mode, + .get = param_get_bool, +}; + +module_param_cb(use_mcq_mode, &mcq_mode_ops, &use_mcq_mode, 0644); +MODULE_PARM_DESC(use_mcq_mode, "Control MCQ mode for controllers starting from UFSHCI 4.0. 1 - enable MCQ, 0 - disable MCQ. MCQ is enabled by default"); #define ufshcd_toggle_vreg(_dev, _vreg, _on) \ ({ \ @@ -135,6 +181,14 @@ enum { UFSHCD_CAN_QUEUE = 32 - UFSHCD_NUM_RESERVED, }; +static const char *const ufshcd_state_name[] = { + [UFSHCD_STATE_RESET] = "reset", + [UFSHCD_STATE_OPERATIONAL] = "operational", + [UFSHCD_STATE_ERROR] = "error", + [UFSHCD_STATE_EH_SCHEDULED_FATAL] = "eh_fatal", + [UFSHCD_STATE_EH_SCHEDULED_NON_FATAL] = "eh_non_fatal", +}; + /* UFSHCD error handling flags */ enum { UFSHCD_EH_IN_PROGRESS = (1 << 0), @@ -158,7 +212,7 @@ enum { #define ufshcd_clear_eh_in_progress(h) \ ((h)->eh_flags &= ~UFSHCD_EH_IN_PROGRESS) -struct ufs_pm_lvl_states ufs_pm_lvl_states[] = { +const struct ufs_pm_lvl_states ufs_pm_lvl_states[] = { [UFS_PM_LVL_0] = {UFS_ACTIVE_PWR_MODE, UIC_LINK_ACTIVE_STATE}, [UFS_PM_LVL_1] = {UFS_ACTIVE_PWR_MODE, UIC_LINK_HIBERN8_STATE}, [UFS_PM_LVL_2] = {UFS_SLEEP_PWR_MODE, UIC_LINK_ACTIVE_STATE}, @@ -200,26 +254,33 @@ ufs_get_desired_pm_lvl_for_dev_link_state(enum ufs_dev_pwr_mode dev_state, return UFS_PM_LVL_0; } -static struct ufs_dev_fix ufs_fixups[] = { +static const struct ufs_dev_quirk ufs_fixups[] = { /* UFS cards deviations table */ - UFS_FIX(UFS_VENDOR_MICRON, UFS_ANY_MODEL, - UFS_DEVICE_QUIRK_DELAY_BEFORE_LPM | - UFS_DEVICE_QUIRK_SWAP_L2P_ENTRY_FOR_HPB_READ), - UFS_FIX(UFS_VENDOR_SAMSUNG, UFS_ANY_MODEL, - UFS_DEVICE_QUIRK_DELAY_BEFORE_LPM | - UFS_DEVICE_QUIRK_HOST_PA_TACTIVATE | - UFS_DEVICE_QUIRK_RECOVERY_FROM_DL_NAC_ERRORS), - UFS_FIX(UFS_VENDOR_SKHYNIX, UFS_ANY_MODEL, - UFS_DEVICE_QUIRK_HOST_PA_SAVECONFIGTIME), - UFS_FIX(UFS_VENDOR_SKHYNIX, "hB8aL1" /*H28U62301AMR*/, - UFS_DEVICE_QUIRK_HOST_VS_DEBUGSAVECONFIGTIME), - UFS_FIX(UFS_VENDOR_TOSHIBA, UFS_ANY_MODEL, - UFS_DEVICE_QUIRK_DELAY_BEFORE_LPM), - UFS_FIX(UFS_VENDOR_TOSHIBA, "THGLF2G9C8KBADG", - UFS_DEVICE_QUIRK_PA_TACTIVATE), - UFS_FIX(UFS_VENDOR_TOSHIBA, "THGLF2G9D8KBADG", - UFS_DEVICE_QUIRK_PA_TACTIVATE), - END_FIX + { .wmanufacturerid = UFS_VENDOR_MICRON, + .model = UFS_ANY_MODEL, + .quirk = UFS_DEVICE_QUIRK_DELAY_BEFORE_LPM | + UFS_DEVICE_QUIRK_SWAP_L2P_ENTRY_FOR_HPB_READ }, + { .wmanufacturerid = UFS_VENDOR_SAMSUNG, + .model = UFS_ANY_MODEL, + .quirk = UFS_DEVICE_QUIRK_DELAY_BEFORE_LPM | + UFS_DEVICE_QUIRK_HOST_PA_TACTIVATE | + UFS_DEVICE_QUIRK_RECOVERY_FROM_DL_NAC_ERRORS }, + { .wmanufacturerid = UFS_VENDOR_SKHYNIX, + .model = UFS_ANY_MODEL, + .quirk = UFS_DEVICE_QUIRK_HOST_PA_SAVECONFIGTIME }, + { .wmanufacturerid = UFS_VENDOR_SKHYNIX, + .model = "hB8aL1" /*H28U62301AMR*/, + .quirk = UFS_DEVICE_QUIRK_HOST_VS_DEBUGSAVECONFIGTIME }, + { .wmanufacturerid = UFS_VENDOR_TOSHIBA, + .model = UFS_ANY_MODEL, + .quirk = UFS_DEVICE_QUIRK_DELAY_BEFORE_LPM }, + { .wmanufacturerid = UFS_VENDOR_TOSHIBA, + .model = "THGLF2G9C8KBADG", + .quirk = UFS_DEVICE_QUIRK_PA_TACTIVATE }, + { .wmanufacturerid = UFS_VENDOR_TOSHIBA, + .model = "THGLF2G9D8KBADG", + .quirk = UFS_DEVICE_QUIRK_PA_TACTIVATE }, + {} }; static irqreturn_t ufshcd_tmc_handler(struct ufs_hba *hba); @@ -230,7 +291,6 @@ static int ufshcd_clear_tm_cmd(struct ufs_hba *hba, int tag); static void ufshcd_hba_exit(struct ufs_hba *hba); static int ufshcd_probe_hba(struct ufs_hba *hba, bool init_dev_params); static int ufshcd_setup_clocks(struct ufs_hba *hba, bool on); -static int ufshcd_uic_hibern8_enter(struct ufs_hba *hba); static inline void ufshcd_add_delay_before_dme_cmd(struct ufs_hba *hba); static int ufshcd_host_reset_and_restore(struct ufs_hba *hba); static void ufshcd_resume_clkscaling(struct ufs_hba *hba); @@ -240,14 +300,13 @@ static int ufshcd_scale_clks(struct ufs_hba *hba, bool scale_up); static irqreturn_t ufshcd_intr(int irq, void *__hba); static int ufshcd_change_power_mode(struct ufs_hba *hba, struct ufs_pa_layer_attr *pwr_mode); -static void ufshcd_schedule_eh_work(struct ufs_hba *hba); static int ufshcd_setup_hba_vreg(struct ufs_hba *hba, bool on); static int ufshcd_setup_vreg(struct ufs_hba *hba, bool on); static inline int ufshcd_config_vreg_hpm(struct ufs_hba *hba, struct ufs_vreg *vreg); static int ufshcd_try_to_abort_task(struct ufs_hba *hba, int tag); -static void ufshcd_wb_toggle_flush_during_h8(struct ufs_hba *hba, bool set); -static inline void ufshcd_wb_toggle_flush(struct ufs_hba *hba, bool enable); +static void ufshcd_wb_toggle_buf_flush_during_h8(struct ufs_hba *hba, + bool enable); static void ufshcd_hba_vreg_set_lpm(struct ufs_hba *hba); static void ufshcd_hba_vreg_set_hpm(struct ufs_hba *hba); @@ -267,16 +326,17 @@ static inline void ufshcd_disable_irq(struct ufs_hba *hba) } } -static inline void ufshcd_wb_config(struct ufs_hba *hba) +static void ufshcd_configure_wb(struct ufs_hba *hba) { if (!ufshcd_is_wb_allowed(hba)) return; ufshcd_wb_toggle(hba, true); - ufshcd_wb_toggle_flush_during_h8(hba, true); - if (!(hba->quirks & UFSHCI_QUIRK_SKIP_MANUAL_WB_FLUSH_CTRL)) - ufshcd_wb_toggle_flush(hba, true); + ufshcd_wb_toggle_buf_flush_during_h8(hba, true); + + if (ufshcd_is_wb_buf_flush_allowed(hba)) + ufshcd_wb_toggle_buf_flush(hba, true); } static void ufshcd_scsi_unblock_requests(struct ufs_hba *hba) @@ -325,6 +385,8 @@ static void ufshcd_add_tm_upiu_trace(struct ufs_hba *hba, unsigned int tag, { struct utp_task_req_desc *descp = &hba->utmrdl_base_addr[tag]; + trace_android_vh_ufs_send_tm_command(hba, tag, (int)str_t); + if (!trace_ufshcd_upiu_enabled()) return; @@ -341,11 +403,13 @@ static void ufshcd_add_tm_upiu_trace(struct ufs_hba *hba, unsigned int tag, } static void ufshcd_add_uic_command_trace(struct ufs_hba *hba, - struct uic_command *ucmd, + const struct uic_command *ucmd, enum ufs_trace_str_t str_t) { u32 cmd; + trace_android_vh_ufs_send_uic_command(hba, ucmd, (int)str_t); + if (!trace_ufshcd_uic_command_enabled()) return; @@ -421,11 +485,11 @@ static void ufshcd_print_clk_freqs(struct ufs_hba *hba) } static void ufshcd_print_evt(struct ufs_hba *hba, u32 id, - char *err_name) + const char *err_name) { int i; bool found = false; - struct ufs_event_hist *e; + const struct ufs_event_hist *e; if (id >= UFS_EVT_CNT) return; @@ -438,7 +502,7 @@ static void ufshcd_print_evt(struct ufs_hba *hba, u32 id, if (e->tstamp[p] == 0) continue; dev_err(hba->dev, "%s[%d] = 0x%x at %lld us\n", err_name, p, - e->val[p], ktime_to_us(e->tstamp[p])); + e->val[p], div_u64(e->tstamp[p], 1000)); found = true; } @@ -465,6 +529,9 @@ static void ufshcd_print_evt_hist(struct ufs_hba *hba) ufshcd_print_evt(hba, UFS_EVT_RESUME_ERR, "resume_fail"); ufshcd_print_evt(hba, UFS_EVT_SUSPEND_ERR, "suspend_fail"); + ufshcd_print_evt(hba, UFS_EVT_WL_RES_ERR, "wlun resume_fail"); + ufshcd_print_evt(hba, UFS_EVT_WL_SUSP_ERR, + "wlun suspend_fail"); ufshcd_print_evt(hba, UFS_EVT_DEV_RESET, "dev_reset"); ufshcd_print_evt(hba, UFS_EVT_HOST_RESET, "host_reset"); ufshcd_print_evt(hba, UFS_EVT_ABORT, "task_abort"); @@ -475,7 +542,7 @@ static void ufshcd_print_evt_hist(struct ufs_hba *hba) static void ufshcd_print_trs(struct ufs_hba *hba, unsigned long bitmap, bool pr_prdt) { - struct ufshcd_lrb *lrbp; + const struct ufshcd_lrb *lrbp; int prdt_length; int tag; @@ -483,9 +550,9 @@ void ufshcd_print_trs(struct ufs_hba *hba, unsigned long bitmap, bool pr_prdt) lrbp = &hba->lrb[tag]; dev_err(hba->dev, "UPIU[%d] - issue time %lld us\n", - tag, ktime_to_us(lrbp->issue_time_stamp)); + tag, div_u64(lrbp->issue_time_stamp_local_clock, 1000)); dev_err(hba->dev, "UPIU[%d] - complete time %lld us\n", - tag, ktime_to_us(lrbp->compl_time_stamp)); + tag, div_u64(lrbp->compl_time_stamp_local_clock, 1000)); dev_err(hba->dev, "UPIU[%d] - Transfer Request Descriptor phys@0x%llx\n", tag, (u64)lrbp->utrd_dma_addr); @@ -504,7 +571,7 @@ void ufshcd_print_trs(struct ufs_hba *hba, unsigned long bitmap, bool pr_prdt) prdt_length = le16_to_cpu( lrbp->utr_descriptor_ptr->prd_table_length); if (hba->quirks & UFSHCD_QUIRK_PRDT_BYTE_GRAN) - prdt_length /= sizeof(struct ufshcd_sg_entry); + prdt_length /= ufshcd_sg_entry_size(hba); dev_err(hba->dev, "UPIU[%d] - PRDT - %d entries phys@0x%llx\n", @@ -513,7 +580,7 @@ void ufshcd_print_trs(struct ufs_hba *hba, unsigned long bitmap, bool pr_prdt) if (pr_prdt) ufshcd_hex_dump("UPIU PRDT: ", lrbp->ucd_prdt_ptr, - sizeof(struct ufshcd_sg_entry) * prdt_length); + ufshcd_sg_entry_size(hba) * prdt_length); } } @@ -531,7 +598,7 @@ static void ufshcd_print_tmrs(struct ufs_hba *hba, unsigned long bitmap) static void ufshcd_print_host_state(struct ufs_hba *hba) { - struct scsi_device *sdev_ufs = hba->sdev_ufs_device; + const struct scsi_device *sdev_ufs = hba->ufs_device_wlun; dev_err(hba->dev, "UFS Host state=%d\n", hba->ufshcd_state); dev_err(hba->dev, "outstanding reqs=0x%lx tasks=0x%lx\n", @@ -547,10 +614,10 @@ static void ufshcd_print_host_state(struct ufs_hba *hba) dev_err(hba->dev, "Clk gate=%d\n", hba->clk_gating.state); dev_err(hba->dev, "last_hibern8_exit_tstamp at %lld us, hibern8_exit_cnt=%d\n", - ktime_to_us(hba->ufs_stats.last_hibern8_exit_tstamp), + div_u64(hba->ufs_stats.last_hibern8_exit_tstamp, 1000), hba->ufs_stats.hibern8_exit_cnt); dev_err(hba->dev, "last intr at %lld us, last intr status=0x%x\n", - ktime_to_us(hba->ufs_stats.last_intr_ts), + div_u64(hba->ufs_stats.last_intr_ts, 1000), hba->ufs_stats.last_intr_status); dev_err(hba->dev, "error handling flags=0x%x, req. abort count=%d\n", hba->eh_flags, hba->req_abort_count); @@ -637,7 +704,7 @@ EXPORT_SYMBOL_GPL(ufshcd_delay_us); * Return: * -ETIMEDOUT on error, zero on success. */ -int ufshcd_wait_for_register(struct ufs_hba *hba, u32 reg, u32 mask, +static int ufshcd_wait_for_register(struct ufs_hba *hba, u32 reg, u32 mask, u32 val, unsigned long interval_us, unsigned long timeout_ms) { @@ -710,38 +777,53 @@ static inline u32 ufshcd_get_ufs_version(struct ufs_hba *hba) */ static inline bool ufshcd_is_device_present(struct ufs_hba *hba) { - return (ufshcd_readl(hba, REG_CONTROLLER_STATUS) & - DEVICE_PRESENT) ? true : false; + return ufshcd_readl(hba, REG_CONTROLLER_STATUS) & DEVICE_PRESENT; } /** * ufshcd_get_tr_ocs - Get the UTRD Overall Command Status * @lrbp: pointer to local command reference block + * @cqe: pointer to the completion queue entry * * This function is used to get the OCS field from UTRD * Returns the OCS field in the UTRD */ -static inline int ufshcd_get_tr_ocs(struct ufshcd_lrb *lrbp) +static enum utp_ocs ufshcd_get_tr_ocs(struct ufshcd_lrb *lrbp, + struct cq_entry *cqe) { + if (cqe) + return le32_to_cpu(cqe->status) & MASK_OCS; + return le32_to_cpu(lrbp->utr_descriptor_ptr->header.dword_2) & MASK_OCS; } /** - * ufshcd_utrl_clear - Clear a bit in UTRLCLR register + * ufshcd_utrl_clear() - Clear requests from the controller request list. * @hba: per adapter instance - * @pos: position of the bit to be cleared + * @mask: mask with one bit set for each request to be cleared */ -static inline void ufshcd_utrl_clear(struct ufs_hba *hba, u32 pos) +static inline void ufshcd_utrl_clear(struct ufs_hba *hba, u32 mask) { if (hba->quirks & UFSHCI_QUIRK_BROKEN_REQ_LIST_CLR) - ufshcd_writel(hba, (1 << pos), REG_UTP_TRANSFER_REQ_LIST_CLEAR); - else - ufshcd_writel(hba, ~(1 << pos), - REG_UTP_TRANSFER_REQ_LIST_CLEAR); + mask = ~mask; + /* + * From the UFSHCI specification: "UTP Transfer Request List CLear + * Register (UTRLCLR): This field is bit significant. Each bit + * corresponds to a slot in the UTP Transfer Request List, where bit 0 + * corresponds to request slot 0. A bit in this field is set to ‘0’ + * by host software to indicate to the host controller that a transfer + * request slot is cleared. The host controller + * shall free up any resources associated to the request slot + * immediately, and shall set the associated bit in UTRLDBR to ‘0’. The + * host software indicates no change to request slots by setting the + * associated bits in this field to ‘1’. Bits in this field shall only + * be set ‘1’ or ‘0’ by host software when UTRLRSR is set to ‘1’." + */ + ufshcd_writel(hba, ~mask, REG_UTP_TRANSFER_REQ_LIST_CLEAR); } /** - * ufshcd_utmrl_clear - Clear a bit in UTRMLCLR register + * ufshcd_utmrl_clear - Clear a bit in UTMRLCLR register * @hba: per adapter instance * @pos: position of the bit to be cleared */ @@ -838,7 +920,7 @@ ufshcd_get_rsp_upiu_data_seg_len(struct utp_upiu_rsp *ucd_rsp_ptr) static inline bool ufshcd_is_exception_event(struct utp_upiu_rsp *ucd_rsp_ptr) { return be32_to_cpu(ucd_rsp_ptr->header.dword_2) & - MASK_RSP_EXCEPTION_EVENT ? true : false; + MASK_RSP_EXCEPTION_EVENT; } /** @@ -909,12 +991,11 @@ static inline void ufshcd_hba_start(struct ufs_hba *hba) * ufshcd_is_hba_active - Get controller state * @hba: per adapter instance * - * Returns false if controller is active, true otherwise + * Returns true if and only if the controller is active. */ static inline bool ufshcd_is_hba_active(struct ufs_hba *hba) { - return (ufshcd_readl(hba, REG_CONTROLLER_ENABLE) & CONTROLLER_ENABLE) - ? false : true; + return ufshcd_readl(hba, REG_CONTROLLER_ENABLE) & CONTROLLER_ENABLE; } u32 ufshcd_get_local_unipro_ver(struct ufs_hba *hba) @@ -938,10 +1019,7 @@ static bool ufshcd_is_unipro_pa_params_tuning_req(struct ufs_hba *hba) * logic simple, we will only do manual tuning if local unipro version * doesn't support ver1.6 or later. */ - if (ufshcd_get_local_unipro_ver(hba) < UFS_UNIPRO_VER_1_6) - return true; - else - return false; + return ufshcd_get_local_unipro_ver(hba) < UFS_UNIPRO_VER_1_6; } /** @@ -1072,13 +1150,38 @@ static bool ufshcd_is_devfreq_scaling_required(struct ufs_hba *hba, return false; } +/* + * Determine the number of pending commands by counting the bits in the SCSI + * device budget maps. This approach has been selected because a bit is set in + * the budget map before scsi_host_queue_ready() checks the host_self_blocked + * flag. The host_self_blocked flag can be modified by calling + * scsi_block_requests() or scsi_unblock_requests(). + */ +static u32 ufshcd_pending_cmds(struct ufs_hba *hba) +{ + const struct scsi_device *sdev; + u32 pending = 0; + + lockdep_assert_held(hba->host->host_lock); + __shost_for_each_device(sdev, hba->host) + pending += sbitmap_weight(&sdev->budget_map); + + return pending; +} + +/* + * Wait until all pending SCSI commands and TMFs have finished or the timeout + * has expired. + * + * Return: 0 upon success; -EBUSY upon timeout. + */ static int ufshcd_wait_for_doorbell_clr(struct ufs_hba *hba, u64 wait_timeout_us) { unsigned long flags; int ret = 0; u32 tm_doorbell; - u32 tr_doorbell; + u32 tr_pending; bool timeout = false, do_last_check = false; ktime_t start; @@ -1096,8 +1199,8 @@ static int ufshcd_wait_for_doorbell_clr(struct ufs_hba *hba, } tm_doorbell = ufshcd_readl(hba, REG_UTP_TASK_REQ_DOOR_BELL); - tr_doorbell = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL); - if (!tm_doorbell && !tr_doorbell) { + tr_pending = ufshcd_pending_cmds(hba); + if (!tm_doorbell && !tr_pending) { timeout = false; break; } else if (do_last_check) { @@ -1105,7 +1208,7 @@ static int ufshcd_wait_for_doorbell_clr(struct ufs_hba *hba, } spin_unlock_irqrestore(hba->host->host_lock, flags); - schedule(); + io_schedule_timeout(msecs_to_jiffies(20)); if (ktime_to_us(ktime_sub(ktime_get(), start)) > wait_timeout_us) { timeout = true; @@ -1117,12 +1220,12 @@ static int ufshcd_wait_for_doorbell_clr(struct ufs_hba *hba, do_last_check = true; } spin_lock_irqsave(hba->host->host_lock, flags); - } while (tm_doorbell || tr_doorbell); + } while (tm_doorbell || tr_pending); if (timeout) { dev_err(hba->dev, "%s: timedout waiting for doorbell to clear (tm=0x%x, tr=0x%x)\n", - __func__, tm_doorbell, tr_doorbell); + __func__, tm_doorbell, tr_pending); ret = -EBUSY; } out: @@ -1176,9 +1279,14 @@ static int ufshcd_scale_gear(struct ufs_hba *hba, bool scale_up) return ret; } -static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba) +/* + * Wait until all pending SCSI commands and TMFs have finished or the timeout + * has expired. + * + * Return: 0 upon success; -EBUSY upon timeout. + */ +static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba, u64 timeout_us) { - #define DOORBELL_CLR_TOUT_US (1000 * 1000) /* 1 sec */ int ret = 0; /* * make sure that there are no outstanding requests when @@ -1188,8 +1296,7 @@ static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba) mutex_lock(&hba->wb_mutex); down_write(&hba->clk_scaling_lock); - if (!hba->clk_scaling.is_allowed || - ufshcd_wait_for_doorbell_clr(hba, DOORBELL_CLR_TOUT_US)) { + if (ufshcd_wait_for_doorbell_clr(hba, timeout_us)) { ret = -EBUSY; up_write(&hba->clk_scaling_lock); mutex_unlock(&hba->wb_mutex); @@ -1209,7 +1316,8 @@ static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, int err, bool sc up_write(&hba->clk_scaling_lock); /* Enable Write Booster if we have scaled up else disable it */ - ufshcd_wb_toggle(hba, scale_up); + if (ufshcd_enable_wb_if_scaling_up(hba) && !err) + ufshcd_wb_toggle(hba, scale_up); mutex_unlock(&hba->wb_mutex); @@ -1230,10 +1338,18 @@ static int ufshcd_devfreq_scale(struct ufs_hba *hba, bool scale_up) { int ret = 0; - ret = ufshcd_clock_scaling_prepare(hba); + if (!hba->clk_scaling.is_allowed) + return -EBUSY; + + ret = ufshcd_clock_scaling_prepare(hba, 1 * USEC_PER_SEC); if (ret) return ret; + if (!hba->clk_scaling.is_allowed) { + ret = -EBUSY; + goto out_unprepare; + } + /* scale down the gear before scaling down clocks */ if (!scale_up) { ret = ufshcd_scale_gear(hba, false); @@ -1306,6 +1422,8 @@ static int ufshcd_devfreq_target(struct device *dev, struct list_head *clk_list = &hba->clk_list_head; struct ufs_clk_info *clki; unsigned long irq_flags; + bool force_out = false; + bool force_scaling = false; if (!ufshcd_is_clkscaling_supported(hba)) return -EINVAL; @@ -1328,11 +1446,14 @@ static int ufshcd_devfreq_target(struct device *dev, } /* Decide based on the rounded-off frequency and update */ - scale_up = (*freq == clki->max_freq) ? true : false; + scale_up = *freq == clki->max_freq; if (!scale_up) *freq = clki->min_freq; + + trace_android_vh_ufs_clock_scaling(hba, &force_out, &force_scaling, &scale_up); + /* Update the frequency */ - if (!ufshcd_is_devfreq_scaling_required(hba, scale_up)) { + if (force_out || (!force_scaling && !ufshcd_is_devfreq_scaling_required(hba, scale_up))) { spin_unlock_irqrestore(hba->host->host_lock, irq_flags); ret = 0; goto out; /* no state change required */ @@ -1354,25 +1475,6 @@ out: return ret; } -static bool ufshcd_is_busy(struct request *req, void *priv, bool reserved) -{ - int *busy = priv; - - WARN_ON_ONCE(reserved); - (*busy)++; - return false; -} - -/* Whether or not any tag is in use by a request that is in progress. */ -static bool ufshcd_any_tag_in_use(struct ufs_hba *hba) -{ - struct request_queue *q = hba->cmd_queue; - int busy = 0; - - blk_mq_tagset_busy_iter(q->tag_set, ufshcd_is_busy, &busy); - return busy; -} - static int ufshcd_devfreq_get_dev_status(struct device *dev, struct devfreq_dev_status *stat) { @@ -1656,6 +1758,26 @@ unblock_reqs: ufshcd_scsi_unblock_requests(hba); } +/* + * Block processing of new SCSI commands and wait until pending SCSI + * commands and TMFs have finished. ufshcd_exec_dev_cmd() and + * ufshcd_issue_devman_upiu_cmd() are not affected by this function. + * + * Return: 0 upon success; -EBUSY upon timeout. + */ +int ufshcd_freeze_scsi_devs(struct ufs_hba *hba, u64 timeout_us) +{ + return ufshcd_clock_scaling_prepare(hba, timeout_us); +} +EXPORT_SYMBOL_GPL(ufshcd_freeze_scsi_devs); + +/* Resume processing of SCSI commands. */ +void ufshcd_unfreeze_scsi_devs(struct ufs_hba *hba) +{ + ufshcd_clock_scaling_unprepare(hba, 0, true); +} +EXPORT_SYMBOL_GPL(ufshcd_unfreeze_scsi_devs); + /** * ufshcd_hold - Enable clocks that were gated earlier due to ufshcd_release. * Also, exit from hibern8 mode and set the link as active. @@ -1772,7 +1894,7 @@ static void ufshcd_gate_work(struct work_struct *work) if (hba->clk_gating.active_reqs || hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL - || ufshcd_any_tag_in_use(hba) || hba->outstanding_tasks + || hba->outstanding_reqs || hba->outstanding_tasks || hba->active_uic_cmd || hba->uic_async_done) goto rel_lock; @@ -1859,18 +1981,26 @@ static ssize_t ufshcd_clkgate_delay_show(struct device *dev, return sysfs_emit(buf, "%lu\n", hba->clk_gating.delay_ms); } -static ssize_t ufshcd_clkgate_delay_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t count) +void ufshcd_clkgate_delay_set(struct device *dev, unsigned long value) { struct ufs_hba *hba = dev_get_drvdata(dev); - unsigned long flags, value; - - if (kstrtoul(buf, 0, &value)) - return -EINVAL; + unsigned long flags; spin_lock_irqsave(hba->host->host_lock, flags); hba->clk_gating.delay_ms = value; spin_unlock_irqrestore(hba->host->host_lock, flags); +} +EXPORT_SYMBOL_GPL(ufshcd_clkgate_delay_set); + +static ssize_t ufshcd_clkgate_delay_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + unsigned long value; + + if (kstrtoul(buf, 0, &value)) + return -EINVAL; + + ufshcd_clkgate_delay_set(dev, value); return count; } @@ -1975,7 +2105,6 @@ static void ufshcd_exit_clk_gating(struct ufs_hba *hba) destroy_workqueue(hba->clk_gating.clk_gating_workq); } -/* Must be called with host lock acquired */ static void ufshcd_clk_scaling_start_busy(struct ufs_hba *hba) { bool queue_resume_work = false; @@ -2043,14 +2172,15 @@ static inline int ufshcd_monitor_opcode2dir(u8 opcode) static inline bool ufshcd_should_inform_monitor(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) { - struct ufs_hba_monitor *m = &hba->monitor; + const struct ufs_hba_monitor *m = &hba->monitor; return (m->enabled && lrbp && lrbp->cmd && (!m->chunk_size || m->chunk_size == lrbp->cmd->sdb.length) && ktime_before(hba->monitor.enabled_ts, lrbp->issue_time_stamp)); } -static void ufshcd_start_monitor(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) +static void ufshcd_start_monitor(struct ufs_hba *hba, + const struct ufshcd_lrb *lrbp) { int dir = ufshcd_monitor_opcode2dir(*lrbp->cmd->cmnd); unsigned long flags; @@ -2061,14 +2191,14 @@ static void ufshcd_start_monitor(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) spin_unlock_irqrestore(hba->host->host_lock, flags); } -static void ufshcd_update_monitor(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) +static void ufshcd_update_monitor(struct ufs_hba *hba, const struct ufshcd_lrb *lrbp) { int dir = ufshcd_monitor_opcode2dir(*lrbp->cmd->cmnd); unsigned long flags; spin_lock_irqsave(hba->host->host_lock, flags); if (dir >= 0 && hba->monitor.nr_queued[dir] > 0) { - struct request *req = scsi_cmd_to_rq(lrbp->cmd); + const struct request *req = scsi_cmd_to_rq(lrbp->cmd); struct ufs_hba_monitor *m = &hba->monitor; ktime_t now, inc, lat; @@ -2097,29 +2227,43 @@ static void ufshcd_update_monitor(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) * ufshcd_send_command - Send SCSI or device management commands * @hba: per adapter instance * @task_tag: Task tag of the command + * @hwq: pointer to hardware queue instance */ static inline -void ufshcd_send_command(struct ufs_hba *hba, unsigned int task_tag) +void ufshcd_send_command(struct ufs_hba *hba, unsigned int task_tag, + struct ufs_hw_queue *hwq) { struct ufshcd_lrb *lrbp = &hba->lrb[task_tag]; unsigned long flags; lrbp->issue_time_stamp = ktime_get(); + lrbp->issue_time_stamp_local_clock = local_clock(); lrbp->compl_time_stamp = ktime_set(0, 0); + lrbp->compl_time_stamp_local_clock = 0; + trace_android_vh_ufs_send_command(hba, lrbp); ufshcd_add_command_trace(hba, task_tag, UFS_CMD_SEND); ufshcd_clk_scaling_start_busy(hba); if (unlikely(ufshcd_should_inform_monitor(hba, lrbp))) ufshcd_start_monitor(hba, lrbp); - spin_lock_irqsave(&hba->outstanding_lock, flags); - if (hba->vops && hba->vops->setup_xfer_req) - hba->vops->setup_xfer_req(hba, task_tag, !!lrbp->cmd); - __set_bit(task_tag, &hba->outstanding_reqs); - ufshcd_writel(hba, 1 << task_tag, REG_UTP_TRANSFER_REQ_DOOR_BELL); - spin_unlock_irqrestore(&hba->outstanding_lock, flags); + if (is_mcq_enabled(hba)) { + int utrd_size = sizeof(struct utp_transfer_req_desc); - /* Make sure that doorbell is committed immediately */ - wmb(); + spin_lock(&hwq->sq_lock); + memcpy(hwq->sqe_base_addr + (hwq->sq_tail_slot * utrd_size), + lrbp->utr_descriptor_ptr, utrd_size); + ufshcd_inc_sq_tail(hwq); + spin_unlock(&hwq->sq_lock); + } else { + spin_lock_irqsave(&hba->outstanding_lock, flags); + if (hba->vops && hba->vops->setup_xfer_req) + hba->vops->setup_xfer_req(hba, lrbp->task_tag, + !!lrbp->cmd); + __set_bit(lrbp->task_tag, &hba->outstanding_reqs); + ufshcd_writel(hba, 1 << lrbp->task_tag, + REG_UTP_TRANSFER_REQ_DOOR_BELL); + spin_unlock_irqrestore(&hba->outstanding_lock, flags); + } } /** @@ -2128,15 +2272,17 @@ void ufshcd_send_command(struct ufs_hba *hba, unsigned int task_tag) */ static inline void ufshcd_copy_sense_data(struct ufshcd_lrb *lrbp) { + u8 *const sense_buffer = lrbp->cmd->sense_buffer; int len; - if (lrbp->sense_buffer && + + if (sense_buffer && ufshcd_get_rsp_upiu_data_seg_len(lrbp->ucd_rsp_ptr)) { int len_to_copy; len = be16_to_cpu(lrbp->ucd_rsp_ptr->sr.sense_data_len); len_to_copy = min_t(int, UFS_SENSE_SIZE, len); - memcpy(lrbp->sense_buffer, lrbp->ucd_rsp_ptr->sr.sense_data, + memcpy(sense_buffer, lrbp->ucd_rsp_ptr->sr.sense_data, len_to_copy); } } @@ -2191,6 +2337,8 @@ static inline int ufshcd_hba_capabilities(struct ufs_hba *hba) int err; hba->capabilities = ufshcd_readl(hba, REG_CONTROLLER_CAPABILITIES); + if (hba->quirks & UFSHCD_QUIRK_BROKEN_64BIT_ADDRESS) + hba->capabilities &= ~MASK_64_ADDRESSING_SUPPORT; /* nutrs and nutmrs are 0 based values */ hba->nutrs = (hba->capabilities & MASK_TRANSFER_REQUESTS_SLOTS) + 1; @@ -2203,6 +2351,14 @@ static inline int ufshcd_hba_capabilities(struct ufs_hba *hba) if (err) dev_err(hba->dev, "crypto setup failed\n"); + hba->mcq_sup = FIELD_GET(MASK_MCQ_SUPPORT, hba->capabilities); + if (!hba->mcq_sup) + return err; + + hba->mcq_capabilities = ufshcd_readl(hba, REG_MCQCAP); + hba->ext_iid_sup = FIELD_GET(MASK_EXT_IID_SUPPORT, + hba->mcq_capabilities); + return err; } @@ -2214,10 +2370,7 @@ static inline int ufshcd_hba_capabilities(struct ufs_hba *hba) */ static inline bool ufshcd_ready_for_uic_cmd(struct ufs_hba *hba) { - if (ufshcd_readl(hba, REG_CONTROLLER_STATUS) & UIC_COMMAND_READY) - return true; - else - return false; + return ufshcd_readl(hba, REG_CONTROLLER_STATUS) & UIC_COMMAND_READY; } /** @@ -2338,6 +2491,9 @@ int ufshcd_send_uic_cmd(struct ufs_hba *hba, struct uic_command *uic_cmd) int ret; unsigned long flags; + if (hba->quirks & UFSHCD_QUIRK_BROKEN_UIC_CMD) + return 0; + ufshcd_hold(hba, false); mutex_lock(&hba->uic_cmd_mutex); ufshcd_add_delay_before_dme_cmd(hba); @@ -2354,6 +2510,52 @@ int ufshcd_send_uic_cmd(struct ufs_hba *hba, struct uic_command *uic_cmd) return ret; } +/** + * ufshcd_sgl_to_prdt - SG list to PRTD (Physical Region Description Table, 4DW format) + * @hba: per-adapter instance + * @lrbp: pointer to local reference block + * @sg_entries: The number of sg lists actually used + * @sg_list: Pointer to SG list + */ +static void ufshcd_sgl_to_prdt(struct ufs_hba *hba, struct ufshcd_lrb *lrbp, int sg_entries, + struct scatterlist *sg_list) +{ + struct ufshcd_sg_entry *prd; + struct scatterlist *sg; + int i; + + if (sg_entries) { + + if (hba->quirks & UFSHCD_QUIRK_PRDT_BYTE_GRAN) + lrbp->utr_descriptor_ptr->prd_table_length = + cpu_to_le16(sg_entries * ufshcd_sg_entry_size(hba)); + else + lrbp->utr_descriptor_ptr->prd_table_length = cpu_to_le16(sg_entries); + + prd = lrbp->ucd_prdt_ptr; + + for_each_sg(sg_list, sg, sg_entries, i) { + const unsigned int len = sg_dma_len(sg); + + /* + * From the UFSHCI spec: "Data Byte Count (DBC): A '0' + * based value that indicates the length, in bytes, of + * the data block. A maximum of length of 256KB may + * exist for any entry. Bits 1:0 of this field shall be + * 11b to indicate Dword granularity. A value of '3' + * indicates 4 bytes, '7' indicates 8 bytes, etc." + */ + WARN_ONCE(len > 256 * 1024, "len = %#x\n", len); + prd->size = cpu_to_le32(len - 1); + prd->addr = cpu_to_le64(sg->dma_address); + prd->reserved = 0; + prd = (void *)prd + ufshcd_sg_entry_size(hba); + } + } else { + lrbp->utr_descriptor_ptr->prd_table_length = 0; + } +} + /** * ufshcd_map_sg - Map scatter-gather list to prdt * @hba: per adapter instance @@ -2363,43 +2565,18 @@ int ufshcd_send_uic_cmd(struct ufs_hba *hba, struct uic_command *uic_cmd) */ static int ufshcd_map_sg(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) { - struct ufshcd_sg_entry *prd_table; - struct scatterlist *sg; - struct scsi_cmnd *cmd; - int sg_segments; - int i; + struct scsi_cmnd *cmd = lrbp->cmd; + int sg_segments = scsi_dma_map(cmd); + int err; - cmd = lrbp->cmd; - sg_segments = scsi_dma_map(cmd); if (sg_segments < 0) return sg_segments; - if (sg_segments) { + ufshcd_sgl_to_prdt(hba, lrbp, sg_segments, scsi_sglist(cmd)); - if (hba->quirks & UFSHCD_QUIRK_PRDT_BYTE_GRAN) - lrbp->utr_descriptor_ptr->prd_table_length = - cpu_to_le16((sg_segments * - sizeof(struct ufshcd_sg_entry))); - else - lrbp->utr_descriptor_ptr->prd_table_length = - cpu_to_le16((u16) (sg_segments)); - - prd_table = (struct ufshcd_sg_entry *)lrbp->ucd_prdt_ptr; - - scsi_for_each_sg(cmd, sg, sg_segments, i) { - prd_table[i].size = - cpu_to_le32(((u32) sg_dma_len(sg))-1); - prd_table[i].base_addr = - cpu_to_le32(lower_32_bits(sg->dma_address)); - prd_table[i].upper_addr = - cpu_to_le32(upper_32_bits(sg->dma_address)); - prd_table[i].reserved = 0; - } - } else { - lrbp->utr_descriptor_ptr->prd_table_length = 0; - } - - return 0; + err = 0; + trace_android_vh_ufs_fill_prdt(hba, lrbp, sg_segments, &err); + return err; } /** @@ -2445,14 +2622,15 @@ static void ufshcd_disable_intr(struct ufs_hba *hba, u32 intrs) } /** - * ufshcd_prepare_req_desc_hdr() - Fills the requests header + * ufshcd_prepare_req_desc_hdr - Fill UTP Transfer request descriptor header according to request * descriptor according to request * @lrbp: pointer to local reference block * @upiu_flags: flags required in the header * @cmd_dir: requests data direction + * @ehs_length: Total EHS Length (in 32‐bytes units of all Extra Header Segments) */ -static void ufshcd_prepare_req_desc_hdr(struct ufshcd_lrb *lrbp, - u8 *upiu_flags, enum dma_data_direction cmd_dir) +static void ufshcd_prepare_req_desc_hdr(struct ufshcd_lrb *lrbp, u8 *upiu_flags, + enum dma_data_direction cmd_dir, int ehs_length) { struct utp_transfer_req_desc *req_desc = lrbp->utr_descriptor_ptr; u32 data_direction; @@ -2471,8 +2649,8 @@ static void ufshcd_prepare_req_desc_hdr(struct ufshcd_lrb *lrbp, *upiu_flags = UPIU_CMD_FLAGS_NONE; } - dword_0 = data_direction | (lrbp->command_type - << UPIU_COMMAND_TYPE_OFFSET); + dword_0 = data_direction | (lrbp->command_type << UPIU_COMMAND_TYPE_OFFSET) | + ehs_length << 8; if (lrbp->intr_cmd) dword_0 |= UTP_REQ_DESC_INT_CMD; @@ -2527,8 +2705,7 @@ void ufshcd_prepare_utp_scsi_cmd_upiu(struct ufshcd_lrb *lrbp, u8 upiu_flags) } /** - * ufshcd_prepare_utp_query_req_upiu() - fills the utp_transfer_req_desc, - * for query requsts + * ufshcd_prepare_utp_query_req_upiu() - fill the utp_transfer_req_desc for query request * @hba: UFS hba * @lrbp: local reference block pointer * @upiu_flags: flags @@ -2599,7 +2776,7 @@ static int ufshcd_compose_devman_upiu(struct ufs_hba *hba, else lrbp->command_type = UTP_CMD_TYPE_UFS_STORAGE; - ufshcd_prepare_req_desc_hdr(lrbp, &upiu_flags, DMA_NONE); + ufshcd_prepare_req_desc_hdr(lrbp, &upiu_flags, DMA_NONE, 0); if (hba->dev_cmd.type == DEV_CMD_TYPE_QUERY) ufshcd_prepare_utp_query_req_upiu(hba, lrbp, upiu_flags); else if (hba->dev_cmd.type == DEV_CMD_TYPE_NOP) @@ -2627,8 +2804,7 @@ static int ufshcd_comp_scsi_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) lrbp->command_type = UTP_CMD_TYPE_UFS_STORAGE; if (likely(lrbp->cmd)) { - ufshcd_prepare_req_desc_hdr(lrbp, &upiu_flags, - lrbp->cmd->sc_data_direction); + ufshcd_prepare_req_desc_hdr(lrbp, &upiu_flags, lrbp->cmd->sc_data_direction, 0); ufshcd_prepare_utp_scsi_cmd_upiu(lrbp, upiu_flags); } else { ret = -EINVAL; @@ -2648,23 +2824,52 @@ static inline u16 ufshcd_upiu_wlun_to_scsi_wlun(u8 upiu_wlun_id) return (upiu_wlun_id & ~UFS_UPIU_WLUN_ID) | SCSI_W_LUN_BASE; } -static inline bool is_rpmb_wlun(struct scsi_device *sdev) -{ - return sdev->lun == ufshcd_upiu_wlun_to_scsi_wlun(UFS_UPIU_RPMB_WLUN); -} - static inline bool is_device_wlun(struct scsi_device *sdev) { return sdev->lun == ufshcd_upiu_wlun_to_scsi_wlun(UFS_UPIU_UFS_DEVICE_WLUN); } +/* + * Associate the UFS controller queue with the default and poll HCTX types. + * Initialize the mq_map[] arrays. + */ +static int ufshcd_map_queues(struct Scsi_Host *shost) +{ + struct ufs_hba *hba = shost_priv(shost); + int i, queue_offset = 0; + + if (!is_mcq_supported(hba)) { + hba->nr_queues[HCTX_TYPE_DEFAULT] = 1; + hba->nr_queues[HCTX_TYPE_READ] = 0; + hba->nr_queues[HCTX_TYPE_POLL] = 1; + hba->nr_hw_queues = 1; + } + + for (i = 0; i < shost->nr_maps; i++) { + struct blk_mq_queue_map *map = &shost->tag_set.map[i]; + + map->nr_queues = hba->nr_queues[i]; + if (!map->nr_queues) + continue; + map->queue_offset = queue_offset; + if (i == HCTX_TYPE_POLL && !is_mcq_supported(hba)) + map->queue_offset = 0; + + blk_mq_map_queues(map); + queue_offset += map->nr_queues; + } + + return 0; +} + static void ufshcd_init_lrb(struct ufs_hba *hba, struct ufshcd_lrb *lrb, int i) { - struct utp_transfer_cmd_desc *cmd_descp = hba->ucdl_base_addr; + struct utp_transfer_cmd_desc *cmd_descp = (void *)hba->ucdl_base_addr + + i * sizeof_utp_transfer_cmd_desc(hba); struct utp_transfer_req_desc *utrdlp = hba->utrdl_base_addr; dma_addr_t cmd_desc_element_addr = hba->ucdl_dma_addr + - i * sizeof(struct utp_transfer_cmd_desc); + i * sizeof_utp_transfer_cmd_desc(hba); u16 response_offset = offsetof(struct utp_transfer_cmd_desc, response_upiu); u16 prdt_offset = offsetof(struct utp_transfer_cmd_desc, prd_table); @@ -2672,11 +2877,11 @@ static void ufshcd_init_lrb(struct ufs_hba *hba, struct ufshcd_lrb *lrb, int i) lrb->utr_descriptor_ptr = utrdlp + i; lrb->utrd_dma_addr = hba->utrdl_dma_addr + i * sizeof(struct utp_transfer_req_desc); - lrb->ucd_req_ptr = (struct utp_upiu_req *)(cmd_descp + i); + lrb->ucd_req_ptr = (struct utp_upiu_req *)cmd_descp->command_upiu; lrb->ucd_req_dma_addr = cmd_desc_element_addr; - lrb->ucd_rsp_ptr = (struct utp_upiu_rsp *)cmd_descp[i].response_upiu; + lrb->ucd_rsp_ptr = (struct utp_upiu_rsp *)cmd_descp->response_upiu; lrb->ucd_rsp_dma_addr = cmd_desc_element_addr + response_offset; - lrb->ucd_prdt_ptr = (struct ufshcd_sg_entry *)cmd_descp[i].prd_table; + lrb->ucd_prdt_ptr = (struct ufshcd_sg_entry *)cmd_descp->prd_table; lrb->ucd_prdt_dma_addr = cmd_desc_element_addr + prdt_offset; } @@ -2693,11 +2898,9 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) int tag = scsi_cmd_to_rq(cmd)->tag; struct ufshcd_lrb *lrbp; int err = 0; + struct ufs_hw_queue *hwq = NULL; - WARN_ONCE(tag < 0, "Invalid tag %d\n", tag); - - if (!down_read_trylock(&hba->clk_scaling_lock)) - return SCSI_MLQUEUE_HOST_BUSY; + WARN_ONCE(tag < 0 || tag >= hba->nutrs, "Invalid tag %d\n", tag); /* * Allows the UFS error handler to wait for prior ufshcd_queuecommand() @@ -2707,7 +2910,19 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) switch (hba->ufshcd_state) { case UFSHCD_STATE_OPERATIONAL: + break; case UFSHCD_STATE_EH_SCHEDULED_NON_FATAL: + /* + * SCSI error handler can call ->queuecommand() while UFS error + * handler is in progress. Error interrupts could change the + * state from UFSHCD_STATE_RESET to + * UFSHCD_STATE_EH_SCHEDULED_NON_FATAL. Prevent requests + * being issued in that case. + */ + if (ufshcd_eh_in_progress(hba)) { + err = SCSI_MLQUEUE_HOST_BUSY; + goto out; + } break; case UFSHCD_STATE_EH_SCHEDULED_FATAL: /* @@ -2749,14 +2964,20 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) lrbp = &hba->lrb[tag]; WARN_ON(lrbp->cmd); lrbp->cmd = cmd; - lrbp->sense_bufflen = UFS_SENSE_SIZE; - lrbp->sense_buffer = cmd->sense_buffer; lrbp->task_tag = tag; lrbp->lun = ufshcd_scsi_to_upiu_lun(cmd->device->lun); - lrbp->intr_cmd = !ufshcd_is_intr_aggr_allowed(hba) ? true : false; + lrbp->intr_cmd = !ufshcd_is_intr_aggr_allowed(hba); ufshcd_prepare_lrbp_crypto(scsi_cmd_to_rq(cmd), lrbp); + trace_android_vh_ufs_prepare_command(hba, scsi_cmd_to_rq(cmd), lrbp, + &err); + if (err) { + lrbp->cmd = NULL; + ufshcd_release(hba); + goto out; + } + lrbp->req_abort_skip = false; ufshpb_prep(hba, lrbp); @@ -2770,13 +2991,14 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) goto out; } - ufshcd_send_command(hba, tag); + if (is_mcq_enabled(hba)) + hwq = ufshcd_mcq_req_to_hwq(hba, scsi_cmd_to_rq(cmd)); + + ufshcd_send_command(hba, tag, hwq); out: rcu_read_unlock(); - up_read(&hba->clk_scaling_lock); - if (ufs_trigger_eh()) { unsigned long flags; @@ -2792,8 +3014,6 @@ static int ufshcd_compose_dev_cmd(struct ufs_hba *hba, struct ufshcd_lrb *lrbp, enum dev_cmd_type cmd_type, int tag) { lrbp->cmd = NULL; - lrbp->sense_bufflen = 0; - lrbp->sense_buffer = NULL; lrbp->task_tag = tag; lrbp->lun = 0; /* device management cmd is not specific to any LUN */ lrbp->intr_cmd = true; /* No interrupt aggregation */ @@ -2803,27 +3023,26 @@ static int ufshcd_compose_dev_cmd(struct ufs_hba *hba, return ufshcd_compose_devman_upiu(hba, lrbp); } -static int -ufshcd_clear_cmd(struct ufs_hba *hba, int tag) +/* + * Clear all the requests from the controller for which a bit has been set in + * @mask and wait until the controller confirms that these requests have been + * cleared. + */ +static int ufshcd_clear_cmds(struct ufs_hba *hba, u32 mask) { - int err = 0; unsigned long flags; - u32 mask = 1 << tag; /* clear outstanding transaction before retry */ spin_lock_irqsave(hba->host->host_lock, flags); - ufshcd_utrl_clear(hba, tag); + ufshcd_utrl_clear(hba, mask); spin_unlock_irqrestore(hba->host->host_lock, flags); /* * wait for h/w to clear corresponding bit in door-bell. * max. wait is 1 sec. */ - err = ufshcd_wait_for_register(hba, - REG_UTP_TRANSFER_REQ_DOOR_BELL, - mask, ~mask, 1000, 1000); - - return err; + return ufshcd_wait_for_register(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL, + mask, ~mask, 1000, 1000); } static int @@ -2870,6 +3089,12 @@ ufshcd_dev_cmd_completion(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) dev_err(hba->dev, "%s: Reject UPIU not fully implemented\n", __func__); break; + case UPIU_TRANSACTION_RESPONSE: + if (hba->dev_cmd.type != DEV_CMD_TYPE_RPMB) { + err = -EINVAL; + dev_err(hba->dev, "%s: unexpected response %x\n", __func__, resp); + } + break; default: err = -EINVAL; dev_err(hba->dev, "%s: Invalid device management cmd response: %x\n", @@ -2883,37 +3108,75 @@ ufshcd_dev_cmd_completion(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) static int ufshcd_wait_for_dev_cmd(struct ufs_hba *hba, struct ufshcd_lrb *lrbp, int max_timeout) { - int err = 0; - unsigned long time_left; + unsigned long time_left = msecs_to_jiffies(max_timeout); unsigned long flags; + bool pending; + int err; +retry: time_left = wait_for_completion_timeout(hba->dev_cmd.complete, - msecs_to_jiffies(max_timeout)); + time_left); - spin_lock_irqsave(hba->host->host_lock, flags); - hba->dev_cmd.complete = NULL; if (likely(time_left)) { - err = ufshcd_get_tr_ocs(lrbp); + /* + * The completion handler called complete() and the caller of + * this function still owns the @lrbp tag so the code below does + * not trigger any race conditions. + */ + hba->dev_cmd.complete = NULL; + err = ufshcd_get_tr_ocs(lrbp, hba->dev_cmd.cqe); if (!err) err = ufshcd_dev_cmd_completion(hba, lrbp); - } - spin_unlock_irqrestore(hba->host->host_lock, flags); - - if (!time_left) { + } else { err = -ETIMEDOUT; dev_dbg(hba->dev, "%s: dev_cmd request timedout, tag %d\n", __func__, lrbp->task_tag); - if (!ufshcd_clear_cmd(hba, lrbp->task_tag)) + if (ufshcd_clear_cmds(hba, 1U << lrbp->task_tag) == 0) { /* successfully cleared the command, retry if needed */ err = -EAGAIN; - /* - * in case of an error, after clearing the doorbell, - * we also need to clear the outstanding_request - * field in hba - */ - spin_lock_irqsave(&hba->outstanding_lock, flags); - __clear_bit(lrbp->task_tag, &hba->outstanding_reqs); - spin_unlock_irqrestore(&hba->outstanding_lock, flags); + /* + * Since clearing the command succeeded we also need to + * clear the task tag bit from the outstanding_reqs + * variable. + */ + spin_lock_irqsave(&hba->outstanding_lock, flags); + pending = test_bit(lrbp->task_tag, + &hba->outstanding_reqs); + if (pending) { + hba->dev_cmd.complete = NULL; + __clear_bit(lrbp->task_tag, + &hba->outstanding_reqs); + } + spin_unlock_irqrestore(&hba->outstanding_lock, flags); + + if (!pending) { + /* + * The completion handler ran while we tried to + * clear the command. + */ + time_left = 1; + goto retry; + } + } else { + dev_err(hba->dev, "%s: failed to clear tag %d\n", + __func__, lrbp->task_tag); + + spin_lock_irqsave(&hba->outstanding_lock, flags); + pending = test_bit(lrbp->task_tag, + &hba->outstanding_reqs); + if (pending) + hba->dev_cmd.complete = NULL; + spin_unlock_irqrestore(&hba->outstanding_lock, flags); + + if (!pending) { + /* + * The completion handler ran while we tried to + * clear the command. + */ + time_left = 1; + goto retry; + } + } } return err; @@ -2948,10 +3211,11 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba, goto out; hba->dev_cmd.complete = &wait; + hba->dev_cmd.cqe = NULL; ufshcd_add_query_upiu_trace(hba, UFS_QUERY_SEND, lrbp->ucd_req_ptr); - ufshcd_send_command(hba, tag); + ufshcd_send_command(hba, tag, hba->dev_cmd_queue); err = ufshcd_wait_for_dev_cmd(hba, lrbp, timeout); ufshcd_add_query_upiu_trace(hba, err ? UFS_QUERY_ERR : UFS_QUERY_COMP, (struct utp_upiu_req *)lrbp->ucd_rsp_ptr); @@ -2985,7 +3249,7 @@ static inline void ufshcd_init_query(struct ufs_hba *hba, (*request)->upiu_req.selector = selector; } -static int ufshcd_query_flag_retry(struct ufs_hba *hba, +int ufshcd_query_flag_retry(struct ufs_hba *hba, enum query_opcode opcode, enum flag_idn idn, u8 index, bool *flag_res) { int ret; @@ -3003,10 +3267,11 @@ static int ufshcd_query_flag_retry(struct ufs_hba *hba, if (ret) dev_err(hba->dev, - "%s: query attribute, opcode %d, idn %d, failed with error %d after %d retires\n", + "%s: query flag, opcode %d, idn %d, failed with error %d after %d retries\n", __func__, opcode, idn, ret, retries); return ret; } +EXPORT_SYMBOL_GPL(ufshcd_query_flag_retry); /** * ufshcd_query_flag() - API function for sending flag query requests @@ -3075,6 +3340,7 @@ out_unlock: ufshcd_release(hba); return err; } +EXPORT_SYMBOL_GPL(ufshcd_query_flag); /** * ufshcd_query_attr - API function for sending attribute requests @@ -3138,6 +3404,7 @@ out_unlock: ufshcd_release(hba); return err; } +EXPORT_SYMBOL_GPL(ufshcd_query_attr); /** * ufshcd_query_attr_retry() - API function for sending query @@ -3171,10 +3438,11 @@ int ufshcd_query_attr_retry(struct ufs_hba *hba, if (ret) dev_err(hba->dev, - "%s: query attribute, idn %d, failed with error %d after %d retires\n", + "%s: query attribute, idn %d, failed with error %d after %d retries\n", __func__, idn, ret, QUERY_REQ_RETRIES); return ret; } +EXPORT_SYMBOL_GPL(ufshcd_query_attr_retry); static int __ufshcd_query_descriptor(struct ufs_hba *hba, enum query_opcode opcode, enum desc_idn idn, u8 index, @@ -3270,37 +3538,7 @@ int ufshcd_query_descriptor_retry(struct ufs_hba *hba, return err; } - -/** - * ufshcd_map_desc_id_to_length - map descriptor IDN to its length - * @hba: Pointer to adapter instance - * @desc_id: descriptor idn value - * @desc_len: mapped desc length (out) - */ -void ufshcd_map_desc_id_to_length(struct ufs_hba *hba, enum desc_idn desc_id, - int *desc_len) -{ - if (desc_id >= QUERY_DESC_IDN_MAX || desc_id == QUERY_DESC_IDN_RFU_0 || - desc_id == QUERY_DESC_IDN_RFU_1) - *desc_len = 0; - else - *desc_len = hba->desc_size[desc_id]; -} -EXPORT_SYMBOL(ufshcd_map_desc_id_to_length); - -static void ufshcd_update_desc_length(struct ufs_hba *hba, - enum desc_idn desc_id, int desc_index, - unsigned char desc_len) -{ - if (hba->desc_size[desc_id] == QUERY_DESC_MAX_SIZE && - desc_id != QUERY_DESC_IDN_STRING && desc_index != UFS_RPMB_UNIT) - /* For UFS 3.1, the normal unit descriptor is 10 bytes larger - * than the RPMB unit, however, both descriptors share the same - * desc_idn, to cover both unit descriptors with one length, we - * choose the normal unit descriptor length by desc_index. - */ - hba->desc_size[desc_id] = desc_len; -} +EXPORT_SYMBOL_GPL(ufshcd_query_descriptor_retry); /** * ufshcd_read_desc_param - read the specified descriptor parameter @@ -3322,26 +3560,13 @@ int ufshcd_read_desc_param(struct ufs_hba *hba, { int ret; u8 *desc_buf; - int buff_len; + int buff_len = QUERY_DESC_MAX_SIZE; bool is_kmalloc = true; /* Safety check */ if (desc_id >= QUERY_DESC_IDN_MAX || !param_size) return -EINVAL; - /* Get the length of descriptor */ - ufshcd_map_desc_id_to_length(hba, desc_id, &buff_len); - if (!buff_len) { - dev_err(hba->dev, "%s: Failed to get desc length\n", __func__); - return -EINVAL; - } - - if (param_offset >= buff_len) { - dev_err(hba->dev, "%s: Invalid offset 0x%x in descriptor IDN 0x%x, length 0x%x\n", - __func__, param_offset, desc_id, buff_len); - return -EINVAL; - } - /* Check whether we need temp memory */ if (param_offset != 0 || param_size < buff_len) { desc_buf = kzalloc(buff_len, GFP_KERNEL); @@ -3354,15 +3579,24 @@ int ufshcd_read_desc_param(struct ufs_hba *hba, /* Request for full descriptor */ ret = ufshcd_query_descriptor_retry(hba, UPIU_QUERY_OPCODE_READ_DESC, - desc_id, desc_index, 0, - desc_buf, &buff_len); - + desc_id, desc_index, 0, + desc_buf, &buff_len); if (ret) { dev_err(hba->dev, "%s: Failed reading descriptor. desc_id %d, desc_index %d, param_offset %d, ret %d\n", __func__, desc_id, desc_index, param_offset, ret); goto out; } + /* Update descriptor length */ + buff_len = desc_buf[QUERY_DESC_LENGTH_OFFSET]; + + if (param_offset >= buff_len) { + dev_err(hba->dev, "%s: Invalid offset 0x%x in descriptor IDN 0x%x, length 0x%x\n", + __func__, param_offset, desc_id, buff_len); + ret = -EINVAL; + goto out; + } + /* Sanity check */ if (desc_buf[QUERY_DESC_DESC_TYPE_OFFSET] != desc_id) { dev_err(hba->dev, "%s: invalid desc_id %d in descriptor header\n", @@ -3371,10 +3605,6 @@ int ufshcd_read_desc_param(struct ufs_hba *hba, goto out; } - /* Update descriptor length */ - buff_len = desc_buf[QUERY_DESC_LENGTH_OFFSET]; - ufshcd_update_desc_length(hba, desc_id, desc_index, buff_len); - if (is_kmalloc) { /* Make sure we don't copy more data than available */ if (param_offset >= buff_len) @@ -3388,6 +3618,7 @@ out: kfree(desc_buf); return ret; } +EXPORT_SYMBOL_GPL(ufshcd_read_desc_param); /** * struct uc_string_id - unicode string @@ -3511,7 +3742,7 @@ static inline int ufshcd_read_unit_desc_param(struct ufs_hba *hba, * Unit descriptors are only available for general purpose LUs (LUN id * from 0 to 7) and RPMB Well known LU. */ - if (!ufs_is_valid_unit_desc_lun(&hba->dev_info, lun, param_offset)) + if (!ufs_is_valid_unit_desc_lun(&hba->dev_info, lun)) return -EOPNOTSUPP; return ufshcd_read_desc_param(hba, QUERY_DESC_IDN_UNIT, lun, @@ -3561,7 +3792,7 @@ static int ufshcd_memory_alloc(struct ufs_hba *hba) size_t utmrdl_size, utrdl_size, ucdl_size; /* Allocate memory for UTP command descriptors */ - ucdl_size = (sizeof(struct utp_transfer_cmd_desc) * hba->nutrs); + ucdl_size = sizeof_utp_transfer_cmd_desc(hba) * hba->nutrs; hba->ucdl_base_addr = dmam_alloc_coherent(hba->dev, ucdl_size, &hba->ucdl_dma_addr, @@ -3569,12 +3800,9 @@ static int ufshcd_memory_alloc(struct ufs_hba *hba) /* * UFSHCI requires UTP command descriptor to be 128 byte aligned. - * make sure hba->ucdl_dma_addr is aligned to PAGE_SIZE - * if hba->ucdl_dma_addr is aligned to PAGE_SIZE, then it will - * be aligned to 128 bytes as well */ if (!hba->ucdl_base_addr || - WARN_ON(hba->ucdl_dma_addr & (PAGE_SIZE - 1))) { + WARN_ON(hba->ucdl_dma_addr & (128 - 1))) { dev_err(hba->dev, "Command Descriptor Memory allocation failed\n"); goto out; @@ -3590,12 +3818,20 @@ static int ufshcd_memory_alloc(struct ufs_hba *hba) &hba->utrdl_dma_addr, GFP_KERNEL); if (!hba->utrdl_base_addr || - WARN_ON(hba->utrdl_dma_addr & (PAGE_SIZE - 1))) { + WARN_ON(hba->utrdl_dma_addr & (1024 - 1))) { dev_err(hba->dev, "Transfer Descriptor Memory allocation failed\n"); goto out; } + /* + * Skip utmrdl allocation; it may have been + * allocated during first pass and not released during + * MCQ memory allocation. + * See ufshcd_release_sdb_queue() and ufshcd_config_mcq() + */ + if (hba->utmrdl_base_addr) + goto skip_utmrdl; /* * Allocate memory for UTP Task Management descriptors * UFSHCI requires 1024 byte alignment of UTMRD @@ -3606,12 +3842,13 @@ static int ufshcd_memory_alloc(struct ufs_hba *hba) &hba->utmrdl_dma_addr, GFP_KERNEL); if (!hba->utmrdl_base_addr || - WARN_ON(hba->utmrdl_dma_addr & (PAGE_SIZE - 1))) { + WARN_ON(hba->utmrdl_dma_addr & (1024 - 1))) { dev_err(hba->dev, "Task Management Descriptor Memory allocation failed\n"); goto out; } +skip_utmrdl: /* Allocate memory for local reference block */ hba->lrb = devm_kcalloc(hba->dev, hba->nutrs, sizeof(struct ufshcd_lrb), @@ -3655,7 +3892,7 @@ static void ufshcd_host_memory_configure(struct ufs_hba *hba) prdt_offset = offsetof(struct utp_transfer_cmd_desc, prd_table); - cmd_desc_size = sizeof(struct utp_transfer_cmd_desc); + cmd_desc_size = sizeof_utp_transfer_cmd_desc(hba); cmd_desc_dma_addr = hba->ucdl_dma_addr; for (i = 0; i < hba->nutrs; i++) { @@ -3742,7 +3979,7 @@ int ufshcd_dme_configure_adapt(struct ufs_hba *hba, { int ret; - if (agreed_gear != UFS_HS_G4) + if (agreed_gear < UFS_HS_G4) adapt_val = PA_NO_ADAPT; ret = ufshcd_dme_set(hba, @@ -4031,7 +4268,7 @@ out_unlock: * * Returns 0 on success, non-zero value on failure */ -static int ufshcd_uic_change_pwr_mode(struct ufs_hba *hba, u8 mode) +int ufshcd_uic_change_pwr_mode(struct ufs_hba *hba, u8 mode) { struct uic_command uic_cmd = {0}; int ret; @@ -4056,6 +4293,7 @@ static int ufshcd_uic_change_pwr_mode(struct ufs_hba *hba, u8 mode) out: return ret; } +EXPORT_SYMBOL_GPL(ufshcd_uic_change_pwr_mode); int ufshcd_link_recovery(struct ufs_hba *hba) { @@ -4086,7 +4324,7 @@ int ufshcd_link_recovery(struct ufs_hba *hba) } EXPORT_SYMBOL_GPL(ufshcd_link_recovery); -static int ufshcd_uic_hibern8_enter(struct ufs_hba *hba) +int ufshcd_uic_hibern8_enter(struct ufs_hba *hba) { int ret; struct uic_command uic_cmd = {0}; @@ -4108,6 +4346,7 @@ static int ufshcd_uic_hibern8_enter(struct ufs_hba *hba) return ret; } +EXPORT_SYMBOL_GPL(ufshcd_uic_hibern8_enter); int ufshcd_uic_hibern8_exit(struct ufs_hba *hba) { @@ -4128,7 +4367,7 @@ int ufshcd_uic_hibern8_exit(struct ufs_hba *hba) } else { ufshcd_vops_hibern8_notify(hba, UIC_CMD_DME_HIBER_EXIT, POST_CHANGE); - hba->ufs_stats.last_hibern8_exit_tstamp = ktime_get(); + hba->ufs_stats.last_hibern8_exit_tstamp = local_clock(); hba->ufs_stats.hibern8_exit_cnt++; } @@ -4152,7 +4391,7 @@ void ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit) spin_unlock_irqrestore(hba->host->host_lock, flags); if (update && - !pm_runtime_suspended(&hba->sdev_ufs_device->sdev_gendev)) { + !pm_runtime_suspended(&hba->ufs_device_wlun->sdev_gendev)) { ufshcd_rpm_get_sync(hba); ufshcd_hold(hba, false); ufshcd_auto_hibern8_enable(hba); @@ -4164,14 +4403,10 @@ EXPORT_SYMBOL_GPL(ufshcd_auto_hibern8_update); void ufshcd_auto_hibern8_enable(struct ufs_hba *hba) { - unsigned long flags; - if (!ufshcd_is_auto_hibern8_supported(hba)) return; - spin_lock_irqsave(hba->host->host_lock, flags); ufshcd_writel(hba, hba->ahit, REG_AUTO_HIBERNATE_IDLE_TIMER); - spin_unlock_irqrestore(hba->host->host_lock, flags); } /** @@ -4201,8 +4436,13 @@ static int ufshcd_get_max_pwr_mode(struct ufs_hba *hba) if (hba->max_pwr_info.is_valid) return 0; - pwr_info->pwr_tx = FAST_MODE; - pwr_info->pwr_rx = FAST_MODE; + if (hba->quirks & UFSHCD_QUIRK_HIBERN_FASTAUTO) { + pwr_info->pwr_tx = FASTAUTO_MODE; + pwr_info->pwr_rx = FASTAUTO_MODE; + } else { + pwr_info->pwr_tx = FAST_MODE; + pwr_info->pwr_rx = FAST_MODE; + } pwr_info->hs_rate = PA_HS_MODE_B; /* Get the connected lane count */ @@ -4282,18 +4522,18 @@ static int ufshcd_change_power_mode(struct ufs_hba *hba, pwr_mode->lane_rx); if (pwr_mode->pwr_rx == FASTAUTO_MODE || pwr_mode->pwr_rx == FAST_MODE) - ufshcd_dme_set(hba, UIC_ARG_MIB(PA_RXTERMINATION), TRUE); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_RXTERMINATION), true); else - ufshcd_dme_set(hba, UIC_ARG_MIB(PA_RXTERMINATION), FALSE); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_RXTERMINATION), false); ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TXGEAR), pwr_mode->gear_tx); ufshcd_dme_set(hba, UIC_ARG_MIB(PA_ACTIVETXDATALANES), pwr_mode->lane_tx); if (pwr_mode->pwr_tx == FASTAUTO_MODE || pwr_mode->pwr_tx == FAST_MODE) - ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TXTERMINATION), TRUE); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TXTERMINATION), true); else - ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TXTERMINATION), FALSE); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TXTERMINATION), false); if (pwr_mode->pwr_rx == FASTAUTO_MODE || pwr_mode->pwr_tx == FASTAUTO_MODE || @@ -4380,7 +4620,7 @@ static int ufshcd_complete_dev_init(struct ufs_hba *hba) QUERY_FLAG_IDN_FDEVICEINIT, 0, NULL); if (err) { dev_err(hba->dev, - "%s setting fDeviceInit flag failed with error %d\n", + "%s: setting fDeviceInit flag failed with error %d\n", __func__, err); goto out; } @@ -4392,16 +4632,16 @@ static int ufshcd_complete_dev_init(struct ufs_hba *hba) QUERY_FLAG_IDN_FDEVICEINIT, 0, &flag_res); if (!flag_res) break; - usleep_range(5000, 10000); + usleep_range(500, 1000); } while (ktime_before(ktime_get(), timeout)); if (err) { dev_err(hba->dev, - "%s reading fDeviceInit flag failed with error %d\n", + "%s: reading fDeviceInit flag failed with error %d\n", __func__, err); } else if (flag_res) { dev_err(hba->dev, - "%s fDeviceInit was not cleared by the device\n", + "%s: fDeviceInit was not cleared by the device\n", __func__); err = -EBUSY; } @@ -4508,7 +4748,7 @@ static int ufshcd_hba_execute_hce(struct ufs_hba *hba) int retry_inner; start: - if (!ufshcd_is_hba_active(hba)) + if (ufshcd_is_hba_active(hba)) /* change controller state to "reset state" */ ufshcd_hba_stop(hba); @@ -4534,7 +4774,7 @@ start: /* wait for the host controller to complete initialization */ retry_inner = 50; - while (ufshcd_is_hba_active(hba)) { + while (!ufshcd_is_hba_active(hba)) { if (retry_inner) { retry_inner--; } else { @@ -4568,14 +4808,18 @@ int ufshcd_hba_enable(struct ufs_hba *hba) /* enable UIC related interrupts */ ufshcd_enable_intr(hba, UFSHCD_UIC_MASK); ret = ufshcd_dme_reset(hba); - if (!ret) { - ret = ufshcd_dme_enable(hba); - if (!ret) - ufshcd_vops_hce_enable_notify(hba, POST_CHANGE); - if (ret) - dev_err(hba->dev, - "Host controller enable failed with non-hce\n"); + if (ret) { + dev_err(hba->dev, "DME_RESET failed\n"); + return ret; } + + ret = ufshcd_dme_enable(hba); + if (ret) { + dev_err(hba->dev, "Enabling DME failed\n"); + return ret; + } + + ufshcd_vops_hce_enable_notify(hba, POST_CHANGE); } else { ret = ufshcd_hba_execute_hce(hba); } @@ -4629,7 +4873,7 @@ void ufshcd_update_evt_hist(struct ufs_hba *hba, u32 id, u32 val) e = &hba->ufs_stats.event[id]; e->val[e->pos] = val; - e->tstamp[e->pos] = ktime_get(); + e->tstamp[e->pos] = local_clock(); e->cnt += 1; e->pos = (e->pos + 1) % UFS_EVENT_HIST_LENGTH; @@ -4677,7 +4921,7 @@ link_startup: * but we can't be sure if the link is up until link startup * succeeds. So reset the local Uni-Pro and try again. */ - if (ret && ufshcd_hba_enable(hba)) { + if (ret && retries && ufshcd_hba_enable(hba)) { ufshcd_update_evt_hist(hba, UFS_EVT_LINK_STARTUP_FAIL, (u32)ret); @@ -4761,100 +5005,6 @@ static int ufshcd_verify_dev_init(struct ufs_hba *hba) return err; } -/** - * ufshcd_set_queue_depth - set lun queue depth - * @sdev: pointer to SCSI device - * - * Read bLUQueueDepth value and activate scsi tagged command - * queueing. For WLUN, queue depth is set to 1. For best-effort - * cases (bLUQueueDepth = 0) the queue depth is set to a maximum - * value that host can queue. - */ -static void ufshcd_set_queue_depth(struct scsi_device *sdev) -{ - int ret = 0; - u8 lun_qdepth; - struct ufs_hba *hba; - - hba = shost_priv(sdev->host); - - lun_qdepth = hba->nutrs; - ret = ufshcd_read_unit_desc_param(hba, - ufshcd_scsi_to_upiu_lun(sdev->lun), - UNIT_DESC_PARAM_LU_Q_DEPTH, - &lun_qdepth, - sizeof(lun_qdepth)); - - /* Some WLUN doesn't support unit descriptor */ - if (ret == -EOPNOTSUPP) - lun_qdepth = 1; - else if (!lun_qdepth) - /* eventually, we can figure out the real queue depth */ - lun_qdepth = hba->nutrs; - else - lun_qdepth = min_t(int, lun_qdepth, hba->nutrs); - - dev_dbg(hba->dev, "%s: activate tcq with queue depth %d\n", - __func__, lun_qdepth); - scsi_change_queue_depth(sdev, lun_qdepth); -} - -/* - * ufshcd_get_lu_wp - returns the "b_lu_write_protect" from UNIT DESCRIPTOR - * @hba: per-adapter instance - * @lun: UFS device lun id - * @b_lu_write_protect: pointer to buffer to hold the LU's write protect info - * - * Returns 0 in case of success and b_lu_write_protect status would be returned - * @b_lu_write_protect parameter. - * Returns -ENOTSUPP if reading b_lu_write_protect is not supported. - * Returns -EINVAL in case of invalid parameters passed to this function. - */ -static int ufshcd_get_lu_wp(struct ufs_hba *hba, - u8 lun, - u8 *b_lu_write_protect) -{ - int ret; - - if (!b_lu_write_protect) - ret = -EINVAL; - /* - * According to UFS device spec, RPMB LU can't be write - * protected so skip reading bLUWriteProtect parameter for - * it. For other W-LUs, UNIT DESCRIPTOR is not available. - */ - else if (lun >= hba->dev_info.max_lu_supported) - ret = -ENOTSUPP; - else - ret = ufshcd_read_unit_desc_param(hba, - lun, - UNIT_DESC_PARAM_LU_WR_PROTECT, - b_lu_write_protect, - sizeof(*b_lu_write_protect)); - return ret; -} - -/** - * ufshcd_get_lu_power_on_wp_status - get LU's power on write protect - * status - * @hba: per-adapter instance - * @sdev: pointer to SCSI device - * - */ -static inline void ufshcd_get_lu_power_on_wp_status(struct ufs_hba *hba, - struct scsi_device *sdev) -{ - if (hba->dev_info.f_power_on_wp_en && - !hba->dev_info.is_lu_power_on_wp) { - u8 b_lu_write_protect; - - if (!ufshcd_get_lu_wp(hba, ufshcd_scsi_to_upiu_lun(sdev->lun), - &b_lu_write_protect) && - (b_lu_write_protect == UFS_LU_POWER_ON_WP)) - hba->dev_info.is_lu_power_on_wp = true; - } -} - /** * ufshcd_setup_links - associate link b/w device wlun and other luns * @sdev: pointer to SCSI device @@ -4868,13 +5018,13 @@ static void ufshcd_setup_links(struct ufs_hba *hba, struct scsi_device *sdev) * Device wlun is the supplier & rest of the luns are consumers. * This ensures that device wlun suspends after all other luns. */ - if (hba->sdev_ufs_device) { + if (hba->ufs_device_wlun) { link = device_link_add(&sdev->sdev_gendev, - &hba->sdev_ufs_device->sdev_gendev, + &hba->ufs_device_wlun->sdev_gendev, DL_FLAG_PM_RUNTIME | DL_FLAG_RPM_ACTIVE); if (!link) { dev_err(&sdev->sdev_gendev, "Failed establishing link - %s\n", - dev_name(&hba->sdev_ufs_device->sdev_gendev)); + dev_name(&hba->ufs_device_wlun->sdev_gendev)); return; } hba->luns_avail--; @@ -4892,6 +5042,64 @@ static void ufshcd_setup_links(struct ufs_hba *hba, struct scsi_device *sdev) } } +/** + * ufshcd_lu_init - Initialize the relevant parameters of the LU + * @hba: per-adapter instance + * @sdev: pointer to SCSI device + */ +static void ufshcd_lu_init(struct ufs_hba *hba, struct scsi_device *sdev) +{ + int len = QUERY_DESC_MAX_SIZE; + u8 lun = ufshcd_scsi_to_upiu_lun(sdev->lun); + u8 lun_qdepth = hba->nutrs; + u8 *desc_buf; + int ret; + + desc_buf = kzalloc(len, GFP_KERNEL); + if (!desc_buf) + goto set_qdepth; + + ret = ufshcd_read_unit_desc_param(hba, lun, 0, desc_buf, len); + if (ret < 0) { + if (ret == -EOPNOTSUPP) + /* If LU doesn't support unit descriptor, its queue depth is set to 1 */ + lun_qdepth = 1; + kfree(desc_buf); + goto set_qdepth; + } + + if (desc_buf[UNIT_DESC_PARAM_LU_Q_DEPTH]) { + /* + * In per-LU queueing architecture, bLUQueueDepth will not be 0, then we will + * use the smaller between UFSHCI CAP.NUTRS and UFS LU bLUQueueDepth + */ + lun_qdepth = min_t(int, desc_buf[UNIT_DESC_PARAM_LU_Q_DEPTH], hba->nutrs); + } + /* + * According to UFS device specification, the write protection mode is only supported by + * normal LU, not supported by WLUN. + */ + if (hba->dev_info.f_power_on_wp_en && lun < hba->dev_info.max_lu_supported && + !hba->dev_info.is_lu_power_on_wp && + desc_buf[UNIT_DESC_PARAM_LU_WR_PROTECT] == UFS_LU_POWER_ON_WP) + hba->dev_info.is_lu_power_on_wp = true; + + /* In case of RPMB LU, check if advanced RPMB mode is enabled */ + if (desc_buf[UNIT_DESC_PARAM_UNIT_INDEX] == UFS_UPIU_RPMB_WLUN && + desc_buf[RPMB_UNIT_DESC_PARAM_REGION_EN] & BIT(4)) + hba->dev_info.b_advanced_rpmb_en = true; + + + kfree(desc_buf); +set_qdepth: + /* + * For WLUNs that don't support unit descriptor, queue depth is set to 1. For LUs whose + * bLUQueueDepth == 0, the queue depth is set to a maximum value that host can queue. + */ + dev_dbg(hba->dev, "Set LU %x queue depth %d\n", lun, lun_qdepth); + scsi_change_queue_depth(sdev, lun_qdepth); +} + /** * ufshcd_slave_alloc - handle initial SCSI device configurations * @sdev: pointer to SCSI device @@ -4919,9 +5127,7 @@ static int ufshcd_slave_alloc(struct scsi_device *sdev) /* WRITE_SAME command is not supported */ sdev->no_write_same = 1; - ufshcd_set_queue_depth(sdev); - - ufshcd_get_lu_power_on_wp_status(hba, sdev); + ufshcd_lu_init(hba, sdev); ufshcd_setup_links(hba, sdev); @@ -4937,11 +5143,7 @@ static int ufshcd_slave_alloc(struct scsi_device *sdev) */ static int ufshcd_change_queue_depth(struct scsi_device *sdev, int depth) { - struct ufs_hba *hba = shost_priv(sdev->host); - - if (depth > hba->nutrs) - depth = hba->nutrs; - return scsi_change_queue_depth(sdev, depth); + return scsi_change_queue_depth(sdev, min(depth, sdev->host->can_queue)); } static void ufshcd_hpb_destroy(struct ufs_hba *hba, struct scsi_device *sdev) @@ -4975,9 +5177,10 @@ static int ufshcd_slave_configure(struct scsi_device *sdev) ufshcd_hpb_configure(hba, sdev); + blk_queue_flag_set(QUEUE_FLAG_PIPELINE_ZONED_WRITES, q); blk_queue_update_dma_pad(q, PRDT_DATA_BYTE_COUNT_PAD - 1); - if (hba->quirks & UFSHCD_QUIRK_ALIGN_SG_WITH_PAGE_SIZE) - blk_queue_update_dma_alignment(q, PAGE_SIZE - 1); + if (hba->quirks & UFSHCD_QUIRK_4KB_DMA_ALIGNMENT) + blk_queue_update_dma_alignment(q, 4096 - 1); /* * Block runtime-pm until all consumers are added. * Refer ufshcd_setup_links(). @@ -4993,7 +5196,9 @@ static int ufshcd_slave_configure(struct scsi_device *sdev) */ sdev->silence_suspend = 1; - ufshcd_crypto_setup_rq_keyslot_manager(hba, q); + ufshcd_crypto_register(hba, q); + + trace_android_vh_ufs_update_sdev(sdev); return 0; } @@ -5014,15 +5219,15 @@ static void ufshcd_slave_destroy(struct scsi_device *sdev) /* Drop the reference as it won't be needed anymore */ if (ufshcd_scsi_to_upiu_lun(sdev->lun) == UFS_UPIU_UFS_DEVICE_WLUN) { spin_lock_irqsave(hba->host->host_lock, flags); - hba->sdev_ufs_device = NULL; + hba->ufs_device_wlun = NULL; spin_unlock_irqrestore(hba->host->host_lock, flags); - } else if (hba->sdev_ufs_device) { + } else if (hba->ufs_device_wlun) { struct device *supplier = NULL; /* Ensure UFS Device WLUN exists and does not disappear */ spin_lock_irqsave(hba->host->host_lock, flags); - if (hba->sdev_ufs_device) { - supplier = &hba->sdev_ufs_device->sdev_gendev; + if (hba->ufs_device_wlun) { + supplier = &hba->ufs_device_wlun->sdev_gendev; get_device(supplier); } spin_unlock_irqrestore(hba->host->host_lock, flags); @@ -5076,18 +5281,20 @@ ufshcd_scsi_cmd_status(struct ufshcd_lrb *lrbp, int scsi_status) * ufshcd_transfer_rsp_status - Get overall status of the response * @hba: per adapter instance * @lrbp: pointer to local reference block of completed command + * @cqe: pointer to the completion queue entry * * Returns result of the command to notify SCSI midlayer */ static inline int -ufshcd_transfer_rsp_status(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) +ufshcd_transfer_rsp_status(struct ufs_hba *hba, struct ufshcd_lrb *lrbp, + struct cq_entry *cqe) { int result = 0; int scsi_status; - int ocs; + enum utp_ocs ocs; /* overall command status of utrd */ - ocs = ufshcd_get_tr_ocs(lrbp); + ocs = ufshcd_get_tr_ocs(lrbp, cqe); if (hba->quirks & UFSHCD_QUIRK_BROKEN_OCS_FATAL_ERROR) { if (be32_to_cpu(lrbp->ucd_rsp_ptr->header.dword_1) & @@ -5239,69 +5446,135 @@ static irqreturn_t ufshcd_uic_cmd_compl(struct ufs_hba *hba, u32 intr_status) return retval; } +/* Release the resources allocated for processing a SCSI command. */ +static void ufshcd_release_scsi_cmd(struct ufs_hba *hba, + struct ufshcd_lrb *lrbp) +{ + struct scsi_cmnd *cmd = lrbp->cmd; + + scsi_dma_unmap(cmd); + ufshcd_crypto_clear_prdt(hba, lrbp); + lrbp->cmd = NULL; /* Mark the command as completed. */ + ufshcd_release(hba); + ufshcd_clk_scaling_update_busy(hba); +} + +/** + * ufshcd_compl_one_cqe - handle a completion queue entry + * @hba: per adapter instance + * @task_tag: the task tag of the request to be completed + * @cqe: pointer to the completion queue entry + */ +void ufshcd_compl_one_cqe(struct ufs_hba *hba, int task_tag, + struct cq_entry *cqe) +{ + struct ufshcd_lrb *lrbp; + struct scsi_cmnd *cmd; + + lrbp = &hba->lrb[task_tag]; + lrbp->compl_time_stamp = ktime_get(); + cmd = lrbp->cmd; + if (cmd) { + trace_android_vh_ufs_compl_command(hba, lrbp); + if (unlikely(ufshcd_should_inform_monitor(hba, lrbp))) + ufshcd_update_monitor(hba, lrbp); + ufshcd_add_command_trace(hba, task_tag, UFS_CMD_COMP); + cmd->result = ufshcd_transfer_rsp_status(hba, lrbp, cqe); + ufshcd_release_scsi_cmd(hba, lrbp); + /* Do not touch lrbp after scsi done */ + cmd->scsi_done(cmd); + } else if (lrbp->command_type == UTP_CMD_TYPE_DEV_MANAGE || + lrbp->command_type == UTP_CMD_TYPE_UFS_STORAGE) { + if (hba->dev_cmd.complete) { + trace_android_vh_ufs_compl_command(hba, lrbp); + hba->dev_cmd.cqe = cqe; + ufshcd_add_command_trace(hba, task_tag, UFS_DEV_COMP); + complete(hba->dev_cmd.complete); + ufshcd_clk_scaling_update_busy(hba); + } + } +} + /** * __ufshcd_transfer_req_compl - handle SCSI and query command completion * @hba: per adapter instance * @completed_reqs: bitmask that indicates which requests to complete - * @retry_requests: whether to ask the SCSI core to retry completed requests */ static void __ufshcd_transfer_req_compl(struct ufs_hba *hba, - unsigned long completed_reqs, - bool retry_requests) + unsigned long completed_reqs) { - struct ufshcd_lrb *lrbp; - struct scsi_cmnd *cmd; - int result; - int index; - bool update_scaling = false; + int tag; - for_each_set_bit(index, &completed_reqs, hba->nutrs) { - lrbp = &hba->lrb[index]; - lrbp->compl_time_stamp = ktime_get(); - cmd = lrbp->cmd; - if (cmd) { - if (unlikely(ufshcd_should_inform_monitor(hba, lrbp))) - ufshcd_update_monitor(hba, lrbp); - ufshcd_add_command_trace(hba, index, UFS_CMD_COMP); - result = retry_requests ? DID_BUS_BUSY << 16 : - ufshcd_transfer_rsp_status(hba, lrbp); - scsi_dma_unmap(cmd); - cmd->result = result; - /* Mark completed command as NULL in LRB */ - lrbp->cmd = NULL; - /* Do not touch lrbp after scsi done */ - cmd->scsi_done(cmd); - ufshcd_release(hba); - update_scaling = true; - } else if (lrbp->command_type == UTP_CMD_TYPE_DEV_MANAGE || - lrbp->command_type == UTP_CMD_TYPE_UFS_STORAGE) { - if (hba->dev_cmd.complete) { - ufshcd_add_command_trace(hba, index, - UFS_DEV_COMP); - complete(hba->dev_cmd.complete); - update_scaling = true; - } - } - if (update_scaling) - ufshcd_clk_scaling_update_busy(hba); + for_each_set_bit(tag, &completed_reqs, hba->nutrs) + ufshcd_compl_one_cqe(hba, tag, NULL); +} + +/* Any value that is not an existing queue number is fine for this constant. */ +enum { + UFSHCD_POLL_FROM_INTERRUPT_CONTEXT = -1 +}; + +static void ufshcd_clear_polled(struct ufs_hba *hba, + unsigned long *completed_reqs) +{ + int tag; + + for_each_set_bit(tag, completed_reqs, hba->nutrs) { + struct scsi_cmnd *cmd = hba->lrb[tag].cmd; + + if (!cmd) + continue; + if (scsi_cmd_to_rq(cmd)->cmd_flags & REQ_HIPRI) + __clear_bit(tag, completed_reqs); } } +/* + * Returns > 0 if one or more commands have been completed or 0 if no + * requests have been completed. + */ +static int ufshcd_poll(struct Scsi_Host *shost, unsigned int queue_num) +{ + struct ufs_hba *hba = shost_priv(shost); + unsigned long completed_reqs, flags; + u32 tr_doorbell; + struct ufs_hw_queue *hwq; + + if (is_mcq_enabled(hba)) { + hwq = &hba->uhq[queue_num + UFSHCD_MCQ_IO_QUEUE_OFFSET]; + + return ufshcd_mcq_poll_cqe_lock(hba, hwq); + } + + spin_lock_irqsave(&hba->outstanding_lock, flags); + tr_doorbell = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL); + completed_reqs = ~tr_doorbell & hba->outstanding_reqs; + WARN_ONCE(completed_reqs & ~hba->outstanding_reqs, + "completed: %#lx; outstanding: %#lx\n", completed_reqs, + hba->outstanding_reqs); + if (queue_num == UFSHCD_POLL_FROM_INTERRUPT_CONTEXT) { + /* Do not complete polled requests from interrupt context. */ + ufshcd_clear_polled(hba, &completed_reqs); + } + hba->outstanding_reqs &= ~completed_reqs; + spin_unlock_irqrestore(&hba->outstanding_lock, flags); + + if (completed_reqs) + __ufshcd_transfer_req_compl(hba, completed_reqs); + + return completed_reqs != 0; +} + /** * ufshcd_transfer_req_compl - handle SCSI and query command completion * @hba: per adapter instance - * @retry_requests: whether or not to ask to retry requests * * Returns * IRQ_HANDLED - If interrupt is valid * IRQ_NONE - If invalid interrupt */ -static irqreturn_t ufshcd_transfer_req_compl(struct ufs_hba *hba, - bool retry_requests) +static irqreturn_t ufshcd_transfer_req_compl(struct ufs_hba *hba) { - unsigned long completed_reqs, flags; - u32 tr_doorbell; - /* Resetting interrupt aggregation counters first and reading the * DOOR_BELL afterward allows us to handle all the completed requests. * In order to prevent other interrupts starvation the DB is read once @@ -5316,22 +5589,13 @@ static irqreturn_t ufshcd_transfer_req_compl(struct ufs_hba *hba, if (ufs_fail_completion()) return IRQ_HANDLED; - spin_lock_irqsave(&hba->outstanding_lock, flags); - tr_doorbell = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL); - completed_reqs = ~tr_doorbell & hba->outstanding_reqs; - WARN_ONCE(completed_reqs & ~hba->outstanding_reqs, - "completed: %#lx; outstanding: %#lx\n", completed_reqs, - hba->outstanding_reqs); - hba->outstanding_reqs &= ~completed_reqs; - spin_unlock_irqrestore(&hba->outstanding_lock, flags); + /* + * Ignore the ufshcd_poll() return value and return IRQ_HANDLED since we + * do not want polling to trigger spurious interrupt complaints. + */ + ufshcd_poll(hba->host, UFSHCD_POLL_FROM_INTERRUPT_CONTEXT); - if (completed_reqs) { - __ufshcd_transfer_req_compl(hba, completed_reqs, - retry_requests); - return IRQ_HANDLED; - } else { - return IRQ_NONE; - } + return IRQ_HANDLED; } int __ufshcd_write_ee_control(struct ufs_hba *hba, u32 ee_ctrl_mask) @@ -5354,8 +5618,8 @@ int ufshcd_write_ee_control(struct ufs_hba *hba) return err; } -int ufshcd_update_ee_control(struct ufs_hba *hba, u16 *mask, u16 *other_mask, - u16 set, u16 clr) +int ufshcd_update_ee_control(struct ufs_hba *hba, u16 *mask, + const u16 *other_mask, u16 set, u16 clr) { u16 new_mask, ee_ctrl_mask; int err = 0; @@ -5534,7 +5798,7 @@ static inline int ufshcd_get_bkops_status(struct ufs_hba *hba, u32 *status) * to know whether auto bkops is enabled or disabled after this function * returns control to it. */ -static int ufshcd_bkops_ctrl(struct ufs_hba *hba, +int ufshcd_bkops_ctrl(struct ufs_hba *hba, enum bkops_status status) { int err; @@ -5559,6 +5823,7 @@ static int ufshcd_bkops_ctrl(struct ufs_hba *hba, out: return err; } +EXPORT_SYMBOL_GPL(ufshcd_bkops_ctrl); /** * ufshcd_urgent_bkops - handle urgent bkops exception event @@ -5618,6 +5883,24 @@ out: __func__, err); } +static void ufshcd_temp_exception_event_handler(struct ufs_hba *hba, u16 status) +{ + u32 value; + + if (ufshcd_query_attr_retry(hba, UPIU_QUERY_OPCODE_READ_ATTR, + QUERY_ATTR_IDN_CASE_ROUGH_TEMP, 0, 0, &value)) + return; + + dev_info(hba->dev, "exception Tcase %d\n", value - 80); + + ufs_hwmon_notify_event(hba, status & MASK_EE_URGENT_TEMP); + + /* + * A placeholder for the platform vendors to add whatever additional + * steps required + */ +} + static int __ufshcd_wb_toggle(struct ufs_hba *hba, bool set, enum flag_idn idn) { u8 index; @@ -5632,60 +5915,60 @@ int ufshcd_wb_toggle(struct ufs_hba *hba, bool enable) { int ret; - if (!ufshcd_is_wb_allowed(hba)) - return 0; - - if (!(enable ^ hba->dev_info.wb_enabled)) + if (!ufshcd_is_wb_allowed(hba) || + hba->dev_info.wb_enabled == enable) return 0; ret = __ufshcd_wb_toggle(hba, enable, QUERY_FLAG_IDN_WB_EN); if (ret) { - dev_err(hba->dev, "%s Write Booster %s failed %d\n", - __func__, enable ? "enable" : "disable", ret); + dev_err(hba->dev, "%s: Write Booster %s failed %d\n", + __func__, enable ? "enabling" : "disabling", ret); return ret; } hba->dev_info.wb_enabled = enable; - dev_dbg(hba->dev, "%s Write Booster %s\n", + dev_dbg(hba->dev, "%s: Write Booster %s\n", __func__, enable ? "enabled" : "disabled"); return ret; } -static void ufshcd_wb_toggle_flush_during_h8(struct ufs_hba *hba, bool set) +static void ufshcd_wb_toggle_buf_flush_during_h8(struct ufs_hba *hba, + bool enable) { int ret; - ret = __ufshcd_wb_toggle(hba, set, + ret = __ufshcd_wb_toggle(hba, enable, QUERY_FLAG_IDN_WB_BUFF_FLUSH_DURING_HIBERN8); if (ret) { - dev_err(hba->dev, "%s: WB-Buf Flush during H8 %s failed: %d\n", - __func__, set ? "enable" : "disable", ret); + dev_err(hba->dev, "%s: WB-Buf Flush during H8 %s failed %d\n", + __func__, enable ? "enabling" : "disabling", ret); return; } - dev_dbg(hba->dev, "%s WB-Buf Flush during H8 %s\n", - __func__, set ? "enabled" : "disabled"); + dev_dbg(hba->dev, "%s: WB-Buf Flush during H8 %s\n", + __func__, enable ? "enabled" : "disabled"); } -static inline void ufshcd_wb_toggle_flush(struct ufs_hba *hba, bool enable) +int ufshcd_wb_toggle_buf_flush(struct ufs_hba *hba, bool enable) { int ret; if (!ufshcd_is_wb_allowed(hba) || hba->dev_info.wb_buf_flush_enabled == enable) - return; + return 0; ret = __ufshcd_wb_toggle(hba, enable, QUERY_FLAG_IDN_WB_BUFF_FLUSH_EN); if (ret) { - dev_err(hba->dev, "%s WB-Buf Flush %s failed %d\n", __func__, - enable ? "enable" : "disable", ret); - return; + dev_err(hba->dev, "%s: WB-Buf Flush %s failed %d\n", + __func__, enable ? "enabling" : "disabling", ret); + return ret; } hba->dev_info.wb_buf_flush_enabled = enable; - - dev_dbg(hba->dev, "%s WB-Buf Flush %s\n", + dev_dbg(hba->dev, "%s: WB-Buf Flush %s\n", __func__, enable ? "enabled" : "disabled"); + + return ret; } static bool ufshcd_wb_presrv_usrspc_keep_vcc_on(struct ufs_hba *hba, @@ -5700,7 +5983,7 @@ static bool ufshcd_wb_presrv_usrspc_keep_vcc_on(struct ufs_hba *hba, QUERY_ATTR_IDN_CURR_WB_BUFF_SIZE, index, 0, &cur_buf); if (ret) { - dev_err(hba->dev, "%s dCurWriteBoosterBufferSize read failed %d\n", + dev_err(hba->dev, "%s: dCurWriteBoosterBufferSize read failed %d\n", __func__, ret); return false; } @@ -5711,10 +5994,48 @@ static bool ufshcd_wb_presrv_usrspc_keep_vcc_on(struct ufs_hba *hba, return false; } /* Let it continue to flush when available buffer exceeds threshold */ - if (avail_buf < hba->vps->wb_flush_threshold) - return true; + return avail_buf < hba->vps->wb_flush_threshold; +} - return false; +static void ufshcd_wb_force_disable(struct ufs_hba *hba) +{ + if (ufshcd_is_wb_buf_flush_allowed(hba)) + ufshcd_wb_toggle_buf_flush(hba, false); + + ufshcd_wb_toggle_buf_flush_during_h8(hba, false); + ufshcd_wb_toggle(hba, false); + hba->caps &= ~UFSHCD_CAP_WB_EN; + + dev_info(hba->dev, "%s: WB force disabled\n", __func__); +} + +static bool ufshcd_is_wb_buf_lifetime_available(struct ufs_hba *hba) +{ + u32 lifetime; + int ret; + u8 index; + + index = ufshcd_wb_get_query_index(hba); + ret = ufshcd_query_attr_retry(hba, UPIU_QUERY_OPCODE_READ_ATTR, + QUERY_ATTR_IDN_WB_BUFF_LIFE_TIME_EST, + index, 0, &lifetime); + if (ret) { + dev_err(hba->dev, + "%s: bWriteBoosterBufferLifeTimeEst read failed %d\n", + __func__, ret); + return false; + } + + if (lifetime == UFS_WB_EXCEED_LIFETIME) { + dev_err(hba->dev, "%s: WB buf lifetime is exhausted 0x%02X\n", + __func__, lifetime); + return false; + } + + dev_dbg(hba->dev, "%s: WB buf lifetime is 0x%02X\n", + __func__, lifetime); + + return true; } static bool ufshcd_wb_need_flush(struct ufs_hba *hba) @@ -5725,6 +6046,12 @@ static bool ufshcd_wb_need_flush(struct ufs_hba *hba) if (!ufshcd_is_wb_allowed(hba)) return false; + + if (!ufshcd_is_wb_buf_lifetime_available(hba)) { + ufshcd_wb_force_disable(hba); + return false; + } + /* * The ufs device needs the vcc to be ON to flush. * With user-space reduction enabled, it's enough to enable flush @@ -5741,16 +6068,13 @@ static bool ufshcd_wb_need_flush(struct ufs_hba *hba) QUERY_ATTR_IDN_AVAIL_WB_BUFF_SIZE, index, 0, &avail_buf); if (ret) { - dev_warn(hba->dev, "%s dAvailableWriteBoosterBufferSize read failed %d\n", + dev_warn(hba->dev, "%s: dAvailableWriteBoosterBufferSize read failed %d\n", __func__, ret); return false; } - if (!hba->dev_info.b_presrv_uspc_en) { - if (avail_buf <= UFS_WB_BUF_REMAIN_PERCENT(10)) - return true; - return false; - } + if (!hba->dev_info.b_presrv_uspc_en) + return avail_buf <= UFS_WB_BUF_REMAIN_PERCENT(10); return ufshcd_wb_presrv_usrspc_keep_vcc_on(hba, avail_buf); } @@ -5797,22 +6121,18 @@ static void ufshcd_exception_event_handler(struct work_struct *work) if (status & hba->ee_drv_mask & MASK_EE_URGENT_BKOPS) ufshcd_bkops_exception_event_handler(hba); + if (status & hba->ee_drv_mask & MASK_EE_URGENT_TEMP) + ufshcd_temp_exception_event_handler(hba, status); + ufs_debugfs_exception_event(hba, status); out: ufshcd_scsi_unblock_requests(hba); - return; } /* Complete requests that have door-bell cleared */ static void ufshcd_complete_requests(struct ufs_hba *hba) { - ufshcd_transfer_req_compl(hba, /*retry_requests=*/false); - ufshcd_tmc_handler(hba); -} - -static void ufshcd_retry_aborted_requests(struct ufs_hba *hba) -{ - ufshcd_transfer_req_compl(hba, /*retry_requests=*/true); + ufshcd_transfer_req_compl(hba); ufshcd_tmc_handler(hba); } @@ -5894,9 +6214,10 @@ static inline bool ufshcd_is_saved_err_fatal(struct ufs_hba *hba) (hba->saved_err & (INT_FATAL_ERRORS | UFSHCD_UIC_HIBERN8_MASK)); } -/* host lock must be held before calling this func */ -static inline void ufshcd_schedule_eh_work(struct ufs_hba *hba) +void ufshcd_schedule_eh_work(struct ufs_hba *hba) { + lockdep_assert_held(hba->host->host_lock); + /* handle fatal errors only when link is not in error state */ if (hba->ufshcd_state != UFSHCD_STATE_ERROR) { if (hba->force_reset || ufshcd_is_link_broken(hba) || @@ -5941,7 +6262,7 @@ static void ufshcd_clk_scaling_suspend(struct ufs_hba *hba, bool suspend) static void ufshcd_err_handling_prepare(struct ufs_hba *hba) { ufshcd_rpm_get_sync(hba); - if (pm_runtime_status_suspended(&hba->sdev_ufs_device->sdev_gendev) || + if (pm_runtime_status_suspended(&hba->ufs_device_wlun->sdev_gendev) || hba->is_sys_suspended) { enum ufs_pm_op pm_op; @@ -5986,7 +6307,7 @@ static void ufshcd_err_handling_unprepare(struct ufs_hba *hba) static inline bool ufshcd_err_handling_should_stop(struct ufs_hba *hba) { return (!hba->is_powered || hba->shutting_down || - !hba->sdev_ufs_device || + !hba->ufs_device_wlun || hba->ufshcd_state == UFSHCD_STATE_ERROR || (!(hba->saved_err || hba->saved_uic_err || hba->force_reset || ufshcd_is_link_broken(hba)))); @@ -6005,7 +6326,7 @@ static void ufshcd_recover_pm_error(struct ufs_hba *hba) * Set RPM status of wlun device to RPM_ACTIVE, * this also clears its runtime error. */ - ret = pm_runtime_set_active(&hba->sdev_ufs_device->sdev_gendev); + ret = pm_runtime_set_active(&hba->ufs_device_wlun->sdev_gendev); /* hba device might have a runtime error otherwise */ if (ret) @@ -6047,22 +6368,60 @@ static bool ufshcd_is_pwr_mode_restore_needed(struct ufs_hba *hba) return false; } +static bool ufshcd_abort_all(struct ufs_hba *hba) +{ + bool needs_reset = false; + int tag, ret; + + /* Clear pending transfer requests */ + for_each_set_bit(tag, &hba->outstanding_reqs, hba->nutrs) { + ret = ufshcd_try_to_abort_task(hba, tag); + dev_err(hba->dev, "Aborting tag %d / CDB %#02x %s\n", tag, + hba->lrb[tag].cmd ? hba->lrb[tag].cmd->cmnd[0] : -1, + ret ? "failed" : "succeeded"); + if (ret) { + needs_reset = true; + goto out; + } + } + + /* Clear pending task management requests */ + for_each_set_bit(tag, &hba->outstanding_tasks, hba->nutmrs) { + if (ufshcd_clear_tm_cmd(hba, tag)) { + needs_reset = true; + goto out; + } + } + +out: + /* Complete the requests that are cleared by s/w */ + ufshcd_complete_requests(hba); + + return needs_reset; +} + /** * ufshcd_err_handler - handle UFS errors that require s/w attention * @work: pointer to work structure */ static void ufshcd_err_handler(struct work_struct *work) { + int retries = MAX_ERR_HANDLER_RETRIES; struct ufs_hba *hba; unsigned long flags; - bool err_xfer = false; - bool err_tm = false; - int err = 0, pmc_err; - int tag; - bool needs_reset = false, needs_restore = false; + bool needs_restore; + bool needs_reset; + int pmc_err; hba = container_of(work, struct ufs_hba, eh_work); + dev_info(hba->dev, + "%s started; HBA state %s; powered %d; shutting down %d; saved_err = %d; saved_uic_err = %d; force_reset = %d%s\n", + __func__, ufshcd_state_name[hba->ufshcd_state], + hba->is_powered, hba->shutting_down, hba->saved_err, + hba->saved_uic_err, hba->force_reset, + ufshcd_is_link_broken(hba) ? "; link is broken" : ""); + down(&hba->host_sem); spin_lock_irqsave(hba->host->host_lock, flags); if (ufshcd_err_handling_should_stop(hba)) { @@ -6078,6 +6437,10 @@ static void ufshcd_err_handler(struct work_struct *work) /* Complete requests that have door-bell cleared by h/w */ ufshcd_complete_requests(hba); spin_lock_irqsave(hba->host->host_lock, flags); +again: + needs_restore = false; + needs_reset = false; + if (hba->ufshcd_state != UFSHCD_STATE_ERROR) hba->ufshcd_state = UFSHCD_STATE_RESET; /* @@ -6145,31 +6508,13 @@ static void ufshcd_err_handler(struct work_struct *work) hba->silence_err_logs = true; /* release lock as clear command might sleep */ spin_unlock_irqrestore(hba->host->host_lock, flags); - /* Clear pending transfer requests */ - for_each_set_bit(tag, &hba->outstanding_reqs, hba->nutrs) { - if (ufshcd_try_to_abort_task(hba, tag)) { - err_xfer = true; - goto lock_skip_pending_xfer_clear; - } - } - /* Clear pending task management requests */ - for_each_set_bit(tag, &hba->outstanding_tasks, hba->nutmrs) { - if (ufshcd_clear_tm_cmd(hba, tag)) { - err_tm = true; - goto lock_skip_pending_xfer_clear; - } - } - -lock_skip_pending_xfer_clear: - ufshcd_retry_aborted_requests(hba); + needs_reset = ufshcd_abort_all(hba); spin_lock_irqsave(hba->host->host_lock, flags); hba->silence_err_logs = false; - if (err_xfer || err_tm) { - needs_reset = true; + if (needs_reset) goto do_reset; - } /* * After all reqs and tasks are cleared from doorbell, @@ -6198,6 +6543,8 @@ lock_skip_pending_xfer_clear: do_reset: /* Fatal errors need reset */ if (needs_reset) { + int err; + hba->force_reset = false; spin_unlock_irqrestore(hba->host->host_lock, flags); err = ufshcd_reset_and_restore(hba); @@ -6217,10 +6564,20 @@ skip_err_handling: dev_err_ratelimited(hba->dev, "%s: exit: saved_err 0x%x saved_uic_err 0x%x", __func__, hba->saved_err, hba->saved_uic_err); } + /* Exit in an operational state or dead */ + if (hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL && + hba->ufshcd_state != UFSHCD_STATE_ERROR) { + if (--retries) + goto again; + hba->ufshcd_state = UFSHCD_STATE_ERROR; + } ufshcd_clear_eh_in_progress(hba); spin_unlock_irqrestore(hba->host->host_lock, flags); ufshcd_err_handling_unprepare(hba); up(&hba->host_sem); + + dev_info(hba->dev, "%s finished; HBA state %s\n", __func__, + ufshcd_state_name[hba->ufshcd_state]); } /** @@ -6357,6 +6714,8 @@ static irqreturn_t ufshcd_check_errors(struct ufs_hba *hba, u32 intr_status) queue_eh_work = true; } + trace_android_vh_ufs_check_int_errors(hba, queue_eh_work); + if (queue_eh_work) { /* * update the transfer error masks to sticky bits, let's do this @@ -6421,6 +6780,40 @@ static irqreturn_t ufshcd_tmc_handler(struct ufs_hba *hba) return ret; } +/** + * ufshcd_handle_mcq_cq_events - handle MCQ completion queue events + * @hba: per adapter instance + * + * Returns IRQ_HANDLED if interrupt is handled + */ +static irqreturn_t ufshcd_handle_mcq_cq_events(struct ufs_hba *hba) +{ + struct ufs_hw_queue *hwq; + unsigned long outstanding_cqs; + unsigned int nr_queues; + int i, ret; + u32 events; + + ret = ufshcd_vops_get_outstanding_cqs(hba, &outstanding_cqs); + if (ret) + outstanding_cqs = (1U << hba->nr_hw_queues) - 1; + + /* Exclude the poll queues */ + nr_queues = hba->nr_hw_queues - hba->nr_queues[HCTX_TYPE_POLL]; + for_each_set_bit(i, &outstanding_cqs, nr_queues) { + hwq = &hba->uhq[i]; + + events = ufshcd_mcq_read_cqis(hba, i); + if (events) + ufshcd_mcq_write_cqis(hba, events, i); + + if (events & UFSHCD_MCQ_CQIS_TAIL_ENT_PUSH_STS) + ufshcd_mcq_poll_cqe_nolock(hba, hwq); + } + + return IRQ_HANDLED; +} + /** * ufshcd_sl_intr - Interrupt service routine * @hba: per adapter instance @@ -6444,7 +6837,10 @@ static irqreturn_t ufshcd_sl_intr(struct ufs_hba *hba, u32 intr_status) retval |= ufshcd_tmc_handler(hba); if (intr_status & UTP_TRANSFER_REQ_COMPL) - retval |= ufshcd_transfer_req_compl(hba, /*retry_requests=*/false); + retval |= ufshcd_transfer_req_compl(hba); + + if (intr_status & MCQ_CQ_EVENT_STATUS) + retval |= ufshcd_handle_mcq_cq_events(hba); return retval; } @@ -6467,7 +6863,7 @@ static irqreturn_t ufshcd_intr(int irq, void *__hba) intr_status = ufshcd_readl(hba, REG_INTERRUPT_STATUS); hba->ufs_stats.last_intr_status = intr_status; - hba->ufs_stats.last_intr_ts = ktime_get(); + hba->ufs_stats.last_intr_ts = local_clock(); /* * There could be max of hba->nutrs reqs in flight and in worst case @@ -6516,6 +6912,10 @@ static int ufshcd_clear_tm_cmd(struct ufs_hba *hba, int tag) err = ufshcd_wait_for_register(hba, REG_UTP_TASK_REQ_DOOR_BELL, mask, 0, 1000, 1000); + + dev_err(hba->dev, "Clearing task management function with tag %d %s\n", + tag, err ? "succeeded" : "failed"); + out: return err; } @@ -6543,6 +6943,8 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba *hba, spin_lock_irqsave(host->host_lock, flags); task_tag = req->tag; + WARN_ONCE(task_tag < 0 || task_tag >= hba->nutmrs, "Invalid tag %d\n", + task_tag); hba->tmf_rqs[req->tag] = req; treq->upiu_req.req_header.dword_0 |= cpu_to_be32(task_tag); @@ -6603,7 +7005,8 @@ static int ufshcd_issue_tm_cmd(struct ufs_hba *hba, int lun_id, int task_id, u8 tm_function, u8 *tm_response) { struct utp_task_req_desc treq = { { 0 }, }; - int ocs_value, err; + enum utp_ocs ocs_value; + int err; /* Configure task request descriptor */ treq.header.dword_0 = cpu_to_le32(UTP_REQ_DESC_INT_CMD); @@ -6673,8 +7076,6 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba, lrbp = &hba->lrb[tag]; WARN_ON(lrbp->cmd); lrbp->cmd = NULL; - lrbp->sense_bufflen = 0; - lrbp->sense_buffer = NULL; lrbp->task_tag = tag; lrbp->lun = 0; lrbp->intr_cmd = true; @@ -6689,7 +7090,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba, /* update the task tag in the request upiu */ req_upiu->header.dword_0 |= cpu_to_be32(tag); - ufshcd_prepare_req_desc_hdr(lrbp, &upiu_flags, DMA_NONE); + ufshcd_prepare_req_desc_hdr(lrbp, &upiu_flags, DMA_NONE, 0); /* just copy the upiu request as it is */ memcpy(lrbp->ucd_req_ptr, req_upiu, sizeof(*lrbp->ucd_req_ptr)); @@ -6708,7 +7109,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba, ufshcd_add_query_upiu_trace(hba, UFS_QUERY_SEND, lrbp->ucd_req_ptr); - ufshcd_send_command(hba, tag); + ufshcd_send_command(hba, tag, hba->dev_cmd_queue); /* * ignore the returning value here - ufshcd_check_query_response is * bound to fail since dev_cmd.query and dev_cmd.type were left empty. @@ -6766,7 +7167,7 @@ int ufshcd_exec_raw_upiu_cmd(struct ufs_hba *hba, int err; enum dev_cmd_type cmd_type = DEV_CMD_TYPE_QUERY; struct utp_task_req_desc treq = { { 0 }, }; - int ocs_value; + enum utp_ocs ocs_value; u8 tm_f = be32_to_cpu(req_upiu->header.dword_1) >> 16 & MASK_TM_FUNC; switch (msgcode) { @@ -6813,14 +7214,108 @@ int ufshcd_exec_raw_upiu_cmd(struct ufs_hba *hba, } /** - * ufshcd_eh_device_reset_handler - device reset handler registered to - * scsi layer. + * ufshcd_advanced_rpmb_req_handler - handle advanced RPMB request + * @hba: per adapter instance + * @req_upiu: upiu request + * @rsp_upiu: upiu reply + * @req_ehs: EHS field which contains Advanced RPMB Request Message + * @rsp_ehs: EHS field which returns Advanced RPMB Response Message + * @sg_cnt: The number of sg lists actually used + * @sg_list: Pointer to SG list when DATA IN/OUT UPIU is required in ARPMB operation + * @dir: DMA direction + * + * Returns zero on success, non-zero on failure + */ +int ufshcd_advanced_rpmb_req_handler(struct ufs_hba *hba, struct utp_upiu_req *req_upiu, + struct utp_upiu_req *rsp_upiu, struct ufs_ehs *req_ehs, + struct ufs_ehs *rsp_ehs, int sg_cnt, struct scatterlist *sg_list, + enum dma_data_direction dir) +{ + DECLARE_COMPLETION_ONSTACK(wait); + const u32 tag = hba->reserved_slot; + struct ufshcd_lrb *lrbp; + int err = 0; + int result; + u8 upiu_flags; + u8 *ehs_data; + u16 ehs_len; + + /* Protects use of hba->reserved_slot. */ + ufshcd_hold(hba, false); + mutex_lock(&hba->dev_cmd.lock); + down_read(&hba->clk_scaling_lock); + + lrbp = &hba->lrb[tag]; + WARN_ON(lrbp->cmd); + lrbp->cmd = NULL; + lrbp->task_tag = tag; + lrbp->lun = UFS_UPIU_RPMB_WLUN; + + lrbp->intr_cmd = true; + ufshcd_prepare_lrbp_crypto(NULL, lrbp); + hba->dev_cmd.type = DEV_CMD_TYPE_RPMB; + + /* Advanced RPMB starts from UFS 4.0, so its command type is UTP_CMD_TYPE_UFS_STORAGE */ + lrbp->command_type = UTP_CMD_TYPE_UFS_STORAGE; + + ufshcd_prepare_req_desc_hdr(lrbp, &upiu_flags, dir, 2); + + /* update the task tag and LUN in the request upiu */ + req_upiu->header.dword_0 |= cpu_to_be32(upiu_flags << 16 | UFS_UPIU_RPMB_WLUN << 8 | tag); + + /* copy the UPIU(contains CDB) request as it is */ + memcpy(lrbp->ucd_req_ptr, req_upiu, sizeof(*lrbp->ucd_req_ptr)); + /* Copy EHS, starting with byte32, immediately after the CDB package */ + memcpy(lrbp->ucd_req_ptr + 1, req_ehs, sizeof(*req_ehs)); + + if (dir != DMA_NONE && sg_list) + ufshcd_sgl_to_prdt(hba, lrbp, sg_cnt, sg_list); + + memset(lrbp->ucd_rsp_ptr, 0, sizeof(struct utp_upiu_rsp)); + + hba->dev_cmd.complete = &wait; + + ufshcd_send_command(hba, tag, hba->dev_cmd_queue); + + err = ufshcd_wait_for_dev_cmd(hba, lrbp, ADVANCED_RPMB_REQ_TIMEOUT); + + if (!err) { + /* Just copy the upiu response as it is */ + memcpy(rsp_upiu, lrbp->ucd_rsp_ptr, sizeof(*rsp_upiu)); + /* Get the response UPIU result */ + result = ufshcd_get_rsp_upiu_result(lrbp->ucd_rsp_ptr); + + ehs_len = be32_to_cpu(lrbp->ucd_rsp_ptr->header.dword_2) >> 24; + /* + * Since the bLength in EHS indicates the total size of the EHS Header and EHS Data + * in 32 Byte units, the value of the bLength Request/Response for Advanced RPMB + * Message is 02h + */ + if (ehs_len == 2 && rsp_ehs) { + /* + * ucd_rsp_ptr points to a buffer with a length of 512 bytes + * (ALIGNED_UPIU_SIZE = 512), and the EHS data just starts from byte32 + */ + ehs_data = (u8 *)lrbp->ucd_rsp_ptr + EHS_OFFSET_IN_RESPONSE; + memcpy(rsp_ehs, ehs_data, ehs_len * 32); + } + } + + up_read(&hba->clk_scaling_lock); + mutex_unlock(&hba->dev_cmd.lock); + ufshcd_release(hba); + return err ? : result; +} + +/** + * ufshcd_eh_device_reset_handler() - Reset a single logical unit. * @cmd: SCSI command pointer * * Returns SUCCESS/FAILED */ static int ufshcd_eh_device_reset_handler(struct scsi_cmnd *cmd) { + unsigned long flags, pending_reqs = 0, not_cleared = 0; struct Scsi_Host *host; struct ufs_hba *hba; u32 pos; @@ -6839,14 +7334,24 @@ static int ufshcd_eh_device_reset_handler(struct scsi_cmnd *cmd) } /* clear the commands that were pending for corresponding LUN */ - for_each_set_bit(pos, &hba->outstanding_reqs, hba->nutrs) { - if (hba->lrb[pos].lun == lun) { - err = ufshcd_clear_cmd(hba, pos); - if (err) - break; - __ufshcd_transfer_req_compl(hba, 1U << pos, false); - } + spin_lock_irqsave(&hba->outstanding_lock, flags); + for_each_set_bit(pos, &hba->outstanding_reqs, hba->nutrs) + if (hba->lrb[pos].lun == lun) + __set_bit(pos, &pending_reqs); + hba->outstanding_reqs &= ~pending_reqs; + spin_unlock_irqrestore(&hba->outstanding_lock, flags); + + if (ufshcd_clear_cmds(hba, pending_reqs) < 0) { + spin_lock_irqsave(&hba->outstanding_lock, flags); + not_cleared = pending_reqs & + ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL); + hba->outstanding_reqs |= not_cleared; + spin_unlock_irqrestore(&hba->outstanding_lock, flags); + + dev_err(hba->dev, "%s: failed to clear requests %#lx\n", + __func__, not_cleared); } + __ufshcd_transfer_req_compl(hba, pending_reqs & ~not_cleared); out: hba->req_abort_count = 0; @@ -6943,7 +7448,7 @@ static int ufshcd_try_to_abort_task(struct ufs_hba *hba, int tag) goto out; } - err = ufshcd_clear_cmd(hba, tag); + err = ufshcd_clear_cmds(hba, 1U << tag); if (err) dev_err(hba->dev, "%s: Failed clearing cmd at tag %d, err %d\n", __func__, tag, err); @@ -6966,6 +7471,7 @@ static int ufshcd_abort(struct scsi_cmnd *cmd) struct ufshcd_lrb *lrbp = &hba->lrb[tag]; unsigned long flags; int err = FAILED; + bool outstanding; u32 reg; WARN_ONCE(tag < 0, "Invalid tag %d\n", tag); @@ -7006,7 +7512,7 @@ static int ufshcd_abort(struct scsi_cmnd *cmd) dev_err(hba->dev, "%s: cmd was completed, but without a notifying intr, tag = %d", __func__, tag); - __ufshcd_transfer_req_compl(hba, 1UL << tag, /*retry_requests=*/false); + __ufshcd_transfer_req_compl(hba, 1UL << tag); goto release; } @@ -7043,7 +7549,17 @@ static int ufshcd_abort(struct scsi_cmnd *cmd) goto release; } - lrbp->cmd = NULL; + /* + * Clear the corresponding bit from outstanding_reqs since the command + * has been aborted successfully. + */ + spin_lock_irqsave(&hba->outstanding_lock, flags); + outstanding = __test_and_clear_bit(tag, &hba->outstanding_reqs); + spin_unlock_irqrestore(&hba->outstanding_lock, flags); + + if (outstanding) + ufshcd_release_scsi_cmd(hba, lrbp); + err = SUCCESS; release: @@ -7070,14 +7586,14 @@ static int ufshcd_host_reset_and_restore(struct ufs_hba *hba) * Stop the host controller and complete the requests * cleared by h/w */ - ufshpb_reset_host(hba); + ufshpb_toggle_state(hba, HPB_PRESENT, HPB_RESET); ufshcd_hba_stop(hba); hba->silence_err_logs = true; - ufshcd_retry_aborted_requests(hba); + ufshcd_complete_requests(hba); hba->silence_err_logs = false; /* scale up clocks to max frequency before full reinitialization */ - ufshcd_set_clk_freq(hba, true); + ufshcd_scale_clks(hba, true); err = ufshcd_hba_enable(hba); @@ -7102,31 +7618,41 @@ static int ufshcd_host_reset_and_restore(struct ufs_hba *hba) */ static int ufshcd_reset_and_restore(struct ufs_hba *hba) { - u32 saved_err; - u32 saved_uic_err; + u32 saved_err = 0; + u32 saved_uic_err = 0; int err = 0; unsigned long flags; int retries = MAX_HOST_RESET_RETRIES; - /* - * This is a fresh start, cache and clear saved error first, - * in case new error generated during reset and restore. - */ spin_lock_irqsave(hba->host->host_lock, flags); - saved_err = hba->saved_err; - saved_uic_err = hba->saved_uic_err; - hba->saved_err = 0; - hba->saved_uic_err = 0; - spin_unlock_irqrestore(hba->host->host_lock, flags); - do { + /* + * This is a fresh start, cache and clear saved error first, + * in case new error generated during reset and restore. + */ + saved_err |= hba->saved_err; + saved_uic_err |= hba->saved_uic_err; + hba->saved_err = 0; + hba->saved_uic_err = 0; + hba->force_reset = false; + hba->ufshcd_state = UFSHCD_STATE_RESET; + spin_unlock_irqrestore(hba->host->host_lock, flags); + /* Reset the attached device */ ufshcd_device_reset(hba); err = ufshcd_host_reset_and_restore(hba); + + spin_lock_irqsave(hba->host->host_lock, flags); + if (err) + continue; + /* Do not exit unless operational or dead */ + if (hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL && + hba->ufshcd_state != UFSHCD_STATE_ERROR && + hba->ufshcd_state != UFSHCD_STATE_EH_SCHEDULED_NON_FATAL) + err = -EAGAIN; } while (err && --retries); - spin_lock_irqsave(hba->host->host_lock, flags); /* * Inform scsi mid-layer that we did reset and allow to handle * Unit Attention properly. @@ -7180,7 +7706,8 @@ static int ufshcd_eh_host_reset_handler(struct scsi_cmnd *cmd) * * Returns calculated max ICC level for specific regulator */ -static u32 ufshcd_get_max_icc_level(int sup_curr_uA, u32 start_scan, char *buff) +static u32 ufshcd_get_max_icc_level(int sup_curr_uA, u32 start_scan, + const char *buff) { int i; int curr_uA; @@ -7188,7 +7715,7 @@ static u32 ufshcd_get_max_icc_level(int sup_curr_uA, u32 start_scan, char *buff) u16 unit; for (i = start_scan; i >= 0; i--) { - data = be16_to_cpup((__be16 *)&buff[2 * i]); + data = get_unaligned_be16(&buff[2 * i]); unit = (data & ATTR_ICC_LVL_UNIT_MASK) >> ATTR_ICC_LVL_UNIT_OFFSET; curr_uA = data & ATTR_ICC_LVL_VALUE_MASK; @@ -7222,12 +7749,11 @@ static u32 ufshcd_get_max_icc_level(int sup_curr_uA, u32 start_scan, char *buff) * In case regulators are not initialized we'll return 0 * @hba: per-adapter instance * @desc_buf: power descriptor buffer to extract ICC levels from. - * @len: length of desc_buff * * Returns calculated ICC level */ static u32 ufshcd_find_max_sup_active_icc_level(struct ufs_hba *hba, - u8 *desc_buf, int len) + const u8 *desc_buf) { u32 icc_level = 0; @@ -7269,25 +7795,23 @@ out: static void ufshcd_set_active_icc_lvl(struct ufs_hba *hba) { int ret; - int buff_len = hba->desc_size[QUERY_DESC_IDN_POWER]; u8 *desc_buf; u32 icc_level; - desc_buf = kmalloc(buff_len, GFP_KERNEL); + desc_buf = kzalloc(QUERY_DESC_MAX_SIZE, GFP_KERNEL); if (!desc_buf) return; ret = ufshcd_read_desc_param(hba, QUERY_DESC_IDN_POWER, 0, 0, - desc_buf, buff_len); + desc_buf, QUERY_DESC_MAX_SIZE); if (ret) { dev_err(hba->dev, - "%s: Failed reading power descriptor.len = %d ret = %d", - __func__, buff_len, ret); + "%s: Failed reading power descriptor ret = %d", + __func__, ret); goto out; } - icc_level = ufshcd_find_max_sup_active_icc_level(hba, desc_buf, - buff_len); + icc_level = ufshcd_find_max_sup_active_icc_level(hba, desc_buf); dev_dbg(hba->dev, "%s: setting icc_level 0x%x", __func__, icc_level); ret = ufshcd_query_attr_retry(hba, UPIU_QUERY_OPCODE_WRITE_ATTR, @@ -7341,25 +7865,25 @@ static inline void ufshcd_blk_pm_runtime_init(struct scsi_device *sdev) static int ufshcd_scsi_add_wlus(struct ufs_hba *hba) { int ret = 0; - struct scsi_device *sdev_boot; + struct scsi_device *sdev_boot, *sdev_rpmb; - hba->sdev_ufs_device = __scsi_add_device(hba->host, 0, 0, + hba->ufs_device_wlun = __scsi_add_device(hba->host, 0, 0, ufshcd_upiu_wlun_to_scsi_wlun(UFS_UPIU_UFS_DEVICE_WLUN), NULL); - if (IS_ERR(hba->sdev_ufs_device)) { - ret = PTR_ERR(hba->sdev_ufs_device); - hba->sdev_ufs_device = NULL; + if (IS_ERR(hba->ufs_device_wlun)) { + ret = PTR_ERR(hba->ufs_device_wlun); + hba->ufs_device_wlun = NULL; goto out; } - scsi_device_put(hba->sdev_ufs_device); + scsi_device_put(hba->ufs_device_wlun); - hba->sdev_rpmb = __scsi_add_device(hba->host, 0, 0, + sdev_rpmb = __scsi_add_device(hba->host, 0, 0, ufshcd_upiu_wlun_to_scsi_wlun(UFS_UPIU_RPMB_WLUN), NULL); - if (IS_ERR(hba->sdev_rpmb)) { - ret = PTR_ERR(hba->sdev_rpmb); - goto remove_sdev_ufs_device; + if (IS_ERR(sdev_rpmb)) { + ret = PTR_ERR(sdev_rpmb); + goto remove_ufs_device_wlun; } - ufshcd_blk_pm_runtime_init(hba->sdev_rpmb); - scsi_device_put(hba->sdev_rpmb); + ufshcd_blk_pm_runtime_init(sdev_rpmb); + scsi_device_put(sdev_rpmb); sdev_boot = __scsi_add_device(hba->host, 0, 0, ufshcd_upiu_wlun_to_scsi_wlun(UFS_UPIU_BOOT_WLUN), NULL); @@ -7371,13 +7895,13 @@ static int ufshcd_scsi_add_wlus(struct ufs_hba *hba) } goto out; -remove_sdev_ufs_device: - scsi_remove_device(hba->sdev_ufs_device); +remove_ufs_device_wlun: + scsi_remove_device(hba->ufs_device_wlun); out: return ret; } -static void ufshcd_wb_probe(struct ufs_hba *hba, u8 *desc_buf) +static void ufshcd_wb_probe(struct ufs_hba *hba, const u8 *desc_buf) { struct ufs_dev_info *dev_info = &hba->dev_info; u8 lun; @@ -7386,6 +7910,7 @@ static void ufshcd_wb_probe(struct ufs_hba *hba, u8 *desc_buf) if (!ufshcd_is_wb_allowed(hba)) return; + /* * Probe WB only for UFS-2.2 and UFS-3.1 (and later) devices or * UFS devices with quirk UFS_DEVICE_QUIRK_SUPPORT_EXTENDED_FEATURES @@ -7396,10 +7921,6 @@ static void ufshcd_wb_probe(struct ufs_hba *hba, u8 *desc_buf) (hba->dev_quirks & UFS_DEVICE_QUIRK_SUPPORT_EXTENDED_FEATURES))) goto wb_disabled; - if (hba->desc_size[QUERY_DESC_IDN_DEVICE] < - DEVICE_DESC_PARAM_EXT_UFS_FEATURE_SUP + 4) - goto wb_disabled; - ext_ufs_feature = get_unaligned_be32(desc_buf + DEVICE_DESC_PARAM_EXT_UFS_FEATURE_SUP); @@ -7437,15 +7958,68 @@ static void ufshcd_wb_probe(struct ufs_hba *hba, u8 *desc_buf) if (!d_lu_wb_buf_alloc) goto wb_disabled; } + + if (!ufshcd_is_wb_buf_lifetime_available(hba)) + goto wb_disabled; + return; wb_disabled: hba->caps &= ~UFSHCD_CAP_WB_EN; } -void ufshcd_fixup_dev_quirks(struct ufs_hba *hba, struct ufs_dev_fix *fixups) +static void ufshcd_temp_notif_probe(struct ufs_hba *hba, const u8 *desc_buf) { - struct ufs_dev_fix *f; + struct ufs_dev_info *dev_info = &hba->dev_info; + u32 ext_ufs_feature; + u8 mask = 0; + + if (!(hba->caps & UFSHCD_CAP_TEMP_NOTIF) || dev_info->wspecversion < 0x300) + return; + + ext_ufs_feature = get_unaligned_be32(desc_buf + DEVICE_DESC_PARAM_EXT_UFS_FEATURE_SUP); + + if (ext_ufs_feature & UFS_DEV_LOW_TEMP_NOTIF) + mask |= MASK_EE_TOO_LOW_TEMP; + + if (ext_ufs_feature & UFS_DEV_HIGH_TEMP_NOTIF) + mask |= MASK_EE_TOO_HIGH_TEMP; + + if (mask) { + ufshcd_enable_ee(hba, mask); + ufs_hwmon_probe(hba, mask); + } +} + +static void ufshcd_ext_iid_probe(struct ufs_hba *hba, u8 *desc_buf) +{ + struct ufs_dev_info *dev_info = &hba->dev_info; + u32 ext_ufs_feature; + u32 ext_iid_en = 0; + int err; + + /* Only UFS-4.0 and above may support EXT_IID */ + if (dev_info->wspecversion < 0x400) + goto out; + + ext_ufs_feature = get_unaligned_be32(desc_buf + + DEVICE_DESC_PARAM_EXT_UFS_FEATURE_SUP); + if (!(ext_ufs_feature & UFS_DEV_EXT_IID_SUP)) + goto out; + + err = ufshcd_query_attr_retry(hba, UPIU_QUERY_OPCODE_READ_ATTR, + QUERY_ATTR_IDN_EXT_IID_EN, 0, 0, &ext_iid_en); + if (err) + dev_err(hba->dev, "failed reading bEXTIIDEn. err = %d\n", err); + +out: + dev_info->b_ext_iid_en = ext_iid_en; +} + +void ufshcd_fixup_dev_quirks(struct ufs_hba *hba, + const struct ufs_dev_quirk *fixups) +{ + const struct ufs_dev_quirk *f; struct ufs_dev_info *dev_info = &hba->dev_info; if (!fixups) @@ -7479,14 +8053,14 @@ static int ufs_get_device_desc(struct ufs_hba *hba) u8 *desc_buf; struct ufs_dev_info *dev_info = &hba->dev_info; - desc_buf = kmalloc(QUERY_DESC_MAX_SIZE, GFP_KERNEL); + desc_buf = kzalloc(QUERY_DESC_MAX_SIZE, GFP_KERNEL); if (!desc_buf) { err = -ENOMEM; goto out; } err = ufshcd_read_desc_param(hba, QUERY_DESC_IDN_DEVICE, 0, 0, desc_buf, - hba->desc_size[QUERY_DESC_IDN_DEVICE]); + QUERY_DESC_MAX_SIZE); if (err) { dev_err(hba->dev, "%s: Failed reading Device Desc. err = %d\n", __func__, err); @@ -7503,6 +8077,7 @@ static int ufs_get_device_desc(struct ufs_hba *hba) /* getting Specification Version in big endian format */ dev_info->wspecversion = desc_buf[DEVICE_DESC_PARAM_SPEC_VER] << 8 | desc_buf[DEVICE_DESC_PARAM_SPEC_VER + 1]; + dev_info->bqueuedepth = desc_buf[DEVICE_DESC_PARAM_Q_DPTH]; b_ufs_feature_sup = desc_buf[DEVICE_DESC_PARAM_UFS_FEAT]; model_index = desc_buf[DEVICE_DESC_PARAM_PRDCT_NAME]; @@ -7538,6 +8113,11 @@ static int ufs_get_device_desc(struct ufs_hba *hba) ufshcd_wb_probe(hba, desc_buf); + ufshcd_temp_notif_probe(hba, desc_buf); + + if (hba->ext_iid_sup) + ufshcd_ext_iid_probe(hba, desc_buf); + /* * ufshcd_read_string_desc returns size of the string * reset the error value @@ -7651,7 +8231,7 @@ static int ufshcd_quirk_tune_host_pa_tactivate(struct ufs_hba *hba) u32 granularity, peer_granularity; u32 pa_tactivate, peer_pa_tactivate; u32 pa_tactivate_us, peer_pa_tactivate_us; - u8 gran_to_us_table[] = {1, 4, 8, 16, 32, 100}; + static const u8 gran_to_us_table[] = {1, 4, 8, 16, 32, 100}; ret = ufshcd_dme_get(hba, UIC_ARG_MIB(PA_GRANULARITY), &granularity); @@ -7690,7 +8270,7 @@ static int ufshcd_quirk_tune_host_pa_tactivate(struct ufs_hba *hba) peer_pa_tactivate_us = peer_pa_tactivate * gran_to_us_table[peer_granularity - 1]; - if (pa_tactivate_us > peer_pa_tactivate_us) { + if (pa_tactivate_us >= peer_pa_tactivate_us) { u32 new_peer_pa_tactivate; new_peer_pa_tactivate = pa_tactivate_us / @@ -7731,18 +8311,16 @@ static void ufshcd_clear_dbg_ufs_stats(struct ufs_hba *hba) static int ufshcd_device_geo_params_init(struct ufs_hba *hba) { int err; - size_t buff_len; u8 *desc_buf; - buff_len = hba->desc_size[QUERY_DESC_IDN_GEOMETRY]; - desc_buf = kmalloc(buff_len, GFP_KERNEL); + desc_buf = kzalloc(QUERY_DESC_MAX_SIZE, GFP_KERNEL); if (!desc_buf) { err = -ENOMEM; goto out; } err = ufshcd_read_desc_param(hba, QUERY_DESC_IDN_GEOMETRY, 0, 0, - desc_buf, buff_len); + desc_buf, QUERY_DESC_MAX_SIZE); if (err) { dev_err(hba->dev, "%s: Failed reading Geometry Desc. err = %d\n", __func__, err); @@ -7754,7 +8332,7 @@ static int ufshcd_device_geo_params_init(struct ufs_hba *hba) else if (desc_buf[GEOMETRY_DESC_PARAM_MAX_NUM_LUN] == 0) hba->dev_info.max_lu_supported = 8; - if (hba->desc_size[QUERY_DESC_IDN_GEOMETRY] >= + if (desc_buf[QUERY_DESC_LENGTH_OFFSET] >= GEOMETRY_DESC_PARAM_HPB_MAX_ACTIVE_REGS) ufshpb_get_geo_info(hba, desc_buf); @@ -7763,7 +8341,12 @@ out: return err; } -static struct ufs_ref_clk ufs_ref_clk_freqs[] = { +struct ufs_ref_clk { + unsigned long freq_hz; + enum ufs_ref_clk_freq val; +}; + +static const struct ufs_ref_clk ufs_ref_clk_freqs[] = { {19200000, REF_CLK_FREQ_19_2_MHZ}, {26000000, REF_CLK_FREQ_26_MHZ}, {38400000, REF_CLK_FREQ_38_4_MHZ}, @@ -7834,11 +8417,7 @@ out: static int ufshcd_device_params_init(struct ufs_hba *hba) { bool flag; - int ret, i; - - /* Init device descriptor sizes */ - for (i = 0; i < QUERY_DESC_IDN_MAX; i++) - hba->desc_size[i] = QUERY_DESC_MAX_SIZE; + int ret; /* Init UFS geometry descriptor related parameters */ ret = ufshcd_device_geo_params_init(hba); @@ -7906,24 +8485,96 @@ out: return ret; } -/** - * ufshcd_probe_hba - probe hba to detect device and initialize it - * @hba: per-adapter instance - * @init_dev_params: whether or not to call ufshcd_device_params_init(). - * - * Execute link-startup and verify device initialization - */ -static int ufshcd_probe_hba(struct ufs_hba *hba, bool init_dev_params) +/* SDB - Single Doorbell */ +static void ufshcd_release_sdb_queue(struct ufs_hba *hba, int nutrs) +{ + size_t ucdl_size, utrdl_size; + + ucdl_size = sizeof(struct utp_transfer_cmd_desc) * nutrs; + dmam_free_coherent(hba->dev, ucdl_size, hba->ucdl_base_addr, + hba->ucdl_dma_addr); + + utrdl_size = sizeof(struct utp_transfer_req_desc) * nutrs; + dmam_free_coherent(hba->dev, utrdl_size, hba->utrdl_base_addr, + hba->utrdl_dma_addr); + + devm_kfree(hba->dev, hba->lrb); +} + +static int ufshcd_alloc_mcq(struct ufs_hba *hba) { int ret; - unsigned long flags; - ktime_t start = ktime_get(); + int old_nutrs = hba->nutrs; + + ret = ufshcd_mcq_decide_queue_depth(hba); + if (ret < 0) + return ret; + + hba->nutrs = ret; + ret = ufshcd_mcq_init(hba); + if (ret) + goto err; + + /* + * Previously allocated memory for nutrs may not be enough in MCQ mode. + * Number of supported tags in MCQ mode may be larger than SDB mode. + */ + if (hba->nutrs != old_nutrs) { + ufshcd_release_sdb_queue(hba, old_nutrs); + ret = ufshcd_memory_alloc(hba); + if (ret) + goto err; + ufshcd_host_memory_configure(hba); + } + + ret = ufshcd_mcq_memory_alloc(hba); + if (ret) + goto err; + + return 0; +err: + hba->nutrs = old_nutrs; + return ret; +} + +static void ufshcd_config_mcq(struct ufs_hba *hba) +{ + int ret; + + ret = ufshcd_mcq_vops_config_esi(hba); + dev_info(hba->dev, "ESI %sconfigured\n", ret ? "is not " : ""); + + ufshcd_enable_intr(hba, UFSHCD_ENABLE_MCQ_INTRS); + ufshcd_mcq_make_queues_operational(hba); + ufshcd_mcq_config_mac(hba, hba->nutrs); + + hba->host->can_queue = hba->nutrs - UFSHCD_NUM_RESERVED; + hba->reserved_slot = hba->nutrs - UFSHCD_NUM_RESERVED; + + /* Select MCQ mode */ + ufshcd_writel(hba, ufshcd_readl(hba, REG_UFS_MEM_CFG) | 0x1, + REG_UFS_MEM_CFG); + hba->mcq_enabled = true; + + dev_info(hba->dev, "MCQ configured, nr_queues=%d, io_queues=%d, read_queue=%d, poll_queues=%d, queue_depth=%d\n", + hba->nr_hw_queues, hba->nr_queues[HCTX_TYPE_DEFAULT], + hba->nr_queues[HCTX_TYPE_READ], hba->nr_queues[HCTX_TYPE_POLL], + hba->nutrs); +} + +static int ufshcd_device_init(struct ufs_hba *hba, bool init_dev_params) +{ + int ret; + struct Scsi_Host *host = hba->host; hba->ufshcd_state = UFSHCD_STATE_RESET; ret = ufshcd_link_startup(hba); if (ret) - goto out; + return ret; + + if (hba->quirks & UFSHCD_QUIRK_SKIP_PH_CONFIGURATION) + return ret; /* Debug counters initialization */ ufshcd_clear_dbg_ufs_stats(hba); @@ -7931,15 +8582,19 @@ static int ufshcd_probe_hba(struct ufs_hba *hba, bool init_dev_params) /* UniPro link is active now */ ufshcd_set_link_active(hba); + /* Reconfigure MCQ upon reset */ + if (is_mcq_enabled(hba) && !init_dev_params) + ufshcd_config_mcq(hba); + /* Verify device initialization by sending NOP OUT UPIU */ ret = ufshcd_verify_dev_init(hba); if (ret) - goto out; + return ret; /* Initiate UFS initialization, and waiting until completion */ ret = ufshcd_complete_dev_init(hba); if (ret) - goto out; + return ret; /* * Initialize UFS device parameters used by driver, these @@ -7948,7 +8603,25 @@ static int ufshcd_probe_hba(struct ufs_hba *hba, bool init_dev_params) if (init_dev_params) { ret = ufshcd_device_params_init(hba); if (ret) - goto out; + return ret; + if (is_mcq_supported(hba) && !hba->scsi_host_added) { + ret = ufshcd_alloc_mcq(hba); + if (ret) { + /* Continue with SDB mode */ + use_mcq_mode = false; + dev_err(hba->dev, "MCQ mode is disabled, err=%d\n", + ret); + } + ret = scsi_add_host(host, hba->dev); + if (ret) { + dev_err(hba->dev, "scsi_add_host failed\n"); + return ret; + } + hba->scsi_host_added = true; + } + /* MCQ may be disabled if ufshcd_alloc_mcq() fails */ + if (is_mcq_supported(hba) && use_mcq_mode) + ufshcd_config_mcq(hba); } ufshcd_tune_unipro_params(hba); @@ -7969,11 +8642,51 @@ static int ufshcd_probe_hba(struct ufs_hba *hba, bool init_dev_params) if (ret) { dev_err(hba->dev, "%s: Failed setting power mode, err = %d\n", __func__, ret); + return ret; + } + } + + return 0; +} + +/** + * ufshcd_probe_hba - probe hba to detect device and initialize it + * @hba: per-adapter instance + * @init_dev_params: whether or not to call ufshcd_device_params_init(). + * + * Execute link-startup and verify device initialization + */ +static int ufshcd_probe_hba(struct ufs_hba *hba, bool init_dev_params) +{ + ktime_t start = ktime_get(); + unsigned long flags; + int ret; + + ret = ufshcd_device_init(hba, init_dev_params); + if (ret) + goto out; + + if (hba->quirks & UFSHCD_QUIRK_REINIT_AFTER_MAX_GEAR_SWITCH) { + /* Reset the device and controller before doing reinit */ + ufshcd_device_reset(hba); + ufshcd_hba_stop(hba); + ufshcd_vops_reinit_notify(hba); + ret = ufshcd_hba_enable(hba); + if (ret) { + dev_err(hba->dev, "Host controller enable failed\n"); + ufshcd_print_evt_hist(hba); + ufshcd_print_host_state(hba); goto out; } - ufshcd_print_pwr_info(hba); + + /* Reinit the device */ + ret = ufshcd_device_init(hba, init_dev_params); + if (ret) + goto out; } + ufshcd_print_pwr_info(hba); + /* * bActiveICCLevel is volatile for UFS device (as per latest v2.1 spec) * and for removable UFS card as well, hence always set the parameter. @@ -7982,13 +8695,17 @@ static int ufshcd_probe_hba(struct ufs_hba *hba, bool init_dev_params) */ ufshcd_set_active_icc_lvl(hba); - ufshcd_wb_config(hba); + /* Enable UFS Write Booster if supported */ + ufshcd_configure_wb(hba); + if (hba->ee_usr_mask) ufshcd_write_ee_control(hba); /* Enable Auto-Hibernate if configured */ ufshcd_auto_hibern8_enable(hba); - ufshpb_reset(hba); + ufshpb_toggle_state(hba, HPB_RESET, HPB_PRESENT); + + trace_android_rvh_ufs_complete_init(hba); out: spin_lock_irqsave(hba->host->host_lock, flags); if (ret) @@ -8033,6 +8750,28 @@ out: } } +static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd) +{ + struct ufs_hba *hba = shost_priv(scmd->device->host); + + if (!hba->system_suspending) { + /* Activate the error handler in the SCSI core. */ + return SCSI_EH_NOT_HANDLED; + } + + /* + * If we get here we know that no TMFs are outstanding and also that + * the only pending command is a START STOP UNIT command. Handle the + * timeout of that command directly to prevent a deadlock between + * ufshcd_set_dev_pwr_mode() and ufshcd_err_handler(). + */ + ufshcd_link_recovery(hba); + dev_info(hba->dev, "%s() finished; outstanding_tasks = %#lx.\n", + __func__, hba->outstanding_tasks); + + return hba->outstanding_reqs ? SCSI_EH_RESET_TIMER : SCSI_EH_DONE; +} + static const struct attribute_group *ufshcd_driver_groups[] = { &ufs_sysfs_unit_descriptor_group, &ufs_sysfs_lun_attributes_group, @@ -8057,7 +8796,9 @@ static struct scsi_host_template ufshcd_driver_template = { .module = THIS_MODULE, .name = UFSHCD, .proc_name = UFSHCD, + .map_queues = ufshcd_map_queues, .queuecommand = ufshcd_queuecommand, + .mq_poll = ufshcd_poll, .slave_alloc = ufshcd_slave_alloc, .slave_configure = ufshcd_slave_configure, .slave_destroy = ufshcd_slave_destroy, @@ -8065,15 +8806,16 @@ static struct scsi_host_template ufshcd_driver_template = { .eh_abort_handler = ufshcd_abort, .eh_device_reset_handler = ufshcd_eh_device_reset_handler, .eh_host_reset_handler = ufshcd_eh_host_reset_handler, + .eh_timed_out = ufshcd_eh_timed_out, .this_id = -1, .sg_tablesize = SG_ALL, .cmd_per_lun = UFSHCD_CMD_PER_LUN, .can_queue = UFSHCD_CAN_QUEUE, .max_segment_size = PRDT_DATA_BYTE_COUNT_MAX, + .max_sectors = (1 << 20) / SECTOR_SIZE, /* 1 MiB */ .max_host_blocked = 1, .track_queue_depth = 1, .sdev_groups = ufshcd_driver_groups, - .dma_boundary = PAGE_SIZE - 1, .rpm_autosuspend_delay = RPM_AUTOSUSPEND_DELAY_MS, }; @@ -8121,33 +8863,10 @@ static inline int ufshcd_config_vreg_hpm(struct ufs_hba *hba, static int ufshcd_config_vreg(struct device *dev, struct ufs_vreg *vreg, bool on) { - int ret = 0; - struct regulator *reg; - const char *name; - int min_uV, uA_load; + if (regulator_count_voltages(vreg->reg) <= 0) + return 0; - BUG_ON(!vreg); - - reg = vreg->reg; - name = vreg->name; - - if (regulator_count_voltages(reg) > 0) { - uA_load = on ? vreg->max_uA : 0; - ret = ufshcd_config_vreg_load(dev, vreg, uA_load); - if (ret) - goto out; - - if (vreg->min_uV && vreg->max_uV) { - min_uV = on ? vreg->min_uV : 0; - ret = regulator_set_voltage(reg, min_uV, vreg->max_uV); - if (ret) - dev_err(dev, - "%s: %s set voltage failed, err=%d\n", - __func__, name, ret); - } - } -out: - return ret; + return ufshcd_config_vreg_load(dev, vreg, on ? vreg->max_uA : 0); } static int ufshcd_enable_vreg(struct device *dev, struct ufs_vreg *vreg) @@ -8223,7 +8942,7 @@ static int ufshcd_setup_hba_vreg(struct ufs_hba *hba, bool on) return ufshcd_toggle_vreg(hba->dev, info->vdd_hba, on); } -static int ufshcd_get_vreg(struct device *dev, struct ufs_vreg *vreg) +int ufshcd_get_vreg(struct device *dev, struct ufs_vreg *vreg) { int ret = 0; @@ -8239,6 +8958,7 @@ static int ufshcd_get_vreg(struct device *dev, struct ufs_vreg *vreg) out: return ret; } +EXPORT_SYMBOL_GPL(ufshcd_get_vreg); static int ufshcd_init_vreg(struct ufs_hba *hba) { @@ -8261,10 +8981,7 @@ static int ufshcd_init_hba_vreg(struct ufs_hba *hba) { struct ufs_vreg_info *info = &hba->vreg_info; - if (info) - return ufshcd_get_vreg(hba->dev, info->vdd_hba); - - return 0; + return ufshcd_get_vreg(hba->dev, info->vdd_hba); } static int ufshcd_setup_clocks(struct ufs_hba *hba, bool on) @@ -8335,6 +9052,19 @@ out: return ret; } +static enum ufs_ref_clk_freq ufshcd_parse_ref_clk_property(struct ufs_hba *hba) +{ + u32 freq; + int ret = device_property_read_u32(hba->dev, "ref-clk-freq", &freq); + + if (ret) { + dev_dbg(hba->dev, "Cannot query 'ref-clk-freq' property = %d", ret); + return REF_CLK_FREQ_INVAL; + } + + return ufs_get_bref_clk_from_hz(freq); +} + static int ufshcd_init_clocks(struct ufs_hba *hba) { int ret = 0; @@ -8428,6 +9158,9 @@ static int ufshcd_hba_init(struct ufs_hba *hba) if (err) goto out_disable_hba_vreg; + if (hba->dev_ref_clk_freq == REF_CLK_FREQ_INVAL) + hba->dev_ref_clk_freq = ufshcd_parse_ref_clk_property(hba); + err = ufshcd_setup_clocks(hba, true); if (err) goto out_disable_hba_vreg; @@ -8476,6 +9209,41 @@ static void ufshcd_hba_exit(struct ufs_hba *hba) } } +static int ufshcd_execute_start_stop(struct scsi_device *sdev, + enum ufs_dev_pwr_mode pwr_mode, + struct scsi_sense_hdr *sshdr) +{ + unsigned char cdb[6] = { START_STOP, 0, 0, 0, pwr_mode << 4, 0 }; + struct request *req; + struct scsi_request *rq; + struct scsi_cmnd *scmd; + int ret; + + req = blk_get_request(sdev->request_queue, REQ_OP_DRV_IN, + BLK_MQ_REQ_PM); + if (IS_ERR(req)) + return PTR_ERR(req); + + rq = scsi_req(req); + scmd = blk_mq_rq_to_pdu(req); + rq->cmd_len = COMMAND_SIZE(cdb[0]); + memcpy(rq->cmd, cdb, rq->cmd_len); + rq->retries = 0/*retries*/; + scmd->flags |= SCMD_FAIL_IF_RECOVERING; + req->timeout = 1 * HZ; + req->rq_flags |= RQF_PM | RQF_QUIET; + + blk_execute_rq(NULL, req, /*at_head=*/true); + + if (sshdr) + scsi_normalize_sense(rq->sense, rq->sense_len, sshdr); + ret = rq->result; + + blk_put_request(req); + + return ret; +} + /** * ufshcd_set_dev_pwr_mode - sends START STOP UNIT command to set device * power mode @@ -8488,23 +9256,17 @@ static void ufshcd_hba_exit(struct ufs_hba *hba) static int ufshcd_set_dev_pwr_mode(struct ufs_hba *hba, enum ufs_dev_pwr_mode pwr_mode) { - unsigned char cmd[6] = { START_STOP }; struct scsi_sense_hdr sshdr; struct scsi_device *sdp; unsigned long flags; int ret, retries; spin_lock_irqsave(hba->host->host_lock, flags); - sdp = hba->sdev_ufs_device; - if (sdp) { + sdp = hba->ufs_device_wlun; + if (sdp && scsi_device_online(sdp)) ret = scsi_device_get(sdp); - if (!ret && !scsi_device_online(sdp)) { - ret = -ENODEV; - scsi_device_put(sdp); - } - } else { + else ret = -ENODEV; - } spin_unlock_irqrestore(hba->host->host_lock, flags); if (ret) @@ -8518,19 +9280,18 @@ static int ufshcd_set_dev_pwr_mode(struct ufs_hba *hba, */ hba->host->eh_noresume = 1; - cmd[4] = pwr_mode << 4; - /* * Current function would be generally called from the power management * callbacks hence set the RQF_PM flag so that it doesn't resume the * already suspended childs. */ for (retries = 3; retries > 0; --retries) { - ret = scsi_execute(sdp, cmd, DMA_NONE, NULL, 0, NULL, &sshdr, - HZ, 0, 0, RQF_PM, NULL); - if (!scsi_status_is_check_condition(ret) || - !scsi_sense_valid(&sshdr) || - sshdr.sense_key != UNIT_ATTENTION) + ret = ufshcd_execute_start_stop(sdp, pwr_mode, &sshdr); + /* + * scsi_execute() only returns a negative value if the request + * queue is dying. + */ + if (ret <= 0) break; } if (ret) { @@ -8542,10 +9303,9 @@ static int ufshcd_set_dev_pwr_mode(struct ufs_hba *hba, scsi_print_sense_hdr(sdp, NULL, &sshdr); ret = -EIO; } - } - - if (!ret) + } else { hba->curr_dev_pwr_mode = pwr_mode; + } scsi_device_put(sdp); hba->host->eh_noresume = 0; @@ -8554,7 +9314,7 @@ static int ufshcd_set_dev_pwr_mode(struct ufs_hba *hba, static int ufshcd_link_state_transition(struct ufs_hba *hba, enum uic_link_state req_link_state, - int check_for_bkops) + bool check_for_bkops) { int ret = 0; @@ -8705,7 +9465,7 @@ static void ufshcd_hba_vreg_set_hpm(struct ufs_hba *hba) static int __ufshcd_wl_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op) { int ret = 0; - int check_for_bkops; + bool check_for_bkops; enum ufs_pm_level pm_lvl; enum ufs_dev_pwr_mode req_dev_pwr_mode; enum uic_link_state req_link_state; @@ -8777,6 +9537,10 @@ static int __ufshcd_wl_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op) flush_work(&hba->eeh_work); + ret = ufshcd_vops_suspend(hba, pm_op, PRE_CHANGE); + if (ret) + goto enable_scaling; + if (req_dev_pwr_mode != hba->curr_dev_pwr_mode) { if (pm_op != UFS_RUNTIME_PM) /* ensure that bkops is disabled */ @@ -8822,7 +9586,7 @@ vops_suspend: * vendor specific host controller register space call them before the * host clocks are ON. */ - ret = ufshcd_vops_suspend(hba, pm_op); + ret = ufshcd_vops_suspend(hba, pm_op, POST_CHANGE); if (ret) goto set_link_active; goto out; @@ -8950,7 +9714,8 @@ static int __ufshcd_wl_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op) set_old_link_state: ufshcd_link_state_transition(hba, old_link_state, 0); vendor_suspend: - ufshcd_vops_suspend(hba, pm_op); + ufshcd_vops_suspend(hba, pm_op, PRE_CHANGE); + ufshcd_vops_suspend(hba, pm_op, POST_CHANGE); out: if (ret) ufshcd_update_evt_hist(hba, UFS_EVT_WL_RES_ERR, (u32)ret); @@ -9011,6 +9776,7 @@ static int ufshcd_wl_suspend(struct device *dev) hba = shost_priv(sdev->host); down(&hba->host_sem); + hba->system_suspending = true; if (pm_runtime_suspended(dev)) goto out; @@ -9052,6 +9818,7 @@ out: hba->curr_dev_pwr_mode, hba->uic_link_state); if (!ret) hba->is_sys_suspended = false; + hba->system_suspending = false; up(&hba->host_sem); return ret; } @@ -9072,7 +9839,7 @@ static void ufshcd_wl_shutdown(struct device *dev) ufshcd_rpm_get_sync(hba); scsi_device_quiesce(sdev); shost_for_each_device(sdev, hba->host) { - if (sdev == hba->sdev_ufs_device) + if (sdev == hba->ufs_device_wlun) continue; scsi_device_quiesce(sdev); } @@ -9143,6 +9910,7 @@ static int ufshcd_resume(struct ufs_hba *hba) /* enable the host irq as host controller would be active soon */ ufshcd_enable_irq(hba); + goto out; disable_vreg: @@ -9289,14 +10057,14 @@ EXPORT_SYMBOL(ufshcd_shutdown); */ void ufshcd_remove(struct ufs_hba *hba) { - if (hba->sdev_ufs_device) + if (hba->ufs_device_wlun) ufshcd_rpm_get_sync(hba); + ufs_hwmon_remove(hba); ufs_bsg_remove(hba); ufshpb_remove(hba); ufs_sysfs_remove_nodes(hba->dev); blk_cleanup_queue(hba->tmf_queue); blk_mq_free_tag_set(&hba->tmf_tag_set); - blk_cleanup_queue(hba->cmd_queue); scsi_remove_host(hba->host); /* disable interrupts */ ufshcd_disable_intr(hba, hba->intr_mask); @@ -9305,6 +10073,56 @@ void ufshcd_remove(struct ufs_hba *hba) } EXPORT_SYMBOL_GPL(ufshcd_remove); +#ifdef CONFIG_PM_SLEEP +int ufshcd_system_freeze(struct device *dev) +{ + + return ufshcd_system_suspend(dev); + +} +EXPORT_SYMBOL_GPL(ufshcd_system_freeze); + +int ufshcd_system_restore(struct device *dev) +{ + + struct ufs_hba *hba = dev_get_drvdata(dev); + int ret; + + ret = ufshcd_system_resume(dev); + if (ret) + return ret; + + /* Configure UTRL and UTMRL base address registers */ + ufshcd_writel(hba, lower_32_bits(hba->utrdl_dma_addr), + REG_UTP_TRANSFER_REQ_LIST_BASE_L); + ufshcd_writel(hba, upper_32_bits(hba->utrdl_dma_addr), + REG_UTP_TRANSFER_REQ_LIST_BASE_H); + ufshcd_writel(hba, lower_32_bits(hba->utmrdl_dma_addr), + REG_UTP_TASK_REQ_LIST_BASE_L); + ufshcd_writel(hba, upper_32_bits(hba->utmrdl_dma_addr), + REG_UTP_TASK_REQ_LIST_BASE_H); + /* + * Make sure that UTRL and UTMRL base address registers + * are updated with the latest queue addresses. Only after + * updating these addresses, we can queue the new commands. + */ + mb(); + + /* Resuming from hibernate, assume that link was OFF */ + ufshcd_set_link_off(hba); + + return 0; + +} +EXPORT_SYMBOL_GPL(ufshcd_system_restore); + +int ufshcd_system_thaw(struct device *dev) +{ + return ufshcd_system_resume(dev); +} +EXPORT_SYMBOL_GPL(ufshcd_system_thaw); +#endif /* CONFIG_PM_SLEEP */ + /** * ufshcd_dealloc_host - deallocate Host Bus Adapter (HBA) * @hba: pointer to Host Bus Adapter (HBA) @@ -9357,11 +10175,13 @@ int ufshcd_alloc_host(struct device *dev, struct ufs_hba **hba_handle) err = -ENOMEM; goto out_error; } + host->nr_maps = HCTX_TYPE_POLL + 1; hba = shost_priv(host); hba->host = host; hba->dev = dev; hba->dev_ref_clk_freq = REF_CLK_FREQ_INVAL; hba->nop_out_timeout = NOP_OUT_TIMEOUT; + ufshcd_set_sg_entry_size(hba, sizeof(struct ufshcd_sg_entry)); INIT_LIST_HEAD(&hba->clk_list_head); spin_lock_init(&hba->outstanding_lock); @@ -9511,16 +10331,12 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) hba->is_irq_enabled = true; } - err = scsi_add_host(host, hba->dev); - if (err) { - dev_err(hba->dev, "scsi_add_host failed\n"); - goto out_disable; - } - - hba->cmd_queue = blk_mq_init_queue(&hba->host->tag_set); - if (IS_ERR(hba->cmd_queue)) { - err = PTR_ERR(hba->cmd_queue); - goto out_remove_scsi_host; + if (!is_mcq_supported(hba)) { + err = scsi_add_host(host, hba->dev); + if (err) { + dev_err(hba->dev, "scsi_add_host failed\n"); + goto out_disable; + } } hba->tmf_tag_set = (struct blk_mq_tag_set) { @@ -9531,7 +10347,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) }; err = blk_mq_alloc_tag_set(&hba->tmf_tag_set); if (err < 0) - goto free_cmd_queue; + goto out_remove_scsi_host; hba->tmf_queue = blk_mq_init_queue(&hba->tmf_tag_set); if (IS_ERR(hba->tmf_queue)) { err = PTR_ERR(hba->tmf_queue); @@ -9591,7 +10407,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) ufshcd_set_ufs_dev_active(hba); async_schedule(ufshcd_async_scan, hba); - ufs_sysfs_add_nodes(hba->dev); + ufs_sysfs_add_nodes(hba); device_enable_async_suspend(dev); return 0; @@ -9600,8 +10416,6 @@ free_tmf_queue: blk_cleanup_queue(hba->tmf_queue); free_tmf_tag_set: blk_mq_free_tag_set(&hba->tmf_tag_set); -free_cmd_queue: - blk_cleanup_queue(hba->cmd_queue); out_remove_scsi_host: scsi_remove_host(hba->host); out_disable: @@ -9623,7 +10437,27 @@ void ufshcd_resume_complete(struct device *dev) } EXPORT_SYMBOL_GPL(ufshcd_resume_complete); -int ufshcd_suspend_prepare(struct device *dev) +static bool ufshcd_rpm_ok_for_spm(struct ufs_hba *hba) +{ + struct device *dev = &hba->ufs_device_wlun->sdev_gendev; + enum ufs_dev_pwr_mode dev_pwr_mode; + enum uic_link_state link_state; + unsigned long flags; + bool res; + + spin_lock_irqsave(&dev->power.lock, flags); + dev_pwr_mode = ufs_get_pm_lvl_to_dev_pwr_mode(hba->spm_lvl); + link_state = ufs_get_pm_lvl_to_link_pwr_state(hba->spm_lvl); + res = pm_runtime_suspended(dev) && + hba->curr_dev_pwr_mode == dev_pwr_mode && + hba->uic_link_state == link_state && + !hba->dev_info.b_rpm_dev_flush_capable; + spin_unlock_irqrestore(&dev->power.lock, flags); + + return res; +} + +int __ufshcd_suspend_prepare(struct device *dev, bool rpm_ok_for_spm) { struct ufs_hba *hba = dev_get_drvdata(dev); int ret; @@ -9634,16 +10468,31 @@ int ufshcd_suspend_prepare(struct device *dev) * if it's runtime suspended. But ufs doesn't follow that. * Refer ufshcd_resume_complete() */ - if (hba->sdev_ufs_device) { - ret = ufshcd_rpm_get_sync(hba); - if (ret < 0 && ret != -EACCES) { - ufshcd_rpm_put(hba); - return ret; + if (hba->ufs_device_wlun) { + /* Prevent runtime suspend */ + ufshcd_rpm_get_noresume(hba); + /* + * Check if already runtime suspended in same state as system + * suspend would be. + */ + if (!rpm_ok_for_spm || !ufshcd_rpm_ok_for_spm(hba)) { + /* RPM state is not ok for SPM, so runtime resume */ + ret = ufshcd_rpm_resume(hba); + if (ret < 0 && ret != -EACCES) { + ufshcd_rpm_put(hba); + return ret; + } } hba->complete_put = true; } return 0; } +EXPORT_SYMBOL_GPL(__ufshcd_suspend_prepare); + +int ufshcd_suspend_prepare(struct device *dev) +{ + return __ufshcd_suspend_prepare(dev, true); +} EXPORT_SYMBOL_GPL(ufshcd_suspend_prepare); #ifdef CONFIG_PM_SLEEP @@ -9735,4 +10584,3 @@ MODULE_AUTHOR("Santosh Yaragnavi "); MODULE_AUTHOR("Vinayak Holikatti "); MODULE_DESCRIPTION("Generic UFS host controller driver Core"); MODULE_LICENSE("GPL"); -MODULE_VERSION(UFSHCD_DRIVER_VERSION); diff --git a/drivers/scsi/ufs/ufshpb.c b/drivers/ufs/core/ufshpb.c similarity index 92% rename from drivers/scsi/ufs/ufshpb.c rename to drivers/ufs/core/ufshpb.c index 14300896c57f..e1e1119d3658 100644 --- a/drivers/scsi/ufs/ufshpb.c +++ b/drivers/ufs/core/ufshpb.c @@ -10,11 +10,14 @@ */ #include -#include +#include +#include +#include +#include -#include "ufshcd.h" +#include "ufshcd-priv.h" #include "ufshpb.h" -#include "../sd.h" +#include "../../scsi/sd.h" #define ACTIVATION_THRESHOLD 8 /* 8 IOs */ #define READ_TO_MS 1000 @@ -91,12 +94,8 @@ static bool ufshpb_is_general_lun(int lun) static bool ufshpb_is_pinned_region(struct ufshpb_lu *hpb, int rgn_idx) { - if (hpb->lu_pinned_end != PINNED_NOT_SET && - rgn_idx >= hpb->lu_pinned_start && - rgn_idx <= hpb->lu_pinned_end) - return true; - - return false; + return hpb->lu_pinned_end != PINNED_NOT_SET && + rgn_idx >= hpb->lu_pinned_start && rgn_idx <= hpb->lu_pinned_end; } static void ufshpb_kick_map_work(struct ufshpb_lu *hpb) @@ -234,11 +233,6 @@ next_srgn: rgn = hpb->rgn_tbl + rgn_idx; srgn = rgn->srgn_tbl + srgn_idx; - if (likely(!srgn->is_last)) - bitmap_len = hpb->entries_per_srgn; - else - bitmap_len = hpb->last_srgn_entries; - if (!ufshpb_is_valid_srgn(rgn, srgn)) return true; @@ -254,6 +248,11 @@ next_srgn: return true; } + if (likely(!srgn->is_last)) + bitmap_len = hpb->entries_per_srgn; + else + bitmap_len = hpb->last_srgn_entries; + if ((srgn_offset + cnt) > bitmap_len) bit_len = bitmap_len - srgn_offset; else @@ -331,7 +330,7 @@ ufshpb_set_hpb_read_to_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp, cdb[0] = UFSHPB_READ; if (hba->dev_quirks & UFS_DEVICE_QUIRK_SWAP_L2P_ENTRY_FOR_HPB_READ) - ppn_tmp = swab64(ppn); + ppn_tmp = (__force __be64)swab64((__force u64)ppn); /* ppn value is stored as big-endian in the host memory */ memcpy(&cdb[6], &ppn_tmp, sizeof(__be64)); @@ -384,7 +383,7 @@ int ufshpb_prep(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) rgn = hpb->rgn_tbl + rgn_idx; srgn = rgn->srgn_tbl + srgn_idx; - /* If command type is WRITE or DISCARD, set bitmap as drity */ + /* If command type is WRITE or DISCARD, set bitmap as dirty */ if (ufshpb_is_write_or_discard(cmd)) { ufshpb_iterate_rgn(hpb, rgn_idx, srgn_idx, srgn_offset, transfer_len, true); @@ -434,9 +433,8 @@ int ufshpb_prep(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) return 0; } -static struct ufshpb_req *ufshpb_get_req(struct ufshpb_lu *hpb, - int rgn_idx, enum req_opf dir, - bool atomic) +static struct ufshpb_req *ufshpb_get_req(struct ufshpb_lu *hpb, int rgn_idx, + enum req_opf op, bool atomic) { struct ufshpb_req *rq; struct request *req; @@ -447,7 +445,7 @@ static struct ufshpb_req *ufshpb_get_req(struct ufshpb_lu *hpb, return NULL; retry: - req = blk_get_request(hpb->sdev_ufs_lu->request_queue, dir, + req = blk_get_request(hpb->sdev_ufs_lu->request_queue, op, BLK_MQ_REQ_NOWAIT); if (!atomic && (PTR_ERR(req) == -EWOULDBLOCK) && (--retries > 0)) { @@ -564,7 +562,7 @@ static void ufshpb_update_active_info(struct ufshpb_lu *hpb, int rgn_idx, if (list_empty(&srgn->list_act_srgn)) list_add_tail(&srgn->list_act_srgn, &hpb->lh_act_srgn); - hpb->stats.rb_active_cnt++; + hpb->stats.rcmd_active_cnt++; } static void ufshpb_update_inactive_info(struct ufshpb_lu *hpb, int rgn_idx) @@ -581,7 +579,7 @@ static void ufshpb_update_inactive_info(struct ufshpb_lu *hpb, int rgn_idx) if (list_empty(&rgn->list_inact_rgn)) list_add_tail(&rgn->list_inact_rgn, &hpb->lh_inact_rgn); - hpb->stats.rb_inactive_cnt++; + hpb->stats.rcmd_inactive_cnt++; } static void ufshpb_activate_subregion(struct ufshpb_lu *hpb, @@ -617,14 +615,14 @@ static void ufshpb_activate_subregion(struct ufshpb_lu *hpb, static void ufshpb_umap_req_compl_fn(struct request *req, blk_status_t error) { - struct ufshpb_req *umap_req = (struct ufshpb_req *)req->end_io_data; + struct ufshpb_req *umap_req = req->end_io_data; ufshpb_put_req(umap_req->hpb, umap_req); } static void ufshpb_map_req_compl_fn(struct request *req, blk_status_t error) { - struct ufshpb_req *map_req = (struct ufshpb_req *) req->end_io_data; + struct ufshpb_req *map_req = req->end_io_data; struct ufshpb_lu *hpb = map_req->hpb; struct ufshpb_subregion *srgn; unsigned long flags; @@ -667,17 +665,16 @@ static void ufshpb_execute_umap_req(struct ufshpb_lu *hpb, struct ufshpb_req *umap_req, struct ufshpb_region *rgn) { - struct request *req; - struct scsi_request *rq; + struct request *req = umap_req->req; + struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); - req = umap_req->req; req->timeout = 0; - req->end_io_data = (void *)umap_req; - rq = scsi_req(req); - ufshpb_set_unmap_cmd(rq->cmd, rgn); - rq->cmd_len = HPB_WRITE_BUFFER_CMD_LENGTH; + req->end_io_data = umap_req; - blk_execute_rq_nowait(NULL, req, 1, ufshpb_umap_req_compl_fn); + ufshpb_set_unmap_cmd(scmd->cmnd, rgn); + scmd->cmd_len = HPB_WRITE_BUFFER_CMD_LENGTH; + + blk_execute_rq_nowait(NULL, req, true, ufshpb_umap_req_compl_fn); hpb->stats.umap_req_cnt++; } @@ -687,7 +684,7 @@ static int ufshpb_execute_map_req(struct ufshpb_lu *hpb, { struct request_queue *q; struct request *req; - struct scsi_request *rq; + struct scsi_cmnd *scmd; int mem_size = hpb->srgn_mem_size; int ret = 0; int i; @@ -710,16 +707,15 @@ static int ufshpb_execute_map_req(struct ufshpb_lu *hpb, req->end_io_data = map_req; - rq = scsi_req(req); - if (unlikely(last)) mem_size = hpb->last_srgn_entries * HPB_ENTRY_SIZE; - ufshpb_set_read_buf_cmd(rq->cmd, map_req->rb.rgn_idx, + scmd = blk_mq_rq_to_pdu(req); + ufshpb_set_read_buf_cmd(scmd->cmnd, map_req->rb.rgn_idx, map_req->rb.srgn_idx, mem_size); - rq->cmd_len = HPB_READ_BUFFER_CMD_LENGTH; + scmd->cmd_len = HPB_READ_BUFFER_CMD_LENGTH; - blk_execute_rq_nowait(NULL, req, 1, ufshpb_map_req_compl_fn); + blk_execute_rq_nowait(NULL, req, true, ufshpb_map_req_compl_fn); hpb->stats.map_req_cnt++; return 0; @@ -933,11 +929,6 @@ static int ufshpb_issue_umap_single_req(struct ufshpb_lu *hpb, return ufshpb_issue_umap_req(hpb, rgn, true); } -static int ufshpb_issue_umap_all_req(struct ufshpb_lu *hpb) -{ - return ufshpb_issue_umap_req(hpb, NULL, false); -} - static void __ufshpb_evict_region(struct ufshpb_lu *hpb, struct ufshpb_region *rgn) { @@ -1145,6 +1136,39 @@ out: spin_unlock_irqrestore(&hpb->rgn_state_lock, flags); return ret; } +/** + *ufshpb_submit_region_inactive() - submit a region to be inactivated later + *@hpb: per-LU HPB instance + *@region_index: the index associated with the region that will be inactivated later + */ +static void ufshpb_submit_region_inactive(struct ufshpb_lu *hpb, int region_index) +{ + int subregion_index; + struct ufshpb_region *rgn; + struct ufshpb_subregion *srgn; + + /* + * Remove this region from active region list and add it to inactive list + */ + spin_lock(&hpb->rsp_list_lock); + ufshpb_update_inactive_info(hpb, region_index); + spin_unlock(&hpb->rsp_list_lock); + + rgn = hpb->rgn_tbl + region_index; + + /* + * Set subregion state to be HPB_SRGN_INVALID, there will no HPB read on this subregion + */ + spin_lock(&hpb->rgn_state_lock); + if (rgn->rgn_state != HPB_RGN_INACTIVE) { + for (subregion_index = 0; subregion_index < rgn->srgn_cnt; subregion_index++) { + srgn = rgn->srgn_tbl + subregion_index; + if (srgn->srgn_state == HPB_SRGN_VALID) + srgn->srgn_state = HPB_SRGN_INVALID; + } + } + spin_unlock(&hpb->rgn_state_lock); +} static void ufshpb_rsp_req_region_update(struct ufshpb_lu *hpb, struct utp_hpb_rsp *rsp_field) @@ -1204,25 +1228,8 @@ static void ufshpb_rsp_req_region_update(struct ufshpb_lu *hpb, for (i = 0; i < rsp_field->inactive_rgn_cnt; i++) { rgn_i = be16_to_cpu(rsp_field->hpb_inactive_field[i]); - dev_dbg(&hpb->sdev_ufs_lu->sdev_dev, - "inactivate(%d) region %d\n", i, rgn_i); - - spin_lock(&hpb->rsp_list_lock); - ufshpb_update_inactive_info(hpb, rgn_i); - spin_unlock(&hpb->rsp_list_lock); - - rgn = hpb->rgn_tbl + rgn_i; - - spin_lock(&hpb->rgn_state_lock); - if (rgn->rgn_state != HPB_RGN_INACTIVE) { - for (srgn_i = 0; srgn_i < rgn->srgn_cnt; srgn_i++) { - srgn = rgn->srgn_tbl + srgn_i; - if (srgn->srgn_state == HPB_SRGN_VALID) - srgn->srgn_state = HPB_SRGN_INVALID; - } - } - spin_unlock(&hpb->rgn_state_lock); - + dev_dbg(&hpb->sdev_ufs_lu->sdev_dev, "inactivate(%d) region %d\n", i, rgn_i); + ufshpb_submit_region_inactive(hpb, rgn_i); } out: @@ -1233,7 +1240,10 @@ out: queue_work(ufshpb_wq, &hpb->map_work); } -static void ufshpb_dev_reset_handler(struct ufshpb_lu *hpb) +/* + * Set the flags of all active regions to RGN_FLAG_UPDATE to let host side reload L2P entries later + */ +static void ufshpb_set_regions_update(struct ufshpb_lu *hpb) { struct victim_select_info *lru_info = &hpb->lru_info; struct ufshpb_region *rgn; @@ -1247,6 +1257,42 @@ static void ufshpb_dev_reset_handler(struct ufshpb_lu *hpb) spin_unlock_irqrestore(&hpb->rgn_state_lock, flags); } +static void ufshpb_dev_reset_handler(struct ufs_hba *hba) +{ + struct scsi_device *sdev; + struct ufshpb_lu *hpb; + + __shost_for_each_device(sdev, hba->host) { + hpb = ufshpb_get_hpb_data(sdev); + if (!hpb) + continue; + + if (hpb->is_hcm) { + /* + * For the HPB host control mode, in case device powered up and lost HPB + * information, we will set the region flag to be RGN_FLAG_UPDATE, it will + * let host reload its L2P entries(reactivate region in the UFS device). + */ + ufshpb_set_regions_update(hpb); + } else { + /* + * For the HPB device control mode, if host side receives 02h:HPB Operation + * in UPIU response, which means device recommends the host side should + * inactivate all active regions. Here we add all active regions to inactive + * list, they will be inactivated later in ufshpb_map_work_handler(). + */ + struct victim_select_info *lru_info = &hpb->lru_info; + struct ufshpb_region *rgn; + + list_for_each_entry(rgn, &lru_info->lh_lru_rgn, list_lru_rgn) + ufshpb_submit_region_inactive(hpb, rgn->rgn_idx); + + if (ufshpb_get_state(hpb) == HPB_PRESENT) + queue_work(ufshpb_wq, &hpb->map_work); + } + } +} + /* * This function will parse recommended active subregion information in sense * data field of response UPIU with SAM_STAT_GOOD state. @@ -1303,7 +1349,7 @@ void ufshpb_rsp_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) if (!ufshpb_is_hpb_rsp_valid(hba, lrbp, rsp_field)) return; - hpb->stats.rb_noti_cnt++; + hpb->stats.rcmd_noti_cnt++; switch (rsp_field->hpb_op) { case HPB_RSP_REQ_REGION_UPDATE: @@ -1316,17 +1362,7 @@ void ufshpb_rsp_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) case HPB_RSP_DEV_RESET: dev_warn(&hpb->sdev_ufs_lu->sdev_dev, "UFS device lost HPB information during PM.\n"); - - if (hpb->is_hcm) { - struct scsi_device *sdev; - - __shost_for_each_device(sdev, hba->host) { - struct ufshpb_lu *h = sdev->hostdata; - - if (h) - ufshpb_dev_reset_handler(h); - } - } + ufshpb_dev_reset_handler(hba); break; default: @@ -1716,18 +1752,18 @@ static DEVICE_ATTR_RO(__name) ufshpb_sysfs_attr_show_func(hit_cnt); ufshpb_sysfs_attr_show_func(miss_cnt); -ufshpb_sysfs_attr_show_func(rb_noti_cnt); -ufshpb_sysfs_attr_show_func(rb_active_cnt); -ufshpb_sysfs_attr_show_func(rb_inactive_cnt); +ufshpb_sysfs_attr_show_func(rcmd_noti_cnt); +ufshpb_sysfs_attr_show_func(rcmd_active_cnt); +ufshpb_sysfs_attr_show_func(rcmd_inactive_cnt); ufshpb_sysfs_attr_show_func(map_req_cnt); ufshpb_sysfs_attr_show_func(umap_req_cnt); static struct attribute *hpb_dev_stat_attrs[] = { &dev_attr_hit_cnt.attr, &dev_attr_miss_cnt.attr, - &dev_attr_rb_noti_cnt.attr, - &dev_attr_rb_active_cnt.attr, - &dev_attr_rb_inactive_cnt.attr, + &dev_attr_rcmd_noti_cnt.attr, + &dev_attr_rcmd_active_cnt.attr, + &dev_attr_rcmd_inactive_cnt.attr, &dev_attr_map_req_cnt.attr, &dev_attr_umap_req_cnt.attr, NULL, @@ -2090,9 +2126,9 @@ static void ufshpb_stat_init(struct ufshpb_lu *hpb) { hpb->stats.hit_cnt = 0; hpb->stats.miss_cnt = 0; - hpb->stats.rb_noti_cnt = 0; - hpb->stats.rb_active_cnt = 0; - hpb->stats.rb_inactive_cnt = 0; + hpb->stats.rcmd_noti_cnt = 0; + hpb->stats.rcmd_active_cnt = 0; + hpb->stats.rcmd_inactive_cnt = 0; hpb->stats.map_req_cnt = 0; hpb->stats.umap_req_cnt = 0; } @@ -2247,7 +2283,7 @@ static bool ufshpb_check_hpb_reset_query(struct ufs_hba *hba) /* wait for the device to complete HPB reset query */ for (try = 0; try < HPB_RESET_REQ_RETRIES; try++) { dev_dbg(hba->dev, - "%s start flag reset polling %d times\n", + "%s: start flag reset polling %d times\n", __func__, try); /* Poll fHpbReset flag to be cleared */ @@ -2256,7 +2292,7 @@ static bool ufshpb_check_hpb_reset_query(struct ufs_hba *hba) if (err) { dev_err(hba->dev, - "%s reading fHpbReset flag failed with error %d\n", + "%s: reading fHpbReset flag failed with error %d\n", __func__, err); return flag_res; } @@ -2268,45 +2304,35 @@ static bool ufshpb_check_hpb_reset_query(struct ufs_hba *hba) } if (flag_res) { dev_err(hba->dev, - "%s fHpbReset was not cleared by the device\n", + "%s: fHpbReset was not cleared by the device\n", __func__); } out: return flag_res; } -void ufshpb_reset(struct ufs_hba *hba) +/** + * ufshpb_toggle_state - switch HPB state of all LUs + * @hba: per-adapter instance + * @src: expected current HPB state + * @dest: target HPB state to switch to + */ +void ufshpb_toggle_state(struct ufs_hba *hba, enum UFSHPB_STATE src, enum UFSHPB_STATE dest) { struct ufshpb_lu *hpb; struct scsi_device *sdev; shost_for_each_device(sdev, hba->host) { hpb = ufshpb_get_hpb_data(sdev); - if (!hpb) + + if (!hpb || ufshpb_get_state(hpb) != src) continue; + ufshpb_set_state(hpb, dest); - if (ufshpb_get_state(hpb) != HPB_RESET) - continue; - - ufshpb_set_state(hpb, HPB_PRESENT); - } -} - -void ufshpb_reset_host(struct ufs_hba *hba) -{ - struct ufshpb_lu *hpb; - struct scsi_device *sdev; - - shost_for_each_device(sdev, hba->host) { - hpb = ufshpb_get_hpb_data(sdev); - if (!hpb) - continue; - - if (ufshpb_get_state(hpb) != HPB_PRESENT) - continue; - ufshpb_set_state(hpb, HPB_RESET); - ufshpb_cancel_jobs(hpb); - ufshpb_discard_rsp_lists(hpb); + if (dest == HPB_RESET) { + ufshpb_cancel_jobs(hpb); + ufshpb_discard_rsp_lists(hpb); + } } } @@ -2317,11 +2343,9 @@ void ufshpb_suspend(struct ufs_hba *hba) shost_for_each_device(sdev, hba->host) { hpb = ufshpb_get_hpb_data(sdev); - if (!hpb) + if (!hpb || ufshpb_get_state(hpb) != HPB_PRESENT) continue; - if (ufshpb_get_state(hpb) != HPB_PRESENT) - continue; ufshpb_set_state(hpb, HPB_SUSPEND); ufshpb_cancel_jobs(hpb); } @@ -2334,20 +2358,15 @@ void ufshpb_resume(struct ufs_hba *hba) shost_for_each_device(sdev, hba->host) { hpb = ufshpb_get_hpb_data(sdev); - if (!hpb) + if (!hpb || ufshpb_get_state(hpb) != HPB_SUSPEND) continue; - if ((ufshpb_get_state(hpb) != HPB_PRESENT) && - (ufshpb_get_state(hpb) != HPB_SUSPEND)) - continue; ufshpb_set_state(hpb, HPB_PRESENT); ufshpb_kick_map_work(hpb); if (hpb->is_hcm) { - unsigned int poll = - hpb->params.timeout_polling_interval_ms; + unsigned int poll = hpb->params.timeout_polling_interval_ms; - schedule_delayed_work(&hpb->ufshpb_read_to_work, - msecs_to_jiffies(poll)); + schedule_delayed_work(&hpb->ufshpb_read_to_work, msecs_to_jiffies(poll)); } } } @@ -2357,12 +2376,10 @@ static int ufshpb_get_lu_info(struct ufs_hba *hba, int lun, { u16 max_active_rgns; u8 lu_enable; - int size; + int size = QUERY_DESC_MAX_SIZE; int ret; char desc_buf[QUERY_DESC_MAX_SIZE]; - ufshcd_map_desc_id_to_length(hba, QUERY_DESC_IDN_UNIT, &size); - ufshcd_rpm_get_sync(hba); ret = ufshcd_query_descriptor_retry(hba, UPIU_QUERY_OPCODE_READ_DESC, QUERY_DESC_IDN_UNIT, lun, 0, @@ -2453,8 +2470,6 @@ static void ufshpb_hpb_lu_prepared(struct ufs_hba *hba) ufshpb_set_state(hpb, HPB_PRESENT); if ((hpb->lu_pinned_end - hpb->lu_pinned_start) > 0) queue_work(ufshpb_wq, &hpb->map_work); - if (!hpb->is_hcm) - ufshpb_issue_umap_all_req(hpb); } else { dev_err(hba->dev, "destroy HPB lu %d\n", hpb->lun); ufshpb_destroy_lu(hba, sdev); diff --git a/drivers/scsi/ufs/ufshpb.h b/drivers/ufs/core/ufshpb.h similarity index 96% rename from drivers/scsi/ufs/ufshpb.h rename to drivers/ufs/core/ufshpb.h index b475dbd78988..0d6e6004d783 100644 --- a/drivers/scsi/ufs/ufshpb.h +++ b/drivers/ufs/core/ufshpb.h @@ -59,8 +59,8 @@ enum UFSHPB_MODE { }; enum UFSHPB_STATE { - HPB_INIT = 0, - HPB_PRESENT = 1, + HPB_INIT, + HPB_PRESENT, HPB_SUSPEND, HPB_FAILED, HPB_RESET, @@ -211,9 +211,9 @@ struct ufshpb_params { struct ufshpb_stats { u64 hit_cnt; u64 miss_cnt; - u64 rb_noti_cnt; - u64 rb_active_cnt; - u64 rb_inactive_cnt; + u64 rcmd_noti_cnt; + u64 rcmd_active_cnt; + u64 rcmd_inactive_cnt; u64 map_req_cnt; u64 pre_req_cnt; u64 umap_req_cnt; @@ -288,8 +288,7 @@ static int ufshpb_prep(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) { return 0; static void ufshpb_rsp_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) {} static void ufshpb_resume(struct ufs_hba *hba) {} static void ufshpb_suspend(struct ufs_hba *hba) {} -static void ufshpb_reset(struct ufs_hba *hba) {} -static void ufshpb_reset_host(struct ufs_hba *hba) {} +static void ufshpb_toggle_state(struct ufs_hba *hba, enum UFSHPB_STATE src, enum UFSHPB_STATE dest) {} static void ufshpb_init(struct ufs_hba *hba) {} static void ufshpb_init_hpb_lu(struct ufs_hba *hba, struct scsi_device *sdev) {} static void ufshpb_destroy_lu(struct ufs_hba *hba, struct scsi_device *sdev) {} @@ -303,8 +302,7 @@ int ufshpb_prep(struct ufs_hba *hba, struct ufshcd_lrb *lrbp); void ufshpb_rsp_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp); void ufshpb_resume(struct ufs_hba *hba); void ufshpb_suspend(struct ufs_hba *hba); -void ufshpb_reset(struct ufs_hba *hba); -void ufshpb_reset_host(struct ufs_hba *hba); +void ufshpb_toggle_state(struct ufs_hba *hba, enum UFSHPB_STATE src, enum UFSHPB_STATE dest); void ufshpb_init(struct ufs_hba *hba); void ufshpb_init_hpb_lu(struct ufs_hba *hba, struct scsi_device *sdev); void ufshpb_destroy_lu(struct ufs_hba *hba, struct scsi_device *sdev); diff --git a/drivers/ufs/host/Kconfig b/drivers/ufs/host/Kconfig new file mode 100644 index 000000000000..139064e70a34 --- /dev/null +++ b/drivers/ufs/host/Kconfig @@ -0,0 +1,142 @@ +# SPDX-License-Identifier: GPL-2.0+ +# +# Kernel configuration file for the UFS host controller drivers. +# +# Copyright (C) 2011-2013 Samsung India Software Operations +# +# Authors: +# Santosh Yaraganavi +# Vinayak Holikatti + +config SCSI_UFSHCD_PCI + tristate "PCI bus based UFS Controller support" + depends on PCI + help + This selects the PCI UFS Host Controller Interface. Select this if + you have UFS Host Controller with PCI Interface. + + If you have a controller with this interface, say Y or M here. + + If unsure, say N. + +config SCSI_UFS_DWC_TC_PCI + tristate "DesignWare pci support using a G210 Test Chip" + depends on SCSI_UFSHCD_PCI + help + Synopsys Test Chip is a PHY for prototyping purposes. + + If unsure, say N. + +config SCSI_UFSHCD_PLATFORM + tristate "Platform bus based UFS Controller support" + depends on HAS_IOMEM + help + This selects the UFS host controller support. Select this if + you have an UFS controller on Platform bus. + + If you have a controller with this interface, say Y or M here. + + If unsure, say N. + +config SCSI_UFS_CDNS_PLATFORM + tristate "Cadence UFS Controller platform driver" + depends on SCSI_UFSHCD_PLATFORM + help + This selects the Cadence-specific additions to UFSHCD platform driver. + + If unsure, say N. + +config SCSI_UFS_DWC_TC_PLATFORM + tristate "DesignWare platform support using a G210 Test Chip" + depends on SCSI_UFSHCD_PLATFORM + help + Synopsys Test Chip is a PHY for prototyping purposes. + + If unsure, say N. + +config SCSI_UFS_QCOM + tristate "QCOM specific hooks to UFS controller platform driver" + depends on SCSI_UFSHCD_PLATFORM && ARCH_QCOM + select QCOM_SCM if SCSI_UFS_CRYPTO + select RESET_CONTROLLER + help + This selects the QCOM specific additions to UFSHCD platform driver. + UFS host on QCOM needs some vendor specific configuration before + accessing the hardware which includes PHY configuration and vendor + specific registers. + + Select this if you have UFS controller on QCOM chipset. + If unsure, say N. + +config SCSI_UFS_MEDIATEK + tristate "Mediatek specific hooks to UFS controller platform driver" + depends on SCSI_UFSHCD_PLATFORM && ARCH_MEDIATEK + select PHY_MTK_UFS + select RESET_TI_SYSCON + help + This selects the Mediatek specific additions to UFSHCD platform driver. + UFS host on Mediatek needs some vendor specific configuration before + accessing the hardware which includes PHY configuration and vendor + specific registers. + + Select this if you have UFS controller on Mediatek chipset. + + If unsure, say N. + +config SCSI_UFS_HISI + tristate "Hisilicon specific hooks to UFS controller platform driver" + depends on (ARCH_HISI || COMPILE_TEST) && SCSI_UFSHCD_PLATFORM + help + This selects the Hisilicon specific additions to UFSHCD platform driver. + + Select this if you have UFS controller on Hisilicon chipset. + If unsure, say N. + +config SCSI_UFS_RENESAS + tristate "Renesas specific hooks to UFS controller platform driver" + depends on (ARCH_RENESAS || COMPILE_TEST) && SCSI_UFSHCD_PLATFORM + help + This selects the Renesas specific additions to UFSHCD platform driver. + UFS host on Renesas needs some vendor specific configuration before + accessing the hardware. + + Select this if you have UFS controller on Renesas chipset. + + If unsure, say N. + +config SCSI_UFS_TI_J721E + tristate "TI glue layer for Cadence UFS Controller" + depends on OF && HAS_IOMEM && (ARCH_K3 || COMPILE_TEST) + help + This selects driver for TI glue layer for Cadence UFS Host + Controller IP. + + Selects this if you have TI platform with UFS controller. + If unsure, say N. + +config SCSI_UFS_EXYNOS + tristate "Exynos specific hooks to UFS controller platform driver" + depends on SCSI_UFSHCD_PLATFORM && (ARCH_EXYNOS || COMPILE_TEST) + help + This selects the Samsung Exynos SoC specific additions to UFSHCD + platform driver. UFS host on Samsung Exynos SoC includes HCI and + UNIPRO layer, and associates with UFS-PHY driver. + + Select this if you have UFS host controller on Samsung Exynos SoC. + If unsure, say N. + +config SCSI_UFS_VARIABLE_SG_ENTRY_SIZE + bool + default y if SCSI_UFS_EXYNOS && SCSI_UFS_CRYPTO + +config SCSI_UFS_SPRD + tristate "Unisoc specific hooks to UFS controller platform driver" + depends on SCSI_UFSHCD_PLATFORM && (ARCH_SPRD || COMPILE_TEST) + help + This selects the Unisoc specific additions to UFSHCD platform driver. + UFS host on Unisoc needs some vendor specific configuration before + accessing the hardware which includes PHY configuration and vendor + specific registers. + + Select this if you have UFS controller on Unisoc chipset. + If unsure, say N. diff --git a/drivers/scsi/ufs/Makefile b/drivers/ufs/host/Makefile similarity index 58% rename from drivers/scsi/ufs/Makefile rename to drivers/ufs/host/Makefile index c407da9b5171..d7c5bf7fa512 100644 --- a/drivers/scsi/ufs/Makefile +++ b/drivers/ufs/host/Makefile @@ -1,15 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -# UFSHCD makefile - -# The link order is important here. ufshcd-core must initialize -# before vendor drivers. -obj-$(CONFIG_SCSI_UFSHCD) += ufshcd-core.o -ufshcd-core-y += ufshcd.o ufs-sysfs.o -ufshcd-core-$(CONFIG_DEBUG_FS) += ufs-debugfs.o -ufshcd-core-$(CONFIG_SCSI_UFS_BSG) += ufs_bsg.o -ufshcd-core-$(CONFIG_SCSI_UFS_CRYPTO) += ufshcd-crypto.o -ufshcd-core-$(CONFIG_SCSI_UFS_HPB) += ufshpb.o -ufshcd-core-$(CONFIG_SCSI_UFS_FAULT_INJECTION) += ufs-fault-injection.o obj-$(CONFIG_SCSI_UFS_DWC_TC_PCI) += tc-dwc-g210-pci.o ufshcd-dwc.o tc-dwc-g210.o obj-$(CONFIG_SCSI_UFS_DWC_TC_PLATFORM) += tc-dwc-g210-pltfrm.o ufshcd-dwc.o tc-dwc-g210.o @@ -22,4 +11,6 @@ obj-$(CONFIG_SCSI_UFSHCD_PCI) += ufshcd-pci.o obj-$(CONFIG_SCSI_UFSHCD_PLATFORM) += ufshcd-pltfrm.o obj-$(CONFIG_SCSI_UFS_HISI) += ufs-hisi.o obj-$(CONFIG_SCSI_UFS_MEDIATEK) += ufs-mediatek.o +obj-$(CONFIG_SCSI_UFS_RENESAS) += ufs-renesas.o +obj-$(CONFIG_SCSI_UFS_SPRD) += ufs-sprd.o obj-$(CONFIG_SCSI_UFS_TI_J721E) += ti-j721e-ufs.o diff --git a/drivers/scsi/ufs/cdns-pltfrm.c b/drivers/ufs/host/cdns-pltfrm.c similarity index 99% rename from drivers/scsi/ufs/cdns-pltfrm.c rename to drivers/ufs/host/cdns-pltfrm.c index 7da8be2f35c4..e05c0ae64eea 100644 --- a/drivers/scsi/ufs/cdns-pltfrm.c +++ b/drivers/ufs/host/cdns-pltfrm.c @@ -9,6 +9,7 @@ * */ +#include #include #include #include @@ -340,4 +341,3 @@ module_platform_driver(cdns_ufs_pltfrm_driver); MODULE_AUTHOR("Jan Kotas "); MODULE_DESCRIPTION("Cadence UFS host controller platform driver"); MODULE_LICENSE("GPL v2"); -MODULE_VERSION(UFSHCD_DRIVER_VERSION); diff --git a/drivers/scsi/ufs/tc-dwc-g210-pci.c b/drivers/ufs/host/tc-dwc-g210-pci.c similarity index 98% rename from drivers/scsi/ufs/tc-dwc-g210-pci.c rename to drivers/ufs/host/tc-dwc-g210-pci.c index 7b08e2e07cc5..92b8ad4b58fe 100644 --- a/drivers/scsi/ufs/tc-dwc-g210-pci.c +++ b/drivers/ufs/host/tc-dwc-g210-pci.c @@ -7,10 +7,11 @@ * Authors: Joao Pinto */ -#include "ufshcd.h" +#include #include "ufshcd-dwc.h" #include "tc-dwc-g210.h" +#include #include #include diff --git a/drivers/scsi/ufs/tc-dwc-g210-pltfrm.c b/drivers/ufs/host/tc-dwc-g210-pltfrm.c similarity index 98% rename from drivers/scsi/ufs/tc-dwc-g210-pltfrm.c rename to drivers/ufs/host/tc-dwc-g210-pltfrm.c index 783ec43efa78..f15a84d0c176 100644 --- a/drivers/scsi/ufs/tc-dwc-g210-pltfrm.c +++ b/drivers/ufs/host/tc-dwc-g210-pltfrm.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "ufshcd-pltfrm.h" #include "ufshcd-dwc.h" diff --git a/drivers/scsi/ufs/tc-dwc-g210.c b/drivers/ufs/host/tc-dwc-g210.c similarity index 99% rename from drivers/scsi/ufs/tc-dwc-g210.c rename to drivers/ufs/host/tc-dwc-g210.c index f954a68f6b4c..deb93dbd83a4 100644 --- a/drivers/scsi/ufs/tc-dwc-g210.c +++ b/drivers/ufs/host/tc-dwc-g210.c @@ -7,8 +7,10 @@ * Authors: Joao Pinto */ -#include "ufshcd.h" -#include "unipro.h" +#include + +#include +#include #include "ufshcd-dwc.h" #include "ufshci-dwc.h" diff --git a/drivers/scsi/ufs/tc-dwc-g210.h b/drivers/ufs/host/tc-dwc-g210.h similarity index 95% rename from drivers/scsi/ufs/tc-dwc-g210.h rename to drivers/ufs/host/tc-dwc-g210.h index 5a506da03f4a..f7154012f5c7 100644 --- a/drivers/scsi/ufs/tc-dwc-g210.h +++ b/drivers/ufs/host/tc-dwc-g210.h @@ -10,6 +10,8 @@ #ifndef _TC_DWC_G210_H #define _TC_DWC_G210_H +struct ufs_hba; + int tc_dwc_g210_config_40_bit(struct ufs_hba *hba); int tc_dwc_g210_config_20_bit(struct ufs_hba *hba); diff --git a/drivers/scsi/ufs/ti-j721e-ufs.c b/drivers/ufs/host/ti-j721e-ufs.c similarity index 100% rename from drivers/scsi/ufs/ti-j721e-ufs.c rename to drivers/ufs/host/ti-j721e-ufs.c diff --git a/drivers/scsi/ufs/ufs-exynos.c b/drivers/ufs/host/ufs-exynos.c similarity index 68% rename from drivers/scsi/ufs/ufs-exynos.c rename to drivers/ufs/host/ufs-exynos.c index bb2dd79a1bcd..7c985fc38db1 100644 --- a/drivers/scsi/ufs/ufs-exynos.c +++ b/drivers/ufs/host/ufs-exynos.c @@ -9,16 +9,19 @@ */ #include +#include #include #include #include +#include #include #include +#include -#include "ufshcd.h" +#include #include "ufshcd-pltfrm.h" -#include "ufshci.h" -#include "unipro.h" +#include +#include #include "ufs-exynos.h" @@ -48,11 +51,13 @@ #define HCI_ERR_EN_T_LAYER 0x84 #define HCI_ERR_EN_DME_LAYER 0x88 #define HCI_CLKSTOP_CTRL 0xB0 +#define REFCLKOUT_STOP BIT(4) +#define MPHY_APBCLK_STOP BIT(3) #define REFCLK_STOP BIT(2) #define UNIPRO_MCLK_STOP BIT(1) #define UNIPRO_PCLK_STOP BIT(0) -#define CLK_STOP_MASK (REFCLK_STOP |\ - UNIPRO_MCLK_STOP |\ +#define CLK_STOP_MASK (REFCLKOUT_STOP | REFCLK_STOP |\ + UNIPRO_MCLK_STOP | MPHY_APBCLK_STOP|\ UNIPRO_PCLK_STOP) #define HCI_MISC 0xB4 #define REFCLK_CTRL_EN BIT(7) @@ -74,6 +79,52 @@ UIC_TRANSPORT_NO_CONNECTION_RX |\ UIC_TRANSPORT_BAD_TC) +/* FSYS UFS Shareability */ +#define UFS_WR_SHARABLE BIT(2) +#define UFS_RD_SHARABLE BIT(1) +#define UFS_SHARABLE (UFS_WR_SHARABLE | UFS_RD_SHARABLE) +#define UFS_SHAREABILITY_OFFSET 0x710 + +/* Multi-host registers */ +#define MHCTRL 0xC4 +#define MHCTRL_EN_VH_MASK (0xE) +#define MHCTRL_EN_VH(vh) (vh << 1) +#define PH2VH_MBOX 0xD8 + +#define MH_MSG_MASK (0xFF) + +#define MH_MSG(id, msg) ((id << 8) | (msg & 0xFF)) +#define MH_MSG_PH_READY 0x1 +#define MH_MSG_VH_READY 0x2 + +#define ALLOW_INQUIRY BIT(25) +#define ALLOW_MODE_SELECT BIT(24) +#define ALLOW_MODE_SENSE BIT(23) +#define ALLOW_PRE_FETCH GENMASK(22, 21) +#define ALLOW_READ_CMD_ALL GENMASK(20, 18) /* read_6/10/16 */ +#define ALLOW_READ_BUFFER BIT(17) +#define ALLOW_READ_CAPACITY GENMASK(16, 15) +#define ALLOW_REPORT_LUNS BIT(14) +#define ALLOW_REQUEST_SENSE BIT(13) +#define ALLOW_SYNCHRONIZE_CACHE GENMASK(8, 7) +#define ALLOW_TEST_UNIT_READY BIT(6) +#define ALLOW_UNMAP BIT(5) +#define ALLOW_VERIFY BIT(4) +#define ALLOW_WRITE_CMD_ALL GENMASK(3, 1) /* write_6/10/16 */ + +#define ALLOW_TRANS_VH_DEFAULT (ALLOW_INQUIRY | ALLOW_MODE_SELECT | \ + ALLOW_MODE_SENSE | ALLOW_PRE_FETCH | \ + ALLOW_READ_CMD_ALL | ALLOW_READ_BUFFER | \ + ALLOW_READ_CAPACITY | ALLOW_REPORT_LUNS | \ + ALLOW_REQUEST_SENSE | ALLOW_SYNCHRONIZE_CACHE | \ + ALLOW_TEST_UNIT_READY | ALLOW_UNMAP | \ + ALLOW_VERIFY | ALLOW_WRITE_CMD_ALL) + +#define HCI_MH_ALLOWABLE_TRAN_OF_VH 0x30C +#define HCI_MH_IID_IN_TASK_TAG 0X308 + +#define PH_READY_TIMEOUT_MS (5 * MSEC_PER_SEC) + enum { UNIPRO_L1_5 = 0,/* PHY Adapter */ UNIPRO_L2, /* Data Link */ @@ -85,15 +136,9 @@ enum { /* * UNIPRO registers */ -#define UNIPRO_COMP_VERSION 0x000 -#define UNIPRO_DME_PWR_REQ 0x090 -#define UNIPRO_DME_PWR_REQ_POWERMODE 0x094 -#define UNIPRO_DME_PWR_REQ_LOCALL2TIMER0 0x098 -#define UNIPRO_DME_PWR_REQ_LOCALL2TIMER1 0x09C -#define UNIPRO_DME_PWR_REQ_LOCALL2TIMER2 0x0A0 -#define UNIPRO_DME_PWR_REQ_REMOTEL2TIMER0 0x0A4 -#define UNIPRO_DME_PWR_REQ_REMOTEL2TIMER1 0x0A8 -#define UNIPRO_DME_PWR_REQ_REMOTEL2TIMER2 0x0AC +#define UNIPRO_DME_POWERMODE_REQ_REMOTEL2TIMER0 0x78B8 +#define UNIPRO_DME_POWERMODE_REQ_REMOTEL2TIMER1 0x78BC +#define UNIPRO_DME_POWERMODE_REQ_REMOTEL2TIMER2 0x78C0 /* * UFS Protector registers @@ -107,7 +152,6 @@ enum { #define CNTR_DIV_VAL 40 -static struct exynos_ufs_drv_data exynos_ufs_drvs; static void exynos_ufs_auto_ctrl_hcc(struct exynos_ufs *ufs, bool en); static void exynos_ufs_ctrl_clkstop(struct exynos_ufs *ufs, bool en); @@ -149,6 +193,117 @@ static int exynos7_ufs_drv_init(struct device *dev, struct exynos_ufs *ufs) return 0; } +static int exynosauto_ufs_drv_init(struct device *dev, struct exynos_ufs *ufs) +{ + struct exynos_ufs_uic_attr *attr = ufs->drv_data->uic_attr; + + /* IO Coherency setting */ + if (ufs->sysreg) { + return regmap_update_bits(ufs->sysreg, + ufs->shareability_reg_offset, + UFS_SHARABLE, UFS_SHARABLE); + } + + attr->tx_dif_p_nsec = 3200000; + + return 0; +} + +static int exynosauto_ufs_post_hce_enable(struct exynos_ufs *ufs) +{ + struct ufs_hba *hba = ufs->hba; + + /* Enable Virtual Host #1 */ + ufshcd_rmwl(hba, MHCTRL_EN_VH_MASK, MHCTRL_EN_VH(1), MHCTRL); + /* Default VH Transfer permissions */ + hci_writel(ufs, ALLOW_TRANS_VH_DEFAULT, HCI_MH_ALLOWABLE_TRAN_OF_VH); + /* IID information is replaced in TASKTAG[7:5] instead of IID in UCD */ + hci_writel(ufs, 0x1, HCI_MH_IID_IN_TASK_TAG); + + return 0; +} + +static int exynosauto_ufs_pre_link(struct exynos_ufs *ufs) +{ + struct ufs_hba *hba = ufs->hba; + int i; + u32 tx_line_reset_period, rx_line_reset_period; + + rx_line_reset_period = (RX_LINE_RESET_TIME * ufs->mclk_rate) / NSEC_PER_MSEC; + tx_line_reset_period = (TX_LINE_RESET_TIME * ufs->mclk_rate) / NSEC_PER_MSEC; + + ufshcd_dme_set(hba, UIC_ARG_MIB(0x200), 0x40); + for_each_ufs_rx_lane(ufs, i) { + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(VND_RX_CLK_PRD, i), + DIV_ROUND_UP(NSEC_PER_SEC, ufs->mclk_rate)); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(VND_RX_CLK_PRD_EN, i), 0x0); + + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(VND_RX_LINERESET_VALUE2, i), + (rx_line_reset_period >> 16) & 0xFF); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(VND_RX_LINERESET_VALUE1, i), + (rx_line_reset_period >> 8) & 0xFF); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(VND_RX_LINERESET_VALUE0, i), + (rx_line_reset_period) & 0xFF); + + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0x2f, i), 0x79); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0x84, i), 0x1); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0x25, i), 0xf6); + } + + for_each_ufs_tx_lane(ufs, i) { + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(VND_TX_CLK_PRD, i), + DIV_ROUND_UP(NSEC_PER_SEC, ufs->mclk_rate)); + /* Not to affect VND_TX_LINERESET_PVALUE to VND_TX_CLK_PRD */ + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(VND_TX_CLK_PRD_EN, i), + 0x02); + + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(VND_TX_LINERESET_PVALUE2, i), + (tx_line_reset_period >> 16) & 0xFF); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(VND_TX_LINERESET_PVALUE1, i), + (tx_line_reset_period >> 8) & 0xFF); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(VND_TX_LINERESET_PVALUE0, i), + (tx_line_reset_period) & 0xFF); + + /* TX PWM Gear Capability / PWM_G1_ONLY */ + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0x04, i), 0x1); + } + + ufshcd_dme_set(hba, UIC_ARG_MIB(0x200), 0x0); + + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_LOCAL_TX_LCC_ENABLE), 0x0); + + ufshcd_dme_set(hba, UIC_ARG_MIB(0xa011), 0x8000); + + return 0; +} + +static int exynosauto_ufs_pre_pwr_change(struct exynos_ufs *ufs, + struct ufs_pa_layer_attr *pwr) +{ + struct ufs_hba *hba = ufs->hba; + + /* PACP_PWR_req and delivered to the remote DME */ + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA0), 12000); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA1), 32000); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA2), 16000); + + return 0; +} + +static int exynosauto_ufs_post_pwr_change(struct exynos_ufs *ufs, + struct ufs_pa_layer_attr *pwr) +{ + struct ufs_hba *hba = ufs->hba; + u32 enabled_vh; + + enabled_vh = ufshcd_readl(hba, MHCTRL) & MHCTRL_EN_VH_MASK; + + /* Send physical host ready message to virtual hosts */ + ufshcd_writel(hba, MH_MSG(enabled_vh, MH_MSG_PH_READY), PH2VH_MBOX); + + return 0; +} + static int exynos7_ufs_pre_link(struct exynos_ufs *ufs) { struct ufs_hba *hba = ufs->hba; @@ -496,8 +651,9 @@ static void exynos_ufs_config_phy_cap_attr(struct exynos_ufs *ufs) if (attr->rx_min_actv_time_cap) ufshcd_dme_set(hba, - UIC_ARG_MIB_SEL(RX_MIN_ACTIVATETIME_CAP, - i), attr->rx_min_actv_time_cap); + UIC_ARG_MIB_SEL( + RX_MIN_ACTIVATETIME_CAPABILITY, i), + attr->rx_min_actv_time_cap); if (attr->rx_hibern8_time_cap) ufshcd_dme_set(hba, @@ -544,7 +700,7 @@ static void exynos_ufs_establish_connt(struct exynos_ufs *ufs) /* local unipro attributes */ ufshcd_dme_set(hba, UIC_ARG_MIB(N_DEVICEID), DEV_ID); - ufshcd_dme_set(hba, UIC_ARG_MIB(N_DEVICEID_VALID), TRUE); + ufshcd_dme_set(hba, UIC_ARG_MIB(N_DEVICEID_VALID), true); ufshcd_dme_set(hba, UIC_ARG_MIB(T_PEERDEVICEID), PEER_DEV_ID); ufshcd_dme_set(hba, UIC_ARG_MIB(T_PEERCPORTID), PEER_CPORT_ID); ufshcd_dme_set(hba, UIC_ARG_MIB(T_CPORTFLAGS), CPORT_DEF_FLAGS); @@ -693,14 +849,14 @@ static int exynos_ufs_post_pwr_mode(struct ufs_hba *hba, } static void exynos_ufs_specify_nexus_t_xfer_req(struct ufs_hba *hba, - int tag, bool op) + int tag, bool is_scsi_cmd) { struct exynos_ufs *ufs = ufshcd_get_variant(hba); u32 type; type = hci_readl(ufs, HCI_UTRL_NEXUS_TYPE); - if (op) + if (is_scsi_cmd) hci_writel(ufs, type | (1 << tag), HCI_UTRL_NEXUS_TYPE); else hci_writel(ufs, type & ~(1 << tag), HCI_UTRL_NEXUS_TYPE); @@ -749,9 +905,13 @@ static int exynos_ufs_phy_init(struct exynos_ufs *ufs) if (ret) { dev_err(hba->dev, "%s: phy init failed, ret = %d\n", __func__, ret); - goto out_exit_phy; + return ret; } + ret = phy_power_on(generic_phy); + if (ret) + goto out_exit_phy; + return 0; out_exit_phy: @@ -793,6 +953,27 @@ static void exynos_ufs_config_intr(struct exynos_ufs *ufs, u32 errs, u8 index) } } +static int exynos_ufs_setup_clocks(struct ufs_hba *hba, bool on, + enum ufs_notify_change_status status) +{ + struct exynos_ufs *ufs = ufshcd_get_variant(hba); + + if (!ufs) + return 0; + + if (on && status == PRE_CHANGE) { + if (ufs->opts & EXYNOS_UFS_OPT_BROKEN_AUTO_CLK_CTRL) + exynos_ufs_disable_auto_ctrl_hcc(ufs); + exynos_ufs_ungate_clks(ufs); + } else if (!on && status == POST_CHANGE) { + exynos_ufs_gate_clks(ufs); + if (ufs->opts & EXYNOS_UFS_OPT_BROKEN_AUTO_CLK_CTRL) + exynos_ufs_enable_auto_ctrl_hcc(ufs); + } + + return 0; +} + static int exynos_ufs_pre_link(struct ufs_hba *hba) { struct exynos_ufs *ufs = ufshcd_get_variant(hba); @@ -808,8 +989,12 @@ static int exynos_ufs_pre_link(struct ufs_hba *hba) /* m-phy */ exynos_ufs_phy_init(ufs); - exynos_ufs_config_phy_time_attr(ufs); - exynos_ufs_config_phy_cap_attr(ufs); + if (!(ufs->opts & EXYNOS_UFS_OPT_SKIP_CONFIG_PHY_ATTR)) { + exynos_ufs_config_phy_time_attr(ufs); + exynos_ufs_config_phy_cap_attr(ufs); + } + + exynos_ufs_setup_clocks(hba, true, PRE_CHANGE); if (ufs->drv_data->pre_link) ufs->drv_data->pre_link(ufs); @@ -843,7 +1028,7 @@ static int exynos_ufs_post_link(struct ufs_hba *hba) if (ufs->opts & EXYNOS_UFS_OPT_SKIP_CONNECTION_ESTAB) ufshcd_dme_set(hba, - UIC_ARG_MIB(T_DBG_SKIP_INIT_HIBERN8_EXIT), TRUE); + UIC_ARG_MIB(T_DBG_SKIP_INIT_HIBERN8_EXIT), true); if (attr->pa_granularity) { exynos_ufs_enable_dbg_mode(hba); @@ -893,17 +1078,10 @@ static int exynos_ufs_post_link(struct ufs_hba *hba) static int exynos_ufs_parse_dt(struct device *dev, struct exynos_ufs *ufs) { struct device_node *np = dev->of_node; - struct exynos_ufs_drv_data *drv_data = &exynos_ufs_drvs; struct exynos_ufs_uic_attr *attr; int ret = 0; - while (drv_data->compatible) { - if (of_device_is_compatible(np, drv_data->compatible)) { - ufs->drv_data = drv_data; - break; - } - drv_data++; - } + ufs->drv_data = device_get_match_data(dev); if (ufs->drv_data && ufs->drv_data->uic_attr) { attr = ufs->drv_data->uic_attr; @@ -913,6 +1091,17 @@ static int exynos_ufs_parse_dt(struct device *dev, struct exynos_ufs *ufs) goto out; } + ufs->sysreg = syscon_regmap_lookup_by_phandle(np, "samsung,sysreg"); + if (IS_ERR(ufs->sysreg)) + ufs->sysreg = NULL; + else { + if (of_property_read_u32_index(np, "samsung,sysreg", 1, + &ufs->shareability_reg_offset)) { + dev_warn(dev, "can't get an offset from sysreg. Set to default value\n"); + ufs->shareability_reg_offset = UFS_SHAREABILITY_OFFSET; + } + } + ufs->pclk_avail_min = PCLK_AVAIL_MIN; ufs->pclk_avail_max = PCLK_AVAIL_MAX; @@ -927,6 +1116,18 @@ out: return ret; } +static inline void exynos_ufs_priv_init(struct ufs_hba *hba, + struct exynos_ufs *ufs) +{ + ufs->hba = hba; + ufs->opts = ufs->drv_data->opts; + ufs->rx_sel_idx = PA_MAXDATALANES; + if (ufs->opts & EXYNOS_UFS_OPT_BROKEN_RX_SEL_IDX) + ufs->rx_sel_idx = 0; + hba->priv = (void *)ufs; + hba->quirks = ufs->drv_data->quirks; +} + static int exynos_ufs_init(struct ufs_hba *hba) { struct device *dev = hba->dev; @@ -972,17 +1173,8 @@ static int exynos_ufs_init(struct ufs_hba *hba) goto out; } - ret = phy_power_on(ufs->phy); - if (ret) - goto phy_off; + exynos_ufs_priv_init(hba, ufs); - ufs->hba = hba; - ufs->opts = ufs->drv_data->opts; - ufs->rx_sel_idx = PA_MAXDATALANES; - if (ufs->opts & EXYNOS_UFS_OPT_BROKEN_RX_SEL_IDX) - ufs->rx_sel_idx = 0; - hba->priv = (void *)ufs; - hba->quirks = ufs->drv_data->quirks; if (ufs->drv_data->drv_init) { ret = ufs->drv_data->drv_init(dev, ufs); if (ret) { @@ -998,8 +1190,6 @@ static int exynos_ufs_init(struct ufs_hba *hba) exynos_ufs_config_smu(ufs); return 0; -phy_off: - phy_power_off(ufs->phy); out: hba->priv = NULL; return ret; @@ -1110,6 +1300,20 @@ static int exynos_ufs_hce_enable_notify(struct ufs_hba *hba, switch (status) { case PRE_CHANGE: + /* + * The maximum segment size must be set after scsi_host_alloc() + * has been called and before LUN scanning starts + * (ufshcd_async_scan()). Note: this callback may also be called + * from other functions than ufshcd_init(). + */ + hba->host->max_segment_size = 4096; + + if (ufs->drv_data->pre_hce_enable) { + ret = ufs->drv_data->pre_hce_enable(ufs); + if (ret) + return ret; + } + ret = exynos_ufs_host_reset(hba); if (ret) return ret; @@ -1119,6 +1323,10 @@ static int exynos_ufs_hce_enable_notify(struct ufs_hba *hba, exynos_ufs_calc_pwm_clk_div(ufs); if (!(ufs->opts & EXYNOS_UFS_OPT_BROKEN_AUTO_CLK_CTRL)) exynos_ufs_enable_auto_ctrl_hcc(ufs); + + if (ufs->drv_data->post_hce_enable) + ret = ufs->drv_data->post_hce_enable(ufs); + break; } @@ -1176,10 +1384,14 @@ static void exynos_ufs_hibern8_notify(struct ufs_hba *hba, } } -static int exynos_ufs_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op) +static int exynos_ufs_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op, + enum ufs_notify_change_status status) { struct exynos_ufs *ufs = ufshcd_get_variant(hba); + if (status == PRE_CHANGE) + return 0; + if (!ufshcd_is_link_active(hba)) phy_power_off(ufs->phy); @@ -1198,12 +1410,170 @@ static int exynos_ufs_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op) return 0; } -static struct ufs_hba_variant_ops ufs_hba_exynos_ops = { +static int exynosauto_ufs_vh_link_startup_notify(struct ufs_hba *hba, + enum ufs_notify_change_status status) +{ + if (status == POST_CHANGE) { + ufshcd_set_link_active(hba); + ufshcd_set_ufs_dev_active(hba); + } + + return 0; +} + +static int exynosauto_ufs_vh_wait_ph_ready(struct ufs_hba *hba) +{ + u32 mbox; + ktime_t start, stop; + + start = ktime_get(); + stop = ktime_add(start, ms_to_ktime(PH_READY_TIMEOUT_MS)); + + do { + mbox = ufshcd_readl(hba, PH2VH_MBOX); + /* TODO: Mailbox message protocols between the PH and VHs are + * not implemented yet. This will be supported later + */ + if ((mbox & MH_MSG_MASK) == MH_MSG_PH_READY) + return 0; + + usleep_range(40, 50); + } while (ktime_before(ktime_get(), stop)); + + return -ETIME; +} + +static int exynosauto_ufs_vh_init(struct ufs_hba *hba) +{ + struct device *dev = hba->dev; + struct platform_device *pdev = to_platform_device(dev); + struct exynos_ufs *ufs; + int ret; + + ufs = devm_kzalloc(dev, sizeof(*ufs), GFP_KERNEL); + if (!ufs) + return -ENOMEM; + + /* exynos-specific hci */ + ufs->reg_hci = devm_platform_ioremap_resource_byname(pdev, "vs_hci"); + if (IS_ERR(ufs->reg_hci)) { + dev_err(dev, "cannot ioremap for hci vendor register\n"); + return PTR_ERR(ufs->reg_hci); + } + + ret = exynosauto_ufs_vh_wait_ph_ready(hba); + if (ret) + return ret; + + ufs->drv_data = device_get_match_data(dev); + if (!ufs->drv_data) + return -ENODEV; + + exynos_ufs_priv_init(hba, ufs); + + return 0; +} + +static int fsd_ufs_pre_link(struct exynos_ufs *ufs) +{ + int i; + struct ufs_hba *hba = ufs->hba; + + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_DBG_CLK_PERIOD), + DIV_ROUND_UP(NSEC_PER_SEC, ufs->mclk_rate)); + ufshcd_dme_set(hba, UIC_ARG_MIB(0x201), 0x12); + ufshcd_dme_set(hba, UIC_ARG_MIB(0x200), 0x40); + + for_each_ufs_tx_lane(ufs, i) { + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0xAA, i), + DIV_ROUND_UP(NSEC_PER_SEC, ufs->mclk_rate)); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0x8F, i), 0x3F); + } + + for_each_ufs_rx_lane(ufs, i) { + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0x12, i), + DIV_ROUND_UP(NSEC_PER_SEC, ufs->mclk_rate)); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0x5C, i), 0x38); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0x0F, i), 0x0); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0x65, i), 0x1); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0x69, i), 0x1); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0x21, i), 0x0); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0x22, i), 0x0); + } + + ufshcd_dme_set(hba, UIC_ARG_MIB(0x200), 0x0); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_DBG_AUTOMODE_THLD), 0x4E20); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_DBG_OPTION_SUITE), 0x2e820183); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_LOCAL_TX_LCC_ENABLE), 0x0); + + exynos_ufs_establish_connt(ufs); + + return 0; +} + +static int fsd_ufs_post_link(struct exynos_ufs *ufs) +{ + int i; + struct ufs_hba *hba = ufs->hba; + u32 hw_cap_min_tactivate; + u32 peer_rx_min_actv_time_cap; + u32 max_rx_hibern8_time_cap; + + ufshcd_dme_get(hba, UIC_ARG_MIB_SEL(0x8F, 4), + &hw_cap_min_tactivate); /* HW Capability of MIN_TACTIVATE */ + ufshcd_dme_get(hba, UIC_ARG_MIB(PA_TACTIVATE), + &peer_rx_min_actv_time_cap); /* PA_TActivate */ + ufshcd_dme_get(hba, UIC_ARG_MIB(PA_HIBERN8TIME), + &max_rx_hibern8_time_cap); /* PA_Hibern8Time */ + + if (peer_rx_min_actv_time_cap >= hw_cap_min_tactivate) + ufshcd_dme_peer_set(hba, UIC_ARG_MIB(PA_TACTIVATE), + peer_rx_min_actv_time_cap + 1); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_HIBERN8TIME), max_rx_hibern8_time_cap + 1); + + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_DBG_MODE), 0x01); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_SAVECONFIGTIME), 0xFA); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_DBG_MODE), 0x00); + + ufshcd_dme_set(hba, UIC_ARG_MIB(0x200), 0x40); + + for_each_ufs_rx_lane(ufs, i) { + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0x35, i), 0x05); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0x73, i), 0x01); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0x41, i), 0x02); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0x42, i), 0xAC); + } + + ufshcd_dme_set(hba, UIC_ARG_MIB(0x200), 0x0); + + return 0; +} + +static int fsd_ufs_pre_pwr_change(struct exynos_ufs *ufs, + struct ufs_pa_layer_attr *pwr) +{ + struct ufs_hba *hba = ufs->hba; + + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TXTERMINATION), 0x1); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_RXTERMINATION), 0x1); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA0), 12000); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA1), 32000); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA2), 16000); + + unipro_writel(ufs, 12000, UNIPRO_DME_POWERMODE_REQ_REMOTEL2TIMER0); + unipro_writel(ufs, 32000, UNIPRO_DME_POWERMODE_REQ_REMOTEL2TIMER1); + unipro_writel(ufs, 16000, UNIPRO_DME_POWERMODE_REQ_REMOTEL2TIMER2); + + return 0; +} + +static const struct ufs_hba_variant_ops ufs_hba_exynos_ops = { .name = "exynos_ufs", .init = exynos_ufs_init, .hce_enable_notify = exynos_ufs_hce_enable_notify, .link_startup_notify = exynos_ufs_link_startup_notify, .pwr_change_notify = exynos_ufs_pwr_change_notify, + .setup_clocks = exynos_ufs_setup_clocks, .setup_xfer_req = exynos_ufs_specify_nexus_t_xfer_req, .setup_task_mgmt = exynos_ufs_specify_nexus_t_tm_req, .hibern8_notify = exynos_ufs_hibern8_notify, @@ -1211,12 +1581,24 @@ static struct ufs_hba_variant_ops ufs_hba_exynos_ops = { .resume = exynos_ufs_resume, }; +static struct ufs_hba_variant_ops ufs_hba_exynosauto_vh_ops = { + .name = "exynosauto_ufs_vh", + .init = exynosauto_ufs_vh_init, + .link_startup_notify = exynosauto_ufs_vh_link_startup_notify, +}; + static int exynos_ufs_probe(struct platform_device *pdev) { int err; struct device *dev = &pdev->dev; + const struct ufs_hba_variant_ops *vops = &ufs_hba_exynos_ops; + const struct exynos_ufs_drv_data *drv_data = + device_get_match_data(dev); - err = ufshcd_pltfrm_init(pdev, &ufs_hba_exynos_ops); + if (drv_data && drv_data->vops) + vops = drv_data->vops; + + err = ufshcd_pltfrm_init(pdev, vops); if (err) dev_err(dev, "ufshcd_pltfrm_init() failed %d\n", err); @@ -1226,9 +1608,14 @@ static int exynos_ufs_probe(struct platform_device *pdev) static int exynos_ufs_remove(struct platform_device *pdev) { struct ufs_hba *hba = platform_get_drvdata(pdev); + struct exynos_ufs *ufs = ufshcd_get_variant(hba); pm_runtime_get_sync(&(pdev)->dev); ufshcd_remove(hba); + + phy_power_off(ufs->phy); + phy_exit(ufs->phy); + return 0; } @@ -1257,8 +1644,35 @@ static struct exynos_ufs_uic_attr exynos7_uic_attr = { .pa_dbg_option_suite = 0x30103, }; -static struct exynos_ufs_drv_data exynos_ufs_drvs = { - .compatible = "samsung,exynos7-ufs", +static const struct exynos_ufs_drv_data exynosauto_ufs_drvs = { + .uic_attr = &exynos7_uic_attr, + .quirks = UFSHCD_QUIRK_PRDT_BYTE_GRAN | + UFSHCI_QUIRK_SKIP_RESET_INTR_AGGR | + UFSHCD_QUIRK_BROKEN_OCS_FATAL_ERROR | + UFSHCD_QUIRK_SKIP_DEF_UNIPRO_TIMEOUT_SETTING, + .opts = EXYNOS_UFS_OPT_BROKEN_AUTO_CLK_CTRL | + EXYNOS_UFS_OPT_SKIP_CONFIG_PHY_ATTR | + EXYNOS_UFS_OPT_BROKEN_RX_SEL_IDX, + .drv_init = exynosauto_ufs_drv_init, + .post_hce_enable = exynosauto_ufs_post_hce_enable, + .pre_link = exynosauto_ufs_pre_link, + .pre_pwr_change = exynosauto_ufs_pre_pwr_change, + .post_pwr_change = exynosauto_ufs_post_pwr_change, +}; + +static const struct exynos_ufs_drv_data exynosauto_ufs_vh_drvs = { + .vops = &ufs_hba_exynosauto_vh_ops, + .quirks = UFSHCD_QUIRK_PRDT_BYTE_GRAN | + UFSHCI_QUIRK_SKIP_RESET_INTR_AGGR | + UFSHCD_QUIRK_BROKEN_OCS_FATAL_ERROR | + UFSHCI_QUIRK_BROKEN_HCE | + UFSHCD_QUIRK_BROKEN_UIC_CMD | + UFSHCD_QUIRK_SKIP_PH_CONFIGURATION | + UFSHCD_QUIRK_SKIP_DEF_UNIPRO_TIMEOUT_SETTING, + .opts = EXYNOS_UFS_OPT_BROKEN_RX_SEL_IDX, +}; + +static const struct exynos_ufs_drv_data exynos_ufs_drvs = { .uic_attr = &exynos7_uic_attr, .quirks = UFSHCD_QUIRK_PRDT_BYTE_GRAN | UFSHCI_QUIRK_BROKEN_REQ_LIST_CLR | @@ -1267,7 +1681,7 @@ static struct exynos_ufs_drv_data exynos_ufs_drvs = { UFSHCD_QUIRK_BROKEN_OCS_FATAL_ERROR | UFSHCI_QUIRK_SKIP_MANUAL_WB_FLUSH_CTRL | UFSHCD_QUIRK_SKIP_DEF_UNIPRO_TIMEOUT_SETTING | - UFSHCD_QUIRK_ALIGN_SG_WITH_PAGE_SIZE, + UFSHCD_QUIRK_4KB_DMA_ALIGNMENT, .opts = EXYNOS_UFS_OPT_HAS_APB_CLK_CTRL | EXYNOS_UFS_OPT_BROKEN_AUTO_CLK_CTRL | EXYNOS_UFS_OPT_BROKEN_RX_SEL_IDX | @@ -1280,9 +1694,56 @@ static struct exynos_ufs_drv_data exynos_ufs_drvs = { .post_pwr_change = exynos7_ufs_post_pwr_change, }; +static struct exynos_ufs_uic_attr fsd_uic_attr = { + .tx_trailingclks = 0x10, + .tx_dif_p_nsec = 3000000, /* unit: ns */ + .tx_dif_n_nsec = 1000000, /* unit: ns */ + .tx_high_z_cnt_nsec = 20000, /* unit: ns */ + .tx_base_unit_nsec = 100000, /* unit: ns */ + .tx_gran_unit_nsec = 4000, /* unit: ns */ + .tx_sleep_cnt = 1000, /* unit: ns */ + .tx_min_activatetime = 0xa, + .rx_filler_enable = 0x2, + .rx_dif_p_nsec = 1000000, /* unit: ns */ + .rx_hibern8_wait_nsec = 4000000, /* unit: ns */ + .rx_base_unit_nsec = 100000, /* unit: ns */ + .rx_gran_unit_nsec = 4000, /* unit: ns */ + .rx_sleep_cnt = 1280, /* unit: ns */ + .rx_stall_cnt = 320, /* unit: ns */ + .rx_hs_g1_sync_len_cap = SYNC_LEN_COARSE(0xf), + .rx_hs_g2_sync_len_cap = SYNC_LEN_COARSE(0xf), + .rx_hs_g3_sync_len_cap = SYNC_LEN_COARSE(0xf), + .rx_hs_g1_prep_sync_len_cap = PREP_LEN(0xf), + .rx_hs_g2_prep_sync_len_cap = PREP_LEN(0xf), + .rx_hs_g3_prep_sync_len_cap = PREP_LEN(0xf), + .pa_dbg_option_suite = 0x2E820183, +}; + +static const struct exynos_ufs_drv_data fsd_ufs_drvs = { + .uic_attr = &fsd_uic_attr, + .quirks = UFSHCD_QUIRK_PRDT_BYTE_GRAN | + UFSHCI_QUIRK_BROKEN_REQ_LIST_CLR | + UFSHCD_QUIRK_BROKEN_OCS_FATAL_ERROR | + UFSHCD_QUIRK_SKIP_DEF_UNIPRO_TIMEOUT_SETTING | + UFSHCI_QUIRK_SKIP_RESET_INTR_AGGR, + .opts = EXYNOS_UFS_OPT_HAS_APB_CLK_CTRL | + EXYNOS_UFS_OPT_BROKEN_AUTO_CLK_CTRL | + EXYNOS_UFS_OPT_SKIP_CONFIG_PHY_ATTR | + EXYNOS_UFS_OPT_BROKEN_RX_SEL_IDX, + .pre_link = fsd_ufs_pre_link, + .post_link = fsd_ufs_post_link, + .pre_pwr_change = fsd_ufs_pre_pwr_change, +}; + static const struct of_device_id exynos_ufs_of_match[] = { { .compatible = "samsung,exynos7-ufs", .data = &exynos_ufs_drvs }, + { .compatible = "samsung,exynosautov9-ufs", + .data = &exynosauto_ufs_drvs }, + { .compatible = "samsung,exynosautov9-ufs-vh", + .data = &exynosauto_ufs_vh_drvs }, + { .compatible = "tesla,fsd-ufs", + .data = &fsd_ufs_drvs }, {}, }; diff --git a/drivers/scsi/ufs/ufs-exynos.h b/drivers/ufs/host/ufs-exynos.h similarity index 86% rename from drivers/scsi/ufs/ufs-exynos.h rename to drivers/ufs/host/ufs-exynos.h index dadf4fd10dd8..a4bd6646d7f1 100644 --- a/drivers/scsi/ufs/ufs-exynos.h +++ b/drivers/ufs/host/ufs-exynos.h @@ -22,6 +22,7 @@ #define PA_DBG_RXPHY_CFGUPDT 0x9519 #define PA_DBG_MODE 0x9529 #define PA_DBG_SKIP_RESET_PHY 0x9539 +#define PA_DBG_AUTOMODE_THLD 0x9536 #define PA_DBG_OV_TM 0x9540 #define PA_DBG_SKIP_LINE_RESET 0x9541 #define PA_DBG_LINE_RESET_REQ 0x9543 @@ -56,6 +57,22 @@ #define TX_GRAN_NVAL_10_08 0x0296 #define TX_GRAN_NVAL_H(v) (((v) >> 8) & 0x3) +#define VND_TX_CLK_PRD 0xAA +#define VND_TX_CLK_PRD_EN 0xA9 +#define VND_TX_LINERESET_PVALUE0 0xAD +#define VND_TX_LINERESET_PVALUE1 0xAC +#define VND_TX_LINERESET_PVALUE2 0xAB + +#define TX_LINE_RESET_TIME 3200 + +#define VND_RX_CLK_PRD 0x12 +#define VND_RX_CLK_PRD_EN 0x11 +#define VND_RX_LINERESET_VALUE0 0x1D +#define VND_RX_LINERESET_VALUE1 0x1C +#define VND_RX_LINERESET_VALUE2 0x1B + +#define RX_LINE_RESET_TIME 1000 + #define RX_FILLER_ENABLE 0x0316 #define RX_FILLER_EN (1 << 1) #define RX_LINERESET_VAL 0x0317 @@ -99,7 +116,7 @@ struct exynos_ufs; #define PA_HIBERN8TIME_VAL 0x20 #define PCLK_AVAIL_MIN 70000000 -#define PCLK_AVAIL_MAX 133000000 +#define PCLK_AVAIL_MAX 167000000 struct exynos_ufs_uic_attr { /* TX Attributes */ @@ -142,7 +159,7 @@ struct exynos_ufs_uic_attr { }; struct exynos_ufs_drv_data { - char *compatible; + const struct ufs_hba_variant_ops *vops; struct exynos_ufs_uic_attr *uic_attr; unsigned int quirks; unsigned int opts; @@ -154,6 +171,8 @@ struct exynos_ufs_drv_data { struct ufs_pa_layer_attr *pwr); int (*post_pwr_change)(struct exynos_ufs *ufs, struct ufs_pa_layer_attr *pwr); + int (*pre_hce_enable)(struct exynos_ufs *ufs); + int (*post_hce_enable)(struct exynos_ufs *ufs); }; struct ufs_phy_time_cfg { @@ -191,7 +210,9 @@ struct exynos_ufs { struct ufs_pa_layer_attr dev_req_params; struct ufs_phy_time_cfg t_cfg; ktime_t entry_hibern8_t; - struct exynos_ufs_drv_data *drv_data; + const struct exynos_ufs_drv_data *drv_data; + struct regmap *sysreg; + u32 shareability_reg_offset; u32 opts; #define EXYNOS_UFS_OPT_HAS_APB_CLK_CTRL BIT(0) @@ -199,6 +220,7 @@ struct exynos_ufs { #define EXYNOS_UFS_OPT_BROKEN_AUTO_CLK_CTRL BIT(2) #define EXYNOS_UFS_OPT_BROKEN_RX_SEL_IDX BIT(3) #define EXYNOS_UFS_OPT_USE_SW_HIBERN8_TIMER BIT(4) +#define EXYNOS_UFS_OPT_SKIP_CONFIG_PHY_ATTR BIT(5) }; #define for_each_ufs_rx_lane(ufs, i) \ @@ -227,22 +249,22 @@ long exynos_ufs_calc_time_cntr(struct exynos_ufs *, long); static inline void exynos_ufs_enable_ov_tm(struct ufs_hba *hba) { - ufshcd_dme_set(hba, UIC_ARG_MIB(PA_DBG_OV_TM), TRUE); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_DBG_OV_TM), true); } static inline void exynos_ufs_disable_ov_tm(struct ufs_hba *hba) { - ufshcd_dme_set(hba, UIC_ARG_MIB(PA_DBG_OV_TM), FALSE); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_DBG_OV_TM), false); } static inline void exynos_ufs_enable_dbg_mode(struct ufs_hba *hba) { - ufshcd_dme_set(hba, UIC_ARG_MIB(PA_DBG_MODE), TRUE); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_DBG_MODE), true); } static inline void exynos_ufs_disable_dbg_mode(struct ufs_hba *hba) { - ufshcd_dme_set(hba, UIC_ARG_MIB(PA_DBG_MODE), FALSE); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_DBG_MODE), false); } #endif /* _UFS_EXYNOS_H_ */ diff --git a/drivers/scsi/ufs/ufs-hisi.c b/drivers/ufs/host/ufs-hisi.c similarity index 97% rename from drivers/scsi/ufs/ufs-hisi.c rename to drivers/ufs/host/ufs-hisi.c index 6b706de8354b..2eed13bc82ca 100644 --- a/drivers/scsi/ufs/ufs-hisi.c +++ b/drivers/ufs/host/ufs-hisi.c @@ -7,18 +7,20 @@ */ #include +#include +#include #include #include #include #include #include -#include "ufshcd.h" +#include #include "ufshcd-pltfrm.h" -#include "unipro.h" +#include #include "ufs-hisi.h" -#include "ufshci.h" -#include "ufs_quirks.h" +#include +#include static int ufs_hisi_check_hibern8(struct ufs_hba *hba) { @@ -396,10 +398,20 @@ out: return ret; } -static int ufs_hisi_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op) +static int ufs_hisi_suspend_prepare(struct device *dev) +{ + /* RPM and SPM are different. Refer ufs_hisi_suspend() */ + return __ufshcd_suspend_prepare(dev, false); +} + +static int ufs_hisi_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op, + enum ufs_notify_change_status status) { struct ufs_hisi_host *host = ufshcd_get_variant(hba); + if (status == PRE_CHANGE) + return 0; + if (pm_op == UFS_RUNTIME_PM) return 0; @@ -574,7 +586,7 @@ static int ufs_hisi_remove(struct platform_device *pdev) static const struct dev_pm_ops ufs_hisi_pm_ops = { SET_SYSTEM_SLEEP_PM_OPS(ufshcd_system_suspend, ufshcd_system_resume) SET_RUNTIME_PM_OPS(ufshcd_runtime_suspend, ufshcd_runtime_resume, NULL) - .prepare = ufshcd_suspend_prepare, + .prepare = ufs_hisi_suspend_prepare, .complete = ufshcd_resume_complete, }; diff --git a/drivers/scsi/ufs/ufs-hisi.h b/drivers/ufs/host/ufs-hisi.h similarity index 100% rename from drivers/scsi/ufs/ufs-hisi.h rename to drivers/ufs/host/ufs-hisi.h diff --git a/drivers/scsi/ufs/ufs-mediatek-trace.h b/drivers/ufs/host/ufs-mediatek-trace.h similarity index 54% rename from drivers/scsi/ufs/ufs-mediatek-trace.h rename to drivers/ufs/host/ufs-mediatek-trace.h index 895e82ea6ece..b5f2ec314074 100644 --- a/drivers/scsi/ufs/ufs-mediatek-trace.h +++ b/drivers/ufs/host/ufs-mediatek-trace.h @@ -24,13 +24,36 @@ TRACE_EVENT(ufs_mtk_event, __entry->data = data; ), - TP_printk("ufs:event=%u data=%u", + TP_printk("ufs: event=%u data=%u", __entry->type, __entry->data) - ); +); + +TRACE_EVENT(ufs_mtk_clk_scale, + TP_PROTO(const char *name, bool scale_up, unsigned long clk_rate), + TP_ARGS(name, scale_up, clk_rate), + + TP_STRUCT__entry( + __field(const char*, name) + __field(bool, scale_up) + __field(unsigned long, clk_rate) + ), + + TP_fast_assign( + __entry->name = name; + __entry->scale_up = scale_up; + __entry->clk_rate = clk_rate; + ), + + TP_printk("ufs: clk (%s) scaled %s @ %lu", + __entry->name, + __entry->scale_up ? "up" : "down", + __entry->clk_rate) +); + #endif #undef TRACE_INCLUDE_PATH #undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_PATH ../../drivers/scsi/ufs/ +#define TRACE_INCLUDE_PATH ../../drivers/ufs/host #define TRACE_INCLUDE_FILE ufs-mediatek-trace #include diff --git a/drivers/scsi/ufs/ufs-mediatek.c b/drivers/ufs/host/ufs-mediatek.c similarity index 64% rename from drivers/scsi/ufs/ufs-mediatek.c rename to drivers/ufs/host/ufs-mediatek.c index a9ddb50d593c..21d9b047539f 100644 --- a/drivers/scsi/ufs/ufs-mediatek.c +++ b/drivers/ufs/host/ufs-mediatek.c @@ -8,47 +8,37 @@ #include #include +#include +#include +#include #include #include #include #include #include +#include #include #include #include -#include "ufshcd.h" -#include "ufshcd-crypto.h" +#include #include "ufshcd-pltfrm.h" -#include "ufs_quirks.h" -#include "unipro.h" +#include +#include #include "ufs-mediatek.h" #define CREATE_TRACE_POINTS #include "ufs-mediatek-trace.h" -#define ufs_mtk_smc(cmd, val, res) \ - arm_smccc_smc(MTK_SIP_UFS_CONTROL, \ - cmd, val, 0, 0, 0, 0, 0, &(res)) - -#define ufs_mtk_va09_pwr_ctrl(res, on) \ - ufs_mtk_smc(UFS_MTK_SIP_VA09_PWR_CTRL, on, res) - -#define ufs_mtk_crypto_ctrl(res, enable) \ - ufs_mtk_smc(UFS_MTK_SIP_CRYPTO_CTRL, enable, res) - -#define ufs_mtk_ref_clk_notify(on, res) \ - ufs_mtk_smc(UFS_MTK_SIP_REF_CLK_NOTIFICATION, on, res) - -#define ufs_mtk_device_reset_ctrl(high, res) \ - ufs_mtk_smc(UFS_MTK_SIP_DEVICE_RESET, high, res) - -static struct ufs_dev_fix ufs_mtk_dev_fixups[] = { - UFS_FIX(UFS_VENDOR_MICRON, UFS_ANY_MODEL, - UFS_DEVICE_QUIRK_DELAY_AFTER_LPM), - UFS_FIX(UFS_VENDOR_SKHYNIX, "H9HQ21AFAMZDAR", - UFS_DEVICE_QUIRK_SUPPORT_EXTENDED_FEATURES), - END_FIX +static const struct ufs_dev_quirk ufs_mtk_dev_fixups[] = { + { .wmanufacturerid = UFS_ANY_VENDOR, + .model = UFS_ANY_MODEL, + .quirk = UFS_DEVICE_QUIRK_DELAY_AFTER_LPM | + UFS_DEVICE_QUIRK_DELAY_BEFORE_LPM }, + { .wmanufacturerid = UFS_VENDOR_SKHYNIX, + .model = "H9HQ21AFAMZDAR", + .quirk = UFS_DEVICE_QUIRK_SUPPORT_EXTENDED_FEATURES }, + {} }; static const struct of_device_id ufs_mtk_of_match[] = { @@ -56,6 +46,44 @@ static const struct of_device_id ufs_mtk_of_match[] = { {}, }; +/* + * Details of UIC Errors + */ +static const char *const ufs_uic_err_str[] = { + "PHY Adapter Layer", + "Data Link Layer", + "Network Link Layer", + "Transport Link Layer", + "DME" +}; + +static const char *const ufs_uic_pa_err_str[] = { + "PHY error on Lane 0", + "PHY error on Lane 1", + "PHY error on Lane 2", + "PHY error on Lane 3", + "Generic PHY Adapter Error. This should be the LINERESET indication" +}; + +static const char *const ufs_uic_dl_err_str[] = { + "NAC_RECEIVED", + "TCx_REPLAY_TIMER_EXPIRED", + "AFCx_REQUEST_TIMER_EXPIRED", + "FCx_PROTECTION_TIMER_EXPIRED", + "CRC_ERROR", + "RX_BUFFER_OVERFLOW", + "MAX_FRAME_LENGTH_EXCEEDED", + "WRONG_SEQUENCE_NUMBER", + "AFC_FRAME_SYNTAX_ERROR", + "NAC_FRAME_SYNTAX_ERROR", + "EOF_SYNTAX_ERROR", + "FRAME_SYNTAX_ERROR", + "BAD_CTRL_SYMBOL_TYPE", + "PA_INIT_ERROR", + "PA_ERROR_IND_RECEIVED", + "PA_INIT" +}; + static bool ufs_mtk_is_boost_crypt_enabled(struct ufs_hba *hba) { struct ufs_mtk_host *host = ufshcd_get_variant(hba); @@ -77,6 +105,13 @@ static bool ufs_mtk_is_broken_vcc(struct ufs_hba *hba) return !!(host->caps & UFS_MTK_CAP_BROKEN_VCC); } +static bool ufs_mtk_is_pmc_via_fastauto(struct ufs_hba *hba) +{ + struct ufs_mtk_host *host = ufshcd_get_variant(hba); + + return !!(host->caps & UFS_MTK_CAP_PMC_VIA_FASTAUTO); +} + static void ufs_mtk_cfg_unipro_cg(struct ufs_hba *hba, bool enable) { u32 tmp; @@ -168,7 +203,6 @@ static int ufs_mtk_hce_enable_notify(struct ufs_hba *hba, enum ufs_notify_change_status status) { struct ufs_mtk_host *host = ufshcd_get_variant(hba); - unsigned long flags; if (status == PRE_CHANGE) { if (host->unipro_lpm) { @@ -182,15 +216,19 @@ static int ufs_mtk_hce_enable_notify(struct ufs_hba *hba, ufs_mtk_crypto_enable(hba); if (host->caps & UFS_MTK_CAP_DISABLE_AH8) { - spin_lock_irqsave(hba->host->host_lock, flags); ufshcd_writel(hba, 0, REG_AUTO_HIBERNATE_IDLE_TIMER); - spin_unlock_irqrestore(hba->host->host_lock, - flags); - hba->capabilities &= ~MASK_AUTO_HIBERN8_SUPPORT; hba->ahit = 0; } + + /* + * Turn on CLK_CG early to bypass abnormal ERR_CHK signal + * to prevent host hang issue + */ + ufshcd_writel(hba, + ufshcd_readl(hba, REG_UFS_XOUFS_CTRL) | 0x80, + REG_UFS_XOUFS_CTRL); } return 0; @@ -244,11 +282,12 @@ static int ufs_mtk_setup_ref_clk(struct ufs_hba *hba, bool on) if (host->ref_clk_enabled == on) return 0; + ufs_mtk_ref_clk_notify(on, PRE_CHANGE, res); + if (on) { - ufs_mtk_ref_clk_notify(on, res); - ufshcd_delay_us(host->ref_clk_ungating_wait_us, 10); ufshcd_writel(hba, REFCLK_REQUEST, REG_UFS_REFCLK_CTRL); } else { + ufshcd_delay_us(host->ref_clk_gating_wait_us, 10); ufshcd_writel(hba, REFCLK_RELEASE, REG_UFS_REFCLK_CTRL); } @@ -267,22 +306,22 @@ static int ufs_mtk_setup_ref_clk(struct ufs_hba *hba, bool on) dev_err(hba->dev, "missing ack of refclk req, reg: 0x%x\n", value); - ufs_mtk_ref_clk_notify(host->ref_clk_enabled, res); + ufs_mtk_ref_clk_notify(host->ref_clk_enabled, POST_CHANGE, res); return -ETIMEDOUT; out: host->ref_clk_enabled = on; - if (!on) { - ufshcd_delay_us(host->ref_clk_gating_wait_us, 10); - ufs_mtk_ref_clk_notify(on, res); - } + if (on) + ufshcd_delay_us(host->ref_clk_ungating_wait_us, 10); + + ufs_mtk_ref_clk_notify(on, POST_CHANGE, res); return 0; } static void ufs_mtk_setup_ref_clk_wait_us(struct ufs_hba *hba, - u16 gating_us, u16 ungating_us) + u16 gating_us) { struct ufs_mtk_host *host = ufshcd_get_variant(hba); @@ -293,7 +332,62 @@ static void ufs_mtk_setup_ref_clk_wait_us(struct ufs_hba *hba, host->ref_clk_gating_wait_us = gating_us; } - host->ref_clk_ungating_wait_us = ungating_us; + host->ref_clk_ungating_wait_us = REFCLK_DEFAULT_WAIT_US; +} + +static void ufs_mtk_dbg_sel(struct ufs_hba *hba) +{ + struct ufs_mtk_host *host = ufshcd_get_variant(hba); + + if (((host->ip_ver >> 16) & 0xFF) >= 0x36) { + ufshcd_writel(hba, 0x820820, REG_UFS_DEBUG_SEL); + ufshcd_writel(hba, 0x0, REG_UFS_DEBUG_SEL_B0); + ufshcd_writel(hba, 0x55555555, REG_UFS_DEBUG_SEL_B1); + ufshcd_writel(hba, 0xaaaaaaaa, REG_UFS_DEBUG_SEL_B2); + ufshcd_writel(hba, 0xffffffff, REG_UFS_DEBUG_SEL_B3); + } else { + ufshcd_writel(hba, 0x20, REG_UFS_DEBUG_SEL); + } +} + +static void ufs_mtk_wait_idle_state(struct ufs_hba *hba, + unsigned long retry_ms) +{ + u64 timeout, time_checked; + u32 val, sm; + bool wait_idle; + + /* cannot use plain ktime_get() in suspend */ + timeout = ktime_get_mono_fast_ns() + retry_ms * 1000000UL; + + /* wait a specific time after check base */ + udelay(10); + wait_idle = false; + + do { + time_checked = ktime_get_mono_fast_ns(); + ufs_mtk_dbg_sel(hba); + val = ufshcd_readl(hba, REG_UFS_PROBE); + + sm = val & 0x1f; + + /* + * if state is in H8 enter and H8 enter confirm + * wait until return to idle state. + */ + if ((sm >= VS_HIB_ENTER) && (sm <= VS_HIB_EXIT)) { + wait_idle = true; + udelay(50); + continue; + } else if (!wait_idle) + break; + + if (wait_idle && (sm == VS_HCE_BASE)) + break; + } while (time_checked < timeout); + + if (wait_idle && sm != VS_HCE_BASE) + dev_info(hba->dev, "wait idle tmo: 0x%x\n", val); } static int ufs_mtk_wait_link_state(struct ufs_hba *hba, u32 state, @@ -305,7 +399,7 @@ static int ufs_mtk_wait_link_state(struct ufs_hba *hba, u32 state, timeout = ktime_add_ms(ktime_get(), max_wait_ms); do { time_checked = ktime_get(); - ufshcd_writel(hba, 0x20, REG_UFS_DEBUG_SEL); + ufs_mtk_dbg_sel(hba); val = ufshcd_readl(hba, REG_UFS_PROBE); val = val >> 28; @@ -347,8 +441,6 @@ static int ufs_mtk_mphy_power_on(struct ufs_hba *hba, bool on) if (ufs_mtk_is_va09_supported(hba)) { ufs_mtk_va09_pwr_ctrl(res, 0); ret = regulator_disable(host->reg_va09); - if (ret < 0) - goto out; } } out: @@ -524,20 +616,44 @@ static void ufs_mtk_init_host_caps(struct ufs_hba *hba) if (of_property_read_bool(np, "mediatek,ufs-broken-vcc")) host->caps |= UFS_MTK_CAP_BROKEN_VCC; + if (of_property_read_bool(np, "mediatek,ufs-pmc-via-fastauto")) + host->caps |= UFS_MTK_CAP_PMC_VIA_FASTAUTO; + dev_info(hba->dev, "caps: 0x%x", host->caps); } -static void ufs_mtk_scale_perf(struct ufs_hba *hba, bool up) +static void ufs_mtk_boost_pm_qos(struct ufs_hba *hba, bool boost) { struct ufs_mtk_host *host = ufshcd_get_variant(hba); - ufs_mtk_boost_crypt(hba, up); - ufs_mtk_setup_ref_clk(hba, up); + if (!host || !host->pm_qos_init) + return; - if (up) + cpu_latency_qos_update_request(&host->pm_qos_req, + boost ? 0 : PM_QOS_DEFAULT_VALUE); +} + +static void ufs_mtk_scale_perf(struct ufs_hba *hba, bool scale_up) +{ + ufs_mtk_boost_crypt(hba, scale_up); + ufs_mtk_boost_pm_qos(hba, scale_up); +} + +static void ufs_mtk_pwr_ctrl(struct ufs_hba *hba, bool on) +{ + struct ufs_mtk_host *host = ufshcd_get_variant(hba); + + if (on) { phy_power_on(host->mphy); - else + ufs_mtk_setup_ref_clk(hba, on); + if (!ufshcd_is_clkscaling_supported(hba)) + ufs_mtk_scale_perf(hba, on); + } else { + if (!ufshcd_is_clkscaling_supported(hba)) + ufs_mtk_scale_perf(hba, on); + ufs_mtk_setup_ref_clk(hba, on); phy_power_off(host->mphy); + } } /** @@ -582,9 +698,9 @@ static int ufs_mtk_setup_clocks(struct ufs_hba *hba, bool on, } if (clk_pwr_off) - ufs_mtk_scale_perf(hba, false); + ufs_mtk_pwr_ctrl(hba, false); } else if (on && status == POST_CHANGE) { - ufs_mtk_scale_perf(hba, true); + ufs_mtk_pwr_ctrl(hba, true); } return ret; @@ -620,6 +736,113 @@ static u32 ufs_mtk_get_ufs_hci_version(struct ufs_hba *hba) return hba->ufs_version; } +/** + * ufs_mtk_init_clocks - Init mtk driver private clocks + * + * @hba: per adapter instance + */ +static void ufs_mtk_init_clocks(struct ufs_hba *hba) +{ + struct ufs_mtk_host *host = ufshcd_get_variant(hba); + struct list_head *head = &hba->clk_list_head; + struct ufs_mtk_clk *mclk = &host->mclk; + struct ufs_clk_info *clki, *clki_tmp; + + /* + * Find private clocks and store them in struct ufs_mtk_clk. + * Remove "ufs_sel_min_src" and "ufs_sel_min_src" from list to avoid + * being switched on/off in clock gating. + */ + list_for_each_entry_safe(clki, clki_tmp, head, list) { + if (!strcmp(clki->name, "ufs_sel")) { + host->mclk.ufs_sel_clki = clki; + } else if (!strcmp(clki->name, "ufs_sel_max_src")) { + host->mclk.ufs_sel_max_clki = clki; + clk_disable_unprepare(clki->clk); + list_del(&clki->list); + } else if (!strcmp(clki->name, "ufs_sel_min_src")) { + host->mclk.ufs_sel_min_clki = clki; + clk_disable_unprepare(clki->clk); + list_del(&clki->list); + } + } + + if (!mclk->ufs_sel_clki || !mclk->ufs_sel_max_clki || + !mclk->ufs_sel_min_clki) { + hba->caps &= ~UFSHCD_CAP_CLK_SCALING; + dev_info(hba->dev, + "%s: Clk-scaling not ready. Feature disabled.", + __func__); + } +} + +#define MAX_VCC_NAME 30 +static int ufs_mtk_vreg_fix_vcc(struct ufs_hba *hba) +{ + struct ufs_vreg_info *info = &hba->vreg_info; + struct device_node *np = hba->dev->of_node; + struct device *dev = hba->dev; + char vcc_name[MAX_VCC_NAME]; + struct arm_smccc_res res; + int err, ver; + + if (hba->vreg_info.vcc) + return 0; + + if (of_property_read_bool(np, "mediatek,ufs-vcc-by-num")) { + ufs_mtk_get_vcc_num(res); + if (res.a1 > UFS_VCC_NONE && res.a1 < UFS_VCC_MAX) + snprintf(vcc_name, MAX_VCC_NAME, "vcc-opt%lu", res.a1); + else + return -ENODEV; + } else if (of_property_read_bool(np, "mediatek,ufs-vcc-by-ver")) { + ver = (hba->dev_info.wspecversion & 0xF00) >> 8; + snprintf(vcc_name, MAX_VCC_NAME, "vcc-ufs%u", ver); + } else { + return 0; + } + + err = ufshcd_populate_vreg(dev, vcc_name, &info->vcc); + if (err) + return err; + + err = ufshcd_get_vreg(dev, info->vcc); + if (err) + return err; + + err = regulator_enable(info->vcc->reg); + if (!err) { + info->vcc->enabled = true; + dev_info(dev, "%s: %s enabled\n", __func__, vcc_name); + } + + return err; +} + +static void ufs_mtk_vreg_fix_vccqx(struct ufs_hba *hba) +{ + struct ufs_vreg_info *info = &hba->vreg_info; + struct ufs_vreg **vreg_on, **vreg_off; + + if (hba->dev_info.wspecversion >= 0x0300) { + vreg_on = &info->vccq; + vreg_off = &info->vccq2; + } else { + vreg_on = &info->vccq2; + vreg_off = &info->vccq; + } + + if (*vreg_on) + (*vreg_on)->always_on = true; + + if (*vreg_off) { + regulator_disable((*vreg_off)->reg); + devm_kfree(hba->dev, (*vreg_off)->name); + devm_kfree(hba->dev, *vreg_off); + *vreg_off = NULL; + } +} + /** * ufs_mtk_init - find other essential mmio bases * @hba: host controller instance @@ -673,12 +896,18 @@ static int ufs_mtk_init(struct ufs_hba *hba) /* Enable WriteBooster */ hba->caps |= UFSHCD_CAP_WB_EN; + + /* Enable clk scaling*/ + hba->caps |= UFSHCD_CAP_CLK_SCALING; + hba->quirks |= UFSHCI_QUIRK_SKIP_MANUAL_WB_FLUSH_CTRL; hba->vps->wb_flush_threshold = UFS_WB_BUF_REMAIN_PERCENT(80); if (host->caps & UFS_MTK_CAP_DISABLE_AH8) hba->caps |= UFSHCD_CAP_HIBERN8_WITH_CLK_GATING; + ufs_mtk_init_clocks(hba); + /* * ufshcd_vops_init() is invoked after * ufshcd_setup_clock(true) in ufshcd_hba_init() thus @@ -689,6 +918,12 @@ static int ufs_mtk_init(struct ufs_hba *hba) ufs_mtk_mphy_power_on(hba, true); ufs_mtk_setup_clocks(hba, true, POST_CHANGE); + host->ip_ver = ufshcd_readl(hba, REG_UFS_MTK_IP_VER); + + /* Initialize pm-qos request */ + cpu_latency_qos_add_request(&host->pm_qos_req, PM_QOS_DEFAULT_VALUE); + host->pm_qos_init = true; + goto out; out_variant_clear: @@ -697,6 +932,26 @@ out: return err; } +static bool ufs_mtk_pmc_via_fastauto(struct ufs_hba *hba, + struct ufs_pa_layer_attr *dev_req_params) +{ + if (!ufs_mtk_is_pmc_via_fastauto(hba)) + return false; + + if (dev_req_params->hs_rate == hba->pwr_info.hs_rate) + return false; + + if (dev_req_params->pwr_tx != FAST_MODE && + dev_req_params->gear_tx < UFS_HS_G4) + return false; + + if (dev_req_params->pwr_rx != FAST_MODE && + dev_req_params->gear_rx < UFS_HS_G4) + return false; + + return true; +} + static int ufs_mtk_pre_pwr_change(struct ufs_hba *hba, struct ufs_pa_layer_attr *dev_max_params, struct ufs_pa_layer_attr *dev_req_params) @@ -706,8 +961,8 @@ static int ufs_mtk_pre_pwr_change(struct ufs_hba *hba, int ret; ufshcd_init_pwr_dev_param(&host_cap); - host_cap.hs_rx_gear = UFS_HS_G4; - host_cap.hs_tx_gear = UFS_HS_G4; + host_cap.hs_rx_gear = UFS_HS_G5; + host_cap.hs_tx_gear = UFS_HS_G5; ret = ufshcd_get_pwr_dev_param(&host_cap, dev_max_params, @@ -717,6 +972,32 @@ static int ufs_mtk_pre_pwr_change(struct ufs_hba *hba, __func__); } + if (ufs_mtk_pmc_via_fastauto(hba, dev_req_params)) { + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TXTERMINATION), true); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TXGEAR), UFS_HS_G1); + + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_RXTERMINATION), true); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_RXGEAR), UFS_HS_G1); + + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_ACTIVETXDATALANES), + dev_req_params->lane_tx); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_ACTIVERXDATALANES), + dev_req_params->lane_rx); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_HSSERIES), + dev_req_params->hs_rate); + + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TXHSADAPTTYPE), + PA_NO_ADAPT); + + ret = ufshcd_uic_change_pwr_mode(hba, + FASTAUTO_MODE << 4 | FASTAUTO_MODE); + + if (ret) { + dev_err(hba->dev, "%s: HSG1B FASTAUTO failed ret=%d\n", + __func__, ret); + } + } + if (host->hw_ver.major >= 3) { ret = ufshcd_dme_configure_adapt(hba, dev_req_params->gear_tx, @@ -802,7 +1083,6 @@ static int ufs_mtk_pre_link(struct ufs_hba *hba) static void ufs_mtk_setup_clk_gating(struct ufs_hba *hba) { - unsigned long flags; u32 ah_ms; if (ufshcd_is_clkgating_allowed(hba)) { @@ -811,13 +1091,11 @@ static void ufs_mtk_setup_clk_gating(struct ufs_hba *hba) hba->ahit); else ah_ms = 10; - spin_lock_irqsave(hba->host->host_lock, flags); - hba->clk_gating.delay_ms = ah_ms + 5; - spin_unlock_irqrestore(hba->host->host_lock, flags); + ufshcd_clkgate_delay_set(hba->dev, ah_ms + 5); } } -static int ufs_mtk_post_link(struct ufs_hba *hba) +static void ufs_mtk_post_link(struct ufs_hba *hba) { /* enable unipro clock gating feature */ ufs_mtk_cfg_unipro_cg(hba, true); @@ -828,8 +1106,6 @@ static int ufs_mtk_post_link(struct ufs_hba *hba) FIELD_PREP(UFSHCI_AHIBERN8_SCALE_MASK, 3); ufs_mtk_setup_clk_gating(hba); - - return 0; } static int ufs_mtk_link_startup_notify(struct ufs_hba *hba, @@ -842,7 +1118,7 @@ static int ufs_mtk_link_startup_notify(struct ufs_hba *hba, ret = ufs_mtk_pre_link(hba); break; case POST_CHANGE: - ret = ufs_mtk_post_link(hba); + ufs_mtk_post_link(hba); break; default: ret = -EINVAL; @@ -909,6 +1185,11 @@ static int ufs_mtk_link_set_lpm(struct ufs_hba *hba) { int err; + /* Disable reset confirm feature by UniPro */ + ufshcd_writel(hba, + (ufshcd_readl(hba, REG_UFS_XOUFS_CTRL) & ~0x100), + REG_UFS_XOUFS_CTRL); + err = ufs_mtk_unipro_set_lpm(hba, true); if (err) { /* Resume UniPro state for following error recovery */ @@ -919,24 +1200,81 @@ static int ufs_mtk_link_set_lpm(struct ufs_hba *hba) return 0; } -static void ufs_mtk_vreg_set_lpm(struct ufs_hba *hba, bool lpm) +static void ufs_mtk_vccqx_set_lpm(struct ufs_hba *hba, bool lpm) { - if (!hba->vreg_info.vccq2 || !hba->vreg_info.vcc) - return; + struct ufs_vreg *vccqx = NULL; - if (lpm && !hba->vreg_info.vcc->enabled) - regulator_set_mode(hba->vreg_info.vccq2->reg, - REGULATOR_MODE_IDLE); - else if (!lpm) - regulator_set_mode(hba->vreg_info.vccq2->reg, - REGULATOR_MODE_NORMAL); + if (hba->vreg_info.vccq) + vccqx = hba->vreg_info.vccq; + else + vccqx = hba->vreg_info.vccq2; + + regulator_set_mode(vccqx->reg, + lpm ? REGULATOR_MODE_IDLE : REGULATOR_MODE_NORMAL); } -static int ufs_mtk_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op) +static void ufs_mtk_vsx_set_lpm(struct ufs_hba *hba, bool lpm) +{ + struct arm_smccc_res res; + + ufs_mtk_device_pwr_ctrl(!lpm, + (unsigned long)hba->dev_info.wspecversion, + res); +} + +static void ufs_mtk_dev_vreg_set_lpm(struct ufs_hba *hba, bool lpm) +{ + if (!hba->vreg_info.vccq && !hba->vreg_info.vccq2) + return; + + /* Skip if VCC is assumed always-on */ + if (!hba->vreg_info.vcc) + return; + + /* Bypass LPM when device is still active */ + if (lpm && ufshcd_is_ufs_dev_active(hba)) + return; + + /* Bypass LPM if VCC is enabled */ + if (lpm && hba->vreg_info.vcc->enabled) + return; + + if (lpm) { + ufs_mtk_vccqx_set_lpm(hba, lpm); + ufs_mtk_vsx_set_lpm(hba, lpm); + } else { + ufs_mtk_vsx_set_lpm(hba, lpm); + ufs_mtk_vccqx_set_lpm(hba, lpm); + } +} + +static void ufs_mtk_auto_hibern8_disable(struct ufs_hba *hba) +{ + int ret; + + /* disable auto-hibern8 */ + ufshcd_writel(hba, 0, REG_AUTO_HIBERNATE_IDLE_TIMER); + + /* wait host return to idle state when auto-hibern8 off */ + ufs_mtk_wait_idle_state(hba, 5); + + ret = ufs_mtk_wait_link_state(hba, VS_LINK_UP, 100); + if (ret) + dev_warn(hba->dev, "exit h8 state fail, ret=%d\n", ret); +} + +static int ufs_mtk_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op, + enum ufs_notify_change_status status) { int err; struct arm_smccc_res res; + if (status == PRE_CHANGE) { + if (ufshcd_is_auto_hibern8_supported(hba)) + ufs_mtk_auto_hibern8_disable(hba); + return 0; + } + if (ufshcd_is_link_hibern8(hba)) { err = ufs_mtk_link_set_lpm(hba); if (err) @@ -957,6 +1295,8 @@ static int ufs_mtk_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op) if (ufshcd_is_link_off(hba)) ufs_mtk_device_reset_ctrl(0, res); + ufs_mtk_host_pwr_ctrl(HOST_PWR_HCI, false, res); + return 0; fail: /* @@ -971,9 +1311,12 @@ fail: static int ufs_mtk_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op) { int err; + struct arm_smccc_res res; if (hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL) - ufs_mtk_vreg_set_lpm(hba, false); + ufs_mtk_dev_vreg_set_lpm(hba, false); + + ufs_mtk_host_pwr_ctrl(HOST_PWR_HCI, true, res); err = ufs_mtk_mphy_power_on(hba, true); if (err) @@ -992,16 +1335,19 @@ fail: static void ufs_mtk_dbg_register_dump(struct ufs_hba *hba) { - ufshcd_dump_regs(hba, REG_UFS_REFCLK_CTRL, 0x4, "Ref-Clk Ctrl "); + /* Dump ufshci register 0x140 ~ 0x14C */ + ufshcd_dump_regs(hba, REG_UFS_XOUFS_CTRL, 0x10, + "XOUFS Ctrl (0x140): "); ufshcd_dump_regs(hba, REG_UFS_EXTREG, 0x4, "Ext Reg "); + /* Dump ufshci register 0x2200 ~ 0x22AC */ ufshcd_dump_regs(hba, REG_UFS_MPHYCTRL, REG_UFS_REJECT_MON - REG_UFS_MPHYCTRL + 4, - "MPHY Ctrl "); + "MPHY Ctrl (0x2200): "); /* Direct debugging information to REG_MTK_PROBE */ - ufshcd_writel(hba, 0x20, REG_UFS_DEBUG_SEL); + ufs_mtk_dbg_sel(hba); ufshcd_dump_regs(hba, REG_UFS_PROBE, 0x4, "Debug Probe "); } @@ -1010,8 +1356,10 @@ static int ufs_mtk_apply_dev_quirks(struct ufs_hba *hba) struct ufs_dev_info *dev_info = &hba->dev_info; u16 mid = dev_info->wmanufacturerid; - if (mid == UFS_VENDOR_SAMSUNG) + if (mid == UFS_VENDOR_SAMSUNG) { ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TACTIVATE), 6); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_HIBERN8TIME), 10); + } /* * Decide waiting time before gating reference clock and @@ -1019,12 +1367,14 @@ static int ufs_mtk_apply_dev_quirks(struct ufs_hba *hba) * requirements. */ if (mid == UFS_VENDOR_SAMSUNG) - ufs_mtk_setup_ref_clk_wait_us(hba, 1, 1); + ufs_mtk_setup_ref_clk_wait_us(hba, 1); else if (mid == UFS_VENDOR_SKHYNIX) - ufs_mtk_setup_ref_clk_wait_us(hba, 30, 30); + ufs_mtk_setup_ref_clk_wait_us(hba, 30); else if (mid == UFS_VENDOR_TOSHIBA) - ufs_mtk_setup_ref_clk_wait_us(hba, 100, 32); - + ufs_mtk_setup_ref_clk_wait_us(hba, 100); + else + ufs_mtk_setup_ref_clk_wait_us(hba, + REFCLK_DEFAULT_WAIT_US); return 0; } @@ -1042,14 +1392,110 @@ static void ufs_mtk_fixup_dev_quirks(struct ufs_hba *hba) hba->dev_quirks &= ~(UFS_DEVICE_QUIRK_DELAY_BEFORE_LPM | UFS_DEVICE_QUIRK_DELAY_AFTER_LPM); } + + ufs_mtk_vreg_fix_vcc(hba); + ufs_mtk_vreg_fix_vccqx(hba); } static void ufs_mtk_event_notify(struct ufs_hba *hba, enum ufs_event_type evt, void *data) { unsigned int val = *(u32 *)data; + unsigned long reg; + u8 bit; trace_ufs_mtk_event(evt, val); + + /* Print details of UIC Errors */ + if (evt <= UFS_EVT_DME_ERR) { + dev_info(hba->dev, + "Host UIC Error Code (%s): %08x\n", + ufs_uic_err_str[evt], val); + reg = val; + } + + if (evt == UFS_EVT_PA_ERR) { + for_each_set_bit(bit, ®, ARRAY_SIZE(ufs_uic_pa_err_str)) + dev_info(hba->dev, "%s\n", ufs_uic_pa_err_str[bit]); + } + + if (evt == UFS_EVT_DL_ERR) { + for_each_set_bit(bit, ®, ARRAY_SIZE(ufs_uic_dl_err_str)) + dev_info(hba->dev, "%s\n", ufs_uic_dl_err_str[bit]); + } +} + +static void ufs_mtk_config_scaling_param(struct ufs_hba *hba, + struct devfreq_dev_profile *profile, + struct devfreq_simple_ondemand_data *data) +{ + /* Customize min gear in clk scaling */ + hba->clk_scaling.min_gear = UFS_HS_G4; + + hba->vps->devfreq_profile.polling_ms = 200; + hba->vps->ondemand_data.upthreshold = 50; + hba->vps->ondemand_data.downdifferential = 20; +} + +/** + * ufs_mtk_clk_scale - Internal clk scaling operation + * + * MTK platform supports clk scaling by switching parent of ufs_sel(mux). + * The ufs_sel downstream to ufs_ck which feeds directly to UFS hardware. + * Max and min clocks rate of ufs_sel defined in dts should match rate of + * "ufs_sel_max_src" and "ufs_sel_min_src" respectively. + * This prevent changing rate of pll clock that is shared between modules. + * + * @hba: per adapter instance + * @scale_up: True for scaling up and false for scaling down + */ +static void ufs_mtk_clk_scale(struct ufs_hba *hba, bool scale_up) +{ + struct ufs_mtk_host *host = ufshcd_get_variant(hba); + struct ufs_mtk_clk *mclk = &host->mclk; + struct ufs_clk_info *clki = mclk->ufs_sel_clki; + int ret = 0; + + ret = clk_prepare_enable(clki->clk); + if (ret) { + dev_info(hba->dev, + "clk_prepare_enable() fail, ret: %d\n", ret); + return; + } + + if (scale_up) { + ret = clk_set_parent(clki->clk, mclk->ufs_sel_max_clki->clk); + clki->curr_freq = clki->max_freq; + } else { + ret = clk_set_parent(clki->clk, mclk->ufs_sel_min_clki->clk); + clki->curr_freq = clki->min_freq; + } + + if (ret) { + dev_info(hba->dev, + "Failed to set ufs_sel_clki, ret: %d\n", ret); + } + + clk_disable_unprepare(clki->clk); + + trace_ufs_mtk_clk_scale(clki->name, scale_up, clk_get_rate(clki->clk)); +} + +static int ufs_mtk_clk_scale_notify(struct ufs_hba *hba, bool scale_up, + enum ufs_notify_change_status status) +{ + if (!ufshcd_is_clkscaling_supported(hba)) + return 0; + + if (status == PRE_CHANGE) { + /* Switch parent before clk_set_rate() */ + ufs_mtk_clk_scale(hba, scale_up); + } else { + /* Request interrupt latency QoS accordingly */ + ufs_mtk_scale_perf(hba, scale_up); + } + + return 0; } /* @@ -1073,6 +1519,8 @@ static const struct ufs_hba_variant_ops ufs_hba_mtk_vops = { .dbg_register_dump = ufs_mtk_dbg_register_dump, .device_reset = ufs_mtk_device_reset, .event_notify = ufs_mtk_event_notify, + .config_scaling_param = ufs_mtk_config_scaling_param, + .clk_scale_notify = ufs_mtk_clk_scale_notify, }; /** @@ -1102,6 +1550,7 @@ static int ufs_mtk_probe(struct platform_device *pdev) } link = device_link_add(dev, &reset_pdev->dev, DL_FLAG_AUTOPROBE_CONSUMER); + put_device(&reset_pdev->dev); if (!link) { dev_notice(dev, "add reset device_link fail\n"); goto skip_reset; @@ -1140,7 +1589,7 @@ static int ufs_mtk_remove(struct platform_device *pdev) } #ifdef CONFIG_PM_SLEEP -int ufs_mtk_system_suspend(struct device *dev) +static int ufs_mtk_system_suspend(struct device *dev) { struct ufs_hba *hba = dev_get_drvdata(dev); int ret; @@ -1149,22 +1598,22 @@ int ufs_mtk_system_suspend(struct device *dev) if (ret) return ret; - ufs_mtk_vreg_set_lpm(hba, true); + ufs_mtk_dev_vreg_set_lpm(hba, true); return 0; } -int ufs_mtk_system_resume(struct device *dev) +static int ufs_mtk_system_resume(struct device *dev) { struct ufs_hba *hba = dev_get_drvdata(dev); - ufs_mtk_vreg_set_lpm(hba, false); + ufs_mtk_dev_vreg_set_lpm(hba, false); return ufshcd_system_resume(dev); } #endif -int ufs_mtk_runtime_suspend(struct device *dev) +static int ufs_mtk_runtime_suspend(struct device *dev) { struct ufs_hba *hba = dev_get_drvdata(dev); int ret = 0; @@ -1173,16 +1622,16 @@ int ufs_mtk_runtime_suspend(struct device *dev) if (ret) return ret; - ufs_mtk_vreg_set_lpm(hba, true); + ufs_mtk_dev_vreg_set_lpm(hba, true); return 0; } -int ufs_mtk_runtime_resume(struct device *dev) +static int ufs_mtk_runtime_resume(struct device *dev) { struct ufs_hba *hba = dev_get_drvdata(dev); - ufs_mtk_vreg_set_lpm(hba, false); + ufs_mtk_dev_vreg_set_lpm(hba, false); return ufshcd_runtime_resume(dev); } diff --git a/drivers/ufs/host/ufs-mediatek.h b/drivers/ufs/host/ufs-mediatek.h new file mode 100644 index 000000000000..2fc6d7b87694 --- /dev/null +++ b/drivers/ufs/host/ufs-mediatek.h @@ -0,0 +1,226 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 MediaTek Inc. + */ + +#ifndef _UFS_MEDIATEK_H +#define _UFS_MEDIATEK_H + +#include +#include +#include + +/* + * Vendor specific UFSHCI Registers + */ +#define REG_UFS_XOUFS_CTRL 0x140 +#define REG_UFS_REFCLK_CTRL 0x144 +#define REG_UFS_EXTREG 0x2100 +#define REG_UFS_MPHYCTRL 0x2200 +#define REG_UFS_MTK_IP_VER 0x2240 +#define REG_UFS_REJECT_MON 0x22AC +#define REG_UFS_DEBUG_SEL 0x22C0 +#define REG_UFS_PROBE 0x22C8 +#define REG_UFS_DEBUG_SEL_B0 0x22D0 +#define REG_UFS_DEBUG_SEL_B1 0x22D4 +#define REG_UFS_DEBUG_SEL_B2 0x22D8 +#define REG_UFS_DEBUG_SEL_B3 0x22DC + +/* + * Ref-clk control + * + * Values for register REG_UFS_REFCLK_CTRL + */ +#define REFCLK_RELEASE 0x0 +#define REFCLK_REQUEST BIT(0) +#define REFCLK_ACK BIT(1) + +#define REFCLK_REQ_TIMEOUT_US 3000 +#define REFCLK_DEFAULT_WAIT_US 32 + +/* + * Other attributes + */ +#define VS_DEBUGCLOCKENABLE 0xD0A1 +#define VS_SAVEPOWERCONTROL 0xD0A6 +#define VS_UNIPROPOWERDOWNCONTROL 0xD0A8 + +/* + * Vendor specific link state + */ +enum { + VS_LINK_DISABLED = 0, + VS_LINK_DOWN = 1, + VS_LINK_UP = 2, + VS_LINK_HIBERN8 = 3, + VS_LINK_LOST = 4, + VS_LINK_CFG = 5, +}; + +/* + * Vendor specific host controller state + */ +enum { + VS_HCE_RESET = 0, + VS_HCE_BASE = 1, + VS_HCE_OOCPR_WAIT = 2, + VS_HCE_DME_RESET = 3, + VS_HCE_MIDDLE = 4, + VS_HCE_DME_ENABLE = 5, + VS_HCE_DEFAULTS = 6, + VS_HIB_IDLEEN = 7, + VS_HIB_ENTER = 8, + VS_HIB_ENTER_CONF = 9, + VS_HIB_MIDDLE = 10, + VS_HIB_WAITTIMER = 11, + VS_HIB_EXIT_CONF = 12, + VS_HIB_EXIT = 13, +}; + +/* + * SiP commands + */ +#define MTK_SIP_UFS_CONTROL MTK_SIP_SMC_CMD(0x276) +#define UFS_MTK_SIP_VA09_PWR_CTRL BIT(0) +#define UFS_MTK_SIP_DEVICE_RESET BIT(1) +#define UFS_MTK_SIP_CRYPTO_CTRL BIT(2) +#define UFS_MTK_SIP_REF_CLK_NOTIFICATION BIT(3) +#define UFS_MTK_SIP_HOST_PWR_CTRL BIT(5) +#define UFS_MTK_SIP_GET_VCC_NUM BIT(6) +#define UFS_MTK_SIP_DEVICE_PWR_CTRL BIT(7) + +/* + * VS_DEBUGCLOCKENABLE + */ +enum { + TX_SYMBOL_CLK_REQ_FORCE = 5, +}; + +/* + * VS_SAVEPOWERCONTROL + */ +enum { + RX_SYMBOL_CLK_GATE_EN = 0, + SYS_CLK_GATE_EN = 2, + TX_CLK_GATE_EN = 3, +}; + +/* + * Host capability + */ +enum ufs_mtk_host_caps { + UFS_MTK_CAP_BOOST_CRYPT_ENGINE = 1 << 0, + UFS_MTK_CAP_VA09_PWR_CTRL = 1 << 1, + UFS_MTK_CAP_DISABLE_AH8 = 1 << 2, + UFS_MTK_CAP_BROKEN_VCC = 1 << 3, + UFS_MTK_CAP_PMC_VIA_FASTAUTO = 1 << 6, +}; + +struct ufs_mtk_crypt_cfg { + struct regulator *reg_vcore; + struct clk *clk_crypt_perf; + struct clk *clk_crypt_mux; + struct clk *clk_crypt_lp; + int vcore_volt; +}; + +struct ufs_mtk_clk { + struct ufs_clk_info *ufs_sel_clki; /* Mux */ + struct ufs_clk_info *ufs_sel_max_clki; /* Max src */ + struct ufs_clk_info *ufs_sel_min_clki; /* Min src */ +}; + +struct ufs_mtk_hw_ver { + u8 step; + u8 minor; + u8 major; +}; + +struct ufs_mtk_host { + struct phy *mphy; + struct pm_qos_request pm_qos_req; + struct regulator *reg_va09; + struct reset_control *hci_reset; + struct reset_control *unipro_reset; + struct reset_control *crypto_reset; + struct ufs_hba *hba; + struct ufs_mtk_crypt_cfg *crypt; + struct ufs_mtk_clk mclk; + struct ufs_mtk_hw_ver hw_ver; + enum ufs_mtk_host_caps caps; + bool mphy_powered_on; + bool pm_qos_init; + bool unipro_lpm; + bool ref_clk_enabled; + u16 ref_clk_ungating_wait_us; + u16 ref_clk_gating_wait_us; + u32 ip_ver; +}; + +/* + * Multi-VCC by Numbering + */ +enum ufs_mtk_vcc_num { + UFS_VCC_NONE = 0, + UFS_VCC_1, + UFS_VCC_2, + UFS_VCC_MAX +}; + +/* + * Host Power Control options + */ +enum { + HOST_PWR_HCI = 0, + HOST_PWR_MPHY +}; + +/* + * SMC call wrapper function + */ +struct ufs_mtk_smc_arg { + unsigned long cmd; + struct arm_smccc_res *res; + unsigned long v1; + unsigned long v2; + unsigned long v3; + unsigned long v4; + unsigned long v5; + unsigned long v6; + unsigned long v7; +}; + +static void _ufs_mtk_smc(struct ufs_mtk_smc_arg s) +{ + arm_smccc_smc(MTK_SIP_UFS_CONTROL, + s.cmd, s.v1, s.v2, s.v3, s.v4, s.v5, s.v6, s.res); +} + +#define ufs_mtk_smc(...) \ + _ufs_mtk_smc((struct ufs_mtk_smc_arg) {__VA_ARGS__}) + +/* + * SMC call interface + */ +#define ufs_mtk_va09_pwr_ctrl(res, on) \ + ufs_mtk_smc(UFS_MTK_SIP_VA09_PWR_CTRL, &(res), on) + +#define ufs_mtk_crypto_ctrl(res, enable) \ + ufs_mtk_smc(UFS_MTK_SIP_CRYPTO_CTRL, &(res), enable) + +#define ufs_mtk_ref_clk_notify(on, stage, res) \ + ufs_mtk_smc(UFS_MTK_SIP_REF_CLK_NOTIFICATION, &(res), on, stage) + +#define ufs_mtk_device_reset_ctrl(high, res) \ + ufs_mtk_smc(UFS_MTK_SIP_DEVICE_RESET, &(res), high) + +#define ufs_mtk_host_pwr_ctrl(opt, on, res) \ + ufs_mtk_smc(UFS_MTK_SIP_HOST_PWR_CTRL, &(res), opt, on) + +#define ufs_mtk_get_vcc_num(res) \ + ufs_mtk_smc(UFS_MTK_SIP_GET_VCC_NUM, &(res)) + +#define ufs_mtk_device_pwr_ctrl(on, ufs_ver, res) \ + ufs_mtk_smc(UFS_MTK_SIP_DEVICE_PWR_CTRL, &(res), on, ufs_ver) + +#endif /* !_UFS_MEDIATEK_H */ diff --git a/drivers/scsi/ufs/ufs-qcom-ice.c b/drivers/ufs/host/ufs-qcom-ice.c similarity index 98% rename from drivers/scsi/ufs/ufs-qcom-ice.c rename to drivers/ufs/host/ufs-qcom-ice.c index bbb0ad7590ec..62387ccd5b30 100644 --- a/drivers/scsi/ufs/ufs-qcom-ice.c +++ b/drivers/ufs/host/ufs-qcom-ice.c @@ -6,10 +6,10 @@ * Copyright 2019 Google LLC */ +#include #include #include -#include "ufshcd-crypto.h" #include "ufs-qcom.h" #define AES_256_XTS_KEY_SIZE 64 @@ -118,7 +118,6 @@ int ufs_qcom_ice_init(struct ufs_qcom_host *host) host->ice_mmio = devm_ioremap_resource(dev, res); if (IS_ERR(host->ice_mmio)) { err = PTR_ERR(host->ice_mmio); - dev_err(dev, "Failed to map ICE registers; err=%d\n", err); return err; } diff --git a/drivers/scsi/ufs/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c similarity index 82% rename from drivers/scsi/ufs/ufs-qcom.c rename to drivers/ufs/host/ufs-qcom.c index f810b99ef5c5..152fff78b46f 100644 --- a/drivers/scsi/ufs/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -5,6 +5,9 @@ #include #include +#include +#include +#include #include #include #include @@ -12,14 +15,18 @@ #include #include -#include "ufshcd.h" +#include #include "ufshcd-pltfrm.h" -#include "unipro.h" +#include #include "ufs-qcom.h" -#include "ufshci.h" -#include "ufs_quirks.h" -#define UFS_QCOM_DEFAULT_DBG_PRINT_EN \ - (UFS_QCOM_DBG_PRINT_REGS_EN | UFS_QCOM_DBG_PRINT_TEST_BUS_EN) +#include +#include + +#define MCQ_QCFGPTR_MASK GENMASK(7, 0) +#define MCQ_QCFGPTR_UNIT 0x200 +#define MCQ_SQATTR_OFFSET(c) \ + ((((c) >> 16) & MCQ_QCFGPTR_MASK) * MCQ_QCFGPTR_UNIT) +#define MCQ_QCFG_SIZE 0x40 enum { TSTBUS_UAWM, @@ -48,25 +55,6 @@ static struct ufs_qcom_host *rcdev_to_ufs_host(struct reset_controller_dev *rcd) return container_of(rcd, struct ufs_qcom_host, rcdev); } -static void ufs_qcom_dump_regs_wrapper(struct ufs_hba *hba, int offset, int len, - const char *prefix, void *priv) -{ - ufshcd_dump_regs(hba, offset, len * 4, prefix); -} - -static int ufs_qcom_get_connected_tx_lanes(struct ufs_hba *hba, u32 *tx_lanes) -{ - int err = 0; - - err = ufshcd_dme_get(hba, - UIC_ARG_MIB(PA_CONNECTEDTXDATALANES), tx_lanes); - if (err) - dev_err(hba->dev, "%s: couldn't read PA_CONNECTEDTXDATALANES %d\n", - __func__, err); - - return err; -} - static int ufs_qcom_host_clk_get(struct device *dev, const char *name, struct clk **clk_out, bool optional) { @@ -119,7 +107,7 @@ static void ufs_qcom_disable_lane_clks(struct ufs_qcom_host *host) static int ufs_qcom_enable_lane_clks(struct ufs_qcom_host *host) { - int err = 0; + int err; struct device *dev = host->hba->dev; if (host->is_lane_clks_enabled) @@ -128,7 +116,7 @@ static int ufs_qcom_enable_lane_clks(struct ufs_qcom_host *host) err = ufs_qcom_host_clk_enable(dev, "rx_lane0_sync_clk", host->rx_l0_sync_clk); if (err) - goto out; + return err; err = ufs_qcom_host_clk_enable(dev, "tx_lane0_sync_clk", host->tx_l0_sync_clk); @@ -146,7 +134,8 @@ static int ufs_qcom_enable_lane_clks(struct ufs_qcom_host *host) goto disable_rx_l1; host->is_lane_clks_enabled = true; - goto out; + + return 0; disable_rx_l1: clk_disable_unprepare(host->rx_l1_sync_clk); @@ -154,7 +143,7 @@ disable_tx_l0: clk_disable_unprepare(host->tx_l0_sync_clk); disable_rx_l0: clk_disable_unprepare(host->rx_l0_sync_clk); -out: + return err; } @@ -169,32 +158,25 @@ static int ufs_qcom_init_lane_clks(struct ufs_qcom_host *host) err = ufs_qcom_host_clk_get(dev, "rx_lane0_sync_clk", &host->rx_l0_sync_clk, false); if (err) - goto out; + return err; err = ufs_qcom_host_clk_get(dev, "tx_lane0_sync_clk", &host->tx_l0_sync_clk, false); if (err) - goto out; + return err; /* In case of single lane per direction, don't read lane1 clocks */ if (host->hba->lanes_per_direction > 1) { err = ufs_qcom_host_clk_get(dev, "rx_lane1_sync_clk", &host->rx_l1_sync_clk, false); if (err) - goto out; + return err; err = ufs_qcom_host_clk_get(dev, "tx_lane1_sync_clk", &host->tx_l1_sync_clk, true); } -out: - return err; -} -static int ufs_qcom_link_startup_post_change(struct ufs_hba *hba) -{ - u32 tx_lanes; - - return ufs_qcom_get_connected_tx_lanes(hba, &tx_lanes); + return 0; } static int ufs_qcom_check_hibern8(struct ufs_hba *hba) @@ -242,6 +224,10 @@ static void ufs_qcom_select_unipro_mode(struct ufs_qcom_host *host) ufshcd_rmwl(host->hba, QUNIPRO_SEL, ufs_qcom_cap_qunipro(host) ? QUNIPRO_SEL : 0, REG_UFS_CFG1); + + if (host->hw_ver.major == 0x05) + ufshcd_rmwl(host->hba, QUNIPRO_G4_SEL, 0, REG_UFS_CFG0); + /* make sure above configuration is applied before we return */ mb(); } @@ -257,7 +243,7 @@ static int ufs_qcom_host_reset(struct ufs_hba *hba) if (!host->core_reset) { dev_warn(hba->dev, "%s: reset control not set\n", __func__); - goto out; + return 0; } reenable_intr = hba->is_irq_enabled; @@ -268,7 +254,7 @@ static int ufs_qcom_host_reset(struct ufs_hba *hba) if (ret) { dev_err(hba->dev, "%s: core_reset assert failed, err = %d\n", __func__, ret); - goto out; + return ret; } /* @@ -290,17 +276,34 @@ static int ufs_qcom_host_reset(struct ufs_hba *hba) hba->is_irq_enabled = true; } -out: - return ret; + return 0; +} + +static u32 ufs_qcom_get_hs_gear(struct ufs_hba *hba) +{ + struct ufs_qcom_host *host = ufshcd_get_variant(hba); + + if (host->hw_ver.major == 0x1) { + /* + * HS-G3 operations may not reliably work on legacy QCOM + * UFS host controller hardware even though capability + * exchange during link startup phase may end up + * negotiating maximum supported gear as G3. + * Hence downgrade the maximum supported gear to HS-G2. + */ + return UFS_HS_G2; + } + + /* Default is HS-G3 */ + return UFS_HS_G3; } static int ufs_qcom_power_up_sequence(struct ufs_hba *hba) { struct ufs_qcom_host *host = ufshcd_get_variant(hba); struct phy *phy = host->generic_phy; - int ret = 0; - bool is_rate_B = (UFS_QCOM_LIMIT_HS_RATE == PA_HS_MODE_B) - ? true : false; + int ret; + bool is_rate_B = UFS_QCOM_LIMIT_HS_RATE == PA_HS_MODE_B; /* Reset UFS Host Controller and PHY */ ret = ufs_qcom_host_reset(hba); @@ -316,7 +319,7 @@ static int ufs_qcom_power_up_sequence(struct ufs_hba *hba) if (ret) { dev_err(hba->dev, "%s: phy init failed, ret = %d\n", __func__, ret); - goto out; + return ret; } /* power on phy - start serdes and phy's power and clocks */ @@ -333,7 +336,7 @@ static int ufs_qcom_power_up_sequence(struct ufs_hba *hba) out_disable_phy: phy_exit(phy); -out: + return ret; } @@ -391,7 +394,6 @@ static int ufs_qcom_hce_enable_notify(struct ufs_hba *hba, static int ufs_qcom_cfg_timers(struct ufs_hba *hba, u32 gear, u32 hs, u32 rate, bool update_link_startup_timer) { - int ret = 0; struct ufs_qcom_host *host = ufshcd_get_variant(hba); struct ufs_clk_info *clki; u32 core_clk_period_in_ns; @@ -426,11 +428,11 @@ static int ufs_qcom_cfg_timers(struct ufs_hba *hba, u32 gear, * Aggregation logic. */ if (ufs_qcom_cap_qunipro(host) && !ufshcd_is_intr_aggr_allowed(hba)) - goto out; + return 0; if (gear == 0) { dev_err(hba->dev, "%s: invalid gear = %d\n", __func__, gear); - goto out_error; + return -EINVAL; } list_for_each_entry(clki, &hba->clk_list_head, list) { @@ -453,7 +455,7 @@ static int ufs_qcom_cfg_timers(struct ufs_hba *hba, u32 gear, } if (ufs_qcom_cap_qunipro(host)) - goto out; + return 0; core_clk_period_in_ns = NSEC_PER_SEC / core_clk_rate; core_clk_period_in_ns <<= OFFSET_CLK_NS_REG; @@ -468,7 +470,7 @@ static int ufs_qcom_cfg_timers(struct ufs_hba *hba, u32 gear, "%s: index %d exceeds table size %zu\n", __func__, gear, ARRAY_SIZE(hs_fr_table_rA)); - goto out_error; + return -EINVAL; } tx_clk_cycles_per_us = hs_fr_table_rA[gear-1][1]; } else if (rate == PA_HS_MODE_B) { @@ -477,13 +479,13 @@ static int ufs_qcom_cfg_timers(struct ufs_hba *hba, u32 gear, "%s: index %d exceeds table size %zu\n", __func__, gear, ARRAY_SIZE(hs_fr_table_rB)); - goto out_error; + return -EINVAL; } tx_clk_cycles_per_us = hs_fr_table_rB[gear-1][1]; } else { dev_err(hba->dev, "%s: invalid rate = %d\n", __func__, rate); - goto out_error; + return -EINVAL; } break; case SLOWAUTO_MODE: @@ -493,14 +495,14 @@ static int ufs_qcom_cfg_timers(struct ufs_hba *hba, u32 gear, "%s: index %d exceeds table size %zu\n", __func__, gear, ARRAY_SIZE(pwm_fr_table)); - goto out_error; + return -EINVAL; } tx_clk_cycles_per_us = pwm_fr_table[gear-1][1]; break; case UNCHANGED: default: dev_err(hba->dev, "%s: invalid mode = %d\n", __func__, hs); - goto out_error; + return -EINVAL; } if (ufshcd_readl(hba, REG_UFS_TX_SYMBOL_CLK_NS_US) != @@ -515,21 +517,17 @@ static int ufs_qcom_cfg_timers(struct ufs_hba *hba, u32 gear, mb(); } - if (update_link_startup_timer) { + if (update_link_startup_timer && host->hw_ver.major != 0x5) { ufshcd_writel(hba, ((core_clk_rate / MSEC_PER_SEC) * 100), - REG_UFS_PA_LINK_STARTUP_TIMER); + REG_UFS_CFG0); /* * make sure that this configuration is applied before * we return */ mb(); } - goto out; -out_error: - ret = -EINVAL; -out: - return ret; + return 0; } static int ufs_qcom_link_startup_notify(struct ufs_hba *hba, @@ -544,8 +542,7 @@ static int ufs_qcom_link_startup_notify(struct ufs_hba *hba, 0, true)) { dev_err(hba->dev, "%s: ufs_qcom_cfg_timers() failed\n", __func__); - err = -EINVAL; - goto out; + return -EINVAL; } if (ufs_qcom_cap_qunipro(host)) @@ -566,15 +563,11 @@ static int ufs_qcom_link_startup_notify(struct ufs_hba *hba, if (ufshcd_get_local_unipro_ver(hba) != UFS_UNIPRO_VER_1_41) err = ufshcd_disable_host_tx_lcc(hba); - break; - case POST_CHANGE: - ufs_qcom_link_startup_post_change(hba); break; default: break; } -out: return err; } @@ -589,11 +582,15 @@ static void ufs_qcom_device_reset_ctrl(struct ufs_hba *hba, bool asserted) gpiod_set_value_cansleep(host->device_reset, asserted); } -static int ufs_qcom_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op) +static int ufs_qcom_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op, + enum ufs_notify_change_status status) { struct ufs_qcom_host *host = ufshcd_get_variant(hba); struct phy *phy = host->generic_phy; + if (status == PRE_CHANGE) + return 0; + if (ufs_qcom_is_link_off(hba)) { /* * Disable the tx/rx lane symbol clocks before PHY is @@ -707,8 +704,7 @@ static int ufs_qcom_pwr_change_notify(struct ufs_hba *hba, if (!dev_req_params) { pr_err("%s: incoming dev_req_params is NULL\n", __func__); - ret = -EINVAL; - goto out; + return -EINVAL; } switch (status) { @@ -716,27 +712,16 @@ static int ufs_qcom_pwr_change_notify(struct ufs_hba *hba, ufshcd_init_pwr_dev_param(&ufs_qcom_cap); ufs_qcom_cap.hs_rate = UFS_QCOM_LIMIT_HS_RATE; - if (host->hw_ver.major == 0x1) { - /* - * HS-G3 operations may not reliably work on legacy QCOM - * UFS host controller hardware even though capability - * exchange during link startup phase may end up - * negotiating maximum supported gear as G3. - * Hence downgrade the maximum supported gear to HS-G2. - */ - if (ufs_qcom_cap.hs_tx_gear > UFS_HS_G2) - ufs_qcom_cap.hs_tx_gear = UFS_HS_G2; - if (ufs_qcom_cap.hs_rx_gear > UFS_HS_G2) - ufs_qcom_cap.hs_rx_gear = UFS_HS_G2; - } + /* This driver only supports symmetic gear setting i.e., hs_tx_gear == hs_rx_gear */ + ufs_qcom_cap.hs_tx_gear = ufs_qcom_cap.hs_rx_gear = ufs_qcom_get_hs_gear(hba); ret = ufshcd_get_pwr_dev_param(&ufs_qcom_cap, dev_max_params, dev_req_params); if (ret) { - pr_err("%s: failed to determine capabilities\n", + dev_err(hba->dev, "%s: failed to determine capabilities\n", __func__); - goto out; + return ret; } /* enable the device ref clock before changing to HS mode */ @@ -777,7 +762,7 @@ static int ufs_qcom_pwr_change_notify(struct ufs_hba *hba, ret = -EINVAL; break; } -out: + return ret; } @@ -789,14 +774,11 @@ static int ufs_qcom_quirk_host_pa_saveconfigtime(struct ufs_hba *hba) err = ufshcd_dme_get(hba, UIC_ARG_MIB(PA_VS_CONFIG_REG1), &pa_vs_config_reg1); if (err) - goto out; + return err; /* Allow extension of MSB bits of PA_SaveConfigTime attribute */ - err = ufshcd_dme_set(hba, UIC_ARG_MIB(PA_VS_CONFIG_REG1), + return ufshcd_dme_set(hba, UIC_ARG_MIB(PA_VS_CONFIG_REG1), (pa_vs_config_reg1 | (1 << 12))); - -out: - return err; } static int ufs_qcom_apply_dev_quirks(struct ufs_hba *hba) @@ -862,11 +844,12 @@ static void ufs_qcom_set_caps(struct ufs_hba *hba) struct ufs_qcom_host *host = ufshcd_get_variant(hba); hba->caps |= UFSHCD_CAP_CLK_GATING | UFSHCD_CAP_HIBERN8_WITH_CLK_GATING; - hba->caps |= UFSHCD_CAP_CLK_SCALING; + hba->caps |= UFSHCD_CAP_CLK_SCALING | UFSHCD_CAP_WB_WITH_CLK_SCALING; hba->caps |= UFSHCD_CAP_AUTO_BKOPS_SUSPEND; hba->caps |= UFSHCD_CAP_WB_EN; hba->caps |= UFSHCD_CAP_CRYPTO; hba->caps |= UFSHCD_CAP_AGGR_POWER_COLLAPSE; + hba->caps |= UFSHCD_CAP_RPM_AUTOSUSPEND; if (host->hw_ver.major >= 0x2) { host->caps = UFS_QCOM_CAP_QUNIPRO | @@ -886,7 +869,6 @@ static int ufs_qcom_setup_clocks(struct ufs_hba *hba, bool on, enum ufs_notify_change_status status) { struct ufs_qcom_host *host = ufshcd_get_variant(hba); - int err = 0; /* * In case ufs_qcom_init() is not yet done, simply ignore. @@ -914,7 +896,7 @@ static int ufs_qcom_setup_clocks(struct ufs_hba *hba, bool on, break; } - return err; + return 0; } static int @@ -922,8 +904,6 @@ ufs_qcom_reset_assert(struct reset_controller_dev *rcdev, unsigned long id) { struct ufs_qcom_host *host = rcdev_to_ufs_host(rcdev); - /* Currently this code only knows about a single reset. */ - WARN_ON(id); ufs_qcom_assert_reset(host->hba); /* provide 1ms delay to let the reset pulse propagate. */ usleep_range(1000, 1100); @@ -935,8 +915,6 @@ ufs_qcom_reset_deassert(struct reset_controller_dev *rcdev, unsigned long id) { struct ufs_qcom_host *host = rcdev_to_ufs_host(rcdev); - /* Currently this code only knows about a single reset. */ - WARN_ON(id); ufs_qcom_deassert_reset(host->hba); /* @@ -952,18 +930,6 @@ static const struct reset_control_ops ufs_qcom_reset_ops = { .deassert = ufs_qcom_reset_deassert, }; -#define ANDROID_BOOT_DEV_MAX 30 -static char android_boot_dev[ANDROID_BOOT_DEV_MAX]; - -#ifndef MODULE -static int __init get_android_boot_dev(char *str) -{ - strlcpy(android_boot_dev, str, ANDROID_BOOT_DEV_MAX); - return 1; -} -__setup("androidboot.bootdevice=", get_android_boot_dev); -#endif - /** * ufs_qcom_init - bind phy with controller * @hba: host controller instance @@ -983,27 +949,22 @@ static int ufs_qcom_init(struct ufs_hba *hba) struct resource *res; struct ufs_clk_info *clki; - if (strlen(android_boot_dev) && strcmp(android_boot_dev, dev_name(dev))) - return -ENODEV; - host = devm_kzalloc(dev, sizeof(*host), GFP_KERNEL); if (!host) { - err = -ENOMEM; dev_err(dev, "%s: no memory for qcom ufs host\n", __func__); - goto out; + return -ENOMEM; } /* Make a two way bind between the qcom host and the hba */ host->hba = hba; ufshcd_set_variant(hba, host); - /* Setup the reset control of HCI */ - host->core_reset = devm_reset_control_get(hba->dev, "rst"); + /* Setup the optional reset control of HCI */ + host->core_reset = devm_reset_control_get_optional(hba->dev, "rst"); if (IS_ERR(host->core_reset)) { - err = PTR_ERR(host->core_reset); - dev_warn(dev, "Failed to get reset control %d\n", err); - host->core_reset = NULL; - err = 0; + err = dev_err_probe(dev, PTR_ERR(host->core_reset), + "Failed to get reset control\n"); + goto out_variant_clear; } /* Fire up the reset controller. Failure here is non-fatal. */ @@ -1012,33 +973,13 @@ static int ufs_qcom_init(struct ufs_hba *hba) host->rcdev.owner = dev->driver->owner; host->rcdev.nr_resets = 1; err = devm_reset_controller_register(dev, &host->rcdev); - if (err) { + if (err) dev_warn(dev, "Failed to register reset controller\n"); - err = 0; - } - /* - * voting/devoting device ref_clk source is time consuming hence - * skip devoting it during aggressive clock gating. This clock - * will still be gated off during runtime suspend. - */ - host->generic_phy = devm_phy_get(dev, "ufsphy"); - - if (host->generic_phy == ERR_PTR(-EPROBE_DEFER)) { - /* - * UFS driver might be probed before the phy driver does. - * In that case we would like to return EPROBE_DEFER code. - */ - err = -EPROBE_DEFER; - dev_warn(dev, "%s: required phy device. hasn't probed yet. err = %d\n", - __func__, err); - goto out_variant_clear; - } else if (IS_ERR(host->generic_phy)) { - if (has_acpi_companion(dev)) { - host->generic_phy = NULL; - } else { - err = PTR_ERR(host->generic_phy); - dev_err(dev, "%s: PHY get failed %d\n", __func__, err); + if (!has_acpi_companion(dev)) { + host->generic_phy = devm_phy_get(dev, "ufsphy"); + if (IS_ERR(host->generic_phy)) { + err = dev_err_probe(dev, PTR_ERR(host->generic_phy), "Failed to get PHY\n"); goto out_variant_clear; } } @@ -1096,20 +1037,18 @@ static int ufs_qcom_init(struct ufs_hba *hba) if (hba->dev->id < MAX_UFS_QCOM_HOSTS) ufs_qcom_hosts[hba->dev->id] = host; - host->dbg_print_en |= UFS_QCOM_DEFAULT_DBG_PRINT_EN; ufs_qcom_get_default_testbus_cfg(host); err = ufs_qcom_testbus_config(host); - if (err) { + if (err) + /* Failure is non-fatal */ dev_warn(dev, "%s: failed to configure the testbus %d\n", __func__, err); - err = 0; - } - goto out; + return 0; out_variant_clear: ufshcd_set_variant(hba, NULL); -out: + return err; } @@ -1135,7 +1074,7 @@ static int ufs_qcom_set_dme_vs_core_clk_ctrl_clear_div(struct ufs_hba *hba, UIC_ARG_MIB(DME_VS_CORE_CLK_CTRL), &core_clk_ctrl_reg); if (err) - goto out; + return err; core_clk_ctrl_reg &= ~DME_VS_CORE_CLK_CTRL_MAX_CORE_CLK_1US_CYCLES_MASK; core_clk_ctrl_reg |= clk_cycles; @@ -1143,11 +1082,9 @@ static int ufs_qcom_set_dme_vs_core_clk_ctrl_clear_div(struct ufs_hba *hba, /* Clear CORE_CLK_DIV_EN */ core_clk_ctrl_reg &= ~DME_VS_CORE_CLK_CTRL_CORE_CLK_DIV_EN_BIT; - err = ufshcd_dme_set(hba, + return ufshcd_dme_set(hba, UIC_ARG_MIB(DME_VS_CORE_CLK_CTRL), core_clk_ctrl_reg); -out: - return err; } static int ufs_qcom_clk_scale_up_pre_change(struct ufs_hba *hba) @@ -1211,101 +1148,44 @@ static int ufs_qcom_clk_scale_notify(struct ufs_hba *hba, int err = 0; if (status == PRE_CHANGE) { + err = ufshcd_uic_hibern8_enter(hba); + if (err) + return err; if (scale_up) err = ufs_qcom_clk_scale_up_pre_change(hba); else err = ufs_qcom_clk_scale_down_pre_change(hba); + if (err) + ufshcd_uic_hibern8_exit(hba); + } else { if (scale_up) err = ufs_qcom_clk_scale_up_post_change(hba); else err = ufs_qcom_clk_scale_down_post_change(hba); - if (err || !dev_req_params) - goto out; + + if (err || !dev_req_params) { + ufshcd_uic_hibern8_exit(hba); + return err; + } ufs_qcom_cfg_timers(hba, dev_req_params->gear_rx, dev_req_params->pwr_rx, dev_req_params->hs_rate, false); + ufshcd_uic_hibern8_exit(hba); } -out: - return err; -} - -static void ufs_qcom_print_hw_debug_reg_all(struct ufs_hba *hba, - void *priv, void (*print_fn)(struct ufs_hba *hba, - int offset, int num_regs, const char *str, void *priv)) -{ - u32 reg; - struct ufs_qcom_host *host; - - if (unlikely(!hba)) { - pr_err("%s: hba is NULL\n", __func__); - return; - } - if (unlikely(!print_fn)) { - dev_err(hba->dev, "%s: print_fn is NULL\n", __func__); - return; - } - - host = ufshcd_get_variant(hba); - if (!(host->dbg_print_en & UFS_QCOM_DBG_PRINT_REGS_EN)) - return; - - reg = ufs_qcom_get_debug_reg_offset(host, UFS_UFS_DBG_RD_REG_OCSC); - print_fn(hba, reg, 44, "UFS_UFS_DBG_RD_REG_OCSC ", priv); - - reg = ufshcd_readl(hba, REG_UFS_CFG1); - reg |= UTP_DBG_RAMS_EN; - ufshcd_writel(hba, reg, REG_UFS_CFG1); - - reg = ufs_qcom_get_debug_reg_offset(host, UFS_UFS_DBG_RD_EDTL_RAM); - print_fn(hba, reg, 32, "UFS_UFS_DBG_RD_EDTL_RAM ", priv); - - reg = ufs_qcom_get_debug_reg_offset(host, UFS_UFS_DBG_RD_DESC_RAM); - print_fn(hba, reg, 128, "UFS_UFS_DBG_RD_DESC_RAM ", priv); - - reg = ufs_qcom_get_debug_reg_offset(host, UFS_UFS_DBG_RD_PRDT_RAM); - print_fn(hba, reg, 64, "UFS_UFS_DBG_RD_PRDT_RAM ", priv); - - /* clear bit 17 - UTP_DBG_RAMS_EN */ - ufshcd_rmwl(hba, UTP_DBG_RAMS_EN, 0, REG_UFS_CFG1); - - reg = ufs_qcom_get_debug_reg_offset(host, UFS_DBG_RD_REG_UAWM); - print_fn(hba, reg, 4, "UFS_DBG_RD_REG_UAWM ", priv); - - reg = ufs_qcom_get_debug_reg_offset(host, UFS_DBG_RD_REG_UARM); - print_fn(hba, reg, 4, "UFS_DBG_RD_REG_UARM ", priv); - - reg = ufs_qcom_get_debug_reg_offset(host, UFS_DBG_RD_REG_TXUC); - print_fn(hba, reg, 48, "UFS_DBG_RD_REG_TXUC ", priv); - - reg = ufs_qcom_get_debug_reg_offset(host, UFS_DBG_RD_REG_RXUC); - print_fn(hba, reg, 27, "UFS_DBG_RD_REG_RXUC ", priv); - - reg = ufs_qcom_get_debug_reg_offset(host, UFS_DBG_RD_REG_DFC); - print_fn(hba, reg, 19, "UFS_DBG_RD_REG_DFC ", priv); - - reg = ufs_qcom_get_debug_reg_offset(host, UFS_DBG_RD_REG_TRLUT); - print_fn(hba, reg, 34, "UFS_DBG_RD_REG_TRLUT ", priv); - - reg = ufs_qcom_get_debug_reg_offset(host, UFS_DBG_RD_REG_TMRLUT); - print_fn(hba, reg, 9, "UFS_DBG_RD_REG_TMRLUT ", priv); + return 0; } static void ufs_qcom_enable_test_bus(struct ufs_qcom_host *host) { - if (host->dbg_print_en & UFS_QCOM_DBG_PRINT_TEST_BUS_EN) { - ufshcd_rmwl(host->hba, UFS_REG_TEST_BUS_EN, - UFS_REG_TEST_BUS_EN, REG_UFS_CFG1); - ufshcd_rmwl(host->hba, TEST_BUS_EN, TEST_BUS_EN, REG_UFS_CFG1); - } else { - ufshcd_rmwl(host->hba, UFS_REG_TEST_BUS_EN, 0, REG_UFS_CFG1); - ufshcd_rmwl(host->hba, TEST_BUS_EN, 0, REG_UFS_CFG1); - } + ufshcd_rmwl(host->hba, UFS_REG_TEST_BUS_EN, + UFS_REG_TEST_BUS_EN, REG_UFS_CFG1); + ufshcd_rmwl(host->hba, TEST_BUS_EN, TEST_BUS_EN, REG_UFS_CFG1); } static void ufs_qcom_get_default_testbus_cfg(struct ufs_qcom_host *host) @@ -1414,10 +1294,53 @@ int ufs_qcom_testbus_config(struct ufs_qcom_host *host) static void ufs_qcom_dump_dbg_regs(struct ufs_hba *hba) { + u32 reg; + struct ufs_qcom_host *host; + + host = ufshcd_get_variant(hba); + ufshcd_dump_regs(hba, REG_UFS_SYS1CLK_1US, 16 * 4, "HCI Vendor Specific Registers "); - ufs_qcom_print_hw_debug_reg_all(hba, NULL, ufs_qcom_dump_regs_wrapper); + reg = ufs_qcom_get_debug_reg_offset(host, UFS_UFS_DBG_RD_REG_OCSC); + ufshcd_dump_regs(hba, reg, 44 * 4, "UFS_UFS_DBG_RD_REG_OCSC "); + + reg = ufshcd_readl(hba, REG_UFS_CFG1); + reg |= UTP_DBG_RAMS_EN; + ufshcd_writel(hba, reg, REG_UFS_CFG1); + + reg = ufs_qcom_get_debug_reg_offset(host, UFS_UFS_DBG_RD_EDTL_RAM); + ufshcd_dump_regs(hba, reg, 32 * 4, "UFS_UFS_DBG_RD_EDTL_RAM "); + + reg = ufs_qcom_get_debug_reg_offset(host, UFS_UFS_DBG_RD_DESC_RAM); + ufshcd_dump_regs(hba, reg, 128 * 4, "UFS_UFS_DBG_RD_DESC_RAM "); + + reg = ufs_qcom_get_debug_reg_offset(host, UFS_UFS_DBG_RD_PRDT_RAM); + ufshcd_dump_regs(hba, reg, 64 * 4, "UFS_UFS_DBG_RD_PRDT_RAM "); + + /* clear bit 17 - UTP_DBG_RAMS_EN */ + ufshcd_rmwl(hba, UTP_DBG_RAMS_EN, 0, REG_UFS_CFG1); + + reg = ufs_qcom_get_debug_reg_offset(host, UFS_DBG_RD_REG_UAWM); + ufshcd_dump_regs(hba, reg, 4 * 4, "UFS_DBG_RD_REG_UAWM "); + + reg = ufs_qcom_get_debug_reg_offset(host, UFS_DBG_RD_REG_UARM); + ufshcd_dump_regs(hba, reg, 4 * 4, "UFS_DBG_RD_REG_UARM "); + + reg = ufs_qcom_get_debug_reg_offset(host, UFS_DBG_RD_REG_TXUC); + ufshcd_dump_regs(hba, reg, 48 * 4, "UFS_DBG_RD_REG_TXUC "); + + reg = ufs_qcom_get_debug_reg_offset(host, UFS_DBG_RD_REG_RXUC); + ufshcd_dump_regs(hba, reg, 27 * 4, "UFS_DBG_RD_REG_RXUC "); + + reg = ufs_qcom_get_debug_reg_offset(host, UFS_DBG_RD_REG_DFC); + ufshcd_dump_regs(hba, reg, 19 * 4, "UFS_DBG_RD_REG_DFC "); + + reg = ufs_qcom_get_debug_reg_offset(host, UFS_DBG_RD_REG_TRLUT); + ufshcd_dump_regs(hba, reg, 34 * 4, "UFS_DBG_RD_REG_TRLUT "); + + reg = ufs_qcom_get_debug_reg_offset(host, UFS_DBG_RD_REG_TMRLUT); + ufshcd_dump_regs(hba, reg, 9 * 4, "UFS_DBG_RD_REG_TMRLUT "); } /** @@ -1449,27 +1372,157 @@ static int ufs_qcom_device_reset(struct ufs_hba *hba) #if IS_ENABLED(CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND) static void ufs_qcom_config_scaling_param(struct ufs_hba *hba, - struct devfreq_dev_profile *p, - void *data) + struct devfreq_dev_profile *p, + struct devfreq_simple_ondemand_data *d) { - static struct devfreq_simple_ondemand_data *d; - - if (!data) - return; - - d = (struct devfreq_simple_ondemand_data *)data; p->polling_ms = 60; d->upthreshold = 70; d->downdifferential = 5; } #else static void ufs_qcom_config_scaling_param(struct ufs_hba *hba, - struct devfreq_dev_profile *p, - void *data) + struct devfreq_dev_profile *p, + struct devfreq_simple_ondemand_data *data) { } #endif +/* Resources */ +static const struct ufshcd_res_info ufs_res_info[RES_MAX] = { + {.name = "ufs_mem",}, + {.name = "mcq",}, + /* Submission Queue DAO */ + {.name = "mcq_sqd",}, + /* Submission Queue Interrupt Status */ + {.name = "mcq_sqis",}, + /* Completion Queue DAO */ + {.name = "mcq_cqd",}, + /* Completion Queue Interrupt Status */ + {.name = "mcq_cqis",}, + /* MCQ vendor specific */ + {.name = "mcq_vs",}, +}; + +static int ufs_qcom_mcq_config_resource(struct ufs_hba *hba) +{ + struct platform_device *pdev = to_platform_device(hba->dev); + struct ufshcd_res_info *res; + struct resource *res_mem, *res_mcq; + int i, ret = 0; + + memcpy(hba->res, ufs_res_info, sizeof(ufs_res_info)); + + for (i = 0; i < RES_MAX; i++) { + res = &hba->res[i]; + res->resource = platform_get_resource_byname(pdev, + IORESOURCE_MEM, + res->name); + if (!res->resource) { + dev_info(hba->dev, "Resource %s not provided\n", res->name); + if (i == RES_UFS) + return -ENOMEM; + continue; + } else if (i == RES_UFS) { + res_mem = res->resource; + res->base = hba->mmio_base; + continue; + } + + res->base = devm_ioremap_resource(hba->dev, res->resource); + if (IS_ERR(res->base)) { + dev_err(hba->dev, "Failed to map res %s, err=%d\n", + res->name, (int)PTR_ERR(res->base)); + res->base = NULL; + ret = PTR_ERR(res->base); + return ret; + } + } + + /* MCQ resource provided in DT */ + res = &hba->res[RES_MCQ]; + /* Bail if MCQ resource is provided */ + if (res->base) + goto out; + + /* Explicitly allocate MCQ resource from ufs_mem */ + res_mcq = devm_kzalloc(hba->dev, sizeof(*res_mcq), GFP_KERNEL); + if (!res_mcq) + return ret; + + res_mcq->start = res_mem->start + + MCQ_SQATTR_OFFSET(hba->mcq_capabilities); + res_mcq->end = res_mcq->start + hba->nr_hw_queues * MCQ_QCFG_SIZE - 1; + res_mcq->flags = res_mem->flags; + res_mcq->name = "mcq"; + + ret = insert_resource(&iomem_resource, res_mcq); + if (ret) { + dev_err(hba->dev, "Failed to insert MCQ resource, err=%d\n", + ret); + goto insert_res_err; + } + + res->base = devm_ioremap_resource(hba->dev, res_mcq); + if (IS_ERR(res->base)) { + dev_err(hba->dev, "MCQ registers mapping failed, err=%d\n", + (int)PTR_ERR(res->base)); + ret = PTR_ERR(res->base); + goto ioremap_err; + } + +out: + hba->mcq_base = res->base; + return 0; +ioremap_err: + res->base = NULL; + remove_resource(res_mcq); +insert_res_err: + devm_kfree(hba->dev, res_mcq); + return ret; +} + +static int ufs_qcom_op_runtime_config(struct ufs_hba *hba) +{ + struct ufshcd_res_info *mem_res, *sqdao_res; + struct ufshcd_mcq_opr_info_t *opr; + int i; + + mem_res = &hba->res[RES_UFS]; + sqdao_res = &hba->res[RES_MCQ_SQD]; + + if (!mem_res->base || !sqdao_res->base) + return -EINVAL; + + for (i = 0; i < OPR_MAX; i++) { + opr = &hba->mcq_opr[i]; + opr->offset = sqdao_res->resource->start - + mem_res->resource->start + 0x40 * i; + opr->stride = 0x100; + opr->base = sqdao_res->base + 0x40 * i; + } + + return 0; +} + +static int ufs_qcom_get_hba_mac(struct ufs_hba *hba) +{ + /* Qualcomm HC supports up to 64 */ + return MAX_SUPP_MAC; +} + +static int ufs_qcom_get_outstanding_cqs(struct ufs_hba *hba, + unsigned long *ocqs) +{ + struct ufshcd_res_info *mcq_vs_res = &hba->res[RES_MCQ_VS]; + + if (!mcq_vs_res->base) + return -EINVAL; + + *ocqs = readl(mcq_vs_res->base + UFS_MEM_CQIS_VS); + + return 0; +} + /* * struct ufs_hba_qcom_vops - UFS QCOM specific variant operations * @@ -1493,6 +1546,10 @@ static const struct ufs_hba_variant_ops ufs_hba_qcom_vops = { .device_reset = ufs_qcom_device_reset, .config_scaling_param = ufs_qcom_config_scaling_param, .program_key = ufs_qcom_ice_program_key, + .mcq_config_resource = ufs_qcom_mcq_config_resource, + .get_hba_mac = ufs_qcom_get_hba_mac, + .op_runtime_config = ufs_qcom_op_runtime_config, + .get_outstanding_cqs = ufs_qcom_get_outstanding_cqs, }; /** @@ -1509,9 +1566,9 @@ static int ufs_qcom_probe(struct platform_device *pdev) /* Perform generic probe */ err = ufshcd_pltfrm_init(pdev, &ufs_hba_qcom_vops); if (err) - dev_err(dev, "ufshcd_pltfrm_init() failed %d\n", err); + return dev_err_probe(dev, err, "ufshcd_pltfrm_init() failed\n"); - return err; + return 0; } /** @@ -1544,10 +1601,16 @@ MODULE_DEVICE_TABLE(acpi, ufs_qcom_acpi_match); #endif static const struct dev_pm_ops ufs_qcom_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(ufshcd_system_suspend, ufshcd_system_resume) SET_RUNTIME_PM_OPS(ufshcd_runtime_suspend, ufshcd_runtime_resume, NULL) .prepare = ufshcd_suspend_prepare, .complete = ufshcd_resume_complete, +#ifdef CONFIG_PM_SLEEP + .suspend = ufshcd_system_suspend, + .resume = ufshcd_system_resume, + .freeze = ufshcd_system_freeze, + .restore = ufshcd_system_restore, + .thaw = ufshcd_system_thaw, +#endif }; static struct platform_driver ufs_qcom_pltform = { diff --git a/drivers/scsi/ufs/ufs-qcom.h b/drivers/ufs/host/ufs-qcom.h similarity index 75% rename from drivers/scsi/ufs/ufs-qcom.h rename to drivers/ufs/host/ufs-qcom.h index 8208e3a3ef59..364cd4a44d05 100644 --- a/drivers/scsi/ufs/ufs-qcom.h +++ b/drivers/ufs/host/ufs-qcom.h @@ -7,6 +7,7 @@ #include #include +#include #define MAX_UFS_QCOM_HOSTS 1 #define MAX_U32 (~(u32)0) @@ -15,13 +16,11 @@ #define HBRN8_POLL_TOUT_MS 100 #define DEFAULT_CLK_RATE_HZ 1000000 #define BUS_VECTOR_NAME_LEN 32 +#define MAX_SUPP_MAC 64 -#define UFS_HW_VER_MAJOR_SHFT (28) -#define UFS_HW_VER_MAJOR_MASK (0x000F << UFS_HW_VER_MAJOR_SHFT) -#define UFS_HW_VER_MINOR_SHFT (16) -#define UFS_HW_VER_MINOR_MASK (0x0FFF << UFS_HW_VER_MINOR_SHFT) -#define UFS_HW_VER_STEP_SHFT (0) -#define UFS_HW_VER_STEP_MASK (0xFFFF << UFS_HW_VER_STEP_SHFT) +#define UFS_HW_VER_MAJOR_MASK GENMASK(31, 28) +#define UFS_HW_VER_MINOR_MASK GENMASK(27, 16) +#define UFS_HW_VER_STEP_MASK GENMASK(15, 0) /* vendor specific pre-defined parameters */ #define SLOW 1 @@ -35,8 +34,10 @@ enum { REG_UFS_TX_SYMBOL_CLK_NS_US = 0xC4, REG_UFS_LOCAL_PORT_ID_REG = 0xC8, REG_UFS_PA_ERR_CODE = 0xCC, - REG_UFS_RETRY_TIMER_REG = 0xD0, - REG_UFS_PA_LINK_STARTUP_TIMER = 0xD8, + /* On older UFS revisions, this register is called "RETRY_TIMER_REG" */ + REG_UFS_PARAM0 = 0xD0, + /* On older UFS revisions, this register is called "REG_UFS_PA_LINK_STARTUP_TIMER" */ + REG_UFS_CFG0 = 0xD8, REG_UFS_CFG1 = 0xDC, REG_UFS_CFG2 = 0xE0, REG_UFS_HW_VERSION = 0xE4, @@ -71,28 +72,39 @@ enum { UFS_UFS_DBG_RD_EDTL_RAM = 0x1900, }; +enum { + UFS_MEM_CQIS_VS = 0x8, +}; + #define UFS_CNTLR_2_x_x_VEN_REGS_OFFSET(x) (0x000 + x) #define UFS_CNTLR_3_x_x_VEN_REGS_OFFSET(x) (0x400 + x) +/* bit definitions for REG_UFS_CFG0 register */ +#define QUNIPRO_G4_SEL BIT(5) + /* bit definitions for REG_UFS_CFG1 register */ -#define QUNIPRO_SEL 0x1 -#define UTP_DBG_RAMS_EN 0x20000 +#define QUNIPRO_SEL BIT(0) +#define UFS_PHY_SOFT_RESET BIT(1) +#define UTP_DBG_RAMS_EN BIT(17) #define TEST_BUS_EN BIT(18) #define TEST_BUS_SEL GENMASK(22, 19) #define UFS_REG_TEST_BUS_EN BIT(30) +#define UFS_PHY_RESET_ENABLE 1 +#define UFS_PHY_RESET_DISABLE 0 + /* bit definitions for REG_UFS_CFG2 register */ -#define UAWM_HW_CGC_EN (1 << 0) -#define UARM_HW_CGC_EN (1 << 1) -#define TXUC_HW_CGC_EN (1 << 2) -#define RXUC_HW_CGC_EN (1 << 3) -#define DFC_HW_CGC_EN (1 << 4) -#define TRLUT_HW_CGC_EN (1 << 5) -#define TMRLUT_HW_CGC_EN (1 << 6) -#define OCSC_HW_CGC_EN (1 << 7) +#define UAWM_HW_CGC_EN BIT(0) +#define UARM_HW_CGC_EN BIT(1) +#define TXUC_HW_CGC_EN BIT(2) +#define RXUC_HW_CGC_EN BIT(3) +#define DFC_HW_CGC_EN BIT(4) +#define TRLUT_HW_CGC_EN BIT(5) +#define TMRLUT_HW_CGC_EN BIT(6) +#define OCSC_HW_CGC_EN BIT(7) /* bit definition for UFS_UFS_TEST_BUS_CTRL_n */ -#define TEST_BUS_SUB_SEL_MASK 0x1F /* All XXX_SEL fields are 5 bits wide */ +#define TEST_BUS_SUB_SEL_MASK GENMASK(4, 0) /* All XXX_SEL fields are 5 bits wide */ #define REG_UFS_CFG2_CGC_EN_ALL (UAWM_HW_CGC_EN | UARM_HW_CGC_EN |\ TXUC_HW_CGC_EN | RXUC_HW_CGC_EN |\ @@ -100,26 +112,11 @@ enum { TMRLUT_HW_CGC_EN | OCSC_HW_CGC_EN) /* bit offset */ -enum { - OFFSET_UFS_PHY_SOFT_RESET = 1, - OFFSET_CLK_NS_REG = 10, -}; +#define OFFSET_CLK_NS_REG 0xa /* bit masks */ -enum { - MASK_UFS_PHY_SOFT_RESET = 0x2, - MASK_TX_SYMBOL_CLK_1US_REG = 0x3FF, - MASK_CLK_NS_REG = 0xFFFC00, -}; - -/* QCOM UFS debug print bit mask */ -#define UFS_QCOM_DBG_PRINT_REGS_EN BIT(0) -#define UFS_QCOM_DBG_PRINT_ICE_REGS_EN BIT(1) -#define UFS_QCOM_DBG_PRINT_TEST_BUS_EN BIT(2) - -#define UFS_QCOM_DBG_PRINT_ALL \ - (UFS_QCOM_DBG_PRINT_REGS_EN | UFS_QCOM_DBG_PRINT_ICE_REGS_EN | \ - UFS_QCOM_DBG_PRINT_TEST_BUS_EN) +#define MASK_TX_SYMBOL_CLK_1US_REG GENMASK(9, 0) +#define MASK_CLK_NS_REG GENMASK(23, 10) /* QUniPro Vendor specific attributes */ #define PA_VS_CONFIG_REG1 0x9000 @@ -134,15 +131,15 @@ ufs_qcom_get_controller_revision(struct ufs_hba *hba, { u32 ver = ufshcd_readl(hba, REG_UFS_HW_VERSION); - *major = (ver & UFS_HW_VER_MAJOR_MASK) >> UFS_HW_VER_MAJOR_SHFT; - *minor = (ver & UFS_HW_VER_MINOR_MASK) >> UFS_HW_VER_MINOR_SHFT; - *step = (ver & UFS_HW_VER_STEP_MASK) >> UFS_HW_VER_STEP_SHFT; + *major = FIELD_GET(UFS_HW_VER_MAJOR_MASK, ver); + *minor = FIELD_GET(UFS_HW_VER_MINOR_MASK, ver); + *step = FIELD_GET(UFS_HW_VER_STEP_MASK, ver); }; static inline void ufs_qcom_assert_reset(struct ufs_hba *hba) { - ufshcd_rmwl(hba, MASK_UFS_PHY_SOFT_RESET, - 1 << OFFSET_UFS_PHY_SOFT_RESET, REG_UFS_CFG1); + ufshcd_rmwl(hba, UFS_PHY_SOFT_RESET, FIELD_PREP(UFS_PHY_SOFT_RESET, UFS_PHY_RESET_ENABLE), + REG_UFS_CFG1); /* * Make sure assertion of ufs phy reset is written to @@ -153,8 +150,8 @@ static inline void ufs_qcom_assert_reset(struct ufs_hba *hba) static inline void ufs_qcom_deassert_reset(struct ufs_hba *hba) { - ufshcd_rmwl(hba, MASK_UFS_PHY_SOFT_RESET, - 0 << OFFSET_UFS_PHY_SOFT_RESET, REG_UFS_CFG1); + ufshcd_rmwl(hba, UFS_PHY_SOFT_RESET, FIELD_PREP(UFS_PHY_SOFT_RESET, UFS_PHY_RESET_DISABLE), + REG_UFS_CFG1); /* * Make sure de-assertion of ufs phy reset is written to @@ -211,8 +208,6 @@ struct ufs_qcom_host { u32 dev_ref_clk_en_mask; - /* Bitmask for enabling debug prints */ - u32 dbg_print_en; struct ufs_qcom_testbus testbus; /* Reset control of HCI */ @@ -239,10 +234,7 @@ int ufs_qcom_testbus_config(struct ufs_qcom_host *host); static inline bool ufs_qcom_cap_qunipro(struct ufs_qcom_host *host) { - if (host->caps & UFS_QCOM_CAP_QUNIPRO) - return true; - else - return false; + return host->caps & UFS_QCOM_CAP_QUNIPRO; } /* ufs-qcom-ice.c */ diff --git a/drivers/ufs/host/ufs-renesas.c b/drivers/ufs/host/ufs-renesas.c new file mode 100644 index 000000000000..f8a5e79ed3b4 --- /dev/null +++ b/drivers/ufs/host/ufs-renesas.c @@ -0,0 +1,412 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Renesas UFS host controller driver + * + * Copyright (C) 2022 Renesas Electronics Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ufshcd-pltfrm.h" + +struct ufs_renesas_priv { + bool initialized; /* The hardware needs initialization once */ +}; + +enum { + SET_PHY_INDEX_LO = 0, + SET_PHY_INDEX_HI, + TIMER_INDEX, + MAX_INDEX +}; + +enum ufs_renesas_init_param_mode { + MODE_RESTORE, + MODE_SET, + MODE_SAVE, + MODE_POLL, + MODE_WAIT, + MODE_WRITE, +}; + +#define PARAM_RESTORE(_reg, _index) \ + { .mode = MODE_RESTORE, .reg = _reg, .index = _index } +#define PARAM_SET(_index, _set) \ + { .mode = MODE_SET, .index = _index, .u.set = _set } +#define PARAM_SAVE(_reg, _mask, _index) \ + { .mode = MODE_SAVE, .reg = _reg, .mask = (u32)(_mask), \ + .index = _index } +#define PARAM_POLL(_reg, _expected, _mask) \ + { .mode = MODE_POLL, .reg = _reg, .u.expected = _expected, \ + .mask = (u32)(_mask) } +#define PARAM_WAIT(_delay_us) \ + { .mode = MODE_WAIT, .u.delay_us = _delay_us } + +#define PARAM_WRITE(_reg, _val) \ + { .mode = MODE_WRITE, .reg = _reg, .u.val = _val } + +#define PARAM_WRITE_D0_D4(_d0, _d4) \ + PARAM_WRITE(0xd0, _d0), PARAM_WRITE(0xd4, _d4) + +#define PARAM_WRITE_800_80C_POLL(_addr, _data_800) \ + PARAM_WRITE_D0_D4(0x0000080c, 0x00000100), \ + PARAM_WRITE_D0_D4(0x00000800, ((_data_800) << 16) | BIT(8) | (_addr)), \ + PARAM_WRITE(0xd0, 0x0000080c), \ + PARAM_POLL(0xd4, BIT(8), BIT(8)) + +#define PARAM_RESTORE_800_80C_POLL(_index) \ + PARAM_WRITE_D0_D4(0x0000080c, 0x00000100), \ + PARAM_WRITE(0xd0, 0x00000800), \ + PARAM_RESTORE(0xd4, _index), \ + PARAM_WRITE(0xd0, 0x0000080c), \ + PARAM_POLL(0xd4, BIT(8), BIT(8)) + +#define PARAM_WRITE_804_80C_POLL(_addr, _data_804) \ + PARAM_WRITE_D0_D4(0x0000080c, 0x00000100), \ + PARAM_WRITE_D0_D4(0x00000804, ((_data_804) << 16) | BIT(8) | (_addr)), \ + PARAM_WRITE(0xd0, 0x0000080c), \ + PARAM_POLL(0xd4, BIT(8), BIT(8)) + +#define PARAM_WRITE_828_82C_POLL(_data_828) \ + PARAM_WRITE_D0_D4(0x0000082c, 0x0f000000), \ + PARAM_WRITE_D0_D4(0x00000828, _data_828), \ + PARAM_WRITE(0xd0, 0x0000082c), \ + PARAM_POLL(0xd4, _data_828, _data_828) + +#define PARAM_WRITE_PHY(_addr16, _data16) \ + PARAM_WRITE(0xf0, 1), \ + PARAM_WRITE_800_80C_POLL(0x16, (_addr16) & 0xff), \ + PARAM_WRITE_800_80C_POLL(0x17, ((_addr16) >> 8) & 0xff), \ + PARAM_WRITE_800_80C_POLL(0x18, (_data16) & 0xff), \ + PARAM_WRITE_800_80C_POLL(0x19, ((_data16) >> 8) & 0xff), \ + PARAM_WRITE_800_80C_POLL(0x1c, 0x01), \ + PARAM_WRITE_828_82C_POLL(0x0f000000), \ + PARAM_WRITE(0xf0, 0) + +#define PARAM_SET_PHY(_addr16, _data16) \ + PARAM_WRITE(0xf0, 1), \ + PARAM_WRITE_800_80C_POLL(0x16, (_addr16) & 0xff), \ + PARAM_WRITE_800_80C_POLL(0x17, ((_addr16) >> 8) & 0xff), \ + PARAM_WRITE_800_80C_POLL(0x1c, 0x01), \ + PARAM_WRITE_828_82C_POLL(0x0f000000), \ + PARAM_WRITE_804_80C_POLL(0x1a, 0), \ + PARAM_WRITE(0xd0, 0x00000808), \ + PARAM_SAVE(0xd4, 0xff, SET_PHY_INDEX_LO), \ + PARAM_WRITE_804_80C_POLL(0x1b, 0), \ + PARAM_WRITE(0xd0, 0x00000808), \ + PARAM_SAVE(0xd4, 0xff, SET_PHY_INDEX_HI), \ + PARAM_WRITE_828_82C_POLL(0x0f000000), \ + PARAM_WRITE(0xf0, 0), \ + PARAM_WRITE(0xf0, 1), \ + PARAM_WRITE_800_80C_POLL(0x16, (_addr16) & 0xff), \ + PARAM_WRITE_800_80C_POLL(0x17, ((_addr16) >> 8) & 0xff), \ + PARAM_SET(SET_PHY_INDEX_LO, ((_data16 & 0xff) << 16) | BIT(8) | 0x18), \ + PARAM_RESTORE_800_80C_POLL(SET_PHY_INDEX_LO), \ + PARAM_SET(SET_PHY_INDEX_HI, (((_data16 >> 8) & 0xff) << 16) | BIT(8) | 0x19), \ + PARAM_RESTORE_800_80C_POLL(SET_PHY_INDEX_HI), \ + PARAM_WRITE_800_80C_POLL(0x1c, 0x01), \ + PARAM_WRITE_828_82C_POLL(0x0f000000), \ + PARAM_WRITE(0xf0, 0) + +#define PARAM_INDIRECT_WRITE(_gpio, _addr, _data_800) \ + PARAM_WRITE(0xf0, _gpio), \ + PARAM_WRITE_800_80C_POLL(_addr, _data_800), \ + PARAM_WRITE_828_82C_POLL(0x0f000000), \ + PARAM_WRITE(0xf0, 0) + +#define PARAM_INDIRECT_POLL(_gpio, _addr, _expected, _mask) \ + PARAM_WRITE(0xf0, _gpio), \ + PARAM_WRITE_800_80C_POLL(_addr, 0), \ + PARAM_WRITE(0xd0, 0x00000808), \ + PARAM_POLL(0xd4, _expected, _mask), \ + PARAM_WRITE(0xf0, 0) + +struct ufs_renesas_init_param { + enum ufs_renesas_init_param_mode mode; + u32 reg; + union { + u32 expected; + u32 delay_us; + u32 set; + u32 val; + } u; + u32 mask; + u32 index; +}; + +/* This setting is for SERIES B */ +static const struct ufs_renesas_init_param ufs_param[] = { + PARAM_WRITE(0xc0, 0x49425308), + PARAM_WRITE_D0_D4(0x00000104, 0x00000002), + PARAM_WAIT(1), + PARAM_WRITE_D0_D4(0x00000828, 0x00000200), + PARAM_WAIT(1), + PARAM_WRITE_D0_D4(0x00000828, 0x00000000), + PARAM_WRITE_D0_D4(0x00000104, 0x00000001), + PARAM_WRITE_D0_D4(0x00000940, 0x00000001), + PARAM_WAIT(1), + PARAM_WRITE_D0_D4(0x00000940, 0x00000000), + + PARAM_WRITE(0xc0, 0x49425308), + PARAM_WRITE(0xc0, 0x41584901), + + PARAM_WRITE_D0_D4(0x0000080c, 0x00000100), + PARAM_WRITE_D0_D4(0x00000804, 0x00000000), + PARAM_WRITE(0xd0, 0x0000080c), + PARAM_POLL(0xd4, BIT(8), BIT(8)), + + PARAM_WRITE(REG_CONTROLLER_ENABLE, 0x00000001), + + PARAM_WRITE(0xd0, 0x00000804), + PARAM_POLL(0xd4, BIT(8) | BIT(6) | BIT(0), BIT(8) | BIT(6) | BIT(0)), + + PARAM_WRITE(0xd0, 0x00000d00), + PARAM_SAVE(0xd4, 0x0000ffff, TIMER_INDEX), + PARAM_WRITE(0xd4, 0x00000000), + PARAM_WRITE_D0_D4(0x0000082c, 0x0f000000), + PARAM_WRITE_D0_D4(0x00000828, 0x08000000), + PARAM_WRITE(0xd0, 0x0000082c), + PARAM_POLL(0xd4, BIT(27), BIT(27)), + PARAM_WRITE(0xd0, 0x00000d2c), + PARAM_POLL(0xd4, BIT(0), BIT(0)), + + /* phy setup */ + PARAM_INDIRECT_WRITE(1, 0x01, 0x001f), + PARAM_INDIRECT_WRITE(7, 0x5d, 0x0014), + PARAM_INDIRECT_WRITE(7, 0x5e, 0x0014), + PARAM_INDIRECT_WRITE(7, 0x0d, 0x0003), + PARAM_INDIRECT_WRITE(7, 0x0e, 0x0007), + PARAM_INDIRECT_WRITE(7, 0x5f, 0x0003), + PARAM_INDIRECT_WRITE(7, 0x60, 0x0003), + PARAM_INDIRECT_WRITE(7, 0x5b, 0x00a6), + PARAM_INDIRECT_WRITE(7, 0x5c, 0x0003), + + PARAM_INDIRECT_POLL(7, 0x3c, 0, BIT(7)), + PARAM_INDIRECT_POLL(7, 0x4c, 0, BIT(4)), + + PARAM_INDIRECT_WRITE(1, 0x32, 0x0080), + PARAM_INDIRECT_WRITE(1, 0x1f, 0x0001), + PARAM_INDIRECT_WRITE(0, 0x2c, 0x0001), + PARAM_INDIRECT_WRITE(0, 0x32, 0x0087), + + PARAM_INDIRECT_WRITE(1, 0x4d, 0x0061), + PARAM_INDIRECT_WRITE(4, 0x9b, 0x0009), + PARAM_INDIRECT_WRITE(4, 0xa6, 0x0005), + PARAM_INDIRECT_WRITE(4, 0xa5, 0x0058), + PARAM_INDIRECT_WRITE(1, 0x39, 0x0027), + PARAM_INDIRECT_WRITE(1, 0x47, 0x004c), + + PARAM_INDIRECT_WRITE(7, 0x0d, 0x0002), + PARAM_INDIRECT_WRITE(7, 0x0e, 0x0007), + + PARAM_WRITE_PHY(0x0028, 0x0061), + PARAM_WRITE_PHY(0x4014, 0x0061), + PARAM_SET_PHY(0x401c, BIT(2)), + PARAM_WRITE_PHY(0x4000, 0x0000), + PARAM_WRITE_PHY(0x4001, 0x0000), + + PARAM_WRITE_PHY(0x10ae, 0x0001), + PARAM_WRITE_PHY(0x10ad, 0x0000), + PARAM_WRITE_PHY(0x10af, 0x0001), + PARAM_WRITE_PHY(0x10b6, 0x0001), + PARAM_WRITE_PHY(0x10ae, 0x0000), + + PARAM_WRITE_PHY(0x10ae, 0x0001), + PARAM_WRITE_PHY(0x10ad, 0x0000), + PARAM_WRITE_PHY(0x10af, 0x0002), + PARAM_WRITE_PHY(0x10b6, 0x0001), + PARAM_WRITE_PHY(0x10ae, 0x0000), + + PARAM_WRITE_PHY(0x10ae, 0x0001), + PARAM_WRITE_PHY(0x10ad, 0x0080), + PARAM_WRITE_PHY(0x10af, 0x0000), + PARAM_WRITE_PHY(0x10b6, 0x0001), + PARAM_WRITE_PHY(0x10ae, 0x0000), + + PARAM_WRITE_PHY(0x10ae, 0x0001), + PARAM_WRITE_PHY(0x10ad, 0x0080), + PARAM_WRITE_PHY(0x10af, 0x001a), + PARAM_WRITE_PHY(0x10b6, 0x0001), + PARAM_WRITE_PHY(0x10ae, 0x0000), + + PARAM_INDIRECT_WRITE(7, 0x70, 0x0016), + PARAM_INDIRECT_WRITE(7, 0x71, 0x0016), + PARAM_INDIRECT_WRITE(7, 0x72, 0x0014), + PARAM_INDIRECT_WRITE(7, 0x73, 0x0014), + PARAM_INDIRECT_WRITE(7, 0x74, 0x0000), + PARAM_INDIRECT_WRITE(7, 0x75, 0x0000), + PARAM_INDIRECT_WRITE(7, 0x76, 0x0010), + PARAM_INDIRECT_WRITE(7, 0x77, 0x0010), + PARAM_INDIRECT_WRITE(7, 0x78, 0x00ff), + PARAM_INDIRECT_WRITE(7, 0x79, 0x0000), + + PARAM_INDIRECT_WRITE(7, 0x19, 0x0007), + + PARAM_INDIRECT_WRITE(7, 0x1a, 0x0007), + + PARAM_INDIRECT_WRITE(7, 0x24, 0x000c), + + PARAM_INDIRECT_WRITE(7, 0x25, 0x000c), + + PARAM_INDIRECT_WRITE(7, 0x62, 0x0000), + PARAM_INDIRECT_WRITE(7, 0x63, 0x0000), + PARAM_INDIRECT_WRITE(7, 0x5d, 0x0014), + PARAM_INDIRECT_WRITE(7, 0x5e, 0x0017), + PARAM_INDIRECT_WRITE(7, 0x5d, 0x0004), + PARAM_INDIRECT_WRITE(7, 0x5e, 0x0017), + PARAM_INDIRECT_POLL(7, 0x55, 0, BIT(6)), + PARAM_INDIRECT_POLL(7, 0x41, 0, BIT(7)), + /* end of phy setup */ + + PARAM_WRITE(0xf0, 0), + PARAM_WRITE(0xd0, 0x00000d00), + PARAM_RESTORE(0xd4, TIMER_INDEX), +}; + +static void ufs_renesas_dbg_register_dump(struct ufs_hba *hba) +{ + ufshcd_dump_regs(hba, 0xc0, 0x40, "regs: 0xc0 + "); +} + +static void ufs_renesas_reg_control(struct ufs_hba *hba, + const struct ufs_renesas_init_param *p) +{ + static u32 save[MAX_INDEX]; + int ret; + u32 val; + + WARN_ON(p->index >= MAX_INDEX); + + switch (p->mode) { + case MODE_RESTORE: + ufshcd_writel(hba, save[p->index], p->reg); + break; + case MODE_SET: + save[p->index] |= p->u.set; + break; + case MODE_SAVE: + save[p->index] = ufshcd_readl(hba, p->reg) & p->mask; + break; + case MODE_POLL: + ret = readl_poll_timeout_atomic(hba->mmio_base + p->reg, + val, + (val & p->mask) == p->u.expected, + 10, 1000); + if (ret) + dev_err(hba->dev, "%s: poll failed %d (%08x, %08x, %08x)\n", + __func__, ret, val, p->mask, p->u.expected); + break; + case MODE_WAIT: + if (p->u.delay_us > 1000) + mdelay(DIV_ROUND_UP(p->u.delay_us, 1000)); + else + udelay(p->u.delay_us); + break; + case MODE_WRITE: + ufshcd_writel(hba, p->u.val, p->reg); + break; + default: + break; + } +} + +static void ufs_renesas_pre_init(struct ufs_hba *hba) +{ + const struct ufs_renesas_init_param *p = ufs_param; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(ufs_param); i++) + ufs_renesas_reg_control(hba, &p[i]); +} + +static int ufs_renesas_hce_enable_notify(struct ufs_hba *hba, + enum ufs_notify_change_status status) +{ + struct ufs_renesas_priv *priv = ufshcd_get_variant(hba); + + if (priv->initialized) + return 0; + + if (status == PRE_CHANGE) + ufs_renesas_pre_init(hba); + + priv->initialized = true; + + return 0; +} + +static int ufs_renesas_setup_clocks(struct ufs_hba *hba, bool on, + enum ufs_notify_change_status status) +{ + if (on && status == PRE_CHANGE) + pm_runtime_get_sync(hba->dev); + else if (!on && status == POST_CHANGE) + pm_runtime_put(hba->dev); + + return 0; +} + +static int ufs_renesas_init(struct ufs_hba *hba) +{ + struct ufs_renesas_priv *priv; + + priv = devm_kmalloc(hba->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + ufshcd_set_variant(hba, priv); + + hba->quirks |= UFSHCD_QUIRK_BROKEN_64BIT_ADDRESS | UFSHCD_QUIRK_HIBERN_FASTAUTO; + + return 0; +} + +static const struct ufs_hba_variant_ops ufs_renesas_vops = { + .name = "renesas", + .init = ufs_renesas_init, + .setup_clocks = ufs_renesas_setup_clocks, + .hce_enable_notify = ufs_renesas_hce_enable_notify, + .dbg_register_dump = ufs_renesas_dbg_register_dump, +}; + +static const struct of_device_id __maybe_unused ufs_renesas_of_match[] = { + { .compatible = "renesas,r8a779f0-ufs" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, ufs_renesas_of_match); + +static int ufs_renesas_probe(struct platform_device *pdev) +{ + return ufshcd_pltfrm_init(pdev, &ufs_renesas_vops); +} + +static int ufs_renesas_remove(struct platform_device *pdev) +{ + struct ufs_hba *hba = platform_get_drvdata(pdev); + + ufshcd_remove(hba); + + return 0; +} + +static struct platform_driver ufs_renesas_platform = { + .probe = ufs_renesas_probe, + .remove = ufs_renesas_remove, + .driver = { + .name = "ufshcd-renesas", + .of_match_table = of_match_ptr(ufs_renesas_of_match), + }, +}; +module_platform_driver(ufs_renesas_platform); + +MODULE_AUTHOR("Yoshihiro Shimoda "); +MODULE_DESCRIPTION("Renesas UFS host controller driver"); +MODULE_LICENSE("Dual MIT/GPL"); diff --git a/drivers/ufs/host/ufs-sprd.c b/drivers/ufs/host/ufs-sprd.c new file mode 100644 index 000000000000..051f3f40d92c --- /dev/null +++ b/drivers/ufs/host/ufs-sprd.c @@ -0,0 +1,458 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * UNISOC UFS Host Controller driver + * + * Copyright (C) 2022 Unisoc, Inc. + * Author: Zhe Wang + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ufshcd-pltfrm.h" +#include "ufs-sprd.h" + +static const struct of_device_id ufs_sprd_of_match[]; + +static struct ufs_sprd_priv *ufs_sprd_get_priv_data(struct ufs_hba *hba) +{ + struct ufs_sprd_host *host = ufshcd_get_variant(hba); + + WARN_ON(!host->priv); + return host->priv; +} + +static void ufs_sprd_regmap_update(struct ufs_sprd_priv *priv, unsigned int index, + unsigned int reg, unsigned int bits, unsigned int val) +{ + regmap_update_bits(priv->sysci[index].regmap, reg, bits, val); +} + +static void ufs_sprd_regmap_read(struct ufs_sprd_priv *priv, unsigned int index, + unsigned int reg, unsigned int *val) +{ + regmap_read(priv->sysci[index].regmap, reg, val); +} + +static void ufs_sprd_get_unipro_ver(struct ufs_hba *hba) +{ + struct ufs_sprd_host *host = ufshcd_get_variant(hba); + + if (ufshcd_dme_get(hba, UIC_ARG_MIB(PA_LOCALVERINFO), &host->unipro_ver)) + host->unipro_ver = 0; +} + +static void ufs_sprd_ctrl_uic_compl(struct ufs_hba *hba, bool enable) +{ + u32 set = ufshcd_readl(hba, REG_INTERRUPT_ENABLE); + + if (enable == true) + set |= UIC_COMMAND_COMPL; + else + set &= ~UIC_COMMAND_COMPL; + ufshcd_writel(hba, set, REG_INTERRUPT_ENABLE); +} + +static int ufs_sprd_get_reset_ctrl(struct device *dev, struct ufs_sprd_rst *rci) +{ + rci->rc = devm_reset_control_get(dev, rci->name); + if (IS_ERR(rci->rc)) { + dev_err(dev, "failed to get reset ctrl:%s\n", rci->name); + return PTR_ERR(rci->rc); + } + + return 0; +} + +static int ufs_sprd_get_syscon_reg(struct device *dev, struct ufs_sprd_syscon *sysci) +{ + sysci->regmap = syscon_regmap_lookup_by_phandle(dev->of_node, sysci->name); + if (IS_ERR(sysci->regmap)) { + dev_err(dev, "failed to get ufs syscon:%s\n", sysci->name); + return PTR_ERR(sysci->regmap); + } + + return 0; +} + +static int ufs_sprd_get_vreg(struct device *dev, struct ufs_sprd_vreg *vregi) +{ + vregi->vreg = devm_regulator_get(dev, vregi->name); + if (IS_ERR(vregi->vreg)) { + dev_err(dev, "failed to get vreg:%s\n", vregi->name); + return PTR_ERR(vregi->vreg); + } + + return 0; +} + +static int ufs_sprd_parse_dt(struct device *dev, struct ufs_hba *hba, struct ufs_sprd_host *host) +{ + u32 i; + struct ufs_sprd_priv *priv = host->priv; + int ret = 0; + + /* Parse UFS reset ctrl info */ + for (i = 0; i < SPRD_UFS_RST_MAX; i++) { + if (!priv->rci[i].name) + continue; + ret = ufs_sprd_get_reset_ctrl(dev, &priv->rci[i]); + if (ret) + goto out; + } + + /* Parse UFS syscon reg info */ + for (i = 0; i < SPRD_UFS_SYSCON_MAX; i++) { + if (!priv->sysci[i].name) + continue; + ret = ufs_sprd_get_syscon_reg(dev, &priv->sysci[i]); + if (ret) + goto out; + } + + /* Parse UFS vreg info */ + for (i = 0; i < SPRD_UFS_VREG_MAX; i++) { + if (!priv->vregi[i].name) + continue; + ret = ufs_sprd_get_vreg(dev, &priv->vregi[i]); + if (ret) + goto out; + } + +out: + return ret; +} + +static int ufs_sprd_common_init(struct ufs_hba *hba) +{ + struct device *dev = hba->dev; + struct ufs_sprd_host *host; + struct platform_device __maybe_unused *pdev = to_platform_device(dev); + const struct of_device_id *of_id; + int ret = 0; + + host = devm_kzalloc(dev, sizeof(*host), GFP_KERNEL); + if (!host) + return -ENOMEM; + + of_id = of_match_node(ufs_sprd_of_match, pdev->dev.of_node); + if (of_id->data != NULL) + host->priv = container_of(of_id->data, struct ufs_sprd_priv, + ufs_hba_sprd_vops); + + host->hba = hba; + ufshcd_set_variant(hba, host); + + hba->caps |= UFSHCD_CAP_CLK_GATING | + UFSHCD_CAP_CRYPTO | + UFSHCD_CAP_WB_EN; + hba->quirks |= UFSHCD_QUIRK_DELAY_BEFORE_DME_CMDS; + + ret = ufs_sprd_parse_dt(dev, hba, host); + + return ret; +} + +static int sprd_ufs_pwr_change_notify(struct ufs_hba *hba, + enum ufs_notify_change_status status, + struct ufs_pa_layer_attr *dev_max_params, + struct ufs_pa_layer_attr *dev_req_params) +{ + struct ufs_sprd_host *host = ufshcd_get_variant(hba); + + if (status == PRE_CHANGE) { + memcpy(dev_req_params, dev_max_params, + sizeof(struct ufs_pa_layer_attr)); + if (host->unipro_ver >= UFS_UNIPRO_VER_1_8) + ufshcd_dme_configure_adapt(hba, dev_req_params->gear_tx, + PA_INITIAL_ADAPT); + } + + return 0; +} + +static int ufs_sprd_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op, + enum ufs_notify_change_status status) +{ + unsigned long flags; + + if (status == PRE_CHANGE) { + if (ufshcd_is_auto_hibern8_supported(hba)) { + spin_lock_irqsave(hba->host->host_lock, flags); + ufshcd_writel(hba, 0, REG_AUTO_HIBERNATE_IDLE_TIMER); + spin_unlock_irqrestore(hba->host->host_lock, flags); + } + } + + return 0; +} + +static void ufs_sprd_n6_host_reset(struct ufs_hba *hba) +{ + struct ufs_sprd_priv *priv = ufs_sprd_get_priv_data(hba); + + dev_info(hba->dev, "ufs host reset!\n"); + + reset_control_assert(priv->rci[SPRD_UFSHCI_SOFT_RST].rc); + usleep_range(1000, 1100); + reset_control_deassert(priv->rci[SPRD_UFSHCI_SOFT_RST].rc); +} + +static int ufs_sprd_n6_device_reset(struct ufs_hba *hba) +{ + struct ufs_sprd_priv *priv = ufs_sprd_get_priv_data(hba); + + dev_info(hba->dev, "ufs device reset!\n"); + + reset_control_assert(priv->rci[SPRD_UFS_DEV_RST].rc); + usleep_range(1000, 1100); + reset_control_deassert(priv->rci[SPRD_UFS_DEV_RST].rc); + + return 0; +} + +static void ufs_sprd_n6_key_acc_enable(struct ufs_hba *hba) +{ + u32 val; + u32 retry = 10; + struct arm_smccc_res res; + +check_hce: + /* Key access only can be enabled under HCE enable */ + val = ufshcd_readl(hba, REG_CONTROLLER_ENABLE); + if (!(val & CONTROLLER_ENABLE)) { + ufs_sprd_n6_host_reset(hba); + val |= CONTROLLER_ENABLE; + ufshcd_writel(hba, val, REG_CONTROLLER_ENABLE); + usleep_range(1000, 1100); + if (retry) { + retry--; + goto check_hce; + } + goto disable_crypto; + } + + arm_smccc_smc(SPRD_SIP_SVC_STORAGE_UFS_CRYPTO_ENABLE, + 0, 0, 0, 0, 0, 0, 0, &res); + if (!res.a0) + return; + +disable_crypto: + dev_err(hba->dev, "key reg access enable fail, disable crypto\n"); + hba->caps &= ~UFSHCD_CAP_CRYPTO; +} + +static int ufs_sprd_n6_init(struct ufs_hba *hba) +{ + struct ufs_sprd_priv *priv; + int ret = 0; + + ret = ufs_sprd_common_init(hba); + if (ret != 0) + return ret; + + priv = ufs_sprd_get_priv_data(hba); + + ret = regulator_enable(priv->vregi[SPRD_UFS_VDD_MPHY].vreg); + if (ret) + return -ENODEV; + + if (hba->caps & UFSHCD_CAP_CRYPTO) + ufs_sprd_n6_key_acc_enable(hba); + + return 0; +} + +static int ufs_sprd_n6_phy_init(struct ufs_hba *hba) +{ + int ret = 0; + uint32_t val = 0; + uint32_t retry = 10; + uint32_t offset; + struct ufs_sprd_priv *priv = ufs_sprd_get_priv_data(hba); + + ufshcd_dme_set(hba, UIC_ARG_MIB(CBREFCLKCTRL2), 0x90); + ufshcd_dme_set(hba, UIC_ARG_MIB(CBCRCTRL), 0x01); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(RXSQCONTROL, + UIC_ARG_MPHY_RX_GEN_SEL_INDEX(0)), 0x01); + ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(RXSQCONTROL, + UIC_ARG_MPHY_RX_GEN_SEL_INDEX(1)), 0x01); + ufshcd_dme_set(hba, UIC_ARG_MIB(VS_MPHYCFGUPDT), 0x01); + ufshcd_dme_set(hba, UIC_ARG_MIB(CBRATESEL), 0x01); + + do { + /* phy_sram_init_done */ + ufs_sprd_regmap_read(priv, SPRD_UFS_ANLG, 0xc, &val); + if ((val & 0x1) == 0x1) { + for (offset = 0x40; offset < 0x42; offset++) { + /* Lane afe calibration */ + ufshcd_dme_set(hba, UIC_ARG_MIB(CBCREGADDRLSB), 0x1c); + ufshcd_dme_set(hba, UIC_ARG_MIB(CBCREGADDRMSB), offset); + ufshcd_dme_set(hba, UIC_ARG_MIB(CBCREGWRLSB), 0x04); + ufshcd_dme_set(hba, UIC_ARG_MIB(CBCREGWRMSB), 0x00); + ufshcd_dme_set(hba, UIC_ARG_MIB(CBCREGRDWRSEL), 0x01); + ufshcd_dme_set(hba, UIC_ARG_MIB(VS_MPHYCFGUPDT), 0x01); + } + + goto update_phy; + } + udelay(1000); + retry--; + } while (retry > 0); + + ret = -ETIMEDOUT; + goto out; + +update_phy: + /* phy_sram_ext_ld_done */ + ufs_sprd_regmap_update(priv, SPRD_UFS_ANLG, 0xc, 0x2, 0); + ufshcd_dme_set(hba, UIC_ARG_MIB(VS_MPHYCFGUPDT), 0x01); + ufshcd_dme_set(hba, UIC_ARG_MIB(VS_MPHYDISABLE), 0x0); +out: + return ret; +} + + +static int sprd_ufs_n6_hce_enable_notify(struct ufs_hba *hba, + enum ufs_notify_change_status status) +{ + int err = 0; + struct ufs_sprd_priv *priv = ufs_sprd_get_priv_data(hba); + + if (status == PRE_CHANGE) { + /* phy_sram_ext_ld_done */ + ufs_sprd_regmap_update(priv, SPRD_UFS_ANLG, 0xc, 0x2, 0x2); + /* phy_sram_bypass */ + ufs_sprd_regmap_update(priv, SPRD_UFS_ANLG, 0xc, 0x4, 0x4); + + ufs_sprd_n6_host_reset(hba); + + if (hba->caps & UFSHCD_CAP_CRYPTO) + ufs_sprd_n6_key_acc_enable(hba); + } + + if (status == POST_CHANGE) { + err = ufs_sprd_n6_phy_init(hba); + if (err) { + dev_err(hba->dev, "Phy setup failed (%d)\n", err); + goto out; + } + + ufs_sprd_get_unipro_ver(hba); + } +out: + return err; +} + +static void sprd_ufs_n6_h8_notify(struct ufs_hba *hba, + enum uic_cmd_dme cmd, + enum ufs_notify_change_status status) +{ + struct ufs_sprd_priv *priv = ufs_sprd_get_priv_data(hba); + + if (status == PRE_CHANGE) { + if (cmd == UIC_CMD_DME_HIBER_ENTER) + /* + * Disable UIC COMPL INTR to prevent access to UFSHCI after + * checking HCS.UPMCRS + */ + ufs_sprd_ctrl_uic_compl(hba, false); + + if (cmd == UIC_CMD_DME_HIBER_EXIT) { + ufs_sprd_regmap_update(priv, SPRD_UFS_AON_APB, APB_UFSDEV_REG, + APB_UFSDEV_REFCLK_EN, APB_UFSDEV_REFCLK_EN); + ufs_sprd_regmap_update(priv, SPRD_UFS_AON_APB, APB_USB31PLL_CTRL, + APB_USB31PLLV_REF2MPHY, APB_USB31PLLV_REF2MPHY); + } + } + + if (status == POST_CHANGE) { + if (cmd == UIC_CMD_DME_HIBER_EXIT) + ufs_sprd_ctrl_uic_compl(hba, true); + + if (cmd == UIC_CMD_DME_HIBER_ENTER) { + ufs_sprd_regmap_update(priv, SPRD_UFS_AON_APB, APB_UFSDEV_REG, + APB_UFSDEV_REFCLK_EN, 0); + ufs_sprd_regmap_update(priv, SPRD_UFS_AON_APB, APB_USB31PLL_CTRL, + APB_USB31PLLV_REF2MPHY, 0); + } + } +} + +static struct ufs_sprd_priv n6_ufs = { + .rci[SPRD_UFSHCI_SOFT_RST] = { .name = "controller", }, + .rci[SPRD_UFS_DEV_RST] = { .name = "device", }, + + .sysci[SPRD_UFS_ANLG] = { .name = "sprd,ufs-anlg-syscon", }, + .sysci[SPRD_UFS_AON_APB] = { .name = "sprd,aon-apb-syscon", }, + + .vregi[SPRD_UFS_VDD_MPHY] = { .name = "vdd-mphy", }, + + .ufs_hba_sprd_vops = { + .name = "sprd,ums9620-ufs", + .init = ufs_sprd_n6_init, + .hce_enable_notify = sprd_ufs_n6_hce_enable_notify, + .pwr_change_notify = sprd_ufs_pwr_change_notify, + .hibern8_notify = sprd_ufs_n6_h8_notify, + .device_reset = ufs_sprd_n6_device_reset, + .suspend = ufs_sprd_suspend, + }, +}; + +static const struct of_device_id __maybe_unused ufs_sprd_of_match[] = { + { .compatible = "sprd,ums9620-ufs", .data = &n6_ufs.ufs_hba_sprd_vops}, + {}, +}; +MODULE_DEVICE_TABLE(of, ufs_sprd_of_match); + +static int ufs_sprd_probe(struct platform_device *pdev) +{ + int err; + struct device *dev = &pdev->dev; + const struct of_device_id *of_id; + + of_id = of_match_node(ufs_sprd_of_match, dev->of_node); + err = ufshcd_pltfrm_init(pdev, of_id->data); + if (err) + dev_err(dev, "ufshcd_pltfrm_init() failed %d\n", err); + + return err; +} + +static int ufs_sprd_remove(struct platform_device *pdev) +{ + struct ufs_hba *hba = platform_get_drvdata(pdev); + + pm_runtime_get_sync(&(pdev)->dev); + ufshcd_remove(hba); + return 0; +} + +static const struct dev_pm_ops ufs_sprd_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(ufshcd_system_suspend, ufshcd_system_resume) + SET_RUNTIME_PM_OPS(ufshcd_runtime_suspend, ufshcd_runtime_resume, NULL) + .prepare = ufshcd_suspend_prepare, + .complete = ufshcd_resume_complete, +}; + +static struct platform_driver ufs_sprd_pltform = { + .probe = ufs_sprd_probe, + .remove = ufs_sprd_remove, + .shutdown = ufshcd_pltfrm_shutdown, + .driver = { + .name = "ufshcd-sprd", + .pm = &ufs_sprd_pm_ops, + .of_match_table = of_match_ptr(ufs_sprd_of_match), + }, +}; +module_platform_driver(ufs_sprd_pltform); + +MODULE_AUTHOR("Zhe Wang "); +MODULE_DESCRIPTION("Unisoc UFS Host Driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/ufs/host/ufs-sprd.h b/drivers/ufs/host/ufs-sprd.h new file mode 100644 index 000000000000..26ad5c3af4c1 --- /dev/null +++ b/drivers/ufs/host/ufs-sprd.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * UNISOC UFS Host Controller driver + * + * Copyright (C) 2022 Unisoc, Inc. + * Author: Zhe Wang + */ + +#ifndef _UFS_SPRD_H_ +#define _UFS_SPRD_H_ + +/* Vendor specific attributes */ +#define RXSQCONTROL 0x8009 +#define CBRATESEL 0x8114 +#define CBCREGADDRLSB 0x8116 +#define CBCREGADDRMSB 0x8117 +#define CBCREGWRLSB 0x8118 +#define CBCREGWRMSB 0x8119 +#define CBCREGRDWRSEL 0x811C +#define CBCRCTRL 0x811F +#define CBREFCLKCTRL2 0x8132 +#define VS_MPHYDISABLE 0xD0C1 + +#define APB_UFSDEV_REG 0xCE8 +#define APB_UFSDEV_REFCLK_EN 0x2 +#define APB_USB31PLL_CTRL 0xCFC +#define APB_USB31PLLV_REF2MPHY 0x1 + +#define SPRD_SIP_SVC_STORAGE_UFS_CRYPTO_ENABLE \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_SIP, \ + 0x0301) + +enum SPRD_UFS_RST_INDEX { + SPRD_UFSHCI_SOFT_RST, + SPRD_UFS_DEV_RST, + + SPRD_UFS_RST_MAX +}; + +enum SPRD_UFS_SYSCON_INDEX { + SPRD_UFS_ANLG, + SPRD_UFS_AON_APB, + + SPRD_UFS_SYSCON_MAX +}; + +enum SPRD_UFS_VREG_INDEX { + SPRD_UFS_VDD_MPHY, + + SPRD_UFS_VREG_MAX +}; + +struct ufs_sprd_rst { + const char *name; + struct reset_control *rc; +}; + +struct ufs_sprd_syscon { + const char *name; + struct regmap *regmap; +}; + +struct ufs_sprd_vreg { + const char *name; + struct regulator *vreg; +}; + +struct ufs_sprd_priv { + struct ufs_sprd_rst rci[SPRD_UFS_RST_MAX]; + struct ufs_sprd_syscon sysci[SPRD_UFS_SYSCON_MAX]; + struct ufs_sprd_vreg vregi[SPRD_UFS_VREG_MAX]; + const struct ufs_hba_variant_ops ufs_hba_sprd_vops; +}; + +struct ufs_sprd_host { + struct ufs_hba *hba; + struct ufs_sprd_priv *priv; + void __iomem *ufs_dbg_mmio; + + enum ufs_unipro_ver unipro_ver; +}; + +#endif /* _UFS_SPRD_H_ */ diff --git a/drivers/scsi/ufs/ufshcd-dwc.c b/drivers/ufs/host/ufshcd-dwc.c similarity index 98% rename from drivers/scsi/ufs/ufshcd-dwc.c rename to drivers/ufs/host/ufshcd-dwc.c index 5bb9d3a88795..e28a67e1e314 100644 --- a/drivers/scsi/ufs/ufshcd-dwc.c +++ b/drivers/ufs/host/ufshcd-dwc.c @@ -7,8 +7,10 @@ * Authors: Joao Pinto */ -#include "ufshcd.h" -#include "unipro.h" +#include + +#include +#include #include "ufshcd-dwc.h" #include "ufshci-dwc.h" diff --git a/drivers/scsi/ufs/ufshcd-dwc.h b/drivers/ufs/host/ufshcd-dwc.h similarity index 95% rename from drivers/scsi/ufs/ufshcd-dwc.h rename to drivers/ufs/host/ufshcd-dwc.h index 4268ca2eb64c..ad91ea56662c 100644 --- a/drivers/scsi/ufs/ufshcd-dwc.h +++ b/drivers/ufs/host/ufshcd-dwc.h @@ -10,6 +10,8 @@ #ifndef _UFSHCD_DWC_H #define _UFSHCD_DWC_H +#include + struct ufshcd_dme_attr_val { u32 attr_sel; u32 mib_val; diff --git a/drivers/scsi/ufs/ufshcd-pci.c b/drivers/ufs/host/ufshcd-pci.c similarity index 96% rename from drivers/scsi/ufs/ufshcd-pci.c rename to drivers/ufs/host/ufshcd-pci.c index e892b9feffb1..1c91f43e15c8 100644 --- a/drivers/scsi/ufs/ufshcd-pci.c +++ b/drivers/ufs/host/ufshcd-pci.c @@ -2,7 +2,6 @@ /* * Universal Flash Storage Host controller PCI glue driver * - * This code is based on drivers/scsi/ufs/ufshcd-pci.c * Copyright (C) 2011-2013 Samsung India Software Operations * * Authors: @@ -10,7 +9,9 @@ * Vinayak Holikatti */ -#include "ufshcd.h" +#include +#include +#include #include #include #include @@ -23,7 +24,7 @@ struct ufs_host { void (*late_init)(struct ufs_hba *hba); }; -enum { +enum intel_ufs_dsm_func_id { INTEL_DSM_FNS = 0, INTEL_DSM_RESET = 1, }; @@ -41,6 +42,15 @@ static const guid_t intel_dsm_guid = GUID_INIT(0x1A4832A0, 0x7D03, 0x43CA, 0xB0, 0x20, 0xF6, 0xDC, 0xD1, 0x2A, 0x19, 0x50); +static bool __intel_dsm_supported(struct intel_host *host, + enum intel_ufs_dsm_func_id fn) +{ + return fn < 32 && fn >= 0 && (host->dsm_fns & (1u << fn)); +} + +#define INTEL_DSM_SUPPORTED(host, name) \ + __intel_dsm_supported(host, INTEL_DSM_##name) + static int __intel_dsm(struct intel_host *intel_host, struct device *dev, unsigned int fn, u32 *result) { @@ -70,7 +80,7 @@ out: static int intel_dsm(struct intel_host *intel_host, struct device *dev, unsigned int fn, u32 *result) { - if (fn > 31 || !(intel_host->dsm_fns & (1 << fn))) + if (!__intel_dsm_supported(intel_host, fn)) return -EOPNOTSUPP; return __intel_dsm(intel_host, dev, fn, result); @@ -299,7 +309,7 @@ static int ufs_intel_device_reset(struct ufs_hba *hba) { struct intel_host *host = ufshcd_get_variant(hba); - if (host->dsm_fns & INTEL_DSM_RESET) { + if (INTEL_DSM_SUPPORTED(host, RESET)) { u32 result = 0; int err; @@ -341,7 +351,7 @@ static int ufs_intel_common_init(struct ufs_hba *hba) return -ENOMEM; ufshcd_set_variant(hba, host); intel_dsm_init(host, hba->dev); - if (host->dsm_fns & INTEL_DSM_RESET) { + if (INTEL_DSM_SUPPORTED(host, RESET)) { if (hba->vops->device_reset) hba->caps |= UFSHCD_CAP_DEEPSLEEP; } else { @@ -425,6 +435,7 @@ static int ufs_intel_adl_init(struct ufs_hba *hba) { hba->nop_out_timeout = 200; hba->quirks |= UFSHCD_QUIRK_BROKEN_AUTO_HIBERN8; + hba->caps |= UFSHCD_CAP_WB_EN; return ufs_intel_common_init(hba); } @@ -618,4 +629,3 @@ MODULE_AUTHOR("Santosh Yaragnavi "); MODULE_AUTHOR("Vinayak Holikatti "); MODULE_DESCRIPTION("UFS host controller PCI glue driver"); MODULE_LICENSE("GPL"); -MODULE_VERSION(UFSHCD_DRIVER_VERSION); diff --git a/drivers/scsi/ufs/ufshcd-pltfrm.c b/drivers/ufs/host/ufshcd-pltfrm.c similarity index 89% rename from drivers/scsi/ufs/ufshcd-pltfrm.c rename to drivers/ufs/host/ufshcd-pltfrm.c index adc302b1a57a..5739ff007828 100644 --- a/drivers/scsi/ufs/ufshcd-pltfrm.c +++ b/drivers/ufs/host/ufshcd-pltfrm.c @@ -8,13 +8,14 @@ * Vinayak Holikatti */ +#include #include #include #include -#include "ufshcd.h" +#include #include "ufshcd-pltfrm.h" -#include "unipro.h" +#include #define UFSHCD_DEFAULT_LANES_PER_DIRECTION 2 @@ -25,7 +26,7 @@ static int ufshcd_parse_clock_info(struct ufs_hba *hba) int i; struct device *dev = hba->dev; struct device_node *np = dev->of_node; - char *name; + const char *name; u32 *clkfreq = NULL; struct ufs_clk_info *clki; int len = 0; @@ -78,8 +79,8 @@ static int ufshcd_parse_clock_info(struct ufs_hba *hba) } for (i = 0; i < sz; i += 2) { - ret = of_property_read_string_index(np, - "clock-names", i/2, (const char **)&name); + ret = of_property_read_string_index(np, "clock-names", i/2, + &name); if (ret) goto out; @@ -119,8 +120,8 @@ static bool phandle_exists(const struct device_node *np, } #define MAX_PROP_SIZE 32 -static int ufshcd_populate_vreg(struct device *dev, const char *name, - struct ufs_vreg **out_vreg) +int ufshcd_populate_vreg(struct device *dev, const char *name, + struct ufs_vreg **out_vreg) { char prop_name[MAX_PROP_SIZE]; struct ufs_vreg *vreg = NULL; @@ -155,6 +156,7 @@ out: *out_vreg = vreg; return 0; } +EXPORT_SYMBOL_GPL(ufshcd_populate_vreg); /** * ufshcd_parse_regulator_info - get regulator info from device tree @@ -218,8 +220,8 @@ static void ufshcd_init_lanes_per_dir(struct ufs_hba *hba) * * Returns 0 on success, non-zero value on failure */ -int ufshcd_get_pwr_dev_param(struct ufs_dev_params *pltfrm_param, - struct ufs_pa_layer_attr *dev_max, +int ufshcd_get_pwr_dev_param(const struct ufs_dev_params *pltfrm_param, + const struct ufs_pa_layer_attr *dev_max, struct ufs_pa_layer_attr *agreed_pwr) { int min_pltfrm_gear; @@ -308,18 +310,20 @@ EXPORT_SYMBOL_GPL(ufshcd_get_pwr_dev_param); void ufshcd_init_pwr_dev_param(struct ufs_dev_params *dev_param) { - dev_param->tx_lanes = 2; - dev_param->rx_lanes = 2; - dev_param->hs_rx_gear = UFS_HS_G3; - dev_param->hs_tx_gear = UFS_HS_G3; - dev_param->pwm_rx_gear = UFS_PWM_G4; - dev_param->pwm_tx_gear = UFS_PWM_G4; - dev_param->rx_pwr_pwm = SLOW_MODE; - dev_param->tx_pwr_pwm = SLOW_MODE; - dev_param->rx_pwr_hs = FAST_MODE; - dev_param->tx_pwr_hs = FAST_MODE; - dev_param->hs_rate = PA_HS_MODE_B; - dev_param->desired_working_mode = UFS_HS_MODE; + *dev_param = (struct ufs_dev_params){ + .tx_lanes = 2, + .rx_lanes = 2, + .hs_rx_gear = UFS_HS_G3, + .hs_tx_gear = UFS_HS_G3, + .pwm_rx_gear = UFS_PWM_G4, + .pwm_tx_gear = UFS_PWM_G4, + .rx_pwr_pwm = SLOW_MODE, + .tx_pwr_pwm = SLOW_MODE, + .rx_pwr_hs = FAST_MODE, + .tx_pwr_hs = FAST_MODE, + .hs_rate = PA_HS_MODE_B, + .desired_working_mode = UFS_HS_MODE, + }; } EXPORT_SYMBOL_GPL(ufshcd_init_pwr_dev_param); @@ -352,7 +356,7 @@ int ufshcd_pltfrm_init(struct platform_device *pdev, err = ufshcd_alloc_host(dev, &hba); if (err) { - dev_err(&pdev->dev, "Allocation failed\n"); + dev_err(dev, "Allocation failed\n"); goto out; } @@ -360,13 +364,13 @@ int ufshcd_pltfrm_init(struct platform_device *pdev, err = ufshcd_parse_clock_info(hba); if (err) { - dev_err(&pdev->dev, "%s: clock parse failed %d\n", + dev_err(dev, "%s: clock parse failed %d\n", __func__, err); goto dealloc_host; } err = ufshcd_parse_regulator_info(hba); if (err) { - dev_err(&pdev->dev, "%s: regulator init failed %d\n", + dev_err(dev, "%s: regulator init failed %d\n", __func__, err); goto dealloc_host; } @@ -379,8 +383,8 @@ int ufshcd_pltfrm_init(struct platform_device *pdev, goto dealloc_host; } - pm_runtime_set_active(&pdev->dev); - pm_runtime_enable(&pdev->dev); + pm_runtime_set_active(dev); + pm_runtime_enable(dev); return 0; @@ -395,4 +399,3 @@ MODULE_AUTHOR("Santosh Yaragnavi "); MODULE_AUTHOR("Vinayak Holikatti "); MODULE_DESCRIPTION("UFS host controller Platform bus based glue driver"); MODULE_LICENSE("GPL"); -MODULE_VERSION(UFSHCD_DRIVER_VERSION); diff --git a/drivers/scsi/ufs/ufshcd-pltfrm.h b/drivers/ufs/host/ufshcd-pltfrm.h similarity index 82% rename from drivers/scsi/ufs/ufshcd-pltfrm.h rename to drivers/ufs/host/ufshcd-pltfrm.h index c33e28ac6ef6..2e4ba2bfbcad 100644 --- a/drivers/scsi/ufs/ufshcd-pltfrm.h +++ b/drivers/ufs/host/ufshcd-pltfrm.h @@ -5,7 +5,7 @@ #ifndef UFSHCD_PLTFRM_H_ #define UFSHCD_PLTFRM_H_ -#include "ufshcd.h" +#include #define UFS_PWM_MODE 1 #define UFS_HS_MODE 2 @@ -25,12 +25,14 @@ struct ufs_dev_params { u32 desired_working_mode; }; -int ufshcd_get_pwr_dev_param(struct ufs_dev_params *dev_param, - struct ufs_pa_layer_attr *dev_max, +int ufshcd_get_pwr_dev_param(const struct ufs_dev_params *dev_param, + const struct ufs_pa_layer_attr *dev_max, struct ufs_pa_layer_attr *agreed_pwr); void ufshcd_init_pwr_dev_param(struct ufs_dev_params *dev_param); int ufshcd_pltfrm_init(struct platform_device *pdev, const struct ufs_hba_variant_ops *vops); void ufshcd_pltfrm_shutdown(struct platform_device *pdev); +int ufshcd_populate_vreg(struct device *dev, const char *name, + struct ufs_vreg **out_vreg); #endif /* UFSHCD_PLTFRM_H_ */ diff --git a/drivers/scsi/ufs/ufshci-dwc.h b/drivers/ufs/host/ufshci-dwc.h similarity index 100% rename from drivers/scsi/ufs/ufshci-dwc.h rename to drivers/ufs/host/ufshci-dwc.h diff --git a/drivers/usb/atm/usbatm.c b/drivers/usb/atm/usbatm.c index 33ae03ac13a6..02d93b732498 100644 --- a/drivers/usb/atm/usbatm.c +++ b/drivers/usb/atm/usbatm.c @@ -1024,7 +1024,7 @@ int usbatm_usb_probe(struct usb_interface *intf, const struct usb_device_id *id, /* public fields */ instance->driver = driver; - strlcpy(instance->driver_name, driver->driver_name, + strscpy(instance->driver_name, driver->driver_name, sizeof(instance->driver_name)); instance->usb_dev = usb_dev; diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 656ba91c3283..676723231526 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1422,7 +1422,7 @@ static int proc_getdriver(struct usb_dev_state *ps, void __user *arg) if (!intf || !intf->dev.driver) ret = -ENODATA; else { - strlcpy(gd.driver, intf->dev.driver->name, + strscpy(gd.driver, intf->dev.driver->name, sizeof(gd.driver)); ret = (copy_to_user(arg, &gd, sizeof(gd)) ? -EFAULT : 0); } diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 504476aa43cc..075c1ae68602 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -3155,8 +3155,6 @@ EXPORT_SYMBOL_GPL(usb_hcd_setup_local_mem); /*-------------------------------------------------------------------------*/ -#if IS_ENABLED(CONFIG_USB_MON) - const struct usb_mon_operations *mon_ops; /* @@ -3190,5 +3188,3 @@ void usb_mon_deregister (void) mb(); } EXPORT_SYMBOL_GPL (usb_mon_deregister); - -#endif /* CONFIG_USB_MON || CONFIG_USB_MON_MODULE */ diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index c2fa6eea3a33..f07ed6397c30 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "hub.h" #include "otg_productlist.h" @@ -2542,6 +2543,11 @@ int usb_new_device(struct usb_device *udev) dev_dbg(&udev->dev, "udev %d, busnum %d, minor = %d\n", udev->devnum, udev->bus->busnum, (((udev->bus->busnum-1) * 128) + (udev->devnum-1))); + + trace_android_vh_usb_new_device_added(udev, &err); + if (err) + goto fail; + /* export the usbdev device-node for libusb */ udev->dev.devt = MKDEV(USB_DEVICE_MAJOR, (((udev->bus->busnum-1) * 128) + (udev->devnum-1))); diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index 7258e640e9ee..0b7a92b96817 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -305,6 +305,7 @@ int dwc3_core_soft_reset(struct dwc3 *dwc) udelay(1); } while (--retries); + dev_warn(dwc->dev, "DWC3 controller soft reset failed.\n"); return -ETIMEDOUT; done: @@ -937,6 +938,44 @@ static void dwc3_set_incr_burst_type(struct dwc3 *dwc) dwc3_writel(dwc->regs, DWC3_GSBUSCFG0, cfg); } +static void dwc3_set_power_down_clk_scale(struct dwc3 *dwc) +{ + struct device *dev = dwc->dev; + struct device_node *node = dev->of_node; + struct clk *suspend_clk; + u32 scale; + u32 reg; + + if (dwc->num_clks == 0) + return; + + /* + * The power down scale field specifies how many suspend_clk + * periods fit into a 16KHz clock period. When performing + * the division, round up the remainder. + * + * The power down scale value is calculated using the fastest + * frequency of the suspend_clk. If it isn't fixed (but within + * the accuracy requirement), the driver may not know the max + * rate of the suspend_clk, so only update the power down scale + * if the default is less than the calculated value from + * clk_get_rate() or if the default is questionably high + * (3x or more) to be within the requirement. + */ + suspend_clk = of_clk_get_by_name(node, "suspend"); + if (IS_ERR(suspend_clk)) + return; + + scale = DIV_ROUND_UP(clk_get_rate(suspend_clk), 16000); + reg = dwc3_readl(dwc->regs, DWC3_GCTL); + if ((reg & DWC3_GCTL_PWRDNSCALE_MASK) < DWC3_GCTL_PWRDNSCALE(scale) || + (reg & DWC3_GCTL_PWRDNSCALE_MASK) > DWC3_GCTL_PWRDNSCALE(scale*3)) { + reg &= ~(DWC3_GCTL_PWRDNSCALE_MASK); + reg |= DWC3_GCTL_PWRDNSCALE(scale); + dwc3_writel(dwc->regs, DWC3_GCTL, reg); + } +} + /** * dwc3_core_init - Low-level initialization of DWC3 Core * @dwc: Pointer to our controller context structure @@ -1018,6 +1057,9 @@ static int dwc3_core_init(struct dwc3 *dwc) if (ret) goto err1; + /* Set power down scale of suspend_clk */ + dwc3_set_power_down_clk_scale(dwc); + /* Adjust Frame Length */ dwc3_frame_length_adjustment(dwc); diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h index e82e4cbe4ec7..00c58b8c2a72 100644 --- a/drivers/usb/dwc3/core.h +++ b/drivers/usb/dwc3/core.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -230,6 +231,7 @@ /* Global Configuration Register */ #define DWC3_GCTL_PWRDNSCALE(n) ((n) << 19) +#define DWC3_GCTL_PWRDNSCALE_MASK DWC3_GCTL_PWRDNSCALE(0x1fff) #define DWC3_GCTL_U2RSTECN BIT(16) #define DWC3_GCTL_RAMCLKSEL(x) (((x) & DWC3_GCTL_CLK_MASK) << 6) #define DWC3_GCTL_CLK_BUS (0) @@ -661,6 +663,8 @@ struct dwc3_event_buffer { dma_addr_t dma; struct dwc3 *dwc; + + ANDROID_KABI_RESERVE(1); }; #define DWC3_EP_FLAG_STALLED BIT(0) @@ -725,6 +729,7 @@ struct dwc3_ep { #define DWC3_EP_FIRST_STREAM_PRIMED BIT(10) #define DWC3_EP_PENDING_CLEAR_STALL BIT(11) #define DWC3_EP_TXFIFO_RESIZED BIT(12) +#define DWC3_EP_DELAY_STOP BIT(13) /* This last one is specific to EP0 */ #define DWC3_EP0_DIR_IN BIT(31) @@ -755,6 +760,9 @@ struct dwc3_ep { /* For isochronous START TRANSFER workaround only */ u8 combo_num; int start_cmd_status; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; enum dwc3_phy { @@ -866,6 +874,9 @@ struct dwc3_hwparams { u32 hwparams7; u32 hwparams8; u32 hwparams9; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; /* HWPARAMS0 */ @@ -934,6 +945,9 @@ struct dwc3_request { unsigned int needs_extra_trb:1; unsigned int direction:1; unsigned int mapped:1; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; /* @@ -1303,6 +1317,11 @@ struct dwc3 { int max_cfg_eps; int last_fifo_depth; int num_ep_resized; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #define INCRX_BURST_MODE 0 diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c index 658739410992..9b6ebc3c902d 100644 --- a/drivers/usb/dwc3/ep0.c +++ b/drivers/usb/dwc3/ep0.c @@ -218,7 +218,7 @@ out: return ret; } -static void dwc3_ep0_stall_and_restart(struct dwc3 *dwc) +void dwc3_ep0_stall_and_restart(struct dwc3 *dwc) { struct dwc3_ep *dep; @@ -271,6 +271,7 @@ void dwc3_ep0_out_start(struct dwc3 *dwc) { struct dwc3_ep *dep; int ret; + int i; complete(&dwc->ep0_in_setup); @@ -279,6 +280,19 @@ void dwc3_ep0_out_start(struct dwc3 *dwc) DWC3_TRBCTL_CONTROL_SETUP, false); ret = dwc3_ep0_start_trans(dep); WARN_ON(ret < 0); + for (i = 2; i < DWC3_ENDPOINTS_NUM; i++) { + struct dwc3_ep *dwc3_ep; + + dwc3_ep = dwc->eps[i]; + if (!dwc3_ep) + continue; + + if (!(dwc3_ep->flags & DWC3_EP_DELAY_STOP)) + continue; + + dwc3_ep->flags &= ~DWC3_EP_DELAY_STOP; + dwc3_stop_active_transfer(dwc3_ep, true, true); + } } static struct dwc3_ep *dwc3_wIndex_to_dep(struct dwc3 *dwc, __le16 wIndex_le) @@ -1073,13 +1087,18 @@ void dwc3_ep0_send_delayed_status(struct dwc3 *dwc) __dwc3_ep0_do_control_status(dwc, dwc->eps[direction]); } -static void dwc3_ep0_end_control_data(struct dwc3 *dwc, struct dwc3_ep *dep) +void dwc3_ep0_end_control_data(struct dwc3 *dwc, struct dwc3_ep *dep) { struct dwc3_gadget_ep_cmd_params params; u32 cmd; int ret; - if (!dep->resource_index) + /* + * For status/DATA OUT stage, TRB will be queued on ep0 out + * endpoint for which resource index is zero. Hence allow + * queuing ENDXFER command for ep0 out endpoint. + */ + if (!dep->resource_index && dep->number) return; cmd = DWC3_DEPCMD_ENDTRANSFER; diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 4812ba4bbedd..aaa57e1d9e53 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -640,9 +640,6 @@ static int dwc3_gadget_set_ep_config(struct dwc3_ep *dep, unsigned int action) return dwc3_send_gadget_ep_cmd(dep, DWC3_DEPCMD_SETEPCONFIG, ¶ms); } -static void dwc3_stop_active_transfer(struct dwc3_ep *dep, bool force, - bool interrupt); - /** * dwc3_gadget_calc_tx_fifo_size - calculates the txfifo size value * @dwc: pointer to the DWC3 context @@ -871,12 +868,13 @@ static int __dwc3_gadget_ep_enable(struct dwc3_ep *dep, unsigned int action) reg |= DWC3_DALEPENA_EP(dep->number); dwc3_writel(dwc->regs, DWC3_DALEPENA, reg); + dep->trb_dequeue = 0; + dep->trb_enqueue = 0; + if (usb_endpoint_xfer_control(desc)) goto out; /* Initialize the TRB ring */ - dep->trb_dequeue = 0; - dep->trb_enqueue = 0; memset(dep->trb_pool, 0, sizeof(struct dwc3_trb) * DWC3_TRB_NUM); @@ -1832,7 +1830,7 @@ static int __dwc3_gadget_ep_queue(struct dwc3_ep *dep, struct dwc3_request *req) struct dwc3 *dwc = dep->dwc; if (!dep->endpoint.desc || !dwc->pullups_connected || !dwc->connected) { - dev_err(dwc->dev, "%s: can't queue to disabled endpoint\n", + dev_dbg(dwc->dev, "%s: can't queue to disabled endpoint\n", dep->name); return -ESHUTDOWN; } @@ -1865,6 +1863,7 @@ static int __dwc3_gadget_ep_queue(struct dwc3_ep *dep, struct dwc3_request *req) */ if ((dep->flags & DWC3_EP_END_TRANSFER_PENDING) || (dep->flags & DWC3_EP_WEDGE) || + (dep->flags & DWC3_EP_DELAY_STOP) || (dep->flags & DWC3_EP_STALL)) { dep->flags |= DWC3_EP_DELAY_START; return 0; @@ -2005,6 +2004,16 @@ static int dwc3_gadget_ep_dequeue(struct usb_ep *ep, if (r == req) { struct dwc3_request *t; + /* + * If a Setup packet is received but yet to DMA out, the controller will + * not process the End Transfer command of any endpoint. Polling of its + * DEPCMD.CmdAct may block setting up TRB for Setup packet, causing a + * timeout. Delay issuing the End Transfer command until the Setup TRB is + * prepared. + */ + if (dwc->ep0state != EP0_SETUP_PHASE && !dwc->delayed_status) + dep->flags |= DWC3_EP_DELAY_STOP; + /* wait until it is processed */ dwc3_stop_active_transfer(dep, true, true); @@ -2088,7 +2097,8 @@ int __dwc3_gadget_ep_set_halt(struct dwc3_ep *dep, int value, int protocol) list_for_each_entry_safe(req, tmp, &dep->started_list, list) dwc3_gadget_move_cancelled_request(req, DWC3_REQUEST_STATUS_STALLED); - if (dep->flags & DWC3_EP_END_TRANSFER_PENDING) { + if (dep->flags & DWC3_EP_END_TRANSFER_PENDING || + (dep->flags & DWC3_EP_DELAY_STOP)) { dep->flags |= DWC3_EP_PENDING_CLEAR_STALL; return 0; } @@ -2670,6 +2680,7 @@ static int __dwc3_gadget_start(struct dwc3 *dwc) /* begin to receive SETUP packets */ dwc->ep0state = EP0_SETUP_PHASE; + dwc->ep0_bounced = false; dwc->link_state = DWC3_LINK_STATE_SS_DIS; dwc->delayed_status = false; dwc3_ep0_out_start(dwc); @@ -3604,7 +3615,7 @@ static void dwc3_reset_gadget(struct dwc3 *dwc) } } -static void dwc3_stop_active_transfer(struct dwc3_ep *dep, bool force, +void dwc3_stop_active_transfer(struct dwc3_ep *dep, bool force, bool interrupt) { struct dwc3_gadget_ep_cmd_params params; @@ -3612,6 +3623,7 @@ static void dwc3_stop_active_transfer(struct dwc3_ep *dep, bool force, int ret; if (!(dep->flags & DWC3_EP_TRANSFER_STARTED) || + (dep->flags & DWC3_EP_DELAY_STOP) || (dep->flags & DWC3_EP_END_TRANSFER_PENDING)) return; @@ -3744,6 +3756,27 @@ static void dwc3_gadget_reset_interrupt(struct dwc3 *dwc) } dwc3_reset_gadget(dwc); + + /* + * From SNPS databook section 8.1.2, the EP0 should be in setup + * phase. So ensure that EP0 is in setup phase by issuing a stall + * and restart if EP0 is not in setup phase. + */ + if (dwc->ep0state != EP0_SETUP_PHASE) { + unsigned int dir; + + dir = !!dwc->ep0_expect_in; + if (dwc->ep0state == EP0_DATA_PHASE) + dwc3_ep0_end_control_data(dwc, dwc->eps[dir]); + else + dwc3_ep0_end_control_data(dwc, dwc->eps[!dir]); + + dwc->eps[0]->trb_enqueue = 0; + dwc->eps[1]->trb_enqueue = 0; + + dwc3_ep0_stall_and_restart(dwc); + } + /* * In the Synopsis DesignWare Cores USB3 Databook Rev. 3.30a * Section 4.1.2 Table 4-2, it states that during a USB reset, the SW diff --git a/drivers/usb/dwc3/gadget.h b/drivers/usb/dwc3/gadget.h index 77df4b6d6c13..55a56cf67d73 100644 --- a/drivers/usb/dwc3/gadget.h +++ b/drivers/usb/dwc3/gadget.h @@ -110,12 +110,15 @@ void dwc3_gadget_giveback(struct dwc3_ep *dep, struct dwc3_request *req, void dwc3_ep0_interrupt(struct dwc3 *dwc, const struct dwc3_event_depevt *event); void dwc3_ep0_out_start(struct dwc3 *dwc); +void dwc3_ep0_end_control_data(struct dwc3 *dwc, struct dwc3_ep *dep); +void dwc3_ep0_stall_and_restart(struct dwc3 *dwc); int __dwc3_gadget_ep0_set_halt(struct usb_ep *ep, int value); int dwc3_gadget_ep0_set_halt(struct usb_ep *ep, int value); int dwc3_gadget_ep0_queue(struct usb_ep *ep, struct usb_request *request, gfp_t gfp_flags); int __dwc3_gadget_ep_set_halt(struct dwc3_ep *dep, int value, int protocol); void dwc3_ep0_send_delayed_status(struct dwc3 *dwc); +void dwc3_stop_active_transfer(struct dwc3_ep *dep, bool force, bool interrupt); /** * dwc3_gadget_ep_get_transfer_index - Gets transfer index from HW diff --git a/drivers/usb/dwc3/trace.c b/drivers/usb/dwc3/trace.c index 088995885678..31d0355780df 100644 --- a/drivers/usb/dwc3/trace.c +++ b/drivers/usb/dwc3/trace.c @@ -9,3 +9,10 @@ #define CREATE_TRACE_POINTS #include "trace.h" + +EXPORT_TRACEPOINT_SYMBOL_GPL(dwc3_event); +EXPORT_TRACEPOINT_SYMBOL_GPL(dwc3_ctrl_req); +EXPORT_TRACEPOINT_SYMBOL_GPL(dwc3_complete_trb); +EXPORT_TRACEPOINT_SYMBOL_GPL(dwc3_ep_queue); +EXPORT_TRACEPOINT_SYMBOL_GPL(dwc3_readl); +EXPORT_TRACEPOINT_SYMBOL_GPL(dwc3_writel); diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig index dd58094f0b85..bc2cfb4ab23d 100644 --- a/drivers/usb/gadget/Kconfig +++ b/drivers/usb/gadget/Kconfig @@ -216,6 +216,12 @@ config USB_F_PRINTER config USB_F_TCM tristate +config USB_F_ACC + tristate + +config USB_F_AUDIO_SRC + tristate + # this first set of drivers all depend on bulk-capable hardware. config USB_CONFIGFS @@ -230,6 +236,14 @@ config USB_CONFIGFS appropriate symbolic links. For more information see Documentation/usb/gadget_configfs.rst. +config USB_CONFIGFS_UEVENT + bool "Uevent notification of Gadget state" + depends on USB_CONFIGFS + help + Enable uevent notifications to userspace when the gadget + state changes. The gadget can be in any of the following + three states: "CONNECTED/DISCONNECTED/CONFIGURED" + config USB_CONFIGFS_SERIAL bool "Generic serial bulk in/out" depends on USB_CONFIGFS @@ -371,6 +385,23 @@ config USB_CONFIGFS_F_FS implemented in kernel space (for instance Ethernet, serial or mass storage) and other are implemented in user space. +config USB_CONFIGFS_F_ACC + bool "Accessory gadget" + depends on USB_CONFIGFS + depends on HID=y + select USB_F_ACC + help + USB gadget Accessory support + +config USB_CONFIGFS_F_AUDIO_SRC + bool "Audio Source gadget" + depends on USB_CONFIGFS + depends on SND + select SND_PCM + select USB_F_AUDIO_SRC + help + USB gadget Audio Source support + config USB_CONFIGFS_F_UAC1 bool "Audio Class 1.0" depends on USB_CONFIGFS diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index 553382ce3837..9315313108c9 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -159,6 +159,8 @@ int config_ep_by_speed_and_alt(struct usb_gadget *g, int want_comp_desc = 0; struct usb_descriptor_header **d_spd; /* cursor for speed desc */ + struct usb_composite_dev *cdev; + bool incomplete_desc = false; if (!g || !f || !_ep) return -EIO; @@ -167,28 +169,43 @@ int config_ep_by_speed_and_alt(struct usb_gadget *g, switch (g->speed) { case USB_SPEED_SUPER_PLUS: if (gadget_is_superspeed_plus(g)) { - speed_desc = f->ssp_descriptors; - want_comp_desc = 1; - break; + if (f->ssp_descriptors) { + speed_desc = f->ssp_descriptors; + want_comp_desc = 1; + break; + } + incomplete_desc = true; } fallthrough; case USB_SPEED_SUPER: if (gadget_is_superspeed(g)) { - speed_desc = f->ss_descriptors; - want_comp_desc = 1; - break; + if (f->ss_descriptors) { + speed_desc = f->ss_descriptors; + want_comp_desc = 1; + break; + } + incomplete_desc = true; } fallthrough; case USB_SPEED_HIGH: if (gadget_is_dualspeed(g)) { - speed_desc = f->hs_descriptors; - break; + if (f->hs_descriptors) { + speed_desc = f->hs_descriptors; + break; + } + incomplete_desc = true; } fallthrough; default: speed_desc = f->fs_descriptors; } + cdev = get_gadget_data(g); + if (incomplete_desc) + WARNING(cdev, + "%s doesn't hold the descriptors for current speed\n", + f->name); + /* find correct alternate setting descriptor */ for_each_desc(speed_desc, d_spd, USB_DT_INTERFACE) { int_desc = (struct usb_interface_descriptor *)*d_spd; @@ -244,12 +261,8 @@ ep_found: _ep->maxburst = comp_desc->bMaxBurst + 1; break; default: - if (comp_desc->bMaxBurst != 0) { - struct usb_composite_dev *cdev; - - cdev = get_gadget_data(g); + if (comp_desc->bMaxBurst != 0) ERROR(cdev, "ep0 bMaxBurst must be 0\n"); - } _ep->maxburst = 1; break; } diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index 5ade844db404..66885e28a195 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -10,6 +10,32 @@ #include "u_f.h" #include "u_os_desc.h" +#ifdef CONFIG_USB_CONFIGFS_UEVENT +#include +#include +#include + +#ifdef CONFIG_USB_CONFIGFS_F_ACC +extern int acc_ctrlrequest_composite(struct usb_composite_dev *cdev, + const struct usb_ctrlrequest *ctrl); +void acc_disconnect(void); +#endif +static struct class *android_class; +static struct device *android_device; +static int index; +static int gadget_index; + +struct device *create_function_device(char *name) +{ + if (android_device && !IS_ERR(android_device)) + return device_create(android_class, android_device, + MKDEV(0, index++), NULL, name); + else + return ERR_PTR(-EINVAL); +} +EXPORT_SYMBOL_GPL(create_function_device); +#endif + int check_user_usb_string(const char *name, struct usb_gadget_strings *stringtab_dev) { @@ -51,6 +77,12 @@ struct gadget_info { char qw_sign[OS_STRING_QW_SIGN_LEN]; spinlock_t spinlock; bool unbind; +#ifdef CONFIG_USB_CONFIGFS_UEVENT + bool connected; + bool sw_connected; + struct work_struct work; + struct device *dev; +#endif }; static inline struct gadget_info *to_gadget_info(struct config_item *item) @@ -272,7 +304,7 @@ static ssize_t gadget_dev_desc_UDC_store(struct config_item *item, mutex_lock(&gi->lock); - if (!strlen(name)) { + if (!strlen(name) || strcmp(name, "none") == 0) { ret = unregister_gadget(gi); if (ret) goto err; @@ -1426,6 +1458,57 @@ err_comp_cleanup: return ret; } +#ifdef CONFIG_USB_CONFIGFS_UEVENT +static void android_work(struct work_struct *data) +{ + struct gadget_info *gi = container_of(data, struct gadget_info, work); + struct usb_composite_dev *cdev = &gi->cdev; + char *disconnected[2] = { "USB_STATE=DISCONNECTED", NULL }; + char *connected[2] = { "USB_STATE=CONNECTED", NULL }; + char *configured[2] = { "USB_STATE=CONFIGURED", NULL }; + /* 0-connected 1-configured 2-disconnected*/ + bool status[3] = { false, false, false }; + unsigned long flags; + bool uevent_sent = false; + + spin_lock_irqsave(&cdev->lock, flags); + if (cdev->config) + status[1] = true; + + if (gi->connected != gi->sw_connected) { + if (gi->connected) + status[0] = true; + else + status[2] = true; + gi->sw_connected = gi->connected; + } + spin_unlock_irqrestore(&cdev->lock, flags); + + if (status[0]) { + kobject_uevent_env(&gi->dev->kobj, KOBJ_CHANGE, connected); + pr_info("%s: sent uevent %s\n", __func__, connected[0]); + uevent_sent = true; + } + + if (status[1]) { + kobject_uevent_env(&gi->dev->kobj, KOBJ_CHANGE, configured); + pr_info("%s: sent uevent %s\n", __func__, configured[0]); + uevent_sent = true; + } + + if (status[2]) { + kobject_uevent_env(&gi->dev->kobj, KOBJ_CHANGE, disconnected); + pr_info("%s: sent uevent %s\n", __func__, disconnected[0]); + uevent_sent = true; + } + + if (!uevent_sent) { + pr_info("%s: did not send uevent (%d %d %p)\n", __func__, + gi->connected, gi->sw_connected, cdev->config); + } +} +#endif + static void configfs_composite_unbind(struct usb_gadget *gadget) { struct usb_composite_dev *cdev; @@ -1453,6 +1536,50 @@ static void configfs_composite_unbind(struct usb_gadget *gadget) spin_unlock_irqrestore(&gi->spinlock, flags); } +#ifdef CONFIG_USB_CONFIGFS_UEVENT +static int android_setup(struct usb_gadget *gadget, + const struct usb_ctrlrequest *c) +{ + struct usb_composite_dev *cdev = get_gadget_data(gadget); + unsigned long flags; + struct gadget_info *gi = container_of(cdev, struct gadget_info, cdev); + int value = -EOPNOTSUPP; + struct usb_function_instance *fi; + + spin_lock_irqsave(&cdev->lock, flags); + if (!gi->connected) { + gi->connected = 1; + schedule_work(&gi->work); + } + spin_unlock_irqrestore(&cdev->lock, flags); + list_for_each_entry(fi, &gi->available_func, cfs_list) { + if (fi != NULL && fi->f != NULL && fi->f->setup != NULL) { + value = fi->f->setup(fi->f, c); + if (value >= 0) + break; + } + } + +#ifdef CONFIG_USB_CONFIGFS_F_ACC + if (value < 0) + value = acc_ctrlrequest_composite(cdev, c); +#endif + + if (value < 0) + value = composite_setup(gadget, c); + + spin_lock_irqsave(&cdev->lock, flags); + if (c->bRequest == USB_REQ_SET_CONFIGURATION && + cdev->config) { + schedule_work(&gi->work); + } + spin_unlock_irqrestore(&cdev->lock, flags); + + return value; +} + +#else // CONFIG_USB_CONFIGFS_UEVENT + static int configfs_composite_setup(struct usb_gadget *gadget, const struct usb_ctrlrequest *ctrl) { @@ -1478,6 +1605,8 @@ static int configfs_composite_setup(struct usb_gadget *gadget, return ret; } +#endif // CONFIG_USB_CONFIGFS_UEVENT + static void configfs_composite_disconnect(struct usb_gadget *gadget) { struct usb_composite_dev *cdev; @@ -1488,6 +1617,14 @@ static void configfs_composite_disconnect(struct usb_gadget *gadget) if (!cdev) return; +#ifdef CONFIG_USB_CONFIGFS_F_ACC + /* + * accessory HID support can be active while the + * accessory function is not actually enabled, + * so we need to inform it when we are disconnected. + */ + acc_disconnect(); +#endif gi = container_of(cdev, struct gadget_info, cdev); spin_lock_irqsave(&gi->spinlock, flags); cdev = get_gadget_data(gadget); @@ -1496,6 +1633,10 @@ static void configfs_composite_disconnect(struct usb_gadget *gadget) return; } +#ifdef CONFIG_USB_CONFIGFS_UEVENT + gi->connected = 0; + schedule_work(&gi->work); +#endif composite_disconnect(gadget); spin_unlock_irqrestore(&gi->spinlock, flags); } @@ -1570,10 +1711,13 @@ static const struct usb_gadget_driver configfs_driver_template = { .bind = configfs_composite_bind, .unbind = configfs_composite_unbind, +#ifdef CONFIG_USB_CONFIGFS_UEVENT + .setup = android_setup, +#else .setup = configfs_composite_setup, +#endif .reset = configfs_composite_reset, .disconnect = configfs_composite_disconnect, - .suspend = configfs_composite_suspend, .resume = configfs_composite_resume, @@ -1585,6 +1729,91 @@ static const struct usb_gadget_driver configfs_driver_template = { .match_existing_only = 1, }; +#ifdef CONFIG_USB_CONFIGFS_UEVENT +static ssize_t state_show(struct device *pdev, struct device_attribute *attr, + char *buf) +{ + struct gadget_info *dev = dev_get_drvdata(pdev); + struct usb_composite_dev *cdev; + char *state = "DISCONNECTED"; + unsigned long flags; + + if (!dev) + goto out; + + cdev = &dev->cdev; + + if (!cdev) + goto out; + + spin_lock_irqsave(&cdev->lock, flags); + if (cdev->config) + state = "CONFIGURED"; + else if (dev->connected) + state = "CONNECTED"; + spin_unlock_irqrestore(&cdev->lock, flags); +out: + return sprintf(buf, "%s\n", state); +} + +static DEVICE_ATTR(state, S_IRUGO, state_show, NULL); + +static struct device_attribute *android_usb_attributes[] = { + &dev_attr_state, + NULL +}; + +static int android_device_create(struct gadget_info *gi) +{ + struct device_attribute **attrs; + struct device_attribute *attr; + + INIT_WORK(&gi->work, android_work); + gi->dev = device_create(android_class, NULL, + MKDEV(0, 0), NULL, "android%d", gadget_index++); + if (IS_ERR(gi->dev)) + return PTR_ERR(gi->dev); + + dev_set_drvdata(gi->dev, gi); + if (!android_device) + android_device = gi->dev; + + attrs = android_usb_attributes; + while ((attr = *attrs++)) { + int err; + + err = device_create_file(gi->dev, attr); + if (err) { + device_destroy(gi->dev->class, + gi->dev->devt); + return err; + } + } + + return 0; +} + +static void android_device_destroy(struct gadget_info *gi) +{ + struct device_attribute **attrs; + struct device_attribute *attr; + + attrs = android_usb_attributes; + while ((attr = *attrs++)) + device_remove_file(gi->dev, attr); + device_destroy(gi->dev->class, gi->dev->devt); +} +#else +static inline int android_device_create(struct gadget_info *gi) +{ + return 0; +} + +static inline void android_device_destroy(struct gadget_info *gi) +{ +} +#endif + static struct config_group *gadgets_make( struct config_group *group, const char *name) @@ -1637,7 +1866,11 @@ static struct config_group *gadgets_make( if (!gi->composite.gadget_driver.function) goto err; + if (android_device_create(gi) < 0) + goto err; + return &gi->group; + err: kfree(gi); return ERR_PTR(-ENOMEM); @@ -1645,7 +1878,11 @@ err: static void gadgets_drop(struct config_group *group, struct config_item *item) { + struct gadget_info *gi; + + gi = container_of(to_config_group(item), struct gadget_info, group); config_item_put(item); + android_device_destroy(gi); } static struct configfs_group_operations gadgets_ops = { @@ -1685,6 +1922,13 @@ static int __init gadget_cfs_init(void) config_group_init(&gadget_subsys.su_group); ret = configfs_register_subsystem(&gadget_subsys); + +#ifdef CONFIG_USB_CONFIGFS_UEVENT + android_class = class_create(THIS_MODULE, "android_usb"); + if (IS_ERR(android_class)) + return PTR_ERR(android_class); +#endif + return ret; } module_init(gadget_cfs_init); @@ -1692,5 +1936,10 @@ module_init(gadget_cfs_init); static void __exit gadget_cfs_exit(void) { configfs_unregister_subsystem(&gadget_subsys); +#ifdef CONFIG_USB_CONFIGFS_UEVENT + if (!IS_ERR(android_class)) + class_destroy(android_class); +#endif + } module_exit(gadget_cfs_exit); diff --git a/drivers/usb/gadget/function/Makefile b/drivers/usb/gadget/function/Makefile index 5d3a6cf02218..dd33a1243342 100644 --- a/drivers/usb/gadget/function/Makefile +++ b/drivers/usb/gadget/function/Makefile @@ -50,3 +50,7 @@ usb_f_printer-y := f_printer.o obj-$(CONFIG_USB_F_PRINTER) += usb_f_printer.o usb_f_tcm-y := f_tcm.o obj-$(CONFIG_USB_F_TCM) += usb_f_tcm.o +usb_f_accessory-y := f_accessory.o +obj-$(CONFIG_USB_F_ACC) += usb_f_accessory.o +usb_f_audio_source-y := f_audio_source.o +obj-$(CONFIG_USB_F_AUDIO_SRC) += usb_f_audio_source.o diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c new file mode 100644 index 000000000000..a00fd58a494c --- /dev/null +++ b/drivers/usb/gadget/function/f_accessory.c @@ -0,0 +1,1563 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Gadget Function Driver for Android USB accessories + * + * Copyright (C) 2011 Google, Inc. + * Author: Mike Lockwood + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +/* #define DEBUG */ +/* #define VERBOSE_DEBUG */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#define MAX_INST_NAME_LEN 40 +#define BULK_BUFFER_SIZE 16384 +#define ACC_STRING_SIZE 256 + +#define PROTOCOL_VERSION 2 + +/* String IDs */ +#define INTERFACE_STRING_INDEX 0 + +/* number of tx and rx requests to allocate */ +#define TX_REQ_MAX 4 +#define RX_REQ_MAX 2 + +struct acc_hid_dev { + struct list_head list; + struct hid_device *hid; + struct acc_dev *dev; + /* accessory defined ID */ + int id; + /* HID report descriptor */ + u8 *report_desc; + /* length of HID report descriptor */ + int report_desc_len; + /* number of bytes of report_desc we have received so far */ + int report_desc_offset; +}; + +struct acc_dev { + struct usb_function function; + struct usb_composite_dev *cdev; + spinlock_t lock; + struct acc_dev_ref *ref; + + struct usb_ep *ep_in; + struct usb_ep *ep_out; + + /* online indicates state of function_set_alt & function_unbind + * set to 1 when we connect + */ + int online; + + /* disconnected indicates state of open & release + * Set to 1 when we disconnect. + * Not cleared until our file is closed. + */ + int disconnected; + + /* strings sent by the host */ + char manufacturer[ACC_STRING_SIZE]; + char model[ACC_STRING_SIZE]; + char description[ACC_STRING_SIZE]; + char version[ACC_STRING_SIZE]; + char uri[ACC_STRING_SIZE]; + char serial[ACC_STRING_SIZE]; + + /* for acc_complete_set_string */ + int string_index; + + /* set to 1 if we have a pending start request */ + int start_requested; + + int audio_mode; + + /* synchronize access to our device file */ + atomic_t open_excl; + + struct list_head tx_idle; + + wait_queue_head_t read_wq; + wait_queue_head_t write_wq; + struct usb_request *rx_req[RX_REQ_MAX]; + int rx_done; + + /* delayed work for handling ACCESSORY_START */ + struct delayed_work start_work; + + /* work for handling ACCESSORY GET PROTOCOL */ + struct work_struct getprotocol_work; + + /* work for handling ACCESSORY SEND STRING */ + struct work_struct sendstring_work; + + /* worker for registering and unregistering hid devices */ + struct work_struct hid_work; + + /* list of active HID devices */ + struct list_head hid_list; + + /* list of new HID devices to register */ + struct list_head new_hid_list; + + /* list of dead HID devices to unregister */ + struct list_head dead_hid_list; +}; + +static struct usb_interface_descriptor acc_interface_desc = { + .bLength = USB_DT_INTERFACE_SIZE, + .bDescriptorType = USB_DT_INTERFACE, + .bInterfaceNumber = 0, + .bNumEndpoints = 2, + .bInterfaceClass = USB_CLASS_VENDOR_SPEC, + .bInterfaceSubClass = USB_SUBCLASS_VENDOR_SPEC, + .bInterfaceProtocol = 0, +}; + +static struct usb_endpoint_descriptor acc_superspeedplus_in_desc = { + .bLength = USB_DT_ENDPOINT_SIZE, + .bDescriptorType = USB_DT_ENDPOINT, + .bEndpointAddress = USB_DIR_IN, + .bmAttributes = USB_ENDPOINT_XFER_BULK, + .wMaxPacketSize = cpu_to_le16(1024), +}; + +static struct usb_endpoint_descriptor acc_superspeedplus_out_desc = { + .bLength = USB_DT_ENDPOINT_SIZE, + .bDescriptorType = USB_DT_ENDPOINT, + .bEndpointAddress = USB_DIR_OUT, + .bmAttributes = USB_ENDPOINT_XFER_BULK, + .wMaxPacketSize = cpu_to_le16(1024), +}; + +static struct usb_ss_ep_comp_descriptor acc_superspeedplus_comp_desc = { + .bLength = sizeof(acc_superspeedplus_comp_desc), + .bDescriptorType = USB_DT_SS_ENDPOINT_COMP, + + /* the following 2 values can be tweaked if necessary */ + /* .bMaxBurst = 0, */ + /* .bmAttributes = 0, */ +}; + +static struct usb_endpoint_descriptor acc_superspeed_in_desc = { + .bLength = USB_DT_ENDPOINT_SIZE, + .bDescriptorType = USB_DT_ENDPOINT, + .bEndpointAddress = USB_DIR_IN, + .bmAttributes = USB_ENDPOINT_XFER_BULK, + .wMaxPacketSize = cpu_to_le16(1024), +}; + +static struct usb_endpoint_descriptor acc_superspeed_out_desc = { + .bLength = USB_DT_ENDPOINT_SIZE, + .bDescriptorType = USB_DT_ENDPOINT, + .bEndpointAddress = USB_DIR_OUT, + .bmAttributes = USB_ENDPOINT_XFER_BULK, + .wMaxPacketSize = cpu_to_le16(1024), +}; + +static struct usb_ss_ep_comp_descriptor acc_superspeed_comp_desc = { + .bLength = sizeof(acc_superspeed_comp_desc), + .bDescriptorType = USB_DT_SS_ENDPOINT_COMP, + + /* the following 2 values can be tweaked if necessary */ + /* .bMaxBurst = 0, */ + /* .bmAttributes = 0, */ +}; + +static struct usb_endpoint_descriptor acc_highspeed_in_desc = { + .bLength = USB_DT_ENDPOINT_SIZE, + .bDescriptorType = USB_DT_ENDPOINT, + .bEndpointAddress = USB_DIR_IN, + .bmAttributes = USB_ENDPOINT_XFER_BULK, + .wMaxPacketSize = cpu_to_le16(512), +}; + +static struct usb_endpoint_descriptor acc_highspeed_out_desc = { + .bLength = USB_DT_ENDPOINT_SIZE, + .bDescriptorType = USB_DT_ENDPOINT, + .bEndpointAddress = USB_DIR_OUT, + .bmAttributes = USB_ENDPOINT_XFER_BULK, + .wMaxPacketSize = cpu_to_le16(512), +}; + +static struct usb_endpoint_descriptor acc_fullspeed_in_desc = { + .bLength = USB_DT_ENDPOINT_SIZE, + .bDescriptorType = USB_DT_ENDPOINT, + .bEndpointAddress = USB_DIR_IN, + .bmAttributes = USB_ENDPOINT_XFER_BULK, +}; + +static struct usb_endpoint_descriptor acc_fullspeed_out_desc = { + .bLength = USB_DT_ENDPOINT_SIZE, + .bDescriptorType = USB_DT_ENDPOINT, + .bEndpointAddress = USB_DIR_OUT, + .bmAttributes = USB_ENDPOINT_XFER_BULK, +}; + +static struct usb_descriptor_header *fs_acc_descs[] = { + (struct usb_descriptor_header *) &acc_interface_desc, + (struct usb_descriptor_header *) &acc_fullspeed_in_desc, + (struct usb_descriptor_header *) &acc_fullspeed_out_desc, + NULL, +}; + +static struct usb_descriptor_header *hs_acc_descs[] = { + (struct usb_descriptor_header *) &acc_interface_desc, + (struct usb_descriptor_header *) &acc_highspeed_in_desc, + (struct usb_descriptor_header *) &acc_highspeed_out_desc, + NULL, +}; + +static struct usb_descriptor_header *ss_acc_descs[] = { + (struct usb_descriptor_header *) &acc_interface_desc, + (struct usb_descriptor_header *) &acc_superspeed_in_desc, + (struct usb_descriptor_header *) &acc_superspeed_comp_desc, + (struct usb_descriptor_header *) &acc_superspeed_out_desc, + (struct usb_descriptor_header *) &acc_superspeed_comp_desc, + NULL, +}; + +static struct usb_descriptor_header *ssp_acc_descs[] = { + (struct usb_descriptor_header *) &acc_interface_desc, + (struct usb_descriptor_header *) &acc_superspeedplus_in_desc, + (struct usb_descriptor_header *) &acc_superspeedplus_comp_desc, + (struct usb_descriptor_header *) &acc_superspeedplus_out_desc, + (struct usb_descriptor_header *) &acc_superspeedplus_comp_desc, + NULL, +}; + +static struct usb_string acc_string_defs[] = { + [INTERFACE_STRING_INDEX].s = "Android Accessory Interface", + { }, /* end of list */ +}; + +static struct usb_gadget_strings acc_string_table = { + .language = 0x0409, /* en-US */ + .strings = acc_string_defs, +}; + +static struct usb_gadget_strings *acc_strings[] = { + &acc_string_table, + NULL, +}; + +struct acc_dev_ref { + struct kref kref; + struct acc_dev *acc_dev; +}; + +static struct acc_dev_ref _acc_dev_ref = { + .kref = KREF_INIT(0), +}; + +struct acc_instance { + struct usb_function_instance func_inst; + const char *name; +}; + +static struct acc_dev *get_acc_dev(void) +{ + struct acc_dev_ref *ref = &_acc_dev_ref; + + return kref_get_unless_zero(&ref->kref) ? ref->acc_dev : NULL; +} + +static void __put_acc_dev(struct kref *kref) +{ + struct acc_dev_ref *ref = container_of(kref, struct acc_dev_ref, kref); + struct acc_dev *dev = ref->acc_dev; + + /* Cancel any async work */ + cancel_delayed_work_sync(&dev->start_work); + cancel_work_sync(&dev->getprotocol_work); + cancel_work_sync(&dev->sendstring_work); + cancel_work_sync(&dev->hid_work); + + ref->acc_dev = NULL; + kfree(dev); +} + +static void put_acc_dev(struct acc_dev *dev) +{ + struct acc_dev_ref *ref = dev->ref; + + WARN_ON(ref->acc_dev != dev); + kref_put(&ref->kref, __put_acc_dev); +} + +static inline struct acc_dev *func_to_dev(struct usb_function *f) +{ + return container_of(f, struct acc_dev, function); +} + +static struct usb_request *acc_request_new(struct usb_ep *ep, int buffer_size) +{ + struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL); + + if (!req) + return NULL; + + /* now allocate buffers for the requests */ + req->buf = kmalloc(buffer_size, GFP_KERNEL); + if (!req->buf) { + usb_ep_free_request(ep, req); + return NULL; + } + + return req; +} + +static void acc_request_free(struct usb_request *req, struct usb_ep *ep) +{ + if (req) { + kfree(req->buf); + usb_ep_free_request(ep, req); + } +} + +/* add a request to the tail of a list */ +static void req_put(struct acc_dev *dev, struct list_head *head, + struct usb_request *req) +{ + unsigned long flags; + + spin_lock_irqsave(&dev->lock, flags); + list_add_tail(&req->list, head); + spin_unlock_irqrestore(&dev->lock, flags); +} + +/* remove a request from the head of a list */ +static struct usb_request *req_get(struct acc_dev *dev, struct list_head *head) +{ + unsigned long flags; + struct usb_request *req; + + spin_lock_irqsave(&dev->lock, flags); + if (list_empty(head)) { + req = 0; + } else { + req = list_first_entry(head, struct usb_request, list); + list_del(&req->list); + } + spin_unlock_irqrestore(&dev->lock, flags); + return req; +} + +static void acc_set_disconnected(struct acc_dev *dev) +{ + dev->disconnected = 1; +} + +static void acc_complete_in(struct usb_ep *ep, struct usb_request *req) +{ + struct acc_dev *dev = get_acc_dev(); + + if (!dev) + return; + + if (req->status == -ESHUTDOWN) { + pr_debug("acc_complete_in set disconnected"); + acc_set_disconnected(dev); + } + + req_put(dev, &dev->tx_idle, req); + + wake_up(&dev->write_wq); + put_acc_dev(dev); +} + +static void acc_complete_out(struct usb_ep *ep, struct usb_request *req) +{ + struct acc_dev *dev = get_acc_dev(); + + if (!dev) + return; + + dev->rx_done = 1; + if (req->status == -ESHUTDOWN) { + pr_debug("acc_complete_out set disconnected"); + acc_set_disconnected(dev); + } + + wake_up(&dev->read_wq); + put_acc_dev(dev); +} + +static void acc_complete_set_string(struct usb_ep *ep, struct usb_request *req) +{ + struct acc_dev *dev = ep->driver_data; + char *string_dest = NULL; + int length = req->actual; + + if (req->status != 0) { + pr_err("acc_complete_set_string, err %d\n", req->status); + return; + } + + switch (dev->string_index) { + case ACCESSORY_STRING_MANUFACTURER: + string_dest = dev->manufacturer; + break; + case ACCESSORY_STRING_MODEL: + string_dest = dev->model; + break; + case ACCESSORY_STRING_DESCRIPTION: + string_dest = dev->description; + break; + case ACCESSORY_STRING_VERSION: + string_dest = dev->version; + break; + case ACCESSORY_STRING_URI: + string_dest = dev->uri; + break; + case ACCESSORY_STRING_SERIAL: + string_dest = dev->serial; + break; + } + if (string_dest) { + unsigned long flags; + + if (length >= ACC_STRING_SIZE) + length = ACC_STRING_SIZE - 1; + + spin_lock_irqsave(&dev->lock, flags); + memcpy(string_dest, req->buf, length); + /* ensure zero termination */ + string_dest[length] = 0; + spin_unlock_irqrestore(&dev->lock, flags); + } else { + pr_err("unknown accessory string index %d\n", + dev->string_index); + } +} + +static void acc_complete_set_hid_report_desc(struct usb_ep *ep, + struct usb_request *req) +{ + struct acc_hid_dev *hid = req->context; + struct acc_dev *dev = hid->dev; + int length = req->actual; + + if (req->status != 0) { + pr_err("acc_complete_set_hid_report_desc, err %d\n", + req->status); + return; + } + + memcpy(hid->report_desc + hid->report_desc_offset, req->buf, length); + hid->report_desc_offset += length; + if (hid->report_desc_offset == hid->report_desc_len) { + /* After we have received the entire report descriptor + * we schedule work to initialize the HID device + */ + schedule_work(&dev->hid_work); + } +} + +static void acc_complete_send_hid_event(struct usb_ep *ep, + struct usb_request *req) +{ + struct acc_hid_dev *hid = req->context; + int length = req->actual; + + if (req->status != 0) { + pr_err("acc_complete_send_hid_event, err %d\n", req->status); + return; + } + + hid_report_raw_event(hid->hid, HID_INPUT_REPORT, req->buf, length, 1); +} + +static int acc_hid_parse(struct hid_device *hid) +{ + struct acc_hid_dev *hdev = hid->driver_data; + + hid_parse_report(hid, hdev->report_desc, hdev->report_desc_len); + return 0; +} + +static int acc_hid_start(struct hid_device *hid) +{ + return 0; +} + +static void acc_hid_stop(struct hid_device *hid) +{ +} + +static int acc_hid_open(struct hid_device *hid) +{ + return 0; +} + +static void acc_hid_close(struct hid_device *hid) +{ +} + +static int acc_hid_raw_request(struct hid_device *hid, unsigned char reportnum, + __u8 *buf, size_t len, unsigned char rtype, int reqtype) +{ + return 0; +} + +static struct hid_ll_driver acc_hid_ll_driver = { + .parse = acc_hid_parse, + .start = acc_hid_start, + .stop = acc_hid_stop, + .open = acc_hid_open, + .close = acc_hid_close, + .raw_request = acc_hid_raw_request, +}; + +static struct acc_hid_dev *acc_hid_new(struct acc_dev *dev, + int id, int desc_len) +{ + struct acc_hid_dev *hdev; + + hdev = kzalloc(sizeof(*hdev), GFP_ATOMIC); + if (!hdev) + return NULL; + hdev->report_desc = kzalloc(desc_len, GFP_ATOMIC); + if (!hdev->report_desc) { + kfree(hdev); + return NULL; + } + hdev->dev = dev; + hdev->id = id; + hdev->report_desc_len = desc_len; + + return hdev; +} + +static struct acc_hid_dev *acc_hid_get(struct list_head *list, int id) +{ + struct acc_hid_dev *hid; + + list_for_each_entry(hid, list, list) { + if (hid->id == id) + return hid; + } + return NULL; +} + +static int acc_register_hid(struct acc_dev *dev, int id, int desc_length) +{ + struct acc_hid_dev *hid; + unsigned long flags; + + /* report descriptor length must be > 0 */ + if (desc_length <= 0) + return -EINVAL; + + spin_lock_irqsave(&dev->lock, flags); + /* replace HID if one already exists with this ID */ + hid = acc_hid_get(&dev->hid_list, id); + if (!hid) + hid = acc_hid_get(&dev->new_hid_list, id); + if (hid) + list_move(&hid->list, &dev->dead_hid_list); + + hid = acc_hid_new(dev, id, desc_length); + if (!hid) { + spin_unlock_irqrestore(&dev->lock, flags); + return -ENOMEM; + } + + list_add(&hid->list, &dev->new_hid_list); + spin_unlock_irqrestore(&dev->lock, flags); + + /* schedule work to register the HID device */ + schedule_work(&dev->hid_work); + return 0; +} + +static int acc_unregister_hid(struct acc_dev *dev, int id) +{ + struct acc_hid_dev *hid; + unsigned long flags; + + spin_lock_irqsave(&dev->lock, flags); + hid = acc_hid_get(&dev->hid_list, id); + if (!hid) + hid = acc_hid_get(&dev->new_hid_list, id); + if (!hid) { + spin_unlock_irqrestore(&dev->lock, flags); + return -EINVAL; + } + + list_move(&hid->list, &dev->dead_hid_list); + spin_unlock_irqrestore(&dev->lock, flags); + + schedule_work(&dev->hid_work); + return 0; +} + +static int create_bulk_endpoints(struct acc_dev *dev, + struct usb_endpoint_descriptor *in_desc, + struct usb_endpoint_descriptor *out_desc) +{ + struct usb_composite_dev *cdev = dev->cdev; + struct usb_request *req; + struct usb_ep *ep; + int i; + + DBG(cdev, "create_bulk_endpoints dev: %p\n", dev); + + ep = usb_ep_autoconfig(cdev->gadget, in_desc); + if (!ep) { + DBG(cdev, "usb_ep_autoconfig for ep_in failed\n"); + return -ENODEV; + } + DBG(cdev, "usb_ep_autoconfig for ep_in got %s\n", ep->name); + ep->driver_data = dev; /* claim the endpoint */ + dev->ep_in = ep; + + ep = usb_ep_autoconfig(cdev->gadget, out_desc); + if (!ep) { + DBG(cdev, "usb_ep_autoconfig for ep_out failed\n"); + return -ENODEV; + } + DBG(cdev, "usb_ep_autoconfig for ep_out got %s\n", ep->name); + ep->driver_data = dev; /* claim the endpoint */ + dev->ep_out = ep; + + /* now allocate requests for our endpoints */ + for (i = 0; i < TX_REQ_MAX; i++) { + req = acc_request_new(dev->ep_in, BULK_BUFFER_SIZE); + if (!req) + goto fail; + req->complete = acc_complete_in; + req_put(dev, &dev->tx_idle, req); + } + for (i = 0; i < RX_REQ_MAX; i++) { + req = acc_request_new(dev->ep_out, BULK_BUFFER_SIZE); + if (!req) + goto fail; + req->complete = acc_complete_out; + dev->rx_req[i] = req; + } + + return 0; + +fail: + pr_err("acc_bind() could not allocate requests\n"); + while ((req = req_get(dev, &dev->tx_idle))) + acc_request_free(req, dev->ep_in); + for (i = 0; i < RX_REQ_MAX; i++) + acc_request_free(dev->rx_req[i], dev->ep_out); + return -1; +} + +static ssize_t acc_read(struct file *fp, char __user *buf, + size_t count, loff_t *pos) +{ + struct acc_dev *dev = fp->private_data; + struct usb_request *req; + ssize_t r = count; + ssize_t data_length; + unsigned xfer; + int ret = 0; + + pr_debug("acc_read(%zu)\n", count); + + if (dev->disconnected) { + pr_debug("acc_read disconnected"); + return -ENODEV; + } + + if (count > BULK_BUFFER_SIZE) + count = BULK_BUFFER_SIZE; + + /* we will block until we're online */ + pr_debug("acc_read: waiting for online\n"); + ret = wait_event_interruptible(dev->read_wq, dev->online); + if (ret < 0) { + r = ret; + goto done; + } + + /* + * Calculate the data length by considering termination character. + * Then compansite the difference of rounding up to + * integer multiple of maxpacket size. + */ + data_length = count; + data_length += dev->ep_out->maxpacket - 1; + data_length -= data_length % dev->ep_out->maxpacket; + + if (dev->rx_done) { + // last req cancelled. try to get it. + req = dev->rx_req[0]; + goto copy_data; + } + +requeue_req: + /* queue a request */ + req = dev->rx_req[0]; + req->length = data_length; + dev->rx_done = 0; + ret = usb_ep_queue(dev->ep_out, req, GFP_KERNEL); + if (ret < 0) { + r = -EIO; + goto done; + } else { + pr_debug("rx %p queue\n", req); + } + + /* wait for a request to complete */ + ret = wait_event_interruptible(dev->read_wq, dev->rx_done); + if (ret < 0) { + r = ret; + ret = usb_ep_dequeue(dev->ep_out, req); + if (ret != 0) { + // cancel failed. There can be a data already received. + // it will be retrieved in the next read. + pr_debug("acc_read: cancelling failed %d", ret); + } + goto done; + } + +copy_data: + dev->rx_done = 0; + if (dev->online) { + /* If we got a 0-len packet, throw it back and try again. */ + if (req->actual == 0) + goto requeue_req; + + pr_debug("rx %p %u\n", req, req->actual); + xfer = (req->actual < count) ? req->actual : count; + r = xfer; + if (copy_to_user(buf, req->buf, xfer)) + r = -EFAULT; + } else + r = -EIO; + +done: + pr_debug("acc_read returning %zd\n", r); + return r; +} + +static ssize_t acc_write(struct file *fp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct acc_dev *dev = fp->private_data; + struct usb_request *req = 0; + ssize_t r = count; + unsigned xfer; + int ret; + + pr_debug("acc_write(%zu)\n", count); + + if (!dev->online || dev->disconnected) { + pr_debug("acc_write disconnected or not online"); + return -ENODEV; + } + + while (count > 0) { + /* get an idle tx request to use */ + req = 0; + ret = wait_event_interruptible(dev->write_wq, + ((req = req_get(dev, &dev->tx_idle)) || !dev->online)); + if (!dev->online || dev->disconnected) { + pr_debug("acc_write dev->error\n"); + r = -EIO; + break; + } + + if (!req) { + r = ret; + break; + } + + if (count > BULK_BUFFER_SIZE) { + xfer = BULK_BUFFER_SIZE; + /* ZLP, They will be more TX requests so not yet. */ + req->zero = 0; + } else { + xfer = count; + /* If the data length is a multple of the + * maxpacket size then send a zero length packet(ZLP). + */ + req->zero = ((xfer % dev->ep_in->maxpacket) == 0); + } + if (copy_from_user(req->buf, buf, xfer)) { + r = -EFAULT; + break; + } + + req->length = xfer; + ret = usb_ep_queue(dev->ep_in, req, GFP_KERNEL); + if (ret < 0) { + pr_debug("acc_write: xfer error %d\n", ret); + r = -EIO; + break; + } + + buf += xfer; + count -= xfer; + + /* zero this so we don't try to free it on error exit */ + req = 0; + } + + if (req) + req_put(dev, &dev->tx_idle, req); + + pr_debug("acc_write returning %zd\n", r); + return r; +} + +static long acc_ioctl(struct file *fp, unsigned code, unsigned long value) +{ + struct acc_dev *dev = fp->private_data; + char *src = NULL; + int ret; + + switch (code) { + case ACCESSORY_GET_STRING_MANUFACTURER: + src = dev->manufacturer; + break; + case ACCESSORY_GET_STRING_MODEL: + src = dev->model; + break; + case ACCESSORY_GET_STRING_DESCRIPTION: + src = dev->description; + break; + case ACCESSORY_GET_STRING_VERSION: + src = dev->version; + break; + case ACCESSORY_GET_STRING_URI: + src = dev->uri; + break; + case ACCESSORY_GET_STRING_SERIAL: + src = dev->serial; + break; + case ACCESSORY_IS_START_REQUESTED: + return dev->start_requested; + case ACCESSORY_GET_AUDIO_MODE: + return dev->audio_mode; + } + if (!src) + return -EINVAL; + + ret = strlen(src) + 1; + if (copy_to_user((void __user *)value, src, ret)) + ret = -EFAULT; + return ret; +} + +static int acc_open(struct inode *ip, struct file *fp) +{ + struct acc_dev *dev = get_acc_dev(); + + if (!dev) + return -ENODEV; + + if (atomic_xchg(&dev->open_excl, 1)) { + put_acc_dev(dev); + return -EBUSY; + } + + dev->disconnected = 0; + fp->private_data = dev; + return 0; +} + +static int acc_release(struct inode *ip, struct file *fp) +{ + struct acc_dev *dev = fp->private_data; + + if (!dev) + return -ENOENT; + + /* indicate that we are disconnected + * still could be online so don't touch online flag + */ + dev->disconnected = 1; + + fp->private_data = NULL; + WARN_ON(!atomic_xchg(&dev->open_excl, 0)); + put_acc_dev(dev); + return 0; +} + +/* file operations for /dev/usb_accessory */ +static const struct file_operations acc_fops = { + .owner = THIS_MODULE, + .read = acc_read, + .write = acc_write, + .unlocked_ioctl = acc_ioctl, + .compat_ioctl = acc_ioctl, + .open = acc_open, + .release = acc_release, +}; + +static int acc_hid_probe(struct hid_device *hdev, + const struct hid_device_id *id) +{ + int ret; + + ret = hid_parse(hdev); + if (ret) + return ret; + return hid_hw_start(hdev, HID_CONNECT_DEFAULT); +} + +static struct miscdevice acc_device = { + .minor = MISC_DYNAMIC_MINOR, + .name = "usb_accessory", + .fops = &acc_fops, +}; + +static const struct hid_device_id acc_hid_table[] = { + { HID_USB_DEVICE(HID_ANY_ID, HID_ANY_ID) }, + { } +}; + +static struct hid_driver acc_hid_driver = { + .name = "USB accessory", + .id_table = acc_hid_table, + .probe = acc_hid_probe, +}; + +static void acc_complete_setup_noop(struct usb_ep *ep, struct usb_request *req) +{ + /* + * Default no-op function when nothing needs to be done for the + * setup request + */ +} + +int acc_ctrlrequest(struct usb_composite_dev *cdev, + const struct usb_ctrlrequest *ctrl) +{ + struct acc_dev *dev = get_acc_dev(); + int value = -EOPNOTSUPP; + struct acc_hid_dev *hid; + int offset; + u8 b_requestType = ctrl->bRequestType; + u8 b_request = ctrl->bRequest; + u16 w_index = le16_to_cpu(ctrl->wIndex); + u16 w_value = le16_to_cpu(ctrl->wValue); + u16 w_length = le16_to_cpu(ctrl->wLength); + unsigned long flags; + + /* + * If instance is not created which is the case in power off charging + * mode, dev will be NULL. Hence return error if it is the case. + */ + if (!dev) + return -ENODEV; + + if (b_requestType == (USB_DIR_OUT | USB_TYPE_VENDOR)) { + if (b_request == ACCESSORY_START) { + dev->start_requested = 1; + schedule_delayed_work( + &dev->start_work, msecs_to_jiffies(10)); + value = 0; + cdev->req->complete = acc_complete_setup_noop; + } else if (b_request == ACCESSORY_SEND_STRING) { + schedule_work(&dev->sendstring_work); + dev->string_index = w_index; + cdev->gadget->ep0->driver_data = dev; + cdev->req->complete = acc_complete_set_string; + value = w_length; + } else if (b_request == ACCESSORY_SET_AUDIO_MODE && + w_index == 0 && w_length == 0) { + dev->audio_mode = w_value; + cdev->req->complete = acc_complete_setup_noop; + value = 0; + } else if (b_request == ACCESSORY_REGISTER_HID) { + cdev->req->complete = acc_complete_setup_noop; + value = acc_register_hid(dev, w_value, w_index); + } else if (b_request == ACCESSORY_UNREGISTER_HID) { + cdev->req->complete = acc_complete_setup_noop; + value = acc_unregister_hid(dev, w_value); + } else if (b_request == ACCESSORY_SET_HID_REPORT_DESC) { + spin_lock_irqsave(&dev->lock, flags); + hid = acc_hid_get(&dev->new_hid_list, w_value); + spin_unlock_irqrestore(&dev->lock, flags); + if (!hid) { + value = -EINVAL; + goto err; + } + offset = w_index; + if (offset != hid->report_desc_offset + || offset + w_length > hid->report_desc_len) { + value = -EINVAL; + goto err; + } + cdev->req->context = hid; + cdev->req->complete = acc_complete_set_hid_report_desc; + value = w_length; + } else if (b_request == ACCESSORY_SEND_HID_EVENT) { + spin_lock_irqsave(&dev->lock, flags); + hid = acc_hid_get(&dev->hid_list, w_value); + spin_unlock_irqrestore(&dev->lock, flags); + if (!hid) { + value = -EINVAL; + goto err; + } + cdev->req->context = hid; + cdev->req->complete = acc_complete_send_hid_event; + value = w_length; + } + } else if (b_requestType == (USB_DIR_IN | USB_TYPE_VENDOR)) { + if (b_request == ACCESSORY_GET_PROTOCOL) { + schedule_work(&dev->getprotocol_work); + *((u16 *)cdev->req->buf) = PROTOCOL_VERSION; + value = sizeof(u16); + cdev->req->complete = acc_complete_setup_noop; + /* clear any string left over from a previous session */ + memset(dev->manufacturer, 0, sizeof(dev->manufacturer)); + memset(dev->model, 0, sizeof(dev->model)); + memset(dev->description, 0, sizeof(dev->description)); + memset(dev->version, 0, sizeof(dev->version)); + memset(dev->uri, 0, sizeof(dev->uri)); + memset(dev->serial, 0, sizeof(dev->serial)); + dev->start_requested = 0; + dev->audio_mode = 0; + } + } + + if (value >= 0) { + cdev->req->zero = 0; + cdev->req->length = value; + value = usb_ep_queue(cdev->gadget->ep0, cdev->req, GFP_ATOMIC); + if (value < 0) + ERROR(cdev, "%s setup response queue error\n", + __func__); + } + +err: + if (value == -EOPNOTSUPP) + VDBG(cdev, + "unknown class-specific control req " + "%02x.%02x v%04x i%04x l%u\n", + ctrl->bRequestType, ctrl->bRequest, + w_value, w_index, w_length); + put_acc_dev(dev); + return value; +} +EXPORT_SYMBOL_GPL(acc_ctrlrequest); + +int acc_ctrlrequest_composite(struct usb_composite_dev *cdev, + const struct usb_ctrlrequest *ctrl) +{ + u16 w_length = le16_to_cpu(ctrl->wLength); + + if (w_length > USB_COMP_EP0_BUFSIZ) { + if (ctrl->bRequestType & USB_DIR_IN) { + /* Cast away the const, we are going to overwrite on purpose. */ + __le16 *temp = (__le16 *)&ctrl->wLength; + + *temp = cpu_to_le16(USB_COMP_EP0_BUFSIZ); + w_length = USB_COMP_EP0_BUFSIZ; + } else { + return -EINVAL; + } + } + return acc_ctrlrequest(cdev, ctrl); +} +EXPORT_SYMBOL_GPL(acc_ctrlrequest_composite); + +static int +__acc_function_bind(struct usb_configuration *c, + struct usb_function *f, bool configfs) +{ + struct usb_composite_dev *cdev = c->cdev; + struct acc_dev *dev = func_to_dev(f); + int id; + int ret; + + DBG(cdev, "acc_function_bind dev: %p\n", dev); + + if (configfs) { + if (acc_string_defs[INTERFACE_STRING_INDEX].id == 0) { + ret = usb_string_id(c->cdev); + if (ret < 0) + return ret; + acc_string_defs[INTERFACE_STRING_INDEX].id = ret; + acc_interface_desc.iInterface = ret; + } + dev->cdev = c->cdev; + } + ret = hid_register_driver(&acc_hid_driver); + if (ret) + return ret; + + dev->start_requested = 0; + + /* allocate interface ID(s) */ + id = usb_interface_id(c, f); + if (id < 0) + return id; + acc_interface_desc.bInterfaceNumber = id; + + /* allocate endpoints */ + ret = create_bulk_endpoints(dev, &acc_fullspeed_in_desc, + &acc_fullspeed_out_desc); + if (ret) + return ret; + + /* support high speed hardware */ + acc_highspeed_in_desc.bEndpointAddress = + acc_fullspeed_in_desc.bEndpointAddress; + acc_highspeed_out_desc.bEndpointAddress = + acc_fullspeed_out_desc.bEndpointAddress; + + /* support super speed hardware */ + acc_superspeed_in_desc.bEndpointAddress = + acc_fullspeed_in_desc.bEndpointAddress; + acc_superspeed_out_desc.bEndpointAddress = + acc_fullspeed_out_desc.bEndpointAddress; + + /* support super speed plus hardware */ + acc_superspeedplus_in_desc.bEndpointAddress = + acc_fullspeed_in_desc.bEndpointAddress; + acc_superspeedplus_out_desc.bEndpointAddress = + acc_fullspeed_out_desc.bEndpointAddress; + + DBG(cdev, "%s speed %s: IN/%s, OUT/%s\n", + gadget_is_dualspeed(c->cdev->gadget) ? "dual" : "full", + f->name, dev->ep_in->name, dev->ep_out->name); + return 0; +} + +static int +acc_function_bind_configfs(struct usb_configuration *c, + struct usb_function *f) { + return __acc_function_bind(c, f, true); +} + +static void +kill_all_hid_devices(struct acc_dev *dev) +{ + struct acc_hid_dev *hid; + struct list_head *entry, *temp; + unsigned long flags; + + spin_lock_irqsave(&dev->lock, flags); + list_for_each_safe(entry, temp, &dev->hid_list) { + hid = list_entry(entry, struct acc_hid_dev, list); + list_del(&hid->list); + list_add(&hid->list, &dev->dead_hid_list); + } + list_for_each_safe(entry, temp, &dev->new_hid_list) { + hid = list_entry(entry, struct acc_hid_dev, list); + list_del(&hid->list); + list_add(&hid->list, &dev->dead_hid_list); + } + spin_unlock_irqrestore(&dev->lock, flags); + + schedule_work(&dev->hid_work); +} + +static void +acc_hid_unbind(struct acc_dev *dev) +{ + hid_unregister_driver(&acc_hid_driver); + kill_all_hid_devices(dev); +} + +static void +acc_function_unbind(struct usb_configuration *c, struct usb_function *f) +{ + struct acc_dev *dev = func_to_dev(f); + struct usb_request *req; + int i; + + dev->online = 0; /* clear online flag */ + wake_up(&dev->read_wq); /* unblock reads on closure */ + wake_up(&dev->write_wq); /* likewise for writes */ + + while ((req = req_get(dev, &dev->tx_idle))) + acc_request_free(req, dev->ep_in); + for (i = 0; i < RX_REQ_MAX; i++) + acc_request_free(dev->rx_req[i], dev->ep_out); + + acc_hid_unbind(dev); +} + +static void acc_getprotocol_work(struct work_struct *data) +{ + char *envp[2] = { "ACCESSORY=GETPROTOCOL", NULL }; + + kobject_uevent_env(&acc_device.this_device->kobj, KOBJ_CHANGE, envp); +} + +static void acc_sendstring_work(struct work_struct *data) +{ + char *envp[2] = { "ACCESSORY=SENDSTRING", NULL }; + + kobject_uevent_env(&acc_device.this_device->kobj, KOBJ_CHANGE, envp); +} + +static void acc_start_work(struct work_struct *data) +{ + char *envp[2] = { "ACCESSORY=START", NULL }; + + kobject_uevent_env(&acc_device.this_device->kobj, KOBJ_CHANGE, envp); +} + +static int acc_hid_init(struct acc_hid_dev *hdev) +{ + struct hid_device *hid; + int ret; + + hid = hid_allocate_device(); + if (IS_ERR(hid)) + return PTR_ERR(hid); + + hid->ll_driver = &acc_hid_ll_driver; + hid->dev.parent = acc_device.this_device; + + hid->bus = BUS_USB; + hid->vendor = HID_ANY_ID; + hid->product = HID_ANY_ID; + hid->driver_data = hdev; + ret = hid_add_device(hid); + if (ret) { + pr_err("can't add hid device: %d\n", ret); + hid_destroy_device(hid); + return ret; + } + + hdev->hid = hid; + return 0; +} + +static void acc_hid_delete(struct acc_hid_dev *hid) +{ + kfree(hid->report_desc); + kfree(hid); +} + +static void acc_hid_work(struct work_struct *data) +{ + struct acc_dev *dev = get_acc_dev(); + struct list_head *entry, *temp; + struct acc_hid_dev *hid; + struct list_head new_list, dead_list; + unsigned long flags; + + if (!dev) + return; + + INIT_LIST_HEAD(&new_list); + + spin_lock_irqsave(&dev->lock, flags); + + /* copy hids that are ready for initialization to new_list */ + list_for_each_safe(entry, temp, &dev->new_hid_list) { + hid = list_entry(entry, struct acc_hid_dev, list); + if (hid->report_desc_offset == hid->report_desc_len) + list_move(&hid->list, &new_list); + } + + if (list_empty(&dev->dead_hid_list)) { + INIT_LIST_HEAD(&dead_list); + } else { + /* move all of dev->dead_hid_list to dead_list */ + dead_list.prev = dev->dead_hid_list.prev; + dead_list.next = dev->dead_hid_list.next; + dead_list.next->prev = &dead_list; + dead_list.prev->next = &dead_list; + INIT_LIST_HEAD(&dev->dead_hid_list); + } + + spin_unlock_irqrestore(&dev->lock, flags); + + /* register new HID devices */ + list_for_each_safe(entry, temp, &new_list) { + hid = list_entry(entry, struct acc_hid_dev, list); + if (acc_hid_init(hid)) { + pr_err("can't add HID device %p\n", hid); + acc_hid_delete(hid); + } else { + spin_lock_irqsave(&dev->lock, flags); + list_move(&hid->list, &dev->hid_list); + spin_unlock_irqrestore(&dev->lock, flags); + } + } + + /* remove dead HID devices */ + list_for_each_safe(entry, temp, &dead_list) { + hid = list_entry(entry, struct acc_hid_dev, list); + list_del(&hid->list); + if (hid->hid) + hid_destroy_device(hid->hid); + acc_hid_delete(hid); + } + + put_acc_dev(dev); +} + +static int acc_function_set_alt(struct usb_function *f, + unsigned intf, unsigned alt) +{ + struct acc_dev *dev = func_to_dev(f); + struct usb_composite_dev *cdev = f->config->cdev; + int ret; + + DBG(cdev, "acc_function_set_alt intf: %d alt: %d\n", intf, alt); + + ret = config_ep_by_speed(cdev->gadget, f, dev->ep_in); + if (ret) + return ret; + + ret = usb_ep_enable(dev->ep_in); + if (ret) + return ret; + + ret = config_ep_by_speed(cdev->gadget, f, dev->ep_out); + if (ret) + return ret; + + ret = usb_ep_enable(dev->ep_out); + if (ret) { + usb_ep_disable(dev->ep_in); + return ret; + } + + dev->online = 1; + dev->disconnected = 0; /* if online then not disconnected */ + + /* readers may be blocked waiting for us to go online */ + wake_up(&dev->read_wq); + return 0; +} + +static void acc_function_disable(struct usb_function *f) +{ + struct acc_dev *dev = func_to_dev(f); + struct usb_composite_dev *cdev = dev->cdev; + + DBG(cdev, "acc_function_disable\n"); + acc_set_disconnected(dev); /* this now only sets disconnected */ + dev->online = 0; /* so now need to clear online flag here too */ + usb_ep_disable(dev->ep_in); + usb_ep_disable(dev->ep_out); + + /* readers may be blocked waiting for us to go online */ + wake_up(&dev->read_wq); + + VDBG(cdev, "%s disabled\n", dev->function.name); +} + +static int acc_setup(void) +{ + struct acc_dev_ref *ref = &_acc_dev_ref; + struct acc_dev *dev; + int ret; + + if (kref_read(&ref->kref)) + return -EBUSY; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + spin_lock_init(&dev->lock); + init_waitqueue_head(&dev->read_wq); + init_waitqueue_head(&dev->write_wq); + atomic_set(&dev->open_excl, 0); + INIT_LIST_HEAD(&dev->tx_idle); + INIT_LIST_HEAD(&dev->hid_list); + INIT_LIST_HEAD(&dev->new_hid_list); + INIT_LIST_HEAD(&dev->dead_hid_list); + INIT_DELAYED_WORK(&dev->start_work, acc_start_work); + INIT_WORK(&dev->hid_work, acc_hid_work); + INIT_WORK(&dev->getprotocol_work, acc_getprotocol_work); + INIT_WORK(&dev->sendstring_work, acc_sendstring_work); + + dev->ref = ref; + if (cmpxchg_relaxed(&ref->acc_dev, NULL, dev)) { + ret = -EBUSY; + goto err_free_dev; + } + + ret = misc_register(&acc_device); + if (ret) + goto err_zap_ptr; + + kref_init(&ref->kref); + return 0; + +err_zap_ptr: + ref->acc_dev = NULL; +err_free_dev: + kfree(dev); + pr_err("USB accessory gadget driver failed to initialize\n"); + return ret; +} + +void acc_disconnect(void) +{ + struct acc_dev *dev = get_acc_dev(); + + if (!dev) + return; + + /* unregister all HID devices if USB is disconnected */ + kill_all_hid_devices(dev); + put_acc_dev(dev); +} +EXPORT_SYMBOL_GPL(acc_disconnect); + +static void acc_cleanup(void) +{ + struct acc_dev *dev = get_acc_dev(); + + misc_deregister(&acc_device); + put_acc_dev(dev); + put_acc_dev(dev); /* Pairs with kref_init() in acc_setup() */ +} +static struct acc_instance *to_acc_instance(struct config_item *item) +{ + return container_of(to_config_group(item), struct acc_instance, + func_inst.group); +} + +static void acc_attr_release(struct config_item *item) +{ + struct acc_instance *fi_acc = to_acc_instance(item); + + usb_put_function_instance(&fi_acc->func_inst); +} + +static struct configfs_item_operations acc_item_ops = { + .release = acc_attr_release, +}; + +static struct config_item_type acc_func_type = { + .ct_item_ops = &acc_item_ops, + .ct_owner = THIS_MODULE, +}; + +static struct acc_instance *to_fi_acc(struct usb_function_instance *fi) +{ + return container_of(fi, struct acc_instance, func_inst); +} + +static int acc_set_inst_name(struct usb_function_instance *fi, const char *name) +{ + struct acc_instance *fi_acc; + char *ptr; + int name_len; + + name_len = strlen(name) + 1; + if (name_len > MAX_INST_NAME_LEN) + return -ENAMETOOLONG; + + ptr = kstrndup(name, name_len, GFP_KERNEL); + if (!ptr) + return -ENOMEM; + + fi_acc = to_fi_acc(fi); + fi_acc->name = ptr; + return 0; +} + +static void acc_free_inst(struct usb_function_instance *fi) +{ + struct acc_instance *fi_acc; + + fi_acc = to_fi_acc(fi); + kfree(fi_acc->name); + acc_cleanup(); +} + +static struct usb_function_instance *acc_alloc_inst(void) +{ + struct acc_instance *fi_acc; + int err; + + fi_acc = kzalloc(sizeof(*fi_acc), GFP_KERNEL); + if (!fi_acc) + return ERR_PTR(-ENOMEM); + fi_acc->func_inst.set_inst_name = acc_set_inst_name; + fi_acc->func_inst.free_func_inst = acc_free_inst; + + err = acc_setup(); + if (err) { + kfree(fi_acc); + return ERR_PTR(err); + } + + config_group_init_type_name(&fi_acc->func_inst.group, + "", &acc_func_type); + return &fi_acc->func_inst; +} + +static void acc_free(struct usb_function *f) +{ + struct acc_dev *dev = func_to_dev(f); + + put_acc_dev(dev); +} + +int acc_ctrlrequest_configfs(struct usb_function *f, + const struct usb_ctrlrequest *ctrl) { + if (f->config != NULL && f->config->cdev != NULL) + return acc_ctrlrequest(f->config->cdev, ctrl); + else + return -1; +} + +static struct usb_function *acc_alloc(struct usb_function_instance *fi) +{ + struct acc_dev *dev = get_acc_dev(); + + dev->function.name = "accessory"; + dev->function.strings = acc_strings, + dev->function.fs_descriptors = fs_acc_descs; + dev->function.hs_descriptors = hs_acc_descs; + dev->function.ss_descriptors = ss_acc_descs; + dev->function.ssp_descriptors = ssp_acc_descs; + dev->function.bind = acc_function_bind_configfs; + dev->function.unbind = acc_function_unbind; + dev->function.set_alt = acc_function_set_alt; + dev->function.disable = acc_function_disable; + dev->function.free_func = acc_free; + dev->function.setup = acc_ctrlrequest_configfs; + + return &dev->function; +} +DECLARE_USB_FUNCTION_INIT(accessory, acc_alloc_inst, acc_alloc); +MODULE_LICENSE("GPL"); diff --git a/drivers/usb/gadget/function/f_audio_source.c b/drivers/usb/gadget/function/f_audio_source.c new file mode 100644 index 000000000000..c768a526b315 --- /dev/null +++ b/drivers/usb/gadget/function/f_audio_source.c @@ -0,0 +1,1071 @@ +/* + * Gadget Function Driver for USB audio source device + * + * Copyright (C) 2012 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#define SAMPLE_RATE 44100 +#define FRAMES_PER_MSEC (SAMPLE_RATE / 1000) + +#define IN_EP_MAX_PACKET_SIZE 256 + +/* Number of requests to allocate */ +#define IN_EP_REQ_COUNT 4 + +#define AUDIO_AC_INTERFACE 0 +#define AUDIO_AS_INTERFACE 1 +#define AUDIO_NUM_INTERFACES 2 +#define MAX_INST_NAME_LEN 40 + +/* B.3.1 Standard AC Interface Descriptor */ +static struct usb_interface_descriptor ac_interface_desc = { + .bLength = USB_DT_INTERFACE_SIZE, + .bDescriptorType = USB_DT_INTERFACE, + .bNumEndpoints = 0, + .bInterfaceClass = USB_CLASS_AUDIO, + .bInterfaceSubClass = USB_SUBCLASS_AUDIOCONTROL, +}; + +DECLARE_UAC_AC_HEADER_DESCRIPTOR(2); + +#define UAC_DT_AC_HEADER_LENGTH UAC_DT_AC_HEADER_SIZE(AUDIO_NUM_INTERFACES) +/* 1 input terminal, 1 output terminal and 1 feature unit */ +#define UAC_DT_TOTAL_LENGTH (UAC_DT_AC_HEADER_LENGTH \ + + UAC_DT_INPUT_TERMINAL_SIZE + UAC_DT_OUTPUT_TERMINAL_SIZE \ + + UAC_DT_FEATURE_UNIT_SIZE(0)) +/* B.3.2 Class-Specific AC Interface Descriptor */ +static struct uac1_ac_header_descriptor_2 ac_header_desc = { + .bLength = UAC_DT_AC_HEADER_LENGTH, + .bDescriptorType = USB_DT_CS_INTERFACE, + .bDescriptorSubtype = UAC_HEADER, + .bcdADC = __constant_cpu_to_le16(0x0100), + .wTotalLength = __constant_cpu_to_le16(UAC_DT_TOTAL_LENGTH), + .bInCollection = AUDIO_NUM_INTERFACES, + .baInterfaceNr = { + [0] = AUDIO_AC_INTERFACE, + [1] = AUDIO_AS_INTERFACE, + } +}; + +#define INPUT_TERMINAL_ID 1 +static struct uac_input_terminal_descriptor input_terminal_desc = { + .bLength = UAC_DT_INPUT_TERMINAL_SIZE, + .bDescriptorType = USB_DT_CS_INTERFACE, + .bDescriptorSubtype = UAC_INPUT_TERMINAL, + .bTerminalID = INPUT_TERMINAL_ID, + .wTerminalType = UAC_INPUT_TERMINAL_MICROPHONE, + .bAssocTerminal = 0, + .wChannelConfig = 0x3, +}; + +DECLARE_UAC_FEATURE_UNIT_DESCRIPTOR(0); + +#define FEATURE_UNIT_ID 2 +static struct uac_feature_unit_descriptor_0 feature_unit_desc = { + .bLength = UAC_DT_FEATURE_UNIT_SIZE(0), + .bDescriptorType = USB_DT_CS_INTERFACE, + .bDescriptorSubtype = UAC_FEATURE_UNIT, + .bUnitID = FEATURE_UNIT_ID, + .bSourceID = INPUT_TERMINAL_ID, + .bControlSize = 2, +}; + +#define OUTPUT_TERMINAL_ID 3 +static struct uac1_output_terminal_descriptor output_terminal_desc = { + .bLength = UAC_DT_OUTPUT_TERMINAL_SIZE, + .bDescriptorType = USB_DT_CS_INTERFACE, + .bDescriptorSubtype = UAC_OUTPUT_TERMINAL, + .bTerminalID = OUTPUT_TERMINAL_ID, + .wTerminalType = UAC_TERMINAL_STREAMING, + .bAssocTerminal = FEATURE_UNIT_ID, + .bSourceID = FEATURE_UNIT_ID, +}; + +/* B.4.1 Standard AS Interface Descriptor */ +static struct usb_interface_descriptor as_interface_alt_0_desc = { + .bLength = USB_DT_INTERFACE_SIZE, + .bDescriptorType = USB_DT_INTERFACE, + .bAlternateSetting = 0, + .bNumEndpoints = 0, + .bInterfaceClass = USB_CLASS_AUDIO, + .bInterfaceSubClass = USB_SUBCLASS_AUDIOSTREAMING, +}; + +static struct usb_interface_descriptor as_interface_alt_1_desc = { + .bLength = USB_DT_INTERFACE_SIZE, + .bDescriptorType = USB_DT_INTERFACE, + .bAlternateSetting = 1, + .bNumEndpoints = 1, + .bInterfaceClass = USB_CLASS_AUDIO, + .bInterfaceSubClass = USB_SUBCLASS_AUDIOSTREAMING, +}; + +/* B.4.2 Class-Specific AS Interface Descriptor */ +static struct uac1_as_header_descriptor as_header_desc = { + .bLength = UAC_DT_AS_HEADER_SIZE, + .bDescriptorType = USB_DT_CS_INTERFACE, + .bDescriptorSubtype = UAC_AS_GENERAL, + .bTerminalLink = INPUT_TERMINAL_ID, + .bDelay = 1, + .wFormatTag = UAC_FORMAT_TYPE_I_PCM, +}; + +DECLARE_UAC_FORMAT_TYPE_I_DISCRETE_DESC(1); + +static struct uac_format_type_i_discrete_descriptor_1 as_type_i_desc = { + .bLength = UAC_FORMAT_TYPE_I_DISCRETE_DESC_SIZE(1), + .bDescriptorType = USB_DT_CS_INTERFACE, + .bDescriptorSubtype = UAC_FORMAT_TYPE, + .bFormatType = UAC_FORMAT_TYPE_I, + .bSubframeSize = 2, + .bBitResolution = 16, + .bSamFreqType = 1, +}; + +/* Standard ISO IN Endpoint Descriptor for highspeed */ +static struct usb_endpoint_descriptor hs_as_in_ep_desc = { + .bLength = USB_DT_ENDPOINT_AUDIO_SIZE, + .bDescriptorType = USB_DT_ENDPOINT, + .bEndpointAddress = USB_DIR_IN, + .bmAttributes = USB_ENDPOINT_SYNC_SYNC + | USB_ENDPOINT_XFER_ISOC, + .wMaxPacketSize = __constant_cpu_to_le16(IN_EP_MAX_PACKET_SIZE), + .bInterval = 4, /* poll 1 per millisecond */ +}; + +/* Standard ISO IN Endpoint Descriptor for highspeed */ +static struct usb_endpoint_descriptor fs_as_in_ep_desc = { + .bLength = USB_DT_ENDPOINT_AUDIO_SIZE, + .bDescriptorType = USB_DT_ENDPOINT, + .bEndpointAddress = USB_DIR_IN, + .bmAttributes = USB_ENDPOINT_SYNC_SYNC + | USB_ENDPOINT_XFER_ISOC, + .wMaxPacketSize = __constant_cpu_to_le16(IN_EP_MAX_PACKET_SIZE), + .bInterval = 1, /* poll 1 per millisecond */ +}; + +/* Class-specific AS ISO OUT Endpoint Descriptor */ +static struct uac_iso_endpoint_descriptor as_iso_in_desc = { + .bLength = UAC_ISO_ENDPOINT_DESC_SIZE, + .bDescriptorType = USB_DT_CS_ENDPOINT, + .bDescriptorSubtype = UAC_EP_GENERAL, + .bmAttributes = 1, + .bLockDelayUnits = 1, + .wLockDelay = __constant_cpu_to_le16(1), +}; + +static struct usb_descriptor_header *hs_audio_desc[] = { + (struct usb_descriptor_header *)&ac_interface_desc, + (struct usb_descriptor_header *)&ac_header_desc, + + (struct usb_descriptor_header *)&input_terminal_desc, + (struct usb_descriptor_header *)&output_terminal_desc, + (struct usb_descriptor_header *)&feature_unit_desc, + + (struct usb_descriptor_header *)&as_interface_alt_0_desc, + (struct usb_descriptor_header *)&as_interface_alt_1_desc, + (struct usb_descriptor_header *)&as_header_desc, + + (struct usb_descriptor_header *)&as_type_i_desc, + + (struct usb_descriptor_header *)&hs_as_in_ep_desc, + (struct usb_descriptor_header *)&as_iso_in_desc, + NULL, +}; + +static struct usb_descriptor_header *fs_audio_desc[] = { + (struct usb_descriptor_header *)&ac_interface_desc, + (struct usb_descriptor_header *)&ac_header_desc, + + (struct usb_descriptor_header *)&input_terminal_desc, + (struct usb_descriptor_header *)&output_terminal_desc, + (struct usb_descriptor_header *)&feature_unit_desc, + + (struct usb_descriptor_header *)&as_interface_alt_0_desc, + (struct usb_descriptor_header *)&as_interface_alt_1_desc, + (struct usb_descriptor_header *)&as_header_desc, + + (struct usb_descriptor_header *)&as_type_i_desc, + + (struct usb_descriptor_header *)&fs_as_in_ep_desc, + (struct usb_descriptor_header *)&as_iso_in_desc, + NULL, +}; + +static struct snd_pcm_hardware audio_hw_info = { + .info = SNDRV_PCM_INFO_MMAP | + SNDRV_PCM_INFO_MMAP_VALID | + SNDRV_PCM_INFO_BATCH | + SNDRV_PCM_INFO_INTERLEAVED | + SNDRV_PCM_INFO_BLOCK_TRANSFER, + + .formats = SNDRV_PCM_FMTBIT_S16_LE, + .channels_min = 2, + .channels_max = 2, + .rate_min = SAMPLE_RATE, + .rate_max = SAMPLE_RATE, + + .buffer_bytes_max = 1024 * 1024, + .period_bytes_min = 64, + .period_bytes_max = 512 * 1024, + .periods_min = 2, + .periods_max = 1024, +}; + +/*-------------------------------------------------------------------------*/ + +struct audio_source_config { + int card; + int device; +}; + +struct audio_dev { + struct usb_function func; + struct snd_card *card; + struct snd_pcm *pcm; + struct snd_pcm_substream *substream; + + struct list_head idle_reqs; + struct usb_ep *in_ep; + + spinlock_t lock; + + /* beginning, end and current position in our buffer */ + void *buffer_start; + void *buffer_end; + void *buffer_pos; + + /* byte size of a "period" */ + unsigned int period; + /* bytes sent since last call to snd_pcm_period_elapsed */ + unsigned int period_offset; + /* time we started playing */ + ktime_t start_time; + /* number of frames sent since start_time */ + s64 frames_sent; + struct audio_source_config *config; + /* for creating and issuing QoS requests */ + struct pm_qos_request pm_qos; +}; + +static inline struct audio_dev *func_to_audio(struct usb_function *f) +{ + return container_of(f, struct audio_dev, func); +} + +/*-------------------------------------------------------------------------*/ + +struct audio_source_instance { + struct usb_function_instance func_inst; + const char *name; + struct audio_source_config *config; + struct device *audio_device; +}; + +static void audio_source_attr_release(struct config_item *item); + +static struct configfs_item_operations audio_source_item_ops = { + .release = audio_source_attr_release, +}; + +static struct config_item_type audio_source_func_type = { + .ct_item_ops = &audio_source_item_ops, + .ct_owner = THIS_MODULE, +}; + +static ssize_t audio_source_pcm_show(struct device *dev, + struct device_attribute *attr, char *buf); + +static DEVICE_ATTR(pcm, S_IRUGO, audio_source_pcm_show, NULL); + +static struct device_attribute *audio_source_function_attributes[] = { + &dev_attr_pcm, + NULL +}; + +/*--------------------------------------------------------------------------*/ + +static struct usb_request *audio_request_new(struct usb_ep *ep, int buffer_size) +{ + struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL); + + if (!req) + return NULL; + + req->buf = kmalloc(buffer_size, GFP_KERNEL); + if (!req->buf) { + usb_ep_free_request(ep, req); + return NULL; + } + req->length = buffer_size; + return req; +} + +static void audio_request_free(struct usb_request *req, struct usb_ep *ep) +{ + if (req) { + kfree(req->buf); + usb_ep_free_request(ep, req); + } +} + +static void audio_req_put(struct audio_dev *audio, struct usb_request *req) +{ + unsigned long flags; + + spin_lock_irqsave(&audio->lock, flags); + list_add_tail(&req->list, &audio->idle_reqs); + spin_unlock_irqrestore(&audio->lock, flags); +} + +static struct usb_request *audio_req_get(struct audio_dev *audio) +{ + unsigned long flags; + struct usb_request *req; + + spin_lock_irqsave(&audio->lock, flags); + if (list_empty(&audio->idle_reqs)) { + req = 0; + } else { + req = list_first_entry(&audio->idle_reqs, struct usb_request, + list); + list_del(&req->list); + } + spin_unlock_irqrestore(&audio->lock, flags); + return req; +} + +/* send the appropriate number of packets to match our bitrate */ +static void audio_send(struct audio_dev *audio) +{ + struct snd_pcm_runtime *runtime; + struct usb_request *req; + int length, length1, length2, ret; + s64 msecs; + s64 frames; + ktime_t now; + + /* audio->substream will be null if we have been closed */ + if (!audio->substream) + return; + /* audio->buffer_pos will be null if we have been stopped */ + if (!audio->buffer_pos) + return; + + runtime = audio->substream->runtime; + + /* compute number of frames to send */ + now = ktime_get(); + msecs = div_s64((ktime_to_ns(now) - ktime_to_ns(audio->start_time)), + 1000000); + frames = div_s64((msecs * SAMPLE_RATE), 1000); + + /* Readjust our frames_sent if we fall too far behind. + * If we get too far behind it is better to drop some frames than + * to keep sending data too fast in an attempt to catch up. + */ + if (frames - audio->frames_sent > 10 * FRAMES_PER_MSEC) + audio->frames_sent = frames - FRAMES_PER_MSEC; + + frames -= audio->frames_sent; + + /* We need to send something to keep the pipeline going */ + if (frames <= 0) + frames = FRAMES_PER_MSEC; + + while (frames > 0) { + req = audio_req_get(audio); + if (!req) + break; + + length = frames_to_bytes(runtime, frames); + if (length > IN_EP_MAX_PACKET_SIZE) + length = IN_EP_MAX_PACKET_SIZE; + + if (audio->buffer_pos + length > audio->buffer_end) + length1 = audio->buffer_end - audio->buffer_pos; + else + length1 = length; + memcpy(req->buf, audio->buffer_pos, length1); + if (length1 < length) { + /* Wrap around and copy remaining length + * at beginning of buffer. + */ + length2 = length - length1; + memcpy(req->buf + length1, audio->buffer_start, + length2); + audio->buffer_pos = audio->buffer_start + length2; + } else { + audio->buffer_pos += length1; + if (audio->buffer_pos >= audio->buffer_end) + audio->buffer_pos = audio->buffer_start; + } + + req->length = length; + ret = usb_ep_queue(audio->in_ep, req, GFP_ATOMIC); + if (ret < 0) { + pr_err("usb_ep_queue failed ret: %d\n", ret); + audio_req_put(audio, req); + break; + } + + frames -= bytes_to_frames(runtime, length); + audio->frames_sent += bytes_to_frames(runtime, length); + } +} + +static void audio_control_complete(struct usb_ep *ep, struct usb_request *req) +{ + /* nothing to do here */ +} + +static void audio_data_complete(struct usb_ep *ep, struct usb_request *req) +{ + struct audio_dev *audio = req->context; + + pr_debug("audio_data_complete req->status %d req->actual %d\n", + req->status, req->actual); + + audio_req_put(audio, req); + + if (!audio->buffer_start || req->status) + return; + + audio->period_offset += req->actual; + if (audio->period_offset >= audio->period) { + snd_pcm_period_elapsed(audio->substream); + audio->period_offset = 0; + } + audio_send(audio); +} + +static int audio_set_endpoint_req(struct usb_function *f, + const struct usb_ctrlrequest *ctrl) +{ + int value = -EOPNOTSUPP; + u16 ep = le16_to_cpu(ctrl->wIndex); + u16 len = le16_to_cpu(ctrl->wLength); + u16 w_value = le16_to_cpu(ctrl->wValue); + + pr_debug("bRequest 0x%x, w_value 0x%04x, len %d, endpoint %d\n", + ctrl->bRequest, w_value, len, ep); + + switch (ctrl->bRequest) { + case UAC_SET_CUR: + case UAC_SET_MIN: + case UAC_SET_MAX: + case UAC_SET_RES: + value = len; + break; + default: + break; + } + + return value; +} + +static int audio_get_endpoint_req(struct usb_function *f, + const struct usb_ctrlrequest *ctrl) +{ + struct usb_composite_dev *cdev = f->config->cdev; + int value = -EOPNOTSUPP; + u8 ep = ((le16_to_cpu(ctrl->wIndex) >> 8) & 0xFF); + u16 len = le16_to_cpu(ctrl->wLength); + u16 w_value = le16_to_cpu(ctrl->wValue); + u8 *buf = cdev->req->buf; + + pr_debug("bRequest 0x%x, w_value 0x%04x, len %d, endpoint %d\n", + ctrl->bRequest, w_value, len, ep); + + if (w_value == UAC_EP_CS_ATTR_SAMPLE_RATE << 8) { + switch (ctrl->bRequest) { + case UAC_GET_CUR: + case UAC_GET_MIN: + case UAC_GET_MAX: + case UAC_GET_RES: + /* return our sample rate */ + buf[0] = (u8)SAMPLE_RATE; + buf[1] = (u8)(SAMPLE_RATE >> 8); + buf[2] = (u8)(SAMPLE_RATE >> 16); + value = 3; + break; + default: + break; + } + } + + return value; +} + +static int +audio_setup(struct usb_function *f, const struct usb_ctrlrequest *ctrl) +{ + struct usb_composite_dev *cdev = f->config->cdev; + struct usb_request *req = cdev->req; + int value = -EOPNOTSUPP; + u16 w_index = le16_to_cpu(ctrl->wIndex); + u16 w_value = le16_to_cpu(ctrl->wValue); + u16 w_length = le16_to_cpu(ctrl->wLength); + + /* composite driver infrastructure handles everything; interface + * activation uses set_alt(). + */ + switch (ctrl->bRequestType) { + case USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_ENDPOINT: + value = audio_set_endpoint_req(f, ctrl); + break; + + case USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_ENDPOINT: + value = audio_get_endpoint_req(f, ctrl); + break; + } + + /* respond with data transfer or status phase? */ + if (value >= 0) { + pr_debug("audio req%02x.%02x v%04x i%04x l%d\n", + ctrl->bRequestType, ctrl->bRequest, + w_value, w_index, w_length); + req->zero = 0; + req->length = value; + req->complete = audio_control_complete; + value = usb_ep_queue(cdev->gadget->ep0, req, GFP_ATOMIC); + if (value < 0) + pr_err("audio response on err %d\n", value); + } + + /* device either stalls (value < 0) or reports success */ + return value; +} + +static int audio_set_alt(struct usb_function *f, unsigned intf, unsigned alt) +{ + struct audio_dev *audio = func_to_audio(f); + struct usb_composite_dev *cdev = f->config->cdev; + int ret; + + pr_debug("audio_set_alt intf %d, alt %d\n", intf, alt); + + ret = config_ep_by_speed(cdev->gadget, f, audio->in_ep); + if (ret) + return ret; + + usb_ep_enable(audio->in_ep); + return 0; +} + +static void audio_disable(struct usb_function *f) +{ + struct audio_dev *audio = func_to_audio(f); + + pr_debug("audio_disable\n"); + usb_ep_disable(audio->in_ep); +} + +static void audio_free_func(struct usb_function *f) +{ + /* no-op */ +} + +/*-------------------------------------------------------------------------*/ + +static void audio_build_desc(struct audio_dev *audio) +{ + u8 *sam_freq; + int rate; + + /* Set channel numbers */ + input_terminal_desc.bNrChannels = 2; + as_type_i_desc.bNrChannels = 2; + + /* Set sample rates */ + rate = SAMPLE_RATE; + sam_freq = as_type_i_desc.tSamFreq[0]; + memcpy(sam_freq, &rate, 3); +} + + +static int snd_card_setup(struct usb_configuration *c, + struct audio_source_config *config); +static struct audio_source_instance *to_fi_audio_source( + const struct usb_function_instance *fi); + + +/* audio function driver setup/binding */ +static int +audio_bind(struct usb_configuration *c, struct usb_function *f) +{ + struct usb_composite_dev *cdev = c->cdev; + struct audio_dev *audio = func_to_audio(f); + int status; + struct usb_ep *ep; + struct usb_request *req; + int i; + int err; + + if (IS_ENABLED(CONFIG_USB_CONFIGFS)) { + struct audio_source_instance *fi_audio = + to_fi_audio_source(f->fi); + struct audio_source_config *config = + fi_audio->config; + + err = snd_card_setup(c, config); + if (err) + return err; + } + + audio_build_desc(audio); + + /* allocate instance-specific interface IDs, and patch descriptors */ + status = usb_interface_id(c, f); + if (status < 0) + goto fail; + ac_interface_desc.bInterfaceNumber = status; + + /* AUDIO_AC_INTERFACE */ + ac_header_desc.baInterfaceNr[0] = status; + + status = usb_interface_id(c, f); + if (status < 0) + goto fail; + as_interface_alt_0_desc.bInterfaceNumber = status; + as_interface_alt_1_desc.bInterfaceNumber = status; + + /* AUDIO_AS_INTERFACE */ + ac_header_desc.baInterfaceNr[1] = status; + + status = -ENODEV; + + /* allocate our endpoint */ + ep = usb_ep_autoconfig(cdev->gadget, &fs_as_in_ep_desc); + if (!ep) + goto fail; + audio->in_ep = ep; + ep->driver_data = audio; /* claim */ + + if (gadget_is_dualspeed(c->cdev->gadget)) + hs_as_in_ep_desc.bEndpointAddress = + fs_as_in_ep_desc.bEndpointAddress; + + f->fs_descriptors = fs_audio_desc; + f->hs_descriptors = hs_audio_desc; + + for (i = 0, status = 0; i < IN_EP_REQ_COUNT && status == 0; i++) { + req = audio_request_new(ep, IN_EP_MAX_PACKET_SIZE); + if (req) { + req->context = audio; + req->complete = audio_data_complete; + audio_req_put(audio, req); + } else + status = -ENOMEM; + } + +fail: + return status; +} + +static void +audio_unbind(struct usb_configuration *c, struct usb_function *f) +{ + struct audio_dev *audio = func_to_audio(f); + struct usb_request *req; + + while ((req = audio_req_get(audio))) + audio_request_free(req, audio->in_ep); + + snd_card_free_when_closed(audio->card); + audio->card = NULL; + audio->pcm = NULL; + audio->substream = NULL; + audio->in_ep = NULL; + + if (IS_ENABLED(CONFIG_USB_CONFIGFS)) { + struct audio_source_instance *fi_audio = + to_fi_audio_source(f->fi); + struct audio_source_config *config = + fi_audio->config; + + config->card = -1; + config->device = -1; + } +} + +static void audio_pcm_playback_start(struct audio_dev *audio) +{ + audio->start_time = ktime_get(); + audio->frames_sent = 0; + audio_send(audio); +} + +static void audio_pcm_playback_stop(struct audio_dev *audio) +{ + unsigned long flags; + + spin_lock_irqsave(&audio->lock, flags); + audio->buffer_start = 0; + audio->buffer_end = 0; + audio->buffer_pos = 0; + spin_unlock_irqrestore(&audio->lock, flags); +} + +static int audio_pcm_open(struct snd_pcm_substream *substream) +{ + struct snd_pcm_runtime *runtime = substream->runtime; + struct audio_dev *audio = substream->private_data; + + runtime->private_data = audio; + runtime->hw = audio_hw_info; + snd_pcm_limit_hw_rates(runtime); + runtime->hw.channels_max = 2; + + audio->substream = substream; + + /* Add the QoS request and set the latency to 0 */ + cpu_latency_qos_add_request(&audio->pm_qos, 0); + + return 0; +} + +static int audio_pcm_close(struct snd_pcm_substream *substream) +{ + struct audio_dev *audio = substream->private_data; + unsigned long flags; + + spin_lock_irqsave(&audio->lock, flags); + + /* Remove the QoS request */ + cpu_latency_qos_remove_request(&audio->pm_qos); + + audio->substream = NULL; + spin_unlock_irqrestore(&audio->lock, flags); + + return 0; +} + +static int audio_pcm_hw_params(struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params) +{ + unsigned int channels = params_channels(params); + unsigned int rate = params_rate(params); + + if (rate != SAMPLE_RATE) + return -EINVAL; + if (channels != 2) + return -EINVAL; + + return snd_pcm_lib_alloc_vmalloc_buffer(substream, + params_buffer_bytes(params)); +} + +static int audio_pcm_hw_free(struct snd_pcm_substream *substream) +{ + return snd_pcm_lib_free_vmalloc_buffer(substream); +} + +static int audio_pcm_prepare(struct snd_pcm_substream *substream) +{ + struct snd_pcm_runtime *runtime = substream->runtime; + struct audio_dev *audio = runtime->private_data; + + audio->period = snd_pcm_lib_period_bytes(substream); + audio->period_offset = 0; + audio->buffer_start = runtime->dma_area; + audio->buffer_end = audio->buffer_start + + snd_pcm_lib_buffer_bytes(substream); + audio->buffer_pos = audio->buffer_start; + + return 0; +} + +static snd_pcm_uframes_t audio_pcm_pointer(struct snd_pcm_substream *substream) +{ + struct snd_pcm_runtime *runtime = substream->runtime; + struct audio_dev *audio = runtime->private_data; + ssize_t bytes = audio->buffer_pos - audio->buffer_start; + + /* return offset of next frame to fill in our buffer */ + return bytes_to_frames(runtime, bytes); +} + +static int audio_pcm_playback_trigger(struct snd_pcm_substream *substream, + int cmd) +{ + struct audio_dev *audio = substream->runtime->private_data; + int ret = 0; + + switch (cmd) { + case SNDRV_PCM_TRIGGER_START: + case SNDRV_PCM_TRIGGER_RESUME: + audio_pcm_playback_start(audio); + break; + + case SNDRV_PCM_TRIGGER_STOP: + case SNDRV_PCM_TRIGGER_SUSPEND: + audio_pcm_playback_stop(audio); + break; + + default: + ret = -EINVAL; + } + + return ret; +} + +static struct audio_dev _audio_dev = { + .func = { + .name = "audio_source", + .bind = audio_bind, + .unbind = audio_unbind, + .set_alt = audio_set_alt, + .setup = audio_setup, + .disable = audio_disable, + .free_func = audio_free_func, + }, + .lock = __SPIN_LOCK_UNLOCKED(_audio_dev.lock), + .idle_reqs = LIST_HEAD_INIT(_audio_dev.idle_reqs), +}; + +static struct snd_pcm_ops audio_playback_ops = { + .open = audio_pcm_open, + .close = audio_pcm_close, + .ioctl = snd_pcm_lib_ioctl, + .hw_params = audio_pcm_hw_params, + .hw_free = audio_pcm_hw_free, + .prepare = audio_pcm_prepare, + .trigger = audio_pcm_playback_trigger, + .pointer = audio_pcm_pointer, +}; + +int audio_source_bind_config(struct usb_configuration *c, + struct audio_source_config *config) +{ + struct audio_dev *audio; + int err; + + config->card = -1; + config->device = -1; + + audio = &_audio_dev; + + err = snd_card_setup(c, config); + if (err) + return err; + + err = usb_add_function(c, &audio->func); + if (err) + goto add_fail; + + return 0; + +add_fail: + snd_card_free(audio->card); + return err; +} + +static int snd_card_setup(struct usb_configuration *c, + struct audio_source_config *config) +{ + struct audio_dev *audio; + struct snd_card *card; + struct snd_pcm *pcm; + int err; + + audio = &_audio_dev; + + err = snd_card_new(&c->cdev->gadget->dev, + SNDRV_DEFAULT_IDX1, SNDRV_DEFAULT_STR1, + THIS_MODULE, 0, &card); + if (err) + return err; + + err = snd_pcm_new(card, "USB audio source", 0, 1, 0, &pcm); + if (err) + goto pcm_fail; + + pcm->private_data = audio; + pcm->info_flags = 0; + audio->pcm = pcm; + + strlcpy(pcm->name, "USB gadget audio", sizeof(pcm->name)); + + snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, &audio_playback_ops); + snd_pcm_lib_preallocate_pages_for_all(pcm, SNDRV_DMA_TYPE_DEV, + NULL, 0, 64 * 1024); + + strlcpy(card->driver, "audio_source", sizeof(card->driver)); + strlcpy(card->shortname, card->driver, sizeof(card->shortname)); + strlcpy(card->longname, "USB accessory audio source", + sizeof(card->longname)); + + err = snd_card_register(card); + if (err) + goto register_fail; + + config->card = pcm->card->number; + config->device = pcm->device; + audio->card = card; + return 0; + +register_fail: +pcm_fail: + snd_card_free(audio->card); + return err; +} + +static struct audio_source_instance *to_audio_source_instance( + struct config_item *item) +{ + return container_of(to_config_group(item), struct audio_source_instance, + func_inst.group); +} + +static struct audio_source_instance *to_fi_audio_source( + const struct usb_function_instance *fi) +{ + return container_of(fi, struct audio_source_instance, func_inst); +} + +static void audio_source_attr_release(struct config_item *item) +{ + struct audio_source_instance *fi_audio = to_audio_source_instance(item); + + usb_put_function_instance(&fi_audio->func_inst); +} + +static int audio_source_set_inst_name(struct usb_function_instance *fi, + const char *name) +{ + struct audio_source_instance *fi_audio; + char *ptr; + int name_len; + + name_len = strlen(name) + 1; + if (name_len > MAX_INST_NAME_LEN) + return -ENAMETOOLONG; + + ptr = kstrndup(name, name_len, GFP_KERNEL); + if (!ptr) + return -ENOMEM; + + fi_audio = to_fi_audio_source(fi); + fi_audio->name = ptr; + + return 0; +} + +static void audio_source_free_inst(struct usb_function_instance *fi) +{ + struct audio_source_instance *fi_audio; + + fi_audio = to_fi_audio_source(fi); + device_destroy(fi_audio->audio_device->class, + fi_audio->audio_device->devt); + kfree(fi_audio->name); + kfree(fi_audio->config); +} + +static ssize_t audio_source_pcm_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct audio_source_instance *fi_audio = dev_get_drvdata(dev); + struct audio_source_config *config = fi_audio->config; + + /* print PCM card and device numbers */ + return sprintf(buf, "%d %d\n", config->card, config->device); +} + +struct device *create_function_device(char *name); + +static struct usb_function_instance *audio_source_alloc_inst(void) +{ + struct audio_source_instance *fi_audio; + struct device_attribute **attrs; + struct device_attribute *attr; + struct device *dev; + void *err_ptr; + int err = 0; + + fi_audio = kzalloc(sizeof(*fi_audio), GFP_KERNEL); + if (!fi_audio) + return ERR_PTR(-ENOMEM); + + fi_audio->func_inst.set_inst_name = audio_source_set_inst_name; + fi_audio->func_inst.free_func_inst = audio_source_free_inst; + + fi_audio->config = kzalloc(sizeof(struct audio_source_config), + GFP_KERNEL); + if (!fi_audio->config) { + err_ptr = ERR_PTR(-ENOMEM); + goto fail_audio; + } + + config_group_init_type_name(&fi_audio->func_inst.group, "", + &audio_source_func_type); + dev = create_function_device("f_audio_source"); + + if (IS_ERR(dev)) { + err_ptr = dev; + goto fail_audio_config; + } + + fi_audio->config->card = -1; + fi_audio->config->device = -1; + fi_audio->audio_device = dev; + + attrs = audio_source_function_attributes; + if (attrs) { + while ((attr = *attrs++) && !err) + err = device_create_file(dev, attr); + if (err) { + err_ptr = ERR_PTR(-EINVAL); + goto fail_device; + } + } + + dev_set_drvdata(dev, fi_audio); + _audio_dev.config = fi_audio->config; + + return &fi_audio->func_inst; + +fail_device: + device_destroy(dev->class, dev->devt); +fail_audio_config: + kfree(fi_audio->config); +fail_audio: + kfree(fi_audio); + return err_ptr; + +} + +static struct usb_function *audio_source_alloc(struct usb_function_instance *fi) +{ + return &_audio_dev.func; +} + +DECLARE_USB_FUNCTION_INIT(audio_source, audio_source_alloc_inst, + audio_source_alloc); +MODULE_LICENSE("GPL"); diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index c9145ee95956..5f4f52bf0349 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -3707,7 +3707,7 @@ int ffs_name_dev(struct ffs_dev *dev, const char *name) existing = _ffs_do_find_dev(name); if (!existing) - strlcpy(dev->name, name, ARRAY_SIZE(dev->name)); + strscpy(dev->name, name, ARRAY_SIZE(dev->name)); else if (existing != dev) ret = -EBUSY; diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c index 6ad669dde41c..1635ca2e9cdd 100644 --- a/drivers/usb/gadget/function/f_mass_storage.c +++ b/drivers/usb/gadget/function/f_mass_storage.c @@ -1156,25 +1156,72 @@ static int do_read_toc(struct fsg_common *common, struct fsg_buffhd *bh) int msf = common->cmnd[1] & 0x02; int start_track = common->cmnd[6]; u8 *buf = (u8 *)bh->buf; + u8 format; + int i, len; + + format = common->cmnd[2] & 0xf; if ((common->cmnd[1] & ~0x02) != 0 || /* Mask away MSF */ - start_track > 1) { + (start_track > 1 && format != 0x1)) { curlun->sense_data = SS_INVALID_FIELD_IN_CDB; return -EINVAL; } - memset(buf, 0, 20); - buf[1] = (20-2); /* TOC data length */ - buf[2] = 1; /* First track number */ - buf[3] = 1; /* Last track number */ - buf[5] = 0x16; /* Data track, copying allowed */ - buf[6] = 0x01; /* Only track is number 1 */ - store_cdrom_address(&buf[8], msf, 0); + /* + * Check if CDB is old style SFF-8020i + * i.e. format is in 2 MSBs of byte 9 + * Mac OS-X host sends us this. + */ + if (format == 0) + format = (common->cmnd[9] >> 6) & 0x3; - buf[13] = 0x16; /* Lead-out track is data */ - buf[14] = 0xAA; /* Lead-out track number */ - store_cdrom_address(&buf[16], msf, curlun->num_sectors); - return 20; + switch (format) { + case 0: /* Formatted TOC */ + case 1: /* Multi-session info */ + len = 4 + 2*8; /* 4 byte header + 2 descriptors */ + memset(buf, 0, len); + buf[1] = len - 2; /* TOC Length excludes length field */ + buf[2] = 1; /* First track number */ + buf[3] = 1; /* Last track number */ + buf[5] = 0x16; /* Data track, copying allowed */ + buf[6] = 0x01; /* Only track is number 1 */ + store_cdrom_address(&buf[8], msf, 0); + + buf[13] = 0x16; /* Lead-out track is data */ + buf[14] = 0xAA; /* Lead-out track number */ + store_cdrom_address(&buf[16], msf, curlun->num_sectors); + return len; + + case 2: + /* Raw TOC */ + len = 4 + 3*11; /* 4 byte header + 3 descriptors */ + memset(buf, 0, len); /* Header + A0, A1 & A2 descriptors */ + buf[1] = len - 2; /* TOC Length excludes length field */ + buf[2] = 1; /* First complete session */ + buf[3] = 1; /* Last complete session */ + + buf += 4; + /* fill in A0, A1 and A2 points */ + for (i = 0; i < 3; i++) { + buf[0] = 1; /* Session number */ + buf[1] = 0x16; /* Data track, copying allowed */ + /* 2 - Track number 0 -> TOC */ + buf[3] = 0xA0 + i; /* A0, A1, A2 point */ + /* 4, 5, 6 - Min, sec, frame is zero */ + buf[8] = 1; /* Pmin: last track number */ + buf += 11; /* go to next track descriptor */ + } + buf -= 11; /* go back to A2 descriptor */ + + /* For A2, 7, 8, 9, 10 - zero, Pmin, Psec, Pframe of Lead out */ + store_cdrom_address(&buf[7], msf, curlun->num_sectors); + return len; + + default: + /* PMA, ATIP, CD-TEXT not supported/required */ + curlun->sense_data = SS_INVALID_FIELD_IN_CDB; + return -EINVAL; + } } static int do_mode_sense(struct fsg_common *common, struct fsg_buffhd *bh) @@ -1901,7 +1948,7 @@ static int do_scsi_command(struct fsg_common *common) common->data_size_from_cmnd = get_unaligned_be16(&common->cmnd[7]); reply = check_command(common, 10, DATA_DIR_TO_HOST, - (7<<6) | (1<<1), 1, + (0xf<<6) | (3<<1), 1, "READ TOC"); if (reply == 0) reply = do_read_toc(common, bh); @@ -2269,6 +2316,16 @@ static void fsg_disable(struct usb_function *f) { struct fsg_dev *fsg = fsg_from_func(f); + /* Disable the endpoints */ + if (fsg->bulk_in_enabled) { + usb_ep_disable(fsg->bulk_in); + fsg->bulk_in_enabled = 0; + } + if (fsg->bulk_out_enabled) { + usb_ep_disable(fsg->bulk_out); + fsg->bulk_out_enabled = 0; + } + __raise_exception(fsg->common, FSG_STATE_CONFIG_CHANGE, NULL); } @@ -3404,6 +3461,7 @@ static struct usb_function *fsg_alloc(struct usb_function_instance *fi) DECLARE_USB_FUNCTION_INIT(mass_storage, fsg_alloc_inst, fsg_alloc); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver); MODULE_AUTHOR("Michal Nazarewicz"); /************************* Module parameters *************************/ diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c index 71a1a26e85c7..0e78da6fc091 100644 --- a/drivers/usb/gadget/function/f_midi.c +++ b/drivers/usb/gadget/function/f_midi.c @@ -1224,6 +1224,65 @@ static void f_midi_free_inst(struct usb_function_instance *f) } } +#ifdef CONFIG_USB_CONFIGFS_UEVENT +extern struct device *create_function_device(char *name); +static ssize_t alsa_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct usb_function_instance *fi_midi = dev_get_drvdata(dev); + struct f_midi *midi; + + if (!fi_midi->f) + dev_warn(dev, "f_midi: function not set\n"); + + if (fi_midi && fi_midi->f) { + midi = func_to_midi(fi_midi->f); + if (midi->rmidi && midi->card && midi->rmidi->card) + return sprintf(buf, "%d %d\n", + midi->rmidi->card->number, midi->rmidi->device); + } + + /* print PCM card and device numbers */ + return sprintf(buf, "%d %d\n", -1, -1); +} + +static DEVICE_ATTR(alsa, S_IRUGO, alsa_show, NULL); + +static struct device_attribute *alsa_function_attributes[] = { + &dev_attr_alsa, + NULL +}; + +static int create_alsa_device(struct usb_function_instance *fi) +{ + struct device *dev; + struct device_attribute **attrs; + struct device_attribute *attr; + int err = 0; + + dev = create_function_device("f_midi"); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + attrs = alsa_function_attributes; + if (attrs) { + while ((attr = *attrs++) && !err) + err = device_create_file(dev, attr); + if (err) { + device_destroy(dev->class, dev->devt); + return -EINVAL; + } + } + dev_set_drvdata(dev, fi); + return 0; +} +#else +static int create_alsa_device(struct usb_function_instance *fi) +{ + return 0; +} +#endif + static struct usb_function_instance *f_midi_alloc_inst(void) { struct f_midi_opts *opts; @@ -1242,6 +1301,11 @@ static struct usb_function_instance *f_midi_alloc_inst(void) opts->out_ports = 1; opts->refcnt = 1; + if (create_alsa_device(&opts->func_inst)) { + kfree(opts); + return ERR_PTR(-ENODEV); + } + config_group_init_type_name(&opts->func_inst.group, "", &midi_func_type); @@ -1262,6 +1326,7 @@ static void f_midi_free(struct usb_function *f) kfifo_free(&midi->in_req_fifo); kfree(midi); free = true; + opts->func_inst.f = NULL; } mutex_unlock(&opts->lock); @@ -1349,6 +1414,7 @@ static struct usb_function *f_midi_alloc(struct usb_function_instance *fi) midi->func.disable = f_midi_disable; midi->func.free_func = f_midi_free; + fi->f = &midi->func; return &midi->func; midi_free: diff --git a/drivers/usb/gadget/function/f_uac1_legacy.c b/drivers/usb/gadget/function/f_uac1_legacy.c index e2d7f69128a0..8ffd477e79e1 100644 --- a/drivers/usb/gadget/function/f_uac1_legacy.c +++ b/drivers/usb/gadget/function/f_uac1_legacy.c @@ -1015,4 +1015,5 @@ static struct usb_function *f_audio_alloc(struct usb_function_instance *fi) DECLARE_USB_FUNCTION_INIT(uac1_legacy, f_audio_alloc_inst, f_audio_alloc); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver); MODULE_AUTHOR("Bryan Wu"); diff --git a/drivers/usb/gadget/function/f_uvc.c b/drivers/usb/gadget/function/f_uvc.c index 5df1b68e5eac..32f2c1645467 100644 --- a/drivers/usb/gadget/function/f_uvc.c +++ b/drivers/usb/gadget/function/f_uvc.c @@ -24,7 +24,6 @@ #include #include -#include "u_uvc.h" #include "uvc.h" #include "uvc_configfs.h" #include "uvc_v4l2.h" @@ -40,11 +39,8 @@ MODULE_PARM_DESC(trace, "Trace level bitmask"); /* string IDs are assigned dynamically */ -#define UVC_STRING_CONTROL_IDX 0 -#define UVC_STRING_STREAMING_IDX 1 - static struct usb_string uvc_en_us_strings[] = { - [UVC_STRING_CONTROL_IDX].s = "UVC Camera", + /* [UVC_STRING_CONTROL_IDX].s = DYNAMIC, */ [UVC_STRING_STREAMING_IDX].s = "Video Streaming", { } }; @@ -142,7 +138,8 @@ static struct usb_endpoint_descriptor uvc_fs_streaming_ep = { .bEndpointAddress = USB_DIR_IN, .bmAttributes = USB_ENDPOINT_SYNC_ASYNC | USB_ENDPOINT_XFER_ISOC, - /* The wMaxPacketSize and bInterval values will be initialized from + /* + * The wMaxPacketSize and bInterval values will be initialized from * module parameters. */ }; @@ -153,7 +150,8 @@ static struct usb_endpoint_descriptor uvc_hs_streaming_ep = { .bEndpointAddress = USB_DIR_IN, .bmAttributes = USB_ENDPOINT_SYNC_ASYNC | USB_ENDPOINT_XFER_ISOC, - /* The wMaxPacketSize and bInterval values will be initialized from + /* + * The wMaxPacketSize and bInterval values will be initialized from * module parameters. */ }; @@ -165,7 +163,8 @@ static struct usb_endpoint_descriptor uvc_ss_streaming_ep = { .bEndpointAddress = USB_DIR_IN, .bmAttributes = USB_ENDPOINT_SYNC_ASYNC | USB_ENDPOINT_XFER_ISOC, - /* The wMaxPacketSize and bInterval values will be initialized from + /* + * The wMaxPacketSize and bInterval values will be initialized from * module parameters. */ }; @@ -173,7 +172,8 @@ static struct usb_endpoint_descriptor uvc_ss_streaming_ep = { static struct usb_ss_ep_comp_descriptor uvc_ss_streaming_comp = { .bLength = sizeof(uvc_ss_streaming_comp), .bDescriptorType = USB_DT_SS_ENDPOINT_COMP, - /* The bMaxBurst, bmAttributes and wBytesPerInterval values will be + /* + * The bMaxBurst, bmAttributes and wBytesPerInterval values will be * initialized from module parameters. */ }; @@ -226,6 +226,8 @@ uvc_function_setup(struct usb_function *f, const struct usb_ctrlrequest *ctrl) struct uvc_device *uvc = to_uvc(f); struct v4l2_event v4l2_event; struct uvc_event *uvc_event = (void *)&v4l2_event.u.data; + unsigned int interface = le16_to_cpu(ctrl->wIndex) & 0xff; + struct usb_ctrlrequest *mctrl; if ((ctrl->bRequestType & USB_TYPE_MASK) != USB_TYPE_CLASS) { uvcg_info(f, "invalid request type\n"); @@ -236,7 +238,8 @@ uvc_function_setup(struct usb_function *f, const struct usb_ctrlrequest *ctrl) if (le16_to_cpu(ctrl->wLength) > UVC_MAX_REQUEST_SIZE) return -EINVAL; - /* Tell the complete callback to generate an event for the next request + /* + * Tell the complete callback to generate an event for the next request * that will be enqueued by UVCIOC_SEND_RESPONSE. */ uvc->event_setup_out = !(ctrl->bRequestType & USB_DIR_IN); @@ -245,6 +248,16 @@ uvc_function_setup(struct usb_function *f, const struct usb_ctrlrequest *ctrl) memset(&v4l2_event, 0, sizeof(v4l2_event)); v4l2_event.type = UVC_EVENT_SETUP; memcpy(&uvc_event->req, ctrl, sizeof(uvc_event->req)); + + /* check for the interface number, fixup the interface number in + * the ctrl request so the userspace doesn't have to bother with + * offset and configfs parsing + */ + mctrl = &uvc_event->req; + mctrl->wIndex &= ~cpu_to_le16(0xff); + if (interface == uvc->streaming_intf) + mctrl->wIndex = cpu_to_le16(UVC_STRING_STREAMING_IDX); + v4l2_event_queue(&uvc->vdev, &v4l2_event); return 0; @@ -418,6 +431,7 @@ uvc_register_video(struct uvc_device *uvc) int ret; /* TODO reference counting. */ + memset(&uvc->vdev, 0, sizeof(uvc->vdev)); uvc->vdev.v4l2_dev = &uvc->v4l2_dev; uvc->vdev.v4l2_dev->dev = &cdev->gadget->dev; uvc->vdev.fops = &uvc_v4l2_fops; @@ -426,7 +440,7 @@ uvc_register_video(struct uvc_device *uvc) uvc->vdev.vfl_dir = VFL_DIR_TX; uvc->vdev.lock = &uvc->video.mutex; uvc->vdev.device_caps = V4L2_CAP_VIDEO_OUTPUT | V4L2_CAP_STREAMING; - strlcpy(uvc->vdev.name, cdev->gadget->name, sizeof(uvc->vdev.name)); + strscpy(uvc->vdev.name, cdev->gadget->name, sizeof(uvc->vdev.name)); video_set_drvdata(&uvc->vdev, uvc); @@ -501,7 +515,8 @@ uvc_copy_descriptors(struct uvc_device *uvc, enum usb_device_speed speed) if (!uvc_control_desc || !uvc_streaming_cls) return ERR_PTR(-ENODEV); - /* Descriptors layout + /* + * Descriptors layout * * uvc_iad * uvc_control_intf @@ -598,8 +613,7 @@ uvc_function_bind(struct usb_configuration *c, struct usb_function *f) uvcg_info(f, "%s()\n", __func__); opts = fi_to_f_uvc_opts(f->fi); - /* Sanity check the streaming endpoint module parameters. - */ + /* Sanity check the streaming endpoint module parameters. */ opts->streaming_interval = clamp(opts->streaming_interval, 1U, 16U); opts->streaming_maxpacket = clamp(opts->streaming_maxpacket, 1U, 3072U); opts->streaming_maxburst = min(opts->streaming_maxburst, 15U); @@ -612,7 +626,8 @@ uvc_function_bind(struct usb_configuration *c, struct usb_function *f) opts->streaming_maxpacket); } - /* Fill in the FS/HS/SS Video Streaming specific descriptors from the + /* + * Fill in the FS/HS/SS Video Streaming specific descriptors from the * module parameters. * * NOTE: We assume that the user knows what they are doing and won't @@ -676,6 +691,7 @@ uvc_function_bind(struct usb_configuration *c, struct usb_function *f) uvc_hs_streaming_ep.bEndpointAddress = uvc->video.ep->address; uvc_ss_streaming_ep.bEndpointAddress = uvc->video.ep->address; + uvc_en_us_strings[UVC_STRING_CONTROL_IDX].s = opts->function_name; us = usb_gstrings_attach(cdev, uvc_function_strings, ARRAY_SIZE(uvc_en_us_strings)); if (IS_ERR(us)) { @@ -866,6 +882,7 @@ static struct usb_function_instance *uvc_alloc_inst(void) opts->streaming_interval = 1; opts->streaming_maxpacket = 1024; + snprintf(opts->function_name, sizeof(opts->function_name), "UVC Camera"); ret = uvcg_attach_configfs(opts); if (ret < 0) { @@ -881,6 +898,7 @@ static void uvc_free(struct usb_function *f) struct uvc_device *uvc = to_uvc(f); struct f_uvc_opts *opts = container_of(f->fi, struct f_uvc_opts, func_inst); + config_item_put(&uvc->header->item); --opts->refcnt; kfree(uvc); } @@ -890,11 +908,16 @@ static void uvc_function_unbind(struct usb_configuration *c, { struct usb_composite_dev *cdev = c->cdev; struct uvc_device *uvc = to_uvc(f); + struct uvc_video *video = &uvc->video; long wait_ret = 1; uvcg_info(f, "%s()\n", __func__); - /* If we know we're connected via v4l2, then there should be a cleanup + if (video->async_wq) + destroy_workqueue(video->async_wq); + + /* + * If we know we're connected via v4l2, then there should be a cleanup * of the device from userspace either via UVC_EVENT_DISCONNECT or * though the video device removal uevent. Allow some time for the * application to close out before things get deleted. @@ -911,7 +934,8 @@ static void uvc_function_unbind(struct usb_configuration *c, v4l2_device_unregister(&uvc->v4l2_dev); if (uvc->func_connected) { - /* Wait for the release to occur to ensure there are no longer any + /* + * Wait for the release to occur to ensure there are no longer any * pending operations that may cause panics when resources are cleaned * up. */ @@ -932,6 +956,7 @@ static struct usb_function *uvc_alloc(struct usb_function_instance *fi) struct uvc_device *uvc; struct f_uvc_opts *opts; struct uvc_descriptor_header **strm_cls; + struct config_item *streaming, *header, *h; uvc = kzalloc(sizeof(*uvc), GFP_KERNEL); if (uvc == NULL) @@ -964,6 +989,28 @@ static struct usb_function *uvc_alloc(struct usb_function_instance *fi) uvc->desc.fs_streaming = opts->fs_streaming; uvc->desc.hs_streaming = opts->hs_streaming; uvc->desc.ss_streaming = opts->ss_streaming; + + streaming = config_group_find_item(&opts->func_inst.group, "streaming"); + if (!streaming) + goto err_config; + + header = config_group_find_item(to_config_group(streaming), "header"); + config_item_put(streaming); + if (!header) + goto err_config; + + h = config_group_find_item(to_config_group(header), "h"); + config_item_put(header); + if (!h) + goto err_config; + + uvc->header = to_uvcg_streaming_header(h); + if (!uvc->header->linked) { + mutex_unlock(&opts->lock); + kfree(uvc); + return ERR_PTR(-EBUSY); + } + ++opts->refcnt; mutex_unlock(&opts->lock); @@ -979,6 +1026,11 @@ static struct usb_function *uvc_alloc(struct usb_function_instance *fi) uvc->func.bind_deactivated = true; return &uvc->func; + +err_config: + mutex_unlock(&opts->lock); + kfree(uvc); + return ERR_PTR(-ENOENT); } DECLARE_USB_FUNCTION_INIT(uvc, uvc_alloc_inst, uvc_alloc); diff --git a/drivers/usb/gadget/function/storage_common.c b/drivers/usb/gadget/function/storage_common.c index e122050eebaf..93305b3b0ece 100644 --- a/drivers/usb/gadget/function/storage_common.c +++ b/drivers/usb/gadget/function/storage_common.c @@ -522,3 +522,4 @@ ssize_t fsg_store_inquiry_string(struct fsg_lun *curlun, const char *buf, EXPORT_SYMBOL_GPL(fsg_store_inquiry_string); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver); diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index ef253599dcf9..83d137b3f086 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -145,10 +145,10 @@ static void eth_get_drvinfo(struct net_device *net, struct ethtool_drvinfo *p) { struct eth_dev *dev = netdev_priv(net); - strlcpy(p->driver, "g_ether", sizeof(p->driver)); - strlcpy(p->version, UETH__VERSION, sizeof(p->version)); - strlcpy(p->fw_version, dev->gadget->name, sizeof(p->fw_version)); - strlcpy(p->bus_info, dev_name(&dev->gadget->dev), sizeof(p->bus_info)); + strscpy(p->driver, "g_ether", sizeof(p->driver)); + strscpy(p->version, UETH__VERSION, sizeof(p->version)); + strscpy(p->fw_version, dev->gadget->name, sizeof(p->fw_version)); + strscpy(p->bus_info, dev_name(&dev->gadget->dev), sizeof(p->bus_info)); } /* REVISIT can also support: diff --git a/drivers/usb/gadget/function/u_uvc.h b/drivers/usb/gadget/function/u_uvc.h index 9a01a7d4f17f..24b8681b0d6f 100644 --- a/drivers/usb/gadget/function/u_uvc.h +++ b/drivers/usb/gadget/function/u_uvc.h @@ -27,6 +27,7 @@ struct f_uvc_opts { unsigned int control_interface; unsigned int streaming_interface; + char function_name[32]; /* * Control descriptors array pointers for full-/high-speed and diff --git a/drivers/usb/gadget/function/uvc.h b/drivers/usb/gadget/function/uvc.h index d1a4ef74742b..40226b1f7e14 100644 --- a/drivers/usb/gadget/function/uvc.h +++ b/drivers/usb/gadget/function/uvc.h @@ -69,7 +69,7 @@ extern unsigned int uvc_gadget_trace_param; #define UVC_MAX_REQUEST_SIZE 64 #define UVC_MAX_EVENTS 4 -#define UVCG_REQUEST_HEADER_LEN 2 +#define UVCG_REQUEST_HEADER_LEN 12 /* ------------------------------------------------------------------------ * Structures @@ -88,6 +88,7 @@ struct uvc_video { struct usb_ep *ep; struct work_struct pump; + struct workqueue_struct *async_wq; /* Frame parameters */ u8 bpp; @@ -133,6 +134,8 @@ struct uvc_device { bool func_connected; wait_queue_head_t func_connected_queue; + struct uvcg_streaming_header *header; + /* Descriptors */ struct { const struct uvc_descriptor_header * const *fs_control; diff --git a/drivers/usb/gadget/function/uvc_configfs.c b/drivers/usb/gadget/function/uvc_configfs.c index 77d64031aa9c..76cb60d13049 100644 --- a/drivers/usb/gadget/function/uvc_configfs.c +++ b/drivers/usb/gadget/function/uvc_configfs.c @@ -10,17 +10,14 @@ * Author: Andrzej Pietrasiewicz */ -#include - -#include "u_uvc.h" #include "uvc_configfs.h" +#include + /* ----------------------------------------------------------------------------- * Global Utility Structures and Macros */ -#define UVCG_STREAMING_CONTROL_SIZE 1 - #define UVC_ATTR(prefix, cname, aname) \ static struct configfs_attribute prefix##attr_##cname = { \ .ca_name = __stringify(aname), \ @@ -49,12 +46,6 @@ static int uvcg_config_compare_u32(const void *l, const void *r) return li < ri ? -1 : li == ri ? 0 : 1; } -static inline struct f_uvc_opts *to_f_uvc_opts(struct config_item *item) -{ - return container_of(to_config_group(item), struct f_uvc_opts, - func_inst.group); -} - struct uvcg_config_group_type { struct config_item_type type; const char *name; @@ -125,19 +116,6 @@ static void uvcg_config_remove_children(struct config_group *group) * control/header */ -DECLARE_UVC_HEADER_DESCRIPTOR(1); - -struct uvcg_control_header { - struct config_item item; - struct UVC_HEADER_DESCRIPTOR(1) desc; - unsigned linked; -}; - -static struct uvcg_control_header *to_uvcg_control_header(struct config_item *item) -{ - return container_of(item, struct uvcg_control_header, item); -} - #define UVCG_CTRL_HDR_ATTR(cname, aname, bits, limit) \ static ssize_t uvcg_control_header_##cname##_show( \ struct config_item *item, char *page) \ @@ -769,24 +747,6 @@ static const char * const uvcg_format_names[] = { "mjpeg", }; -enum uvcg_format_type { - UVCG_UNCOMPRESSED = 0, - UVCG_MJPEG, -}; - -struct uvcg_format { - struct config_group group; - enum uvcg_format_type type; - unsigned linked; - unsigned num_frames; - __u8 bmaControls[UVCG_STREAMING_CONTROL_SIZE]; -}; - -static struct uvcg_format *to_uvcg_format(struct config_item *item) -{ - return container_of(to_config_group(item), struct uvcg_format, group); -} - static ssize_t uvcg_format_bma_controls_show(struct uvcg_format *f, char *page) { struct f_uvc_opts *opts; @@ -845,29 +805,11 @@ end: return ret; } -struct uvcg_format_ptr { - struct uvcg_format *fmt; - struct list_head entry; -}; - /* ----------------------------------------------------------------------------- * streaming/header/ * streaming/header */ -struct uvcg_streaming_header { - struct config_item item; - struct uvc_input_header_descriptor desc; - unsigned linked; - struct list_head formats; - unsigned num_fmt; -}; - -static struct uvcg_streaming_header *to_uvcg_streaming_header(struct config_item *item) -{ - return container_of(item, struct uvcg_streaming_header, item); -} - static void uvcg_format_set_indices(struct config_group *fmt); static int uvcg_streaming_header_allow_link(struct config_item *src, @@ -1059,31 +1001,6 @@ static const struct uvcg_config_group_type uvcg_streaming_header_grp_type = { * streaming/// */ -struct uvcg_frame { - struct config_item item; - enum uvcg_format_type fmt_type; - struct { - u8 b_length; - u8 b_descriptor_type; - u8 b_descriptor_subtype; - u8 b_frame_index; - u8 bm_capabilities; - u16 w_width; - u16 w_height; - u32 dw_min_bit_rate; - u32 dw_max_bit_rate; - u32 dw_max_video_frame_buffer_size; - u32 dw_default_frame_interval; - u8 b_frame_interval_type; - } __attribute__((packed)) frame; - u32 *dw_frame_interval; -}; - -static struct uvcg_frame *to_uvcg_frame(struct config_item *item) -{ - return container_of(item, struct uvcg_frame, item); -} - #define UVCG_FRAME_ATTR(cname, aname, bits) \ static ssize_t uvcg_frame_##cname##_show(struct config_item *item, char *page)\ { \ @@ -1345,6 +1262,7 @@ static struct config_item *uvcg_frame_make(struct config_group *group, struct uvcg_format *fmt; struct f_uvc_opts *opts; struct config_item *opts_item; + struct uvcg_frame_ptr *frame_ptr; h = kzalloc(sizeof(*h), GFP_KERNEL); if (!h) @@ -1375,6 +1293,16 @@ static struct config_item *uvcg_frame_make(struct config_group *group, kfree(h); return ERR_PTR(-EINVAL); } + + frame_ptr = kzalloc(sizeof(*frame_ptr), GFP_KERNEL); + if (!frame_ptr) { + mutex_unlock(&opts->lock); + kfree(h); + return ERR_PTR(-ENOMEM); + } + + frame_ptr->frm = h; + list_add_tail(&frame_ptr->entry, &fmt->frames); ++fmt->num_frames; mutex_unlock(&opts->lock); @@ -1388,13 +1316,23 @@ static void uvcg_frame_drop(struct config_group *group, struct config_item *item struct uvcg_format *fmt; struct f_uvc_opts *opts; struct config_item *opts_item; + struct uvcg_frame *target_frm = NULL; + struct uvcg_frame_ptr *frame_ptr, *tmp; opts_item = group->cg_item.ci_parent->ci_parent->ci_parent; opts = to_f_uvc_opts(opts_item); mutex_lock(&opts->lock); + target_frm = container_of(item, struct uvcg_frame, item); fmt = to_uvcg_format(&group->cg_item); - --fmt->num_frames; + + list_for_each_entry_safe(frame_ptr, tmp, &fmt->frames, entry) + if (frame_ptr->frm == target_frm) { + list_del(&frame_ptr->entry); + kfree(frame_ptr); + --fmt->num_frames; + break; + } mutex_unlock(&opts->lock); config_item_put(item); @@ -1420,18 +1358,6 @@ static void uvcg_format_set_indices(struct config_group *fmt) * streaming/uncompressed/ */ -struct uvcg_uncompressed { - struct uvcg_format fmt; - struct uvc_format_uncompressed desc; -}; - -static struct uvcg_uncompressed *to_uvcg_uncompressed(struct config_item *item) -{ - return container_of( - container_of(to_config_group(item), struct uvcg_format, group), - struct uvcg_uncompressed, fmt); -} - static struct configfs_group_operations uvcg_uncompressed_group_ops = { .make_item = uvcg_frame_make, .drop_item = uvcg_frame_drop, @@ -1565,6 +1491,12 @@ uvcg_uncompressed_##cname##_store(struct config_item *item, \ if (ret) \ goto end; \ \ + /* index values in uvc are never 0 */ \ + if (!num) { \ + ret = -EINVAL; \ + goto end; \ + } \ + \ u->desc.aname = num; \ ret = len; \ end: \ @@ -1580,7 +1512,7 @@ UVCG_UNCOMPRESSED_ATTR(b_bits_per_pixel, bBitsPerPixel, 8); UVCG_UNCOMPRESSED_ATTR(b_default_frame_index, bDefaultFrameIndex, 8); UVCG_UNCOMPRESSED_ATTR_RO(b_aspect_ratio_x, bAspectRatioX, 8); UVCG_UNCOMPRESSED_ATTR_RO(b_aspect_ratio_y, bAspectRatioY, 8); -UVCG_UNCOMPRESSED_ATTR_RO(bm_interface_flags, bmInterfaceFlags, 8); +UVCG_UNCOMPRESSED_ATTR_RO(bm_interlace_flags, bmInterlaceFlags, 8); #undef UVCG_UNCOMPRESSED_ATTR #undef UVCG_UNCOMPRESSED_ATTR_RO @@ -1609,7 +1541,7 @@ static struct configfs_attribute *uvcg_uncompressed_attrs[] = { &uvcg_uncompressed_attr_b_default_frame_index, &uvcg_uncompressed_attr_b_aspect_ratio_x, &uvcg_uncompressed_attr_b_aspect_ratio_y, - &uvcg_uncompressed_attr_bm_interface_flags, + &uvcg_uncompressed_attr_bm_interlace_flags, &uvcg_uncompressed_attr_bma_controls, NULL, }; @@ -1642,9 +1574,10 @@ static struct config_group *uvcg_uncompressed_make(struct config_group *group, h->desc.bDefaultFrameIndex = 1; h->desc.bAspectRatioX = 0; h->desc.bAspectRatioY = 0; - h->desc.bmInterfaceFlags = 0; + h->desc.bmInterlaceFlags = 0; h->desc.bCopyProtect = 0; + INIT_LIST_HEAD(&h->fmt.frames); h->fmt.type = UVCG_UNCOMPRESSED; config_group_init_type_name(&h->fmt.group, name, &uvcg_uncompressed_type); @@ -1669,18 +1602,6 @@ static const struct uvcg_config_group_type uvcg_uncompressed_grp_type = { * streaming/mjpeg/ */ -struct uvcg_mjpeg { - struct uvcg_format fmt; - struct uvc_format_mjpeg desc; -}; - -static struct uvcg_mjpeg *to_uvcg_mjpeg(struct config_item *item) -{ - return container_of( - container_of(to_config_group(item), struct uvcg_format, group), - struct uvcg_mjpeg, fmt); -} - static struct configfs_group_operations uvcg_mjpeg_group_ops = { .make_item = uvcg_frame_make, .drop_item = uvcg_frame_drop, @@ -1758,6 +1679,12 @@ uvcg_mjpeg_##cname##_store(struct config_item *item, \ if (ret) \ goto end; \ \ + /* index values in uvc are never 0 */ \ + if (!num) { \ + ret = -EINVAL; \ + goto end; \ + } \ + \ u->desc.aname = num; \ ret = len; \ end: \ @@ -1773,7 +1700,7 @@ UVCG_MJPEG_ATTR(b_default_frame_index, bDefaultFrameIndex, 8); UVCG_MJPEG_ATTR_RO(bm_flags, bmFlags, 8); UVCG_MJPEG_ATTR_RO(b_aspect_ratio_x, bAspectRatioX, 8); UVCG_MJPEG_ATTR_RO(b_aspect_ratio_y, bAspectRatioY, 8); -UVCG_MJPEG_ATTR_RO(bm_interface_flags, bmInterfaceFlags, 8); +UVCG_MJPEG_ATTR_RO(bm_interlace_flags, bmInterlaceFlags, 8); #undef UVCG_MJPEG_ATTR #undef UVCG_MJPEG_ATTR_RO @@ -1801,7 +1728,7 @@ static struct configfs_attribute *uvcg_mjpeg_attrs[] = { &uvcg_mjpeg_attr_bm_flags, &uvcg_mjpeg_attr_b_aspect_ratio_x, &uvcg_mjpeg_attr_b_aspect_ratio_y, - &uvcg_mjpeg_attr_bm_interface_flags, + &uvcg_mjpeg_attr_bm_interlace_flags, &uvcg_mjpeg_attr_bma_controls, NULL, }; @@ -1828,9 +1755,10 @@ static struct config_group *uvcg_mjpeg_make(struct config_group *group, h->desc.bDefaultFrameIndex = 1; h->desc.bAspectRatioX = 0; h->desc.bAspectRatioY = 0; - h->desc.bmInterfaceFlags = 0; + h->desc.bmInterlaceFlags = 0; h->desc.bCopyProtect = 0; + INIT_LIST_HEAD(&h->fmt.frames); h->fmt.type = UVCG_MJPEG; config_group_init_type_name(&h->fmt.group, name, &uvcg_mjpeg_type); @@ -2425,10 +2353,53 @@ UVCG_OPTS_ATTR(streaming_maxburst, streaming_maxburst, 15); #undef UVCG_OPTS_ATTR +#define UVCG_OPTS_STRING_ATTR(cname, aname) \ +static ssize_t f_uvc_opts_string_##cname##_show(struct config_item *item,\ + char *page) \ +{ \ + struct f_uvc_opts *opts = to_f_uvc_opts(item); \ + int result; \ + \ + mutex_lock(&opts->lock); \ + result = snprintf(page, sizeof(opts->aname), "%s", opts->aname);\ + mutex_unlock(&opts->lock); \ + \ + return result; \ +} \ + \ +static ssize_t f_uvc_opts_string_##cname##_store(struct config_item *item,\ + const char *page, size_t len) \ +{ \ + struct f_uvc_opts *opts = to_f_uvc_opts(item); \ + int size = min(sizeof(opts->aname), len + 1); \ + int ret = 0; \ + \ + mutex_lock(&opts->lock); \ + if (opts->refcnt) { \ + ret = -EBUSY; \ + goto end; \ + } \ + \ + ret = strscpy(opts->aname, page, size); \ + if (ret == -E2BIG) \ + ret = size - 1; \ + \ +end: \ + mutex_unlock(&opts->lock); \ + return ret; \ +} \ + \ +UVC_ATTR(f_uvc_opts_string_, cname, aname) + +UVCG_OPTS_STRING_ATTR(function_name, function_name); + +#undef UVCG_OPTS_STRING_ATTR + static struct configfs_attribute *uvc_attrs[] = { &f_uvc_opts_attr_streaming_interval, &f_uvc_opts_attr_streaming_maxpacket, &f_uvc_opts_attr_streaming_maxburst, + &f_uvc_opts_string_attr_function_name, NULL, }; diff --git a/drivers/usb/gadget/function/uvc_configfs.h b/drivers/usb/gadget/function/uvc_configfs.h index 7e1d7ca29bf2..ad2ec8c4c78c 100644 --- a/drivers/usb/gadget/function/uvc_configfs.h +++ b/drivers/usb/gadget/function/uvc_configfs.h @@ -12,7 +12,125 @@ #ifndef UVC_CONFIGFS_H #define UVC_CONFIGFS_H -struct f_uvc_opts; +#include + +#include "u_uvc.h" + +static inline struct f_uvc_opts *to_f_uvc_opts(struct config_item *item) +{ + return container_of(to_config_group(item), struct f_uvc_opts, + func_inst.group); +} + +#define UVCG_STREAMING_CONTROL_SIZE 1 + +DECLARE_UVC_HEADER_DESCRIPTOR(1); + +struct uvcg_control_header { + struct config_item item; + struct UVC_HEADER_DESCRIPTOR(1) desc; + unsigned linked; +}; + +static inline struct uvcg_control_header *to_uvcg_control_header(struct config_item *item) +{ + return container_of(item, struct uvcg_control_header, item); +} + +enum uvcg_format_type { + UVCG_UNCOMPRESSED = 0, + UVCG_MJPEG, +}; + +struct uvcg_format { + struct config_group group; + enum uvcg_format_type type; + unsigned linked; + struct list_head frames; + unsigned num_frames; + __u8 bmaControls[UVCG_STREAMING_CONTROL_SIZE]; +}; + +struct uvcg_format_ptr { + struct uvcg_format *fmt; + struct list_head entry; +}; + +static inline struct uvcg_format *to_uvcg_format(struct config_item *item) +{ + return container_of(to_config_group(item), struct uvcg_format, group); +} + +struct uvcg_streaming_header { + struct config_item item; + struct uvc_input_header_descriptor desc; + unsigned linked; + struct list_head formats; + unsigned num_fmt; +}; + +static inline struct uvcg_streaming_header *to_uvcg_streaming_header(struct config_item *item) +{ + return container_of(item, struct uvcg_streaming_header, item); +} + +struct uvcg_frame_ptr { + struct uvcg_frame *frm; + struct list_head entry; +}; + +struct uvcg_frame { + struct config_item item; + enum uvcg_format_type fmt_type; + struct { + u8 b_length; + u8 b_descriptor_type; + u8 b_descriptor_subtype; + u8 b_frame_index; + u8 bm_capabilities; + u16 w_width; + u16 w_height; + u32 dw_min_bit_rate; + u32 dw_max_bit_rate; + u32 dw_max_video_frame_buffer_size; + u32 dw_default_frame_interval; + u8 b_frame_interval_type; + } __attribute__((packed)) frame; + u32 *dw_frame_interval; +}; + +static inline struct uvcg_frame *to_uvcg_frame(struct config_item *item) +{ + return container_of(item, struct uvcg_frame, item); +} + +/* ----------------------------------------------------------------------------- + * streaming/uncompressed/ + */ + +struct uvcg_uncompressed { + struct uvcg_format fmt; + struct uvc_format_uncompressed desc; +}; + +static inline struct uvcg_uncompressed *to_uvcg_uncompressed(struct config_item *item) +{ + return container_of(to_uvcg_format(item), struct uvcg_uncompressed, fmt); +} + +/* ----------------------------------------------------------------------------- + * streaming/mjpeg/ + */ + +struct uvcg_mjpeg { + struct uvcg_format fmt; + struct uvc_format_mjpeg desc; +}; + +static inline struct uvcg_mjpeg *to_uvcg_mjpeg(struct config_item *item) +{ + return container_of(to_uvcg_format(item), struct uvcg_mjpeg, fmt); +} int uvcg_attach_configfs(struct f_uvc_opts *opts); diff --git a/drivers/usb/gadget/function/uvc_queue.c b/drivers/usb/gadget/function/uvc_queue.c index 0cc8422afe4e..0aa3d7e1f3cc 100644 --- a/drivers/usb/gadget/function/uvc_queue.c +++ b/drivers/usb/gadget/function/uvc_queue.c @@ -111,7 +111,8 @@ static void uvc_buffer_queue(struct vb2_buffer *vb) if (likely(!(queue->flags & UVC_QUEUE_DISCONNECTED))) { list_add_tail(&buf->queue, &queue->irqqueue); } else { - /* If the device is disconnected return the buffer to userspace + /* + * If the device is disconnected return the buffer to userspace * directly. The next QBUF call will fail with -ENODEV. */ buf->state = UVC_BUF_STATE_ERROR; @@ -149,7 +150,7 @@ int uvcg_queue_init(struct uvc_video_queue *queue, struct device *dev, enum v4l2 queue->queue.mem_ops = &vb2_vmalloc_memops; } - queue->queue.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC + queue->queue.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_COPY | V4L2_BUF_FLAG_TSTAMP_SRC_EOF; queue->queue.dev = dev; @@ -192,18 +193,7 @@ int uvcg_query_buffer(struct uvc_video_queue *queue, struct v4l2_buffer *buf) int uvcg_queue_buffer(struct uvc_video_queue *queue, struct v4l2_buffer *buf) { - unsigned long flags; - int ret; - - ret = vb2_qbuf(&queue->queue, NULL, buf); - if (ret < 0) - return ret; - - spin_lock_irqsave(&queue->irqlock, flags); - ret = (queue->flags & UVC_QUEUE_PAUSED) != 0; - queue->flags &= ~UVC_QUEUE_PAUSED; - spin_unlock_irqrestore(&queue->irqlock, flags); - return ret; + return vb2_qbuf(&queue->queue, NULL, buf); } /* @@ -273,7 +263,8 @@ void uvcg_queue_cancel(struct uvc_video_queue *queue, int disconnect) } queue->buf_used = 0; - /* This must be protected by the irqlock spinlock to avoid race + /* + * This must be protected by the irqlock spinlock to avoid race * conditions between uvc_queue_buffer and the disconnection event that * could result in an interruptible wait in uvc_dequeue_buffer. Do not * blindly replace this logic by checking for the UVC_DEV_DISCONNECTED @@ -362,8 +353,6 @@ struct uvc_buffer *uvcg_queue_head(struct uvc_video_queue *queue) if (!list_empty(&queue->irqqueue)) buf = list_first_entry(&queue->irqqueue, struct uvc_buffer, queue); - else - queue->flags |= UVC_QUEUE_PAUSED; return buf; } diff --git a/drivers/usb/gadget/function/uvc_queue.h b/drivers/usb/gadget/function/uvc_queue.h index b668927b5d2c..41f87b917f6b 100644 --- a/drivers/usb/gadget/function/uvc_queue.h +++ b/drivers/usb/gadget/function/uvc_queue.h @@ -43,7 +43,6 @@ struct uvc_buffer { #define UVC_QUEUE_DISCONNECTED (1 << 0) #define UVC_QUEUE_DROP_INCOMPLETE (1 << 1) -#define UVC_QUEUE_PAUSED (1 << 2) struct uvc_video_queue { struct vb2_queue queue; diff --git a/drivers/usb/gadget/function/uvc_v4l2.c b/drivers/usb/gadget/function/uvc_v4l2.c index 65abd55ce234..a189b08bba80 100644 --- a/drivers/usb/gadget/function/uvc_v4l2.c +++ b/drivers/usb/gadget/function/uvc_v4l2.c @@ -18,12 +18,161 @@ #include #include #include +#include #include "f_uvc.h" #include "uvc.h" #include "uvc_queue.h" #include "uvc_video.h" #include "uvc_v4l2.h" +#include "uvc_configfs.h" + +static struct uvc_format_desc *to_uvc_format(struct uvcg_format *uformat) +{ + char guid[16] = UVC_GUID_FORMAT_MJPEG; + struct uvc_format_desc *format; + struct uvcg_uncompressed *unc; + + if (uformat->type == UVCG_UNCOMPRESSED) { + unc = to_uvcg_uncompressed(&uformat->group.cg_item); + if (!unc) + return ERR_PTR(-EINVAL); + + memcpy(guid, unc->desc.guidFormat, sizeof(guid)); + } + + format = uvc_format_by_guid(guid); + if (!format) + return ERR_PTR(-EINVAL); + + return format; +} + +static int uvc_v4l2_get_bytesperline(struct uvcg_format *uformat, + struct uvcg_frame *uframe) +{ + struct uvcg_uncompressed *u; + + if (uformat->type == UVCG_UNCOMPRESSED) { + u = to_uvcg_uncompressed(&uformat->group.cg_item); + if (!u) + return 0; + + return u->desc.bBitsPerPixel * uframe->frame.w_width / 8; + } + + return 0; +} + +static int uvc_get_frame_size(struct uvcg_format *uformat, + struct uvcg_frame *uframe) +{ + unsigned int bpl = uvc_v4l2_get_bytesperline(uformat, uframe); + + return bpl ? bpl * uframe->frame.w_height : + uframe->frame.dw_max_video_frame_buffer_size; +} + +static struct uvcg_format *find_format_by_index(struct uvc_device *uvc, int index) +{ + struct uvcg_format_ptr *format; + struct uvcg_format *uformat = NULL; + int i = 1; + + list_for_each_entry(format, &uvc->header->formats, entry) { + if (index == i) { + uformat = format->fmt; + break; + } + i++; + } + + return uformat; +} + +static struct uvcg_frame *find_frame_by_index(struct uvc_device *uvc, + struct uvcg_format *uformat, + int index) +{ + struct uvcg_format_ptr *format; + struct uvcg_frame_ptr *frame; + struct uvcg_frame *uframe = NULL; + + list_for_each_entry(format, &uvc->header->formats, entry) { + if (format->fmt->type != uformat->type) + continue; + list_for_each_entry(frame, &format->fmt->frames, entry) { + if (index == frame->frm->frame.b_frame_index) { + uframe = frame->frm; + break; + } + } + } + + return uframe; +} + +static struct uvcg_format *find_format_by_pix(struct uvc_device *uvc, + u32 pixelformat) +{ + struct uvcg_format_ptr *format; + struct uvcg_format *uformat = NULL; + + list_for_each_entry(format, &uvc->header->formats, entry) { + struct uvc_format_desc *fmtdesc = to_uvc_format(format->fmt); + + if (fmtdesc->fcc == pixelformat) { + uformat = format->fmt; + break; + } + } + + return uformat; +} + +static struct uvcg_frame *find_closest_frame_by_size(struct uvc_device *uvc, + struct uvcg_format *uformat, + u16 rw, u16 rh) +{ + struct uvc_video *video = &uvc->video; + struct uvcg_format_ptr *format; + struct uvcg_frame_ptr *frame; + struct uvcg_frame *uframe = NULL; + unsigned int d, maxd; + + /* Find the closest image size. The distance between image sizes is + * the size in pixels of the non-overlapping regions between the + * requested size and the frame-specified size. + */ + maxd = (unsigned int)-1; + + list_for_each_entry(format, &uvc->header->formats, entry) { + if (format->fmt->type != uformat->type) + continue; + + list_for_each_entry(frame, &format->fmt->frames, entry) { + u16 w, h; + + w = frame->frm->frame.w_width; + h = frame->frm->frame.w_height; + + d = min(w, rw) * min(h, rh); + d = w*h + rw*rh - 2*d; + if (d < maxd) { + maxd = d; + uframe = frame->frm; + } + + if (maxd == 0) + break; + } + } + + if (!uframe) + uvcg_dbg(&video->uvc->func, "Unsupported size %ux%u\n", rw, rh); + + return uframe; +} /* -------------------------------------------------------------------------- * Requests handling @@ -50,16 +199,6 @@ uvc_send_response(struct uvc_device *uvc, struct uvc_request_data *data) * V4L2 ioctls */ -struct uvc_format { - u8 bpp; - u32 fcc; -}; - -static struct uvc_format uvc_formats[] = { - { 16, V4L2_PIX_FMT_YUYV }, - { 0, V4L2_PIX_FMT_MJPEG }, -}; - static int uvc_v4l2_querycap(struct file *file, void *fh, struct v4l2_capability *cap) { @@ -67,9 +206,9 @@ uvc_v4l2_querycap(struct file *file, void *fh, struct v4l2_capability *cap) struct uvc_device *uvc = video_get_drvdata(vdev); struct usb_composite_dev *cdev = uvc->func.config->cdev; - strlcpy(cap->driver, "g_uvc", sizeof(cap->driver)); - strlcpy(cap->card, cdev->gadget->name, sizeof(cap->card)); - strlcpy(cap->bus_info, dev_name(&cdev->gadget->dev), + strscpy(cap->driver, "g_uvc", sizeof(cap->driver)); + strscpy(cap->card, cdev->gadget->name, sizeof(cap->card)); + strscpy(cap->bus_info, dev_name(&cdev->gadget->dev), sizeof(cap->bus_info)); return 0; } @@ -93,43 +232,156 @@ uvc_v4l2_get_format(struct file *file, void *fh, struct v4l2_format *fmt) return 0; } +static int +uvc_v4l2_try_format(struct file *file, void *fh, struct v4l2_format *fmt) +{ + struct video_device *vdev = video_devdata(file); + struct uvc_device *uvc = video_get_drvdata(vdev); + struct uvc_video *video = &uvc->video; + struct uvcg_format *uformat; + struct uvcg_frame *uframe; + u8 *fcc; + + if (fmt->type != video->queue.queue.type) + return -EINVAL; + + fcc = (u8 *)&fmt->fmt.pix.pixelformat; + uvcg_dbg(&uvc->func, "Trying format 0x%08x (%c%c%c%c): %ux%u\n", + fmt->fmt.pix.pixelformat, + fcc[0], fcc[1], fcc[2], fcc[3], + fmt->fmt.pix.width, fmt->fmt.pix.height); + + uformat = find_format_by_pix(uvc, fmt->fmt.pix.pixelformat); + if (!uformat) + return -EINVAL; + + uframe = find_closest_frame_by_size(uvc, uformat, + fmt->fmt.pix.width, fmt->fmt.pix.height); + if (!uframe) + return -EINVAL; + + fmt->fmt.pix.width = uframe->frame.w_width; + fmt->fmt.pix.height = uframe->frame.w_height; + fmt->fmt.pix.field = V4L2_FIELD_NONE; + fmt->fmt.pix.bytesperline = uvc_v4l2_get_bytesperline(uformat, uframe); + fmt->fmt.pix.sizeimage = uvc_get_frame_size(uformat, uframe); + fmt->fmt.pix.pixelformat = to_uvc_format(uformat)->fcc; + fmt->fmt.pix.colorspace = V4L2_COLORSPACE_SRGB; + fmt->fmt.pix.priv = 0; + + return 0; +} + static int uvc_v4l2_set_format(struct file *file, void *fh, struct v4l2_format *fmt) { struct video_device *vdev = video_devdata(file); struct uvc_device *uvc = video_get_drvdata(vdev); struct uvc_video *video = &uvc->video; - struct uvc_format *format; - unsigned int imagesize; - unsigned int bpl; - unsigned int i; + int ret; - for (i = 0; i < ARRAY_SIZE(uvc_formats); ++i) { - format = &uvc_formats[i]; - if (format->fcc == fmt->fmt.pix.pixelformat) - break; - } + ret = uvc_v4l2_try_format(file, fh, fmt); + if (ret) + return ret; - if (i == ARRAY_SIZE(uvc_formats)) { - uvcg_info(&uvc->func, "Unsupported format 0x%08x.\n", - fmt->fmt.pix.pixelformat); - return -EINVAL; - } - - bpl = format->bpp * fmt->fmt.pix.width / 8; - imagesize = bpl ? bpl * fmt->fmt.pix.height : fmt->fmt.pix.sizeimage; - - video->fcc = format->fcc; - video->bpp = format->bpp; + video->fcc = fmt->fmt.pix.pixelformat; + video->bpp = fmt->fmt.pix.bytesperline * 8 / video->width; video->width = fmt->fmt.pix.width; video->height = fmt->fmt.pix.height; - video->imagesize = imagesize; + video->imagesize = fmt->fmt.pix.sizeimage; - fmt->fmt.pix.field = V4L2_FIELD_NONE; - fmt->fmt.pix.bytesperline = bpl; - fmt->fmt.pix.sizeimage = imagesize; - fmt->fmt.pix.colorspace = V4L2_COLORSPACE_SRGB; - fmt->fmt.pix.priv = 0; + return ret; +} + +static int +uvc_v4l2_enum_frameintervals(struct file *file, void *fh, + struct v4l2_frmivalenum *fival) +{ + struct video_device *vdev = video_devdata(file); + struct uvc_device *uvc = video_get_drvdata(vdev); + struct uvcg_format *uformat = NULL; + struct uvcg_frame *uframe = NULL; + struct uvcg_frame_ptr *frame; + + uformat = find_format_by_pix(uvc, fival->pixel_format); + if (!uformat) + return -EINVAL; + + list_for_each_entry(frame, &uformat->frames, entry) { + if (frame->frm->frame.w_width == fival->width && + frame->frm->frame.w_height == fival->height) { + uframe = frame->frm; + break; + } + } + if (!uframe) + return -EINVAL; + + if (fival->index >= uframe->frame.b_frame_interval_type) + return -EINVAL; + + fival->discrete.numerator = + uframe->dw_frame_interval[fival->index]; + + /* TODO: handle V4L2_FRMIVAL_TYPE_STEPWISE */ + fival->type = V4L2_FRMIVAL_TYPE_DISCRETE; + fival->discrete.denominator = 10000000; + v4l2_simplify_fraction(&fival->discrete.numerator, + &fival->discrete.denominator, 8, 333); + + return 0; +} + +static int +uvc_v4l2_enum_framesizes(struct file *file, void *fh, + struct v4l2_frmsizeenum *fsize) +{ + struct video_device *vdev = video_devdata(file); + struct uvc_device *uvc = video_get_drvdata(vdev); + struct uvcg_format *uformat = NULL; + struct uvcg_frame *uframe = NULL; + + uformat = find_format_by_pix(uvc, fsize->pixel_format); + if (!uformat) + return -EINVAL; + + if (fsize->index >= uformat->num_frames) + return -EINVAL; + + uframe = find_frame_by_index(uvc, uformat, fsize->index + 1); + if (!uframe) + return -EINVAL; + + fsize->type = V4L2_FRMSIZE_TYPE_DISCRETE; + fsize->discrete.width = uframe->frame.w_width; + fsize->discrete.height = uframe->frame.w_height; + + return 0; +} + +static int +uvc_v4l2_enum_format(struct file *file, void *fh, struct v4l2_fmtdesc *f) +{ + struct video_device *vdev = video_devdata(file); + struct uvc_device *uvc = video_get_drvdata(vdev); + struct uvc_format_desc *fmtdesc; + struct uvcg_format *uformat; + + if (f->index >= uvc->header->num_fmt) + return -EINVAL; + + uformat = find_format_by_index(uvc, f->index + 1); + if (!uformat) + return -EINVAL; + + if (uformat->type != UVCG_UNCOMPRESSED) + f->flags |= V4L2_FMT_FLAG_COMPRESSED; + + fmtdesc = to_uvc_format(uformat); + f->pixelformat = fmtdesc->fcc; + + strscpy(f->description, fmtdesc->name, sizeof(f->description)); + f->description[strlen(fmtdesc->name) - 1] = 0; return 0; } @@ -169,7 +421,8 @@ uvc_v4l2_qbuf(struct file *file, void *fh, struct v4l2_buffer *b) if (ret < 0) return ret; - schedule_work(&video->pump); + if (uvc->state == UVC_STATE_STREAMING) + queue_work(video->async_wq, &video->pump); return ret; } @@ -297,8 +550,12 @@ uvc_v4l2_ioctl_default(struct file *file, void *fh, bool valid_prio, const struct v4l2_ioctl_ops uvc_v4l2_ioctl_ops = { .vidioc_querycap = uvc_v4l2_querycap, + .vidioc_try_fmt_vid_out = uvc_v4l2_try_format, .vidioc_g_fmt_vid_out = uvc_v4l2_get_format, .vidioc_s_fmt_vid_out = uvc_v4l2_set_format, + .vidioc_enum_frameintervals = uvc_v4l2_enum_frameintervals, + .vidioc_enum_framesizes = uvc_v4l2_enum_framesizes, + .vidioc_enum_fmt_vid_out = uvc_v4l2_enum_format, .vidioc_reqbufs = uvc_v4l2_reqbufs, .vidioc_querybuf = uvc_v4l2_querybuf, .vidioc_qbuf = uvc_v4l2_qbuf, diff --git a/drivers/usb/gadget/function/uvc_video.c b/drivers/usb/gadget/function/uvc_video.c index 0de7f11d1425..dd1c6b2ca7c6 100644 --- a/drivers/usb/gadget/function/uvc_video.c +++ b/drivers/usb/gadget/function/uvc_video.c @@ -12,6 +12,7 @@ #include #include #include +#include #include @@ -27,13 +28,41 @@ static int uvc_video_encode_header(struct uvc_video *video, struct uvc_buffer *buf, u8 *data, int len) { - data[0] = UVCG_REQUEST_HEADER_LEN; + struct uvc_device *uvc = container_of(video, struct uvc_device, video); + struct usb_composite_dev *cdev = uvc->func.config->cdev; + struct timespec64 ts = ns_to_timespec64(buf->buf.vb2_buf.timestamp); + int pos = 2; + data[1] = UVC_STREAM_EOH | video->fid; - if (buf->bytesused - video->queue.buf_used <= len - UVCG_REQUEST_HEADER_LEN) + if (video->queue.buf_used == 0 && ts.tv_sec) { + /* dwClockFrequency is 48 MHz */ + u32 pts = ((u64)ts.tv_sec * USEC_PER_SEC + ts.tv_nsec / NSEC_PER_USEC) * 48; + + data[1] |= UVC_STREAM_PTS; + put_unaligned_le32(pts, &data[pos]); + pos += 4; + } + + if (cdev->gadget->ops->get_frame) { + u32 sof, stc; + + sof = usb_gadget_frame_number(cdev->gadget); + ktime_get_ts64(&ts); + stc = ((u64)ts.tv_sec * USEC_PER_SEC + ts.tv_nsec / NSEC_PER_USEC) * 48; + + data[1] |= UVC_STREAM_SCR; + put_unaligned_le32(stc, &data[pos]); + put_unaligned_le16(sof, &data[pos+4]); + pos += 6; + } + + data[0] = pos; + + if (buf->bytesused - video->queue.buf_used <= len - pos) data[1] |= UVC_STREAM_EOF; - return UVCG_REQUEST_HEADER_LEN; + return pos; } static int @@ -207,9 +236,12 @@ static int uvcg_video_ep_queue(struct uvc_video *video, struct usb_request *req) uvcg_err(&video->uvc->func, "Failed to queue request (%d).\n", ret); - /* Isochronous endpoints can't be halted. */ - if (usb_endpoint_xfer_bulk(video->ep->desc)) - usb_ep_set_halt(video->ep); + /* If the endpoint is disabled the descriptor may be NULL. */ + if (video->ep->desc) { + /* Isochronous endpoints can't be halted. */ + if (usb_endpoint_xfer_bulk(video->ep->desc)) + usb_ep_set_halt(video->ep); + } } return ret; @@ -221,6 +253,7 @@ uvc_video_complete(struct usb_ep *ep, struct usb_request *req) struct uvc_request *ureq = req->context; struct uvc_video *video = ureq->video; struct uvc_video_queue *queue = &video->queue; + struct uvc_device *uvc = video->uvc; unsigned long flags; switch (req->status) { @@ -253,7 +286,8 @@ uvc_video_complete(struct usb_ep *ep, struct usb_request *req) list_add_tail(&req->list, &video->req_free); spin_unlock_irqrestore(&video->req_lock, flags); - schedule_work(&video->pump); + if (uvc->state == UVC_STATE_STREAMING) + queue_work(video->async_wq, &video->pump); } static int @@ -348,13 +382,14 @@ static void uvcg_video_pump(struct work_struct *work) { struct uvc_video *video = container_of(work, struct uvc_video, pump); struct uvc_video_queue *queue = &video->queue; - struct usb_request *req; + struct usb_request *req = NULL; struct uvc_buffer *buf; unsigned long flags; int ret; - while (1) { - /* Retrieve the first available USB request, protected by the + while (video->ep->enabled) { + /* + * Retrieve the first available USB request, protected by the * request lock. */ spin_lock_irqsave(&video->req_lock, flags); @@ -367,7 +402,8 @@ static void uvcg_video_pump(struct work_struct *work) list_del(&req->list); spin_unlock_irqrestore(&video->req_lock, flags); - /* Retrieve the first available video buffer and fill the + /* + * Retrieve the first available video buffer and fill the * request, protected by the video queue irqlock. */ spin_lock_irqsave(&queue->irqlock, flags); @@ -379,9 +415,11 @@ static void uvcg_video_pump(struct work_struct *work) video->encode(req, video, buf); - /* With usb3 we have more requests. This will decrease the + /* + * With usb3 we have more requests. This will decrease the * interrupt load to a quarter but also catches the corner - * cases, which needs to be handled */ + * cases, which needs to be handled. + */ if (list_empty(&video->req_free) || buf->state == UVC_BUF_STATE_DONE || !(video->req_int_count % @@ -400,9 +438,16 @@ static void uvcg_video_pump(struct work_struct *work) uvcg_queue_cancel(queue, 0); break; } - video->req_int_count++; + + /* Endpoint now owns the request */ + req = NULL; + if (buf->state != UVC_BUF_STATE_DONE) + video->req_int_count++; } + if (!req) + return; + spin_lock_irqsave(&video->req_lock, flags); list_add_tail(&req->list, &video->req_free); spin_unlock_irqrestore(&video->req_lock, flags); @@ -451,7 +496,7 @@ int uvcg_video_enable(struct uvc_video *video, int enable) video->req_int_count = 0; - schedule_work(&video->pump); + queue_work(video->async_wq, &video->pump); return ret; } @@ -465,6 +510,11 @@ int uvcg_video_init(struct uvc_video *video, struct uvc_device *uvc) spin_lock_init(&video->req_lock); INIT_WORK(&video->pump, uvcg_video_pump); + /* Allocate a work queue for asynchronous video pump handler. */ + video->async_wq = alloc_workqueue("uvcgadget", WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!video->async_wq) + return -EINVAL; + video->uvc = uvc; video->fcc = V4L2_PIX_FMT_YUYV; video->bpp = 16; diff --git a/drivers/usb/gadget/legacy/webcam.c b/drivers/usb/gadget/legacy/webcam.c index e9b5846b2322..c06dd1af7a0c 100644 --- a/drivers/usb/gadget/legacy/webcam.c +++ b/drivers/usb/gadget/legacy/webcam.c @@ -171,7 +171,7 @@ static const struct uvc_format_uncompressed uvc_format_yuv = { .bDefaultFrameIndex = 1, .bAspectRatioX = 0, .bAspectRatioY = 0, - .bmInterfaceFlags = 0, + .bmInterlaceFlags = 0, .bCopyProtect = 0, }; @@ -222,7 +222,7 @@ static const struct uvc_format_mjpeg uvc_format_mjpg = { .bDefaultFrameIndex = 1, .bAspectRatioX = 0, .bAspectRatioY = 0, - .bmInterfaceFlags = 0, + .bmInterlaceFlags = 0, .bCopyProtect = 0, }; diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index 61099f2d057d..eb1be4fd55ec 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(usb_ep_set_maxpacket_limit); * configurable, with more generic names like "ep-a". (remember that for * USB, "in" means "towards the USB host".) * - * This routine must be called in process context. + * This routine may be called in an atomic (interrupt) context. * * returns zero, or a negative error code. */ @@ -134,7 +134,7 @@ EXPORT_SYMBOL_GPL(usb_ep_enable); * gadget drivers must call usb_ep_enable() again before queueing * requests to the endpoint. * - * This routine must be called in process context. + * This routine may be called in an atomic (interrupt) context. * * returns zero, or a negative error code. */ diff --git a/drivers/usb/gadget/udc/omap_udc.c b/drivers/usb/gadget/udc/omap_udc.c index 494da00398d7..1183842db835 100644 --- a/drivers/usb/gadget/udc/omap_udc.c +++ b/drivers/usb/gadget/udc/omap_udc.c @@ -2553,7 +2553,7 @@ omap_ep_setup(char *name, u8 addr, u8 type, /* set up driver data structures */ BUG_ON(strlen(name) >= sizeof ep->name); - strlcpy(ep->name, name, sizeof ep->name); + strscpy(ep->name, name, sizeof(ep->name)); INIT_LIST_HEAD(&ep->queue); INIT_LIST_HEAD(&ep->iso); ep->bEndpointAddress = addr; diff --git a/drivers/usb/gadget/udc/trace.c b/drivers/usb/gadget/udc/trace.c index 19e837de2a20..1b984207d186 100644 --- a/drivers/usb/gadget/udc/trace.c +++ b/drivers/usb/gadget/udc/trace.c @@ -8,3 +8,6 @@ #define CREATE_TRACE_POINTS #include "trace.h" + +EXPORT_TRACEPOINT_SYMBOL_GPL(usb_gadget_connect); +EXPORT_TRACEPOINT_SYMBOL_GPL(usb_gadget_disconnect); diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index ddd321d9a890..966c445614fd 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -295,6 +295,7 @@ void xhci_ring_free(struct xhci_hcd *xhci, struct xhci_ring *ring) kfree(ring); } +EXPORT_SYMBOL_GPL(xhci_ring_free); void xhci_initialize_ring_info(struct xhci_ring *ring, unsigned int cycle_state) @@ -423,6 +424,7 @@ fail: kfree(ring); return NULL; } +EXPORT_SYMBOL_GPL(xhci_ring_alloc); void xhci_free_endpoint_ring(struct xhci_hcd *xhci, struct xhci_virt_device *virt_dev, @@ -1808,6 +1810,7 @@ struct xhci_command *xhci_alloc_command(struct xhci_hcd *xhci, INIT_LIST_HEAD(&command->cmd_list); return command; } +EXPORT_SYMBOL_GPL(xhci_alloc_command); struct xhci_command *xhci_alloc_command_with_ctx(struct xhci_hcd *xhci, bool allocate_completion, gfp_t mem_flags) @@ -1841,6 +1844,7 @@ void xhci_free_command(struct xhci_hcd *xhci, kfree(command->completion); kfree(command); } +EXPORT_SYMBOL_GPL(xhci_free_command); int xhci_alloc_erst(struct xhci_hcd *xhci, struct xhci_ring *evt_ring, @@ -1871,6 +1875,7 @@ int xhci_alloc_erst(struct xhci_hcd *xhci, return 0; } +EXPORT_SYMBOL_GPL(xhci_alloc_erst); void xhci_free_erst(struct xhci_hcd *xhci, struct xhci_erst *erst) { @@ -1884,6 +1889,7 @@ void xhci_free_erst(struct xhci_hcd *xhci, struct xhci_erst *erst) erst->erst_dma_addr); erst->entries = NULL; } +EXPORT_SYMBOL_GPL(xhci_free_erst); void xhci_mem_cleanup(struct xhci_hcd *xhci) { diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 6e5f83b4e1d8..85da106c818b 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -79,6 +79,7 @@ dma_addr_t xhci_trb_virt_to_dma(struct xhci_segment *seg, return 0; return seg->dma + (segment_offset * sizeof(*trb)); } +EXPORT_SYMBOL_GPL(xhci_trb_virt_to_dma); static bool trb_is_noop(union xhci_trb *trb) { @@ -320,6 +321,7 @@ void xhci_ring_cmd_db(struct xhci_hcd *xhci) /* Flush PCI posted writes */ readl(&xhci->dba->doorbell[0]); } +EXPORT_SYMBOL_GPL(xhci_ring_cmd_db); static bool xhci_mod_cmd_timer(struct xhci_hcd *xhci, unsigned long delay) { @@ -4499,6 +4501,7 @@ int xhci_queue_stop_endpoint(struct xhci_hcd *xhci, struct xhci_command *cmd, return queue_command(xhci, cmd, 0, 0, 0, trb_slot_id | trb_ep_index | type | trb_suspend, false); } +EXPORT_SYMBOL_GPL(xhci_queue_stop_endpoint); int xhci_queue_reset_ep(struct xhci_hcd *xhci, struct xhci_command *cmd, int slot_id, unsigned int ep_index, diff --git a/drivers/usb/host/xhci-trace.c b/drivers/usb/host/xhci-trace.c index d0070814d1ea..c4178357bd92 100644 --- a/drivers/usb/host/xhci-trace.c +++ b/drivers/usb/host/xhci-trace.c @@ -12,3 +12,6 @@ #include "xhci-trace.h" EXPORT_TRACEPOINT_SYMBOL_GPL(xhci_dbg_quirks); +EXPORT_TRACEPOINT_SYMBOL_GPL(xhci_urb_enqueue); +EXPORT_TRACEPOINT_SYMBOL_GPL(xhci_handle_transfer); +EXPORT_TRACEPOINT_SYMBOL_GPL(xhci_urb_giveback); diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index dceca1b758ca..c7d67e7d1c0f 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -17,6 +17,7 @@ #include #include #include +#include /* Code sharing between pci-quirks and xhci hcd */ #include "xhci-ext-caps.h" @@ -815,6 +816,9 @@ struct xhci_command { struct completion *completion; union xhci_trb *command_trb; struct list_head cmd_list; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; /* drop context bitmasks */ @@ -1550,6 +1554,8 @@ struct xhci_segment { void *bounce_buf; unsigned int bounce_offs; unsigned int bounce_len; + + ANDROID_KABI_RESERVE(1); }; enum xhci_cancelled_td_status { @@ -1639,6 +1645,9 @@ struct xhci_ring { enum xhci_ring_type type; bool last_td_was_short; struct radix_tree_root *trb_address_map; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; struct xhci_erst_entry { @@ -1656,6 +1665,8 @@ struct xhci_erst { dma_addr_t erst_dma_addr; /* Num entries the ERST can contain */ unsigned int erst_size; + + ANDROID_KABI_RESERVE(1); }; struct xhci_scratchpad { @@ -1939,6 +1950,12 @@ struct xhci_hcd { struct list_head regset_list; void *dbc; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); + /* platform-specific data -- must come last */ unsigned long priv[] __aligned(sizeof(s64)); }; diff --git a/drivers/usb/misc/usb251xb.c b/drivers/usb/misc/usb251xb.c index 507deef1f709..ccab87009a6e 100644 --- a/drivers/usb/misc/usb251xb.c +++ b/drivers/usb/misc/usb251xb.c @@ -544,7 +544,7 @@ static int usb251xb_get_ofdata(struct usb251xb *hub, hub->lang_id = USB251XB_DEF_LANGUAGE_ID; cproperty_char = of_get_property(np, "manufacturer", NULL); - strlcpy(str, cproperty_char ? : USB251XB_DEF_MANUFACTURER_STRING, + strscpy(str, cproperty_char ? : USB251XB_DEF_MANUFACTURER_STRING, sizeof(str)); hub->manufacturer_len = strlen(str) & 0xFF; memset(hub->manufacturer, 0, USB251XB_STRING_BUFSIZE); @@ -554,7 +554,7 @@ static int usb251xb_get_ofdata(struct usb251xb *hub, USB251XB_STRING_BUFSIZE); cproperty_char = of_get_property(np, "product", NULL); - strlcpy(str, cproperty_char ? : data->product_str, sizeof(str)); + strscpy(str, cproperty_char ? : data->product_str, sizeof(str)); hub->product_len = strlen(str) & 0xFF; memset(hub->product, 0, USB251XB_STRING_BUFSIZE); len = min_t(size_t, USB251XB_STRING_BUFSIZE / 2, strlen(str)); @@ -563,7 +563,7 @@ static int usb251xb_get_ofdata(struct usb251xb *hub, USB251XB_STRING_BUFSIZE); cproperty_char = of_get_property(np, "serial", NULL); - strlcpy(str, cproperty_char ? : USB251XB_DEF_SERIAL_STRING, + strscpy(str, cproperty_char ? : USB251XB_DEF_SERIAL_STRING, sizeof(str)); hub->serial_len = strlen(str) & 0xFF; memset(hub->serial, 0, USB251XB_STRING_BUFSIZE); diff --git a/drivers/usb/storage/onetouch.c b/drivers/usb/storage/onetouch.c index a989fe930e21..3ebd88b7aa2b 100644 --- a/drivers/usb/storage/onetouch.c +++ b/drivers/usb/storage/onetouch.c @@ -201,7 +201,7 @@ static int onetouch_connect_input(struct us_data *ss) onetouch->dev = input_dev; if (udev->manufacturer) - strlcpy(onetouch->name, udev->manufacturer, + strscpy(onetouch->name, udev->manufacturer, sizeof(onetouch->name)); if (udev->product) { if (udev->manufacturer) diff --git a/drivers/usb/typec/altmodes/displayport.c b/drivers/usb/typec/altmodes/displayport.c index ff3038047d38..ef0a92737d8b 100644 --- a/drivers/usb/typec/altmodes/displayport.c +++ b/drivers/usb/typec/altmodes/displayport.c @@ -57,6 +57,7 @@ struct dp_altmode { struct typec_displayport_data data; enum dp_state state; + bool hpd; struct mutex lock; /* device lock */ struct work_struct work; @@ -66,10 +67,17 @@ struct dp_altmode { static int dp_altmode_notify(struct dp_altmode *dp) { - u8 state = get_count_order(DP_CONF_GET_PIN_ASSIGN(dp->data.conf)); + unsigned long conf; + u8 state; - return typec_altmode_notify(dp->alt, TYPEC_MODAL_STATE(state), - &dp->data); + if (dp->data.conf) { + state = get_count_order(DP_CONF_GET_PIN_ASSIGN(dp->data.conf)); + conf = TYPEC_MODAL_STATE(state); + } else { + conf = TYPEC_STATE_USB; + } + + return typec_altmode_notify(dp->alt, conf, &dp->data); } static int dp_altmode_configure(struct dp_altmode *dp, u8 con) @@ -118,6 +126,7 @@ static int dp_altmode_configure(struct dp_altmode *dp, u8 con) static int dp_altmode_status_update(struct dp_altmode *dp) { bool configured = !!DP_CONF_GET_PIN_ASSIGN(dp->data.conf); + bool hpd = !!(dp->data.status & DP_STATUS_HPD_STATE); u8 con = DP_STATUS_CONNECTION(dp->data.status); int ret = 0; @@ -130,6 +139,11 @@ static int dp_altmode_status_update(struct dp_altmode *dp) ret = dp_altmode_configure(dp, con); if (!ret) dp->state = DP_STATE_CONFIGURE; + } else { + if (dp->hpd != hpd) { + dp->hpd = hpd; + sysfs_notify(&dp->alt->dev.kobj, "displayport", "hpd"); + } } return ret; @@ -137,21 +151,10 @@ static int dp_altmode_status_update(struct dp_altmode *dp) static int dp_altmode_configured(struct dp_altmode *dp) { - int ret; - sysfs_notify(&dp->alt->dev.kobj, "displayport", "configuration"); - - if (!dp->data.conf) - return typec_altmode_notify(dp->alt, TYPEC_STATE_USB, - &dp->data); - - ret = dp_altmode_notify(dp); - if (ret) - return ret; - sysfs_notify(&dp->alt->dev.kobj, "displayport", "pin_assignment"); - return 0; + return dp_altmode_notify(dp); } static int dp_altmode_configure_vdm(struct dp_altmode *dp, u32 conf) @@ -172,13 +175,8 @@ static int dp_altmode_configure_vdm(struct dp_altmode *dp, u32 conf) } ret = typec_altmode_vdm(dp->alt, header, &conf, 2); - if (ret) { - if (DP_CONF_GET_PIN_ASSIGN(dp->data.conf)) - dp_altmode_notify(dp); - else - typec_altmode_notify(dp->alt, TYPEC_STATE_USB, - &dp->data); - } + if (ret) + dp_altmode_notify(dp); return ret; } @@ -513,9 +511,18 @@ static ssize_t pin_assignment_show(struct device *dev, } static DEVICE_ATTR_RW(pin_assignment); +static ssize_t hpd_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct dp_altmode *dp = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%d\n", dp->hpd); +} +static DEVICE_ATTR_RO(hpd); + static struct attribute *dp_altmode_attrs[] = { &dev_attr_configuration.attr, &dev_attr_pin_assignment.attr, + &dev_attr_hpd.attr, NULL }; diff --git a/drivers/usb/typec/bus.h b/drivers/usb/typec/bus.h index 56dec268d4dd..fa021ed928a7 100644 --- a/drivers/usb/typec/bus.h +++ b/drivers/usb/typec/bus.h @@ -4,6 +4,7 @@ #define __USB_TYPEC_ALTMODE_H__ #include +#include struct bus_type; struct typec_mux; @@ -22,6 +23,7 @@ struct altmode { struct altmode *partner; struct altmode *plug[2]; + ANDROID_KABI_RESERVE(1); }; #define to_altmode(d) container_of(d, struct altmode, adev) diff --git a/drivers/usb/typec/class.h b/drivers/usb/typec/class.h index aef03eb7e152..fa08d5c00c8d 100644 --- a/drivers/usb/typec/class.h +++ b/drivers/usb/typec/class.h @@ -5,6 +5,7 @@ #include #include +#include struct typec_mux; struct typec_switch; @@ -14,6 +15,7 @@ struct typec_plug { enum typec_plug_index index; struct ida mode_ids; int num_altmodes; + ANDROID_KABI_RESERVE(1); }; struct typec_cable { @@ -22,6 +24,7 @@ struct typec_cable { struct usb_pd_identity *identity; unsigned int active:1; u16 pd_revision; /* 0300H = "3.0" */ + ANDROID_KABI_RESERVE(1); }; struct typec_partner { @@ -33,6 +36,7 @@ struct typec_partner { int num_altmodes; u16 pd_revision; /* 0300H = "3.0" */ enum usb_pd_svdm_ver svdm_version; + ANDROID_KABI_RESERVE(1); }; struct typec_port { @@ -59,6 +63,7 @@ struct typec_port { struct mutex port_list_lock; /* Port list lock */ void *pld; + ANDROID_KABI_RESERVE(1); }; #define to_typec_port(_dev_) container_of(_dev_, struct typec_port, dev) diff --git a/drivers/usb/typec/mux.h b/drivers/usb/typec/mux.h index b1d6e837cb74..a15b926edfc5 100644 --- a/drivers/usb/typec/mux.h +++ b/drivers/usb/typec/mux.h @@ -4,15 +4,18 @@ #define __USB_TYPEC_MUX__ #include +#include struct typec_switch { struct device dev; typec_switch_set_fn_t set; + ANDROID_KABI_RESERVE(1); }; struct typec_mux { struct device dev; typec_mux_set_fn_t set; + ANDROID_KABI_RESERVE(1); }; #define to_typec_switch(_dev_) container_of(_dev_, struct typec_switch, dev) diff --git a/drivers/usb/typec/tcpm/fusb302.c b/drivers/usb/typec/tcpm/fusb302.c index 96c55eaf3f80..ab89c014606e 100644 --- a/drivers/usb/typec/tcpm/fusb302.c +++ b/drivers/usb/typec/tcpm/fusb302.c @@ -151,7 +151,7 @@ static void _fusb302_log(struct fusb302_chip *chip, const char *fmt, if (fusb302_log_full(chip)) { chip->logbuffer_head = max(chip->logbuffer_head - 1, 0); - strlcpy(tmpbuffer, "overflow", sizeof(tmpbuffer)); + strscpy(tmpbuffer, "overflow", sizeof(tmpbuffer)); } if (chip->logbuffer_head < 0 || diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index ee461d314927..c420da474fa1 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -475,6 +475,10 @@ struct tcpm_port { * SNK_READY for non-pd link. */ bool slow_charger_loop; + + /* Port is still in tCCDebounce */ + bool debouncing; + #ifdef CONFIG_DEBUG_FS struct dentry *dentry; struct mutex logbuffer_lock; /* log buffer access lock */ @@ -977,6 +981,21 @@ static int tcpm_set_vconn(struct tcpm_port *port, bool enable) return ret; } +bool tcpm_is_debouncing(struct tcpm_port *port) +{ + bool debounce; + + if (!port) + return false; + + mutex_lock(&port->lock); + debounce = port->debouncing; + mutex_unlock(&port->lock); + + return debounce; +} +EXPORT_SYMBOL_GPL(tcpm_is_debouncing); + static u32 tcpm_get_current_limit(struct tcpm_port *port) { enum typec_cc_status cc; @@ -3604,6 +3623,7 @@ static int tcpm_src_attach(struct tcpm_port *port) port->partner = NULL; port->attached = true; + port->debouncing = false; port->send_discover = true; return 0; @@ -3730,6 +3750,7 @@ static int tcpm_snk_attach(struct tcpm_port *port) port->partner = NULL; port->attached = true; + port->debouncing = false; port->send_discover = true; return 0; @@ -3757,6 +3778,7 @@ static int tcpm_acc_attach(struct tcpm_port *port) tcpm_typec_connect(port); port->attached = true; + port->debouncing = false; return 0; } @@ -3846,6 +3868,15 @@ static void run_state_machine(struct tcpm_port *port) if (!port->non_pd_role_swap) tcpm_swap_complete(port, -ENOTCONN); tcpm_src_detach(port); + if (port->debouncing) { + port->debouncing = false; + if (port->tcpc->check_contaminant && + port->tcpc->check_contaminant(port->tcpc)) { + /* Contaminant detection would handle toggling */ + tcpm_set_state(port, TOGGLING, 0); + break; + } + } if (tcpm_start_toggling(port, tcpm_rp_cc(port))) { tcpm_set_state(port, TOGGLING, 0); break; @@ -3855,6 +3886,7 @@ static void run_state_machine(struct tcpm_port *port) tcpm_set_state(port, SNK_UNATTACHED, PD_T_DRP_SNK); break; case SRC_ATTACH_WAIT: + port->debouncing = true; if (tcpm_port_is_debug(port)) tcpm_set_state(port, DEBUG_ACC_ATTACHED, PD_T_CC_DEBOUNCE); @@ -3869,6 +3901,7 @@ static void run_state_machine(struct tcpm_port *port) break; case SNK_TRY: + port->debouncing = false; port->try_snk_count++; /* * Requirements: @@ -4083,6 +4116,15 @@ static void run_state_machine(struct tcpm_port *port) tcpm_swap_complete(port, -ENOTCONN); tcpm_pps_complete(port, -ENOTCONN); tcpm_snk_detach(port); + if (port->debouncing) { + port->debouncing = false; + if (port->tcpc->check_contaminant && + port->tcpc->check_contaminant(port->tcpc)) { + /* Contaminant detection would handle toggling */ + tcpm_set_state(port, TOGGLING, 0); + break; + } + } if (tcpm_start_toggling(port, TYPEC_CC_RD)) { tcpm_set_state(port, TOGGLING, 0); break; @@ -4092,6 +4134,7 @@ static void run_state_machine(struct tcpm_port *port) tcpm_set_state(port, SRC_UNATTACHED, PD_T_DRP_SRC); break; case SNK_ATTACH_WAIT: + port->debouncing = true; if ((port->cc1 == TYPEC_CC_OPEN && port->cc2 != TYPEC_CC_OPEN) || (port->cc1 != TYPEC_CC_OPEN && @@ -4103,14 +4146,18 @@ static void run_state_machine(struct tcpm_port *port) PD_T_PD_DEBOUNCE); break; case SNK_DEBOUNCED: - if (tcpm_port_is_disconnected(port)) + if (tcpm_port_is_disconnected(port)) { tcpm_set_state(port, SNK_UNATTACHED, PD_T_PD_DEBOUNCE); - else if (port->vbus_present) + } else if (port->vbus_present) { tcpm_set_state(port, tcpm_try_src(port) ? SRC_TRY : SNK_ATTACHED, 0); + port->debouncing = false; + } else { + port->debouncing = false; + } break; case SRC_TRY: port->try_src_count++; @@ -5156,6 +5203,7 @@ static void _tcpm_pd_vbus_off(struct tcpm_port *port) break; case SNK_ATTACH_WAIT: case SNK_DEBOUNCED: + port->debouncing = false; /* Do nothing, as TCPM is still waiting for vbus to reaach VSAFE5V to connect */ break; @@ -6045,6 +6093,49 @@ sink: return 0; } +static int tcpm_copy_pdos(u32 *dest_pdo, const u32 *src_pdo, unsigned int nr_pdo) +{ + unsigned int i; + + if (nr_pdo > PDO_MAX_OBJECTS) + nr_pdo = PDO_MAX_OBJECTS; + + for (i = 0; i < nr_pdo; i++) + dest_pdo[i] = src_pdo[i]; + + return nr_pdo; +} + +int tcpm_update_sink_capabilities(struct tcpm_port *port, const u32 *pdo, unsigned int nr_pdo, + unsigned int operating_snk_mw) +{ + if (tcpm_validate_caps(port, pdo, nr_pdo)) + return -EINVAL; + + mutex_lock(&port->lock); + port->nr_snk_pdo = tcpm_copy_pdos(port->snk_pdo, pdo, nr_pdo); + port->operating_snk_mw = operating_snk_mw; + port->update_sink_caps = true; + + switch (port->state) { + case SNK_NEGOTIATE_CAPABILITIES: + case SNK_NEGOTIATE_PPS_CAPABILITIES: + case SNK_READY: + case SNK_TRANSITION_SINK: + case SNK_TRANSITION_SINK_VBUS: + if (port->pps_data.active) + tcpm_set_state(port, SNK_NEGOTIATE_PPS_CAPABILITIES, 0); + else + tcpm_set_state(port, SNK_NEGOTIATE_CAPABILITIES, 0); + break; + default: + break; + } + mutex_unlock(&port->lock); + return 0; +} +EXPORT_SYMBOL_GPL(tcpm_update_sink_capabilities); + /* Power Supply access to expose source power information */ enum tcpm_psy_online_states { TCPM_PSY_OFFLINE = 0, diff --git a/drivers/usb/usbip/stub_main.c b/drivers/usb/usbip/stub_main.c index 77a5b3f8736a..e8c3131a8543 100644 --- a/drivers/usb/usbip/stub_main.c +++ b/drivers/usb/usbip/stub_main.c @@ -100,7 +100,7 @@ static int add_match_busid(char *busid) for (i = 0; i < MAX_BUSID; i++) { spin_lock(&busid_table[i].busid_lock); if (!busid_table[i].name[0]) { - strlcpy(busid_table[i].name, busid, BUSID_SIZE); + strscpy(busid_table[i].name, busid, BUSID_SIZE); if ((busid_table[i].status != STUB_BUSID_ALLOC) && (busid_table[i].status != STUB_BUSID_REMOV)) busid_table[i].status = STUB_BUSID_ADDED; diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 74ac0c28fe43..47e7fec2a9fd 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -726,6 +726,10 @@ static void vhost_vsock_reset_orphans(struct sock *sk) * executing. */ + /* Only handle our own sockets */ + if (vsk->transport != &vhost_transport.transport) + return; + /* If the peer is still valid, no need to reset connection */ if (vhost_vsock_get(vsk->remote_addr.svm_cid)) return; @@ -757,8 +761,7 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file) /* Iterating over all connections for all CIDs to find orphans is * inefficient. Room for improvement here. */ - vsock_for_each_connected_socket(&vhost_transport.transport, - vhost_vsock_reset_orphans); + vsock_for_each_connected_socket(vhost_vsock_reset_orphans); /* Don't check the owner, because we are in the release path, so we * need to stop the vsock device in any case. diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index 96d271f36636..9bacac0a8f58 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -1605,18 +1606,36 @@ static void do_remove_conflicting_framebuffers(struct apertures_struct *a, /* check all firmware fbs and kick off if the base addr overlaps */ for_each_registered_fb(i) { struct apertures_struct *gen_aper; + struct device *device; if (!(registered_fb[i]->flags & FBINFO_MISC_FIRMWARE)) continue; gen_aper = registered_fb[i]->apertures; + device = registered_fb[i]->device; if (fb_do_apertures_overlap(gen_aper, a) || (primary && gen_aper && gen_aper->count && gen_aper->ranges[0].base == VGA_FB_PHYS)) { printk(KERN_INFO "fb%d: switching to %s from %s\n", i, name, registered_fb[i]->fix.id); - do_unregister_framebuffer(registered_fb[i]); + + /* + * If we kick-out a firmware driver, we also want to remove + * the underlying platform device, such as simple-framebuffer, + * VESA, EFI, etc. A native driver will then be able to + * allocate the memory range. + * + * If it's not a platform device, at least print a warning. A + * fix would add code to remove the device from the system. + */ + if (dev_is_platform(device)) { + registered_fb[i]->forced_out = true; + platform_device_unregister(to_platform_device(device)); + } else { + pr_warn("fb%d: cannot remove device\n", i); + do_unregister_framebuffer(registered_fb[i]); + } } } } @@ -1954,9 +1973,13 @@ EXPORT_SYMBOL(register_framebuffer); void unregister_framebuffer(struct fb_info *fb_info) { - mutex_lock(®istration_lock); + bool forced_out = fb_info->forced_out; + + if (!forced_out) + mutex_lock(®istration_lock); do_unregister_framebuffer(fb_info); - mutex_unlock(®istration_lock); + if (!forced_out) + mutex_unlock(®istration_lock); } EXPORT_SYMBOL(unregister_framebuffer); diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index ce1b3f6ec325..d9b8c06392ee 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -142,7 +142,7 @@ config VIRTIO_MMIO_CMDLINE_DEVICES If unsure, say 'N'. config VIRTIO_DMA_SHARED_BUFFER - tristate + tristate "Virtio DMA shared buffer support" depends on DMA_SHARED_BUFFER help This option adds a flavor of dma buffers that are backed by diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index c2b733ef95b0..c7dc7a9a6b06 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -504,8 +504,9 @@ int virtio_device_restore(struct virtio_device *dev) goto err; } - /* Finally, tell the device we're all set */ - virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); + /* If restore didn't do it, mark device DRIVER_OK ourselves. */ + if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK)) + virtio_device_ready(dev); virtio_config_enable(dev); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index c22ff0117b46..8691ebde8ef2 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -560,11 +560,15 @@ static int init_vqs(struct virtio_balloon *vb) virtqueue_kick(vb->stats_vq); } - if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_FREE_PAGE]; + virtqueue_disable_dma_api_for_buffers(vb->free_page_vq); + } - if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) { vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING]; + virtqueue_disable_dma_api_for_buffers(vb->reporting_vq); + } return 0; } @@ -1136,7 +1140,6 @@ static int virtballoon_validate(struct virtio_device *vdev) else if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_REPORTING); - __virtio_clear_bit(vdev, VIRTIO_F_ACCESS_PLATFORM); return 0; } diff --git a/drivers/virtio/virtio_dma_buf.c b/drivers/virtio/virtio_dma_buf.c index 5127a2f0c986..f94ab97d49b6 100644 --- a/drivers/virtio/virtio_dma_buf.c +++ b/drivers/virtio/virtio_dma_buf.c @@ -25,11 +25,14 @@ struct dma_buf *virtio_dma_buf_export const struct virtio_dma_buf_ops, ops); if (!exp_info->ops || - exp_info->ops->attach != &virtio_dma_buf_attach || !virtio_ops->get_uuid) { return ERR_PTR(-EINVAL); } + if (!(IS_ENABLED(CONFIG_CFI_CLANG) && IS_ENABLED(CONFIG_MODULES)) && + exp_info->ops->attach != &virtio_dma_buf_attach) + return ERR_PTR(-EINVAL); + return dma_buf_export(exp_info); } EXPORT_SYMBOL(virtio_dma_buf_export); @@ -60,6 +63,9 @@ EXPORT_SYMBOL(virtio_dma_buf_attach); */ bool is_virtio_dma_buf(struct dma_buf *dma_buf) { + if (IS_ENABLED(CONFIG_CFI_CLANG) && IS_ENABLED(CONFIG_MODULES)) + return true; + return dma_buf->ops->attach == &virtio_dma_buf_attach; } EXPORT_SYMBOL(is_virtio_dma_buf); diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 4624a2c3d055..f7d6864648c3 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -1144,6 +1144,7 @@ static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) const bool is_movable = page_zonenum(pfn_to_page(pfn)) == ZONE_MOVABLE; int rc, retry_count; + struct acr_info dummy; /* * TODO: We want an alloc_contig_range() mode that tries to allocate @@ -1154,7 +1155,7 @@ static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) */ for (retry_count = 0; retry_count < 5; retry_count++) { rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, - GFP_KERNEL); + GFP_KERNEL, &dummy); if (rc == -ENOMEM) /* whoops, out of memory */ return rc; diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index a274261f36d6..0a083846e1aa 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -293,7 +293,7 @@ static int virtio_pci_find_shm_cap(struct pci_dev *dev, u8 required_id, for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR); pos > 0; pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) { - u8 type, cap_len, id; + u8 type, cap_len, id, res_bar; u32 tmp32; u64 res_offset, res_length; @@ -315,9 +315,14 @@ static int virtio_pci_find_shm_cap(struct pci_dev *dev, u8 required_id, if (id != required_id) continue; - /* Type, and ID match, looks good */ pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, - bar), bar); + bar), &res_bar); + if (res_bar >= PCI_STD_NUM_BARS) + continue; + + /* Type and ID match, and the BAR value isn't reserved. + * Looks good. + */ /* Read the lower 32bit of length and offset */ pci_read_config_dword(dev, pos + offsetof(struct virtio_pci_cap, @@ -337,6 +342,7 @@ static int virtio_pci_find_shm_cap(struct pci_dev *dev, u8 required_id, length_hi), &tmp32); res_length |= ((u64)tmp32) << 32; + *bar = res_bar; *offset = res_offset; *len = res_length; diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c index 9ab66e44738e..d2b1acb18e61 100644 --- a/drivers/virtio/virtio_pci_modern_dev.c +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -35,6 +35,13 @@ vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off, pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, length), &length); + /* Check if the BAR may have changed since we requested the region. */ + if (bar >= PCI_STD_NUM_BARS || !(mdev->modern_bars & (1 << bar))) { + dev_err(&dev->dev, + "virtio_pci: bar unexpectedly changed to %u\n", bar); + return NULL; + } + if (length <= start) { dev_err(&dev->dev, "virtio_pci: bad capability len %u (>%u expected)\n", @@ -120,7 +127,7 @@ static inline int virtio_pci_find_capability(struct pci_dev *dev, u8 cfg_type, &bar); /* Ignore structures with reserved BAR values */ - if (bar > 0x5) + if (bar >= PCI_STD_NUM_BARS) continue; if (type == cfg_type) { diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 603a6f4345ef..fb5e6fd9326c 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2462,4 +2462,14 @@ const struct vring *virtqueue_get_vring(struct virtqueue *vq) } EXPORT_SYMBOL_GPL(virtqueue_get_vring); +/* + * Prevents use of DMA API for buffers passed via the specified virtqueue. + * DMA API may still be used for the vrings themselves. + */ +void virtqueue_disable_dma_api_for_buffers(struct virtqueue *vq) +{ + to_vvq(vq)->use_dma_api = false; +} +EXPORT_SYMBOL_GPL(virtqueue_disable_dma_api_for_buffers); + MODULE_LICENSE("GPL"); diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 141067379f5e..4fb76c3a8045 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -740,3 +740,4 @@ MODULE_AUTHOR("Latchesar Ionkov "); MODULE_AUTHOR("Eric Van Hensbergen "); MODULE_AUTHOR("Ron Minnich "); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/Kconfig b/fs/Kconfig index a6313a969bc5..fd3cd36bdf46 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -127,6 +127,7 @@ source "fs/quota/Kconfig" source "fs/autofs/Kconfig" source "fs/fuse/Kconfig" source "fs/overlayfs/Kconfig" +source "fs/incfs/Kconfig" menu "Caches" diff --git a/fs/Makefile b/fs/Makefile index d504be65a210..e4adb988218c 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -6,6 +6,8 @@ # Rewritten to use lists instead of if-statements. # +subdir-ccflags-y += -DANDROID_GKI_VFS_EXPORT_ONLY=VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver + obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ ioctl.o readdir.o select.o dcache.o inode.o \ @@ -113,6 +115,7 @@ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ obj-$(CONFIG_OVERLAY_FS) += overlayfs/ obj-$(CONFIG_ORANGEFS_FS) += orangefs/ +obj-$(CONFIG_INCREMENTAL_FS) += incfs/ obj-$(CONFIG_UDF_FS) += udf/ obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ obj-$(CONFIG_OMFS_FS) += omfs/ diff --git a/fs/OWNERS b/fs/OWNERS new file mode 100644 index 000000000000..7780f6be0335 --- /dev/null +++ b/fs/OWNERS @@ -0,0 +1 @@ +per-file {crypto,verity}/**=ebiggers@google.com diff --git a/fs/adfs/super.c b/fs/adfs/super.c index bdbd26e571ed..57044e537290 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -492,3 +492,4 @@ static void __exit exit_adfs_fs(void) module_init(init_adfs_fs) module_exit(exit_adfs_fs) MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/affs/super.c b/fs/affs/super.c index c6c2a513ec92..2d2797ef7cfa 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -676,6 +676,7 @@ static void __exit exit_affs_fs(void) MODULE_DESCRIPTION("Amiga filesystem support for Linux"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); module_init(init_affs_fs) module_exit(exit_affs_fs) diff --git a/fs/afs/main.c b/fs/afs/main.c index 179004b15566..c6bd95645499 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -18,6 +18,7 @@ MODULE_DESCRIPTION("AFS Client File System"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); unsigned afs_debug; module_param_named(debug, afs_debug, uint, S_IWUSR | S_IRUGO); diff --git a/fs/attr.c b/fs/attr.c index f581c4d00897..3e8d5b50eddc 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -164,7 +164,7 @@ kill_priv: return 0; } -EXPORT_SYMBOL(setattr_prepare); +EXPORT_SYMBOL_NS(setattr_prepare, ANDROID_GKI_VFS_EXPORT_ONLY); /** * inode_newsize_ok - may this inode be truncated to a given size @@ -210,7 +210,7 @@ out_sig: out_big: return -EFBIG; } -EXPORT_SYMBOL(inode_newsize_ok); +EXPORT_SYMBOL_NS(inode_newsize_ok, ANDROID_GKI_VFS_EXPORT_ONLY); /** * setattr_copy - copy simple metadata updates into the generic inode @@ -435,4 +435,4 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, return error; } -EXPORT_SYMBOL(notify_change); +EXPORT_SYMBOL_NS(notify_change, ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/autofs/init.c b/fs/autofs/init.c index d3f55e874338..ba08261f4faa 100644 --- a/fs/autofs/init.c +++ b/fs/autofs/init.c @@ -44,3 +44,4 @@ static void __exit exit_autofs_fs(void) module_init(init_autofs_fs) module_exit(exit_autofs_fs) MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 12b8fdcc445b..649e28f48c2d 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -215,7 +215,7 @@ void make_bad_inode(struct inode *inode) inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &bad_file_ops; } -EXPORT_SYMBOL(make_bad_inode); +EXPORT_SYMBOL_NS(make_bad_inode, ANDROID_GKI_VFS_EXPORT_ONLY); /* * This tests whether an inode has been flagged as bad. The test uses @@ -235,7 +235,7 @@ bool is_bad_inode(struct inode *inode) return (inode->i_op == &bad_inode_ops); } -EXPORT_SYMBOL(is_bad_inode); +EXPORT_SYMBOL_NS(is_bad_inode, ANDROID_GKI_VFS_EXPORT_ONLY); /** * iget_failed - Mark an under-construction inode as dead and release it diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index c1ba13d19024..abb8f6bb7e39 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -34,6 +34,7 @@ MODULE_DESCRIPTION("BeOS File System (BeFS) driver"); MODULE_AUTHOR("Will Dyson"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); /* The units the vfs expects inode->i_blocks to be in */ #define VFS_BLOCK_SIZE 512 diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index fd691e4815c5..293223cd4b37 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -22,6 +22,7 @@ MODULE_AUTHOR("Tigran Aivazian "); MODULE_DESCRIPTION("SCO UnixWare BFS filesystem for Linux"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); #undef DEBUG diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index bb202ad369d5..7c01973da6c8 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -834,3 +834,4 @@ static void __exit exit_misc_binfmt(void) core_initcall(init_misc_binfmt); module_exit(exit_misc_binfmt); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index eae622ef4c6d..32cb24c3f0d4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2412,6 +2412,7 @@ static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = btrfs_page_mkwrite, + .speculative = true, }; static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4ff55457f902..ae15b6f2e418 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2789,6 +2789,7 @@ late_initcall(init_btrfs_fs); module_exit(exit_btrfs_fs) MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_SOFTDEP("pre: crc32c"); MODULE_SOFTDEP("pre: xxhash64"); MODULE_SOFTDEP("pre: sha256"); diff --git a/fs/buffer.c b/fs/buffer.c index 1960e2d43ae2..50ef77e7d485 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -51,6 +51,8 @@ #include "internal.h" +#include + static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, enum rw_hint hint, struct writeback_control *wbc); @@ -173,7 +175,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate) unlock_buffer(bh); put_bh(bh); } -EXPORT_SYMBOL(end_buffer_write_sync); +EXPORT_SYMBOL_NS(end_buffer_write_sync, ANDROID_GKI_VFS_EXPORT_ONLY); /* * Various filesystems appear to want __find_get_block to be non-blocking. @@ -419,7 +421,7 @@ void mark_buffer_async_write(struct buffer_head *bh) { mark_buffer_async_write_endio(bh, end_buffer_async_write); } -EXPORT_SYMBOL(mark_buffer_async_write); +EXPORT_SYMBOL_NS(mark_buffer_async_write, ANDROID_GKI_VFS_EXPORT_ONLY); /* @@ -649,7 +651,7 @@ int __set_page_dirty_buffers(struct page *page) return newly_dirty; } -EXPORT_SYMBOL(__set_page_dirty_buffers); +EXPORT_SYMBOL_NS(__set_page_dirty_buffers, ANDROID_GKI_VFS_EXPORT_ONLY); /* * Write out and wait upon a list of buffers. @@ -1112,7 +1114,7 @@ void mark_buffer_dirty(struct buffer_head *bh) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } } -EXPORT_SYMBOL(mark_buffer_dirty); +EXPORT_SYMBOL_NS(mark_buffer_dirty, ANDROID_GKI_VFS_EXPORT_ONLY); void mark_buffer_write_io_error(struct buffer_head *bh) { @@ -1130,7 +1132,7 @@ void mark_buffer_write_io_error(struct buffer_head *bh) errseq_set(&sb->s_wb_err, -EIO); rcu_read_unlock(); } -EXPORT_SYMBOL(mark_buffer_write_io_error); +EXPORT_SYMBOL_NS(mark_buffer_write_io_error, ANDROID_GKI_VFS_EXPORT_ONLY); /* * Decrement a buffer_head's reference count. If all buffers against a page @@ -1147,7 +1149,7 @@ void __brelse(struct buffer_head * buf) } WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); } -EXPORT_SYMBOL(__brelse); +EXPORT_SYMBOL_NS(__brelse, ANDROID_GKI_VFS_EXPORT_ONLY); /* * bforget() is like brelse(), except it discards any @@ -1166,7 +1168,7 @@ void __bforget(struct buffer_head *bh) } __brelse(bh); } -EXPORT_SYMBOL(__bforget); +EXPORT_SYMBOL_NS(__bforget, ANDROID_GKI_VFS_EXPORT_ONLY); static struct buffer_head *__bread_slow(struct buffer_head *bh) { @@ -1233,6 +1235,7 @@ static void bh_lru_install(struct buffer_head *bh) struct buffer_head *evictee = bh; struct bh_lru *b; int i; + bool skip = false; check_irqs_on(); bh_lru_lock(); @@ -1248,6 +1251,12 @@ static void bh_lru_install(struct buffer_head *bh) return; } + trace_android_vh_bh_lru_install(bh->b_page, &skip); + if (skip) { + bh_lru_unlock(); + return; + } + b = this_cpu_ptr(&bh_lrus); for (i = 0; i < BH_LRU_SIZE; i++) { swap(evictee, b->bhs[i]); @@ -1349,7 +1358,7 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) brelse(bh); } } -EXPORT_SYMBOL(__breadahead); +EXPORT_SYMBOL_NS(__breadahead, ANDROID_GKI_VFS_EXPORT_ONLY); void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) @@ -1384,7 +1393,7 @@ __bread_gfp(struct block_device *bdev, sector_t block, bh = __bread_slow(bh); return bh; } -EXPORT_SYMBOL(__bread_gfp); +EXPORT_SYMBOL_NS(__bread_gfp, ANDROID_GKI_VFS_EXPORT_ONLY); static void __invalidate_bh_lrus(struct bh_lru *b) { @@ -1546,7 +1555,7 @@ void block_invalidatepage(struct page *page, unsigned int offset, out: return; } -EXPORT_SYMBOL(block_invalidatepage); +EXPORT_SYMBOL_NS(block_invalidatepage, ANDROID_GKI_VFS_EXPORT_ONLY); /* @@ -1582,7 +1591,7 @@ void create_empty_buffers(struct page *page, attach_page_private(page, head); spin_unlock(&page->mapping->private_lock); } -EXPORT_SYMBOL(create_empty_buffers); +EXPORT_SYMBOL_NS(create_empty_buffers, ANDROID_GKI_VFS_EXPORT_ONLY); /** * clean_bdev_aliases: clean a range of buffers in block device @@ -1656,7 +1665,7 @@ unlock_page: break; } } -EXPORT_SYMBOL(clean_bdev_aliases); +EXPORT_SYMBOL_NS(clean_bdev_aliases, ANDROID_GKI_VFS_EXPORT_ONLY); /* * Size is a power-of-two in the range 512..PAGE_SIZE, @@ -1914,7 +1923,7 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) bh = bh->b_this_page; } while (bh != head); } -EXPORT_SYMBOL(page_zero_new_buffers); +EXPORT_SYMBOL_NS(page_zero_new_buffers, ANDROID_GKI_VFS_EXPORT_ONLY); static void iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, @@ -2204,7 +2213,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, mark_inode_dirty(inode); return copied; } -EXPORT_SYMBOL(generic_write_end); +EXPORT_SYMBOL_NS(generic_write_end, ANDROID_GKI_VFS_EXPORT_ONLY); /* * block_is_partially_uptodate checks whether buffers within a page are @@ -2249,7 +2258,7 @@ int block_is_partially_uptodate(struct page *page, unsigned long from, return ret; } -EXPORT_SYMBOL(block_is_partially_uptodate); +EXPORT_SYMBOL_NS(block_is_partially_uptodate, ANDROID_GKI_VFS_EXPORT_ONLY); /* * Generic "read page" function for block devices that have the normal @@ -2370,7 +2379,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) out: return err; } -EXPORT_SYMBOL(generic_cont_expand_simple); +EXPORT_SYMBOL_NS(generic_cont_expand_simple, ANDROID_GKI_VFS_EXPORT_ONLY); static int cont_expand_zero(struct file *file, struct address_space *mapping, loff_t pos, loff_t *bytes) @@ -2995,7 +3004,7 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block, get_block(inode, block, &tmp, 0); return tmp.b_blocknr; } -EXPORT_SYMBOL(generic_block_bmap); +EXPORT_SYMBOL_NS(generic_block_bmap, ANDROID_GKI_VFS_EXPORT_ONLY); static void end_bio_bh_io_sync(struct bio *bio) { @@ -3116,7 +3125,7 @@ void ll_rw_block(int op, int op_flags, int nr, struct buffer_head *bhs[]) unlock_buffer(bh); } } -EXPORT_SYMBOL(ll_rw_block); +EXPORT_SYMBOL_NS(ll_rw_block, ANDROID_GKI_VFS_EXPORT_ONLY); void write_dirty_buffer(struct buffer_head *bh, int op_flags) { @@ -3163,13 +3172,13 @@ int __sync_dirty_buffer(struct buffer_head *bh, int op_flags) } return ret; } -EXPORT_SYMBOL(__sync_dirty_buffer); +EXPORT_SYMBOL_NS(__sync_dirty_buffer, ANDROID_GKI_VFS_EXPORT_ONLY); int sync_dirty_buffer(struct buffer_head *bh) { return __sync_dirty_buffer(bh, REQ_SYNC); } -EXPORT_SYMBOL(sync_dirty_buffer); +EXPORT_SYMBOL_NS(sync_dirty_buffer, ANDROID_GKI_VFS_EXPORT_ONLY); /* * try_to_free_buffers() checks if all the buffers on this particular page diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c index 9c8d34c49b12..8c454fa3aaa8 100644 --- a/fs/cachefiles/main.c +++ b/fs/cachefiles/main.c @@ -28,6 +28,7 @@ MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask"); MODULE_DESCRIPTION("Mounted-filesystem based cache"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); struct kmem_cache *cachefiles_object_jar; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 202ddde3d62a..4417a04dff12 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1341,3 +1341,4 @@ MODULE_AUTHOR("Yehuda Sadeh "); MODULE_AUTHOR("Patience Warnick "); MODULE_DESCRIPTION("Ceph filesystem for Linux"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index fc736ced6f4a..acb413ac22a6 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1763,6 +1763,7 @@ exit_cifs(void) MODULE_AUTHOR("Steve French"); MODULE_LICENSE("GPL"); /* combination of LGPL + GPL source behaves as GPL */ +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_DESCRIPTION ("VFS to access SMB3 servers e.g. Samba, Macs, Azure and Windows (and " "also older servers complying with the SNIA CIFS Specification)"); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index aa422348824a..059012d29ca2 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4231,6 +4231,7 @@ static const struct vm_operations_struct cifs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = cifs_page_mkwrite, + .speculative = true, }; int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 240669f51eac..897c7cb93f47 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -388,6 +388,7 @@ MODULE_AUTHOR("Jan Harkes, Peter J. Braam"); MODULE_DESCRIPTION("Coda Distributed File System VFS interface"); MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_VERSION("7.0"); static int __init init_coda(void) diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index c2d820063ec4..107c9fb15bc0 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -173,6 +173,7 @@ MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); MODULE_VERSION("0.0.2"); MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration."); +MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver); core_initcall(configfs_init); module_exit(configfs_exit); diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 2be65269a987..bc2e0d97923f 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -1008,3 +1008,4 @@ static void __exit exit_cramfs_fs(void) module_init(init_cramfs_fs) module_exit(exit_cramfs_fs) MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile index 652c7180ec6d..d39077502e06 100644 --- a/fs/crypto/Makefile +++ b/fs/crypto/Makefile @@ -1,6 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o +ccflags-y += -DDEFAULT_SYMBOL_NAMESPACE=ANDROID_GKI_VFS_EXPORT_ONLY + fscrypto-y := crypto.o \ fname.o \ hkdf.o \ diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 68a2de6b5a9b..dac786fb6f80 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -1,23 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 /* - * This contains encryption functions for per-file encryption. + * Utility functions for file contents encryption/decryption on + * block device-based filesystems. * * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility - * - * Written by Michael Halcrow, 2014. - * - * Filename encryption additions - * Uday Savagaonkar, 2014 - * Encryption policy handling additions - * Ildar Muslukhov, 2014 - * Add fscrypt_pullback_bio_page() - * Jaegeuk Kim, 2015. - * - * This has not yet undergone a rigorous security audit. - * - * The usage of AES-XTS should conform to recommendations in NIST - * Special Publication 800-38E and IEEE P1619/D16. */ #include @@ -26,18 +13,37 @@ #include #include "fscrypt_private.h" -void fscrypt_decrypt_bio(struct bio *bio) +/** + * fscrypt_decrypt_bio() - decrypt the contents of a bio + * @bio: the bio to decrypt + * + * Decrypt the contents of a "read" bio following successful completion of the + * underlying disk read. The bio must be reading a whole number of blocks of an + * encrypted file directly into the page cache. If the bio is reading the + * ciphertext into bounce pages instead of the page cache (for example, because + * the file is also compressed, so decompression is required after decryption), + * then this function isn't applicable. This function may sleep, so it must be + * called from a workqueue rather than from the bio's bi_end_io callback. + * + * Return: %true on success; %false on failure. On failure, bio->bi_status is + * also set to an error status. + */ +bool fscrypt_decrypt_bio(struct bio *bio) { struct bio_vec *bv; struct bvec_iter_all iter_all; bio_for_each_segment_all(bv, bio, iter_all) { struct page *page = bv->bv_page; - int ret = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len, + int err = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len, bv->bv_offset); - if (ret) - SetPageError(page); + + if (err) { + bio->bi_status = errno_to_blk_status(err); + return false; + } } + return true; } EXPORT_SYMBOL(fscrypt_decrypt_bio); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 4ef3f714046a..059344a8900e 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -69,6 +69,14 @@ void fscrypt_free_bounce_page(struct page *bounce_page) } EXPORT_SYMBOL(fscrypt_free_bounce_page); +/* + * Generate the IV for the given logical block number within the given file. + * For filenames encryption, lblk_num == 0. + * + * Keep this in sync with fscrypt_limit_io_blocks(). fscrypt_limit_io_blocks() + * needs to know about any IV generation methods where the low bits of IV don't + * simply contain the lblk_num (e.g., IV_INO_LBLK_32). + */ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, const struct fscrypt_info *ci) { @@ -105,7 +113,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, if (WARN_ON_ONCE(len <= 0)) return -EINVAL; - if (WARN_ON_ONCE(len % FS_CRYPTO_BLOCK_SIZE != 0)) + if (WARN_ON_ONCE(len % FSCRYPT_CONTENTS_ALIGNMENT != 0)) return -EINVAL; fscrypt_generate_iv(&iv, lblk_num, ci); @@ -205,8 +213,8 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks); * fscrypt_encrypt_block_inplace() - Encrypt a filesystem block in-place * @inode: The inode to which this block belongs * @page: The page containing the block to encrypt - * @len: Size of block to encrypt. Doesn't need to be a multiple of the - * fs block size, but must be a multiple of FS_CRYPTO_BLOCK_SIZE. + * @len: Size of block to encrypt. This must be a multiple of + * FSCRYPT_CONTENTS_ALIGNMENT. * @offs: Byte offset within @page at which the block to encrypt begins * @lblk_num: Filesystem logical block number of the block, i.e. the 0-based * number of the block within the file @@ -275,8 +283,8 @@ EXPORT_SYMBOL(fscrypt_decrypt_pagecache_blocks); * fscrypt_decrypt_block_inplace() - Decrypt a filesystem block in-place * @inode: The inode to which this block belongs * @page: The page containing the block to decrypt - * @len: Size of block to decrypt. Doesn't need to be a multiple of the - * fs block size, but must be a multiple of FS_CRYPTO_BLOCK_SIZE. + * @len: Size of block to decrypt. This must be a multiple of + * FSCRYPT_CONTENTS_ALIGNMENT. * @offs: Byte offset within @page at which the block to decrypt begins * @lblk_num: Filesystem logical block number of the block, i.e. the 0-based * number of the block within the file diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index eb538c28df94..12bd61d20f69 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -18,6 +18,13 @@ #include #include "fscrypt_private.h" +/* + * The minimum message length (input and output length), in bytes, for all + * filenames encryption modes. Filenames shorter than this will be zero-padded + * before being encrypted. + */ +#define FSCRYPT_FNAME_MIN_MSG_LEN 16 + /* * struct fscrypt_nokey_name - identifier for directory entry when key is absent * @@ -79,7 +86,8 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) /** * fscrypt_fname_encrypt() - encrypt a filename * @inode: inode of the parent directory (for regular filenames) - * or of the symlink (for symlink targets) + * or of the symlink (for symlink targets). Key must already be + * set up. * @iname: the filename to encrypt * @out: (output) the encrypted filename * @olen: size of the encrypted filename. It must be at least @iname->len. @@ -130,6 +138,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, return 0; } +EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt); /** * fname_decrypt() - decrypt a filename @@ -257,9 +266,9 @@ static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst) return bp - dst; } -bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, - u32 orig_len, u32 max_len, - u32 *encrypted_len_ret) +bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret) { int padding = 4 << (fscrypt_policy_flags(policy) & FSCRYPT_POLICY_FLAGS_PAD_MASK); @@ -267,12 +276,35 @@ bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, if (orig_len > max_len) return false; - encrypted_len = max(orig_len, (u32)FS_CRYPTO_BLOCK_SIZE); + encrypted_len = max_t(u32, orig_len, FSCRYPT_FNAME_MIN_MSG_LEN); encrypted_len = round_up(encrypted_len, padding); *encrypted_len_ret = min(encrypted_len, max_len); return true; } +/** + * fscrypt_fname_encrypted_size() - calculate length of encrypted filename + * @inode: parent inode of dentry name being encrypted. Key must + * already be set up. + * @orig_len: length of the original filename + * @max_len: maximum length to return + * @encrypted_len_ret: where calculated length should be returned (on success) + * + * Filenames that are shorter than the maximum length may have their lengths + * increased slightly by encryption, due to padding that is applied. + * + * Return: false if the orig_len is greater than max_len. Otherwise, true and + * fill out encrypted_len_ret with the length (up to max_len). + */ +bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, + u32 max_len, u32 *encrypted_len_ret) +{ + return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy, + orig_len, max_len, + encrypted_len_ret); +} +EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size); + /** * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames * @max_encrypted_len: maximum length of encrypted filenames the buffer will be @@ -350,7 +382,7 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, return 0; } - if (iname->len < FS_CRYPTO_BLOCK_SIZE) + if (iname->len < FSCRYPT_FNAME_MIN_MSG_LEN) return -EUCLEAN; if (fscrypt_has_encryption_key(inode)) @@ -428,9 +460,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return ret; if (fscrypt_has_encryption_key(dir)) { - if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy, - iname->len, - dir->i_sb->s_cop->max_namelen, + if (!fscrypt_fname_encrypted_size(dir, iname->len, NAME_MAX, &fname->crypto_buf.len)) return -ENAMETOOLONG; fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 373c434b375c..98d1901db9cb 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -20,13 +20,39 @@ #define FSCRYPT_FILE_NONCE_SIZE 16 +/* + * Minimum size of an fscrypt master key. Note: a longer key will be required + * if ciphers with a 256-bit security strength are used. This is just the + * absolute minimum, which applies when only 128-bit encryption is used. + */ #define FSCRYPT_MIN_KEY_SIZE 16 +/* Maximum size of a standard fscrypt master key */ +#define FSCRYPT_MAX_STANDARD_KEY_SIZE 64 + +/* Maximum size of a hardware-wrapped fscrypt master key */ +#define FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE + +/* + * Maximum size of an fscrypt master key across both key types. + * This should just use max(), but max() doesn't work in a struct definition. + */ +#define FSCRYPT_MAX_ANY_KEY_SIZE \ + (FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE > FSCRYPT_MAX_STANDARD_KEY_SIZE ? \ + FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE : FSCRYPT_MAX_STANDARD_KEY_SIZE) + +/* + * FSCRYPT_MAX_KEY_SIZE is defined in the UAPI header, but the addition of + * hardware-wrapped keys has made it misleading as it's only for standard keys. + * Don't use it in kernel code; use one of the above constants instead. + */ +#undef FSCRYPT_MAX_KEY_SIZE + #define FSCRYPT_CONTEXT_V1 1 #define FSCRYPT_CONTEXT_V2 2 /* Keep this in sync with include/uapi/linux/fscrypt.h */ -#define FSCRYPT_MODE_MAX FSCRYPT_MODE_ADIANTUM +#define FSCRYPT_MODE_MAX FSCRYPT_MODE_AES_256_HCTR2 struct fscrypt_context_v1 { u8 version; /* FSCRYPT_CONTEXT_V1 */ @@ -179,7 +205,7 @@ struct fscrypt_symlink_data { struct fscrypt_prepared_key { struct crypto_skcipher *tfm; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT - struct fscrypt_blk_crypto_key *blk_key; + struct blk_crypto_key *blk_key; #endif }; @@ -292,14 +318,11 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, const struct fscrypt_info *ci); /* fname.c */ -int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, - u8 *out, unsigned int olen); -bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, - u32 orig_len, u32 max_len, - u32 *encrypted_len_ret); +bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret); /* hkdf.c */ - struct fscrypt_hkdf { struct crypto_shash *hmac_tfm; }; @@ -330,7 +353,8 @@ void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT -int fscrypt_select_encryption_impl(struct fscrypt_info *ci); +int fscrypt_select_encryption_impl(struct fscrypt_info *ci, + bool is_hw_wrapped_key); static inline bool fscrypt_using_inline_encryption(const struct fscrypt_info *ci) @@ -339,10 +363,16 @@ fscrypt_using_inline_encryption(const struct fscrypt_info *ci) } int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, - const u8 *raw_key, + const u8 *raw_key, size_t raw_key_size, + bool is_hw_wrapped, const struct fscrypt_info *ci); -void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key); +void fscrypt_destroy_inline_crypt_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key); + +int fscrypt_derive_sw_secret(struct super_block *sb, + const u8 *wrapped_key, size_t wrapped_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]); /* * Check whether the crypto transform or blk-crypto key has been allocated in @@ -367,7 +397,8 @@ fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ -static inline int fscrypt_select_encryption_impl(struct fscrypt_info *ci) +static inline int fscrypt_select_encryption_impl(struct fscrypt_info *ci, + bool is_hw_wrapped_key) { return 0; } @@ -380,7 +411,8 @@ fscrypt_using_inline_encryption(const struct fscrypt_info *ci) static inline int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, - const u8 *raw_key, + const u8 *raw_key, size_t raw_key_size, + bool is_hw_wrapped, const struct fscrypt_info *ci) { WARN_ON(1); @@ -388,10 +420,20 @@ fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, } static inline void -fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key) +fscrypt_destroy_inline_crypt_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key) { } +static inline int +fscrypt_derive_sw_secret(struct super_block *sb, + const u8 *wrapped_key, size_t wrapped_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]) +{ + fscrypt_warn(NULL, "kernel doesn't support hardware-wrapped keys"); + return -EOPNOTSUPP; +} + static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, const struct fscrypt_info *ci) @@ -408,16 +450,38 @@ fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, struct fscrypt_master_key_secret { /* - * For v2 policy keys: HKDF context keyed by this master key. - * For v1 policy keys: not set (hkdf.hmac_tfm == NULL). + * The KDF with which subkeys of this key can be derived. + * + * For v1 policy keys, this isn't applicable and won't be set. + * Otherwise, this KDF will be keyed by this master key if + * ->is_hw_wrapped=false, or by the "software secret" that hardware + * derived from this master key if ->is_hw_wrapped=true. */ struct fscrypt_hkdf hkdf; - /* Size of the raw key in bytes. Set even if ->raw isn't set. */ + /* + * True if this key is a hardware-wrapped key; false if this key is a + * standard key (i.e. a "software key"). For v1 policy keys this will + * always be false, as v1 policy support is a legacy feature which + * doesn't support newer functionality such as hardware-wrapped keys. + */ + bool is_hw_wrapped; + + /* + * Size of the raw key in bytes. This remains set even if ->raw was + * zeroized due to no longer being needed. I.e. we still remember the + * size of the key even if we don't need to remember the key itself. + */ u32 size; - /* For v1 policy keys: the raw key. Wiped for v2 policy keys. */ - u8 raw[FSCRYPT_MAX_KEY_SIZE]; + /* + * The raw key which userspace provided, when still needed. This can be + * either a standard key or a hardware-wrapped key, as indicated by + * ->is_hw_wrapped. In the case of a standard, v2 policy key, there is + * no need to remember the raw key separately from ->hkdf so this field + * will be zeroized as soon as ->hkdf is initialized. + */ + u8 raw[FSCRYPT_MAX_ANY_KEY_SIZE]; } __randomize_layout; @@ -431,13 +495,7 @@ struct fscrypt_master_key_secret { struct fscrypt_master_key { /* - * Back-pointer to the super_block of the filesystem to which this - * master key has been added. Only valid if ->mk_active_refs > 0. - */ - struct super_block *mk_sb; - - /* - * Link in ->mk_sb->s_master_keys->key_hashtable. + * Link in ->s_master_keys->key_hashtable. * Only valid if ->mk_active_refs > 0. */ struct hlist_node mk_node; @@ -448,7 +506,7 @@ struct fscrypt_master_key { /* * Active and structural reference counts. An active ref guarantees * that the struct continues to exist, continues to be in the keyring - * ->mk_sb->s_master_keys, and that any embedded subkeys (e.g. + * ->s_master_keys, and that any embedded subkeys (e.g. * ->mk_direct_keys) that have been prepared continue to exist. * A structural ref only guarantees that the struct continues to exist. * @@ -561,14 +619,15 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) void fscrypt_put_master_key(struct fscrypt_master_key *mk); -void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk); +void fscrypt_put_master_key_activeref(struct super_block *sb, + struct fscrypt_master_key *mk); struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); -int fscrypt_add_test_dummy_key(struct super_block *sb, - struct fscrypt_key_specifier *key_spec); +int fscrypt_get_test_dummy_key_identifier( + u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); @@ -583,7 +642,9 @@ struct fscrypt_mode { int keysize; /* key size in bytes */ int security_strength; /* security strength in bytes */ int ivsize; /* IV size in bytes */ - int logged_impl_name; + int logged_cryptoapi_impl; + int logged_blk_crypto_native; + int logged_blk_crypto_fallback; enum blk_crypto_mode_num blk_crypto_mode; }; @@ -592,7 +653,8 @@ extern struct fscrypt_mode fscrypt_modes[]; int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_info *ci); -void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key); +void fscrypt_destroy_prepared_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key); int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key); @@ -643,6 +705,8 @@ int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci); bool fscrypt_policies_equal(const union fscrypt_policy *policy1, const union fscrypt_policy *policy2); +int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, + struct fscrypt_key_specifier *key_spec); bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, const struct inode *inode); int fscrypt_policy_from_context(union fscrypt_policy *policy_u, diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index 7607d18b35fc..41e7c9b05c2a 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -4,7 +4,9 @@ * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". * - * This is used to derive keys from the fscrypt master keys. + * This is used to derive keys from the fscrypt master keys (or from the + * "software secrets" which hardware derives from the fscrypt master keys, in + * the case that the fscrypt master keys are hardware-wrapped keys). * * Copyright 2019 Google LLC */ diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index be5c650e4957..7b8c5a1104b5 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -224,9 +224,9 @@ int fscrypt_prepare_symlink(struct inode *dir, const char *target, * counting it (even though it is meaningless for ciphertext) is simpler * for now since filesystems will assume it is there and subtract it. */ - if (!fscrypt_fname_encrypted_size(policy, len, - max_len - sizeof(struct fscrypt_symlink_data), - &disk_link->len)) + if (!__fscrypt_fname_encrypted_size(policy, len, + max_len - sizeof(struct fscrypt_symlink_data), + &disk_link->len)) return -ENAMETOOLONG; disk_link->len += sizeof(struct fscrypt_symlink_data); diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index c57bebfa48fe..64fff3895a22 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -17,29 +17,26 @@ #include #include #include +#include #include "fscrypt_private.h" -struct fscrypt_blk_crypto_key { - struct blk_crypto_key base; - int num_devs; - struct request_queue *devs[]; -}; - -static int fscrypt_get_num_devices(struct super_block *sb) +static struct block_device **fscrypt_get_devices(struct super_block *sb, + unsigned int *num_devs) { - if (sb->s_cop->get_num_devices) - return sb->s_cop->get_num_devices(sb); - return 1; -} + struct block_device **devs; -static void fscrypt_get_devices(struct super_block *sb, int num_devs, - struct request_queue **devs) -{ - if (num_devs == 1) - devs[0] = bdev_get_queue(sb->s_bdev); - else - sb->s_cop->get_devices(sb, devs); + if (sb->s_cop->get_devices) { + devs = sb->s_cop->get_devices(sb, num_devs); + if (devs) + return devs; + } + devs = kmalloc(sizeof(*devs), GFP_KERNEL); + if (!devs) + return ERR_PTR(-ENOMEM); + devs[0] = sb->s_bdev; + *num_devs = 1; + return devs; } static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci) @@ -63,15 +60,45 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci) return DIV_ROUND_UP(lblk_bits, 8); } +/* + * Log a message when starting to use blk-crypto (native) or blk-crypto-fallback + * for an encryption mode for the first time. This is the blk-crypto + * counterpart to the message logged when starting to use the crypto API for the + * first time. A limitation is that these messages don't convey which specific + * filesystems or files are using each implementation. However, *usually* + * systems use just one implementation per mode, which makes these messages + * helpful for debugging problems where the "wrong" implementation is used. + */ +static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode, + struct block_device **devs, + unsigned int num_devs, + const struct blk_crypto_config *cfg) +{ + unsigned int i; + + for (i = 0; i < num_devs; i++) { + if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || + blk_crypto_config_supported_natively(devs[i], cfg)) { + if (!xchg(&mode->logged_blk_crypto_native, 1)) + pr_info("fscrypt: %s using blk-crypto (native)\n", + mode->friendly_name); + } else if (!xchg(&mode->logged_blk_crypto_fallback, 1)) { + pr_info("fscrypt: %s using blk-crypto-fallback\n", + mode->friendly_name); + } + } +} + /* Enable inline encryption for this file if supported. */ -int fscrypt_select_encryption_impl(struct fscrypt_info *ci) +int fscrypt_select_encryption_impl(struct fscrypt_info *ci, + bool is_hw_wrapped_key) { const struct inode *inode = ci->ci_inode; struct super_block *sb = inode->i_sb; struct blk_crypto_config crypto_cfg; - int num_devs; - struct request_queue **devs; - int i; + struct block_device **devs; + unsigned int num_devs; + unsigned int i; /* The file must need contents encryption, not filenames encryption */ if (!S_ISREG(inode->i_mode)) @@ -99,23 +126,27 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) return 0; /* - * On all the filesystem's devices, blk-crypto must support the crypto - * configuration that the file would use. + * On all the filesystem's block devices, blk-crypto must support the + * crypto configuration that the file would use. */ crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode; crypto_cfg.data_unit_size = sb->s_blocksize; crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci); - num_devs = fscrypt_get_num_devices(sb); - devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL); - if (!devs) - return -ENOMEM; - fscrypt_get_devices(sb, num_devs, devs); + crypto_cfg.key_type = + is_hw_wrapped_key ? BLK_CRYPTO_KEY_TYPE_HW_WRAPPED : + BLK_CRYPTO_KEY_TYPE_STANDARD; + + devs = fscrypt_get_devices(sb, &num_devs); + if (IS_ERR(devs)) + return PTR_ERR(devs); for (i = 0; i < num_devs; i++) { if (!blk_crypto_config_supported(devs[i], &crypto_cfg)) goto out_free_devs; } + fscrypt_log_blk_crypto_impl(ci->ci_mode, devs, num_devs, &crypto_cfg); + ci->ci_inlinecrypt = true; out_free_devs: kfree(devs); @@ -124,55 +155,50 @@ out_free_devs: } int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, - const u8 *raw_key, + const u8 *raw_key, size_t raw_key_size, + bool is_hw_wrapped, const struct fscrypt_info *ci) { const struct inode *inode = ci->ci_inode; struct super_block *sb = inode->i_sb; enum blk_crypto_mode_num crypto_mode = ci->ci_mode->blk_crypto_mode; - int num_devs = fscrypt_get_num_devices(sb); - int queue_refs = 0; - struct fscrypt_blk_crypto_key *blk_key; + enum blk_crypto_key_type key_type = is_hw_wrapped ? + BLK_CRYPTO_KEY_TYPE_HW_WRAPPED : BLK_CRYPTO_KEY_TYPE_STANDARD; + struct blk_crypto_key *blk_key; + struct block_device **devs; + unsigned int num_devs; + unsigned int i; int err; - int i; - blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_KERNEL); + blk_key = kmalloc(sizeof(*blk_key), GFP_KERNEL); if (!blk_key) return -ENOMEM; - blk_key->num_devs = num_devs; - fscrypt_get_devices(sb, num_devs, blk_key->devs); - - err = blk_crypto_init_key(&blk_key->base, raw_key, crypto_mode, - fscrypt_get_dun_bytes(ci), sb->s_blocksize); + err = blk_crypto_init_key(blk_key, raw_key, raw_key_size, key_type, + crypto_mode, fscrypt_get_dun_bytes(ci), + sb->s_blocksize); if (err) { fscrypt_err(inode, "error %d initializing blk-crypto key", err); goto fail; } - /* - * We have to start using blk-crypto on all the filesystem's devices. - * We also have to save all the request_queue's for later so that the - * key can be evicted from them. This is needed because some keys - * aren't destroyed until after the filesystem was already unmounted - * (namely, the per-mode keys in struct fscrypt_master_key). - */ - for (i = 0; i < num_devs; i++) { - if (!blk_get_queue(blk_key->devs[i])) { - fscrypt_err(inode, "couldn't get request_queue"); - err = -EAGAIN; - goto fail; - } - queue_refs++; - - err = blk_crypto_start_using_key(&blk_key->base, - blk_key->devs[i]); - if (err) { - fscrypt_err(inode, - "error %d starting to use blk-crypto", err); - goto fail; - } + /* Start using blk-crypto on all the filesystem's block devices. */ + devs = fscrypt_get_devices(sb, &num_devs); + if (IS_ERR(devs)) { + err = PTR_ERR(devs); + goto fail; } + for (i = 0; i < num_devs; i++) { + err = blk_crypto_start_using_key(devs[i], blk_key); + if (err) + break; + } + kfree(devs); + if (err) { + fscrypt_err(inode, "error %d starting to use blk-crypto", err); + goto fail; + } + /* * Pairs with the smp_load_acquire() in fscrypt_is_key_prepared(). * I.e., here we publish ->blk_key with a RELEASE barrier so that @@ -183,24 +209,57 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, return 0; fail: - for (i = 0; i < queue_refs; i++) - blk_put_queue(blk_key->devs[i]); kfree_sensitive(blk_key); return err; } -void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key) +void fscrypt_destroy_inline_crypt_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key) { - struct fscrypt_blk_crypto_key *blk_key = prep_key->blk_key; - int i; + struct blk_crypto_key *blk_key = prep_key->blk_key; + struct block_device **devs; + unsigned int num_devs; + unsigned int i; - if (blk_key) { - for (i = 0; i < blk_key->num_devs; i++) { - blk_crypto_evict_key(blk_key->devs[i], &blk_key->base); - blk_put_queue(blk_key->devs[i]); - } - kfree_sensitive(blk_key); + if (!blk_key) + return; + + /* Evict the key from all the filesystem's block devices. */ + devs = fscrypt_get_devices(sb, &num_devs); + if (!IS_ERR(devs)) { + for (i = 0; i < num_devs; i++) + blk_crypto_evict_key(devs[i], blk_key); + kfree(devs); } + kfree_sensitive(blk_key); +} + +/* + * Ask the inline encryption hardware to derive the software secret from a + * hardware-wrapped key. Returns -EOPNOTSUPP if hardware-wrapped keys aren't + * supported on this filesystem or hardware. + */ +int fscrypt_derive_sw_secret(struct super_block *sb, + const u8 *wrapped_key, size_t wrapped_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]) +{ + int err; + + /* The filesystem must be mounted with -o inlinecrypt. */ + if (!(sb->s_flags & SB_INLINECRYPT)) { + fscrypt_warn(NULL, + "%s: filesystem not mounted with inlinecrypt\n", + sb->s_id); + return -EOPNOTSUPP; + } + + err = blk_crypto_derive_sw_secret(sb->s_bdev, wrapped_key, + wrapped_key_size, sw_secret); + if (err == -EOPNOTSUPP) + fscrypt_warn(NULL, + "%s: block device doesn't support hardware-wrapped keys\n", + sb->s_id); + return err; } bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) @@ -238,6 +297,8 @@ static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num, * otherwise fscrypt_mergeable_bio() won't work as intended. * * The encryption context will be freed automatically when the bio is freed. + * + * This function also handles setting bi_skip_dm_default_key when needed. */ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask) @@ -245,12 +306,15 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, const struct fscrypt_info *ci; u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; + if (fscrypt_inode_should_skip_dm_default_key(inode)) + bio_set_skip_dm_default_key(bio); + if (!fscrypt_inode_uses_inline_crypto(inode)) return; ci = inode->i_crypt_info; fscrypt_generate_dun(ci, first_lblk, dun); - bio_crypt_set_ctx(bio, &ci->ci_enc_key.blk_key->base, dun, gfp_mask); + bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask); } EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx); @@ -315,6 +379,13 @@ EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx_bh); * * fscrypt_set_bio_crypt_ctx() must have already been called on the bio. * + * This function isn't required in cases where crypto-mergeability is ensured in + * another way, such as I/O targeting only a single file (and thus a single key) + * combined with fscrypt_limit_io_blocks() to ensure DUN contiguity. + * + * This function also returns false if the next part of the I/O would need to + * have a different value for the bi_skip_dm_default_key flag. + * * Return: true iff the I/O is mergeable */ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, @@ -325,6 +396,9 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, if (!!bc != fscrypt_inode_uses_inline_crypto(inode)) return false; + if (bio_should_skip_dm_default_key(bio) != + fscrypt_inode_should_skip_dm_default_key(inode)) + return false; if (!bc) return true; @@ -333,7 +407,7 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, * uses the same pointer. I.e., there's currently no need to support * merging requests where the keys are the same but the pointers differ. */ - if (bc->bc_key != &inode->i_crypt_info->ci_enc_key.blk_key->base) + if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key) return false; fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun); @@ -358,8 +432,96 @@ bool fscrypt_mergeable_bio_bh(struct bio *bio, u64 next_lblk; if (!bh_get_inode_and_lblk_num(next_bh, &inode, &next_lblk)) - return !bio->bi_crypt_context; + return !bio->bi_crypt_context && + !bio_should_skip_dm_default_key(bio); return fscrypt_mergeable_bio(bio, inode, next_lblk); } EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh); + +/** + * fscrypt_dio_supported() - check whether DIO (direct I/O) is supported on an + * inode, as far as encryption is concerned + * @inode: the inode in question + * + * Return: %true if there are no encryption constraints that prevent DIO from + * being supported; %false if DIO is unsupported. (Note that in the + * %true case, the filesystem might have other, non-encryption-related + * constraints that prevent DIO from actually being supported. Also, on + * encrypted files the filesystem is still responsible for only allowing + * DIO when requests are filesystem-block-aligned.) + */ +bool fscrypt_dio_supported(struct inode *inode) +{ + int err; + + /* If the file is unencrypted, no veto from us. */ + if (!fscrypt_needs_contents_encryption(inode)) + return true; + + /* + * We only support DIO with inline crypto, not fs-layer crypto. + * + * To determine whether the inode is using inline crypto, we have to set + * up the key if it wasn't already done. This is because in the current + * design of fscrypt, the decision of whether to use inline crypto or + * not isn't made until the inode's encryption key is being set up. In + * the DIO read/write case, the key will always be set up already, since + * the file will be open. But in the case of statx(), the key might not + * be set up yet, as the file might not have been opened yet. + */ + err = fscrypt_require_key(inode); + if (err) { + /* + * Key unavailable or couldn't be set up. This edge case isn't + * worth worrying about; just report that DIO is unsupported. + */ + return false; + } + return fscrypt_inode_uses_inline_crypto(inode); +} +EXPORT_SYMBOL_GPL(fscrypt_dio_supported); + +/** + * fscrypt_limit_io_blocks() - limit I/O blocks to avoid discontiguous DUNs + * @inode: the file on which I/O is being done + * @lblk: the block at which the I/O is being started from + * @nr_blocks: the number of blocks we want to submit starting at @lblk + * + * Determine the limit to the number of blocks that can be submitted in a bio + * targeting @lblk without causing a data unit number (DUN) discontiguity. + * + * This is normally just @nr_blocks, as normally the DUNs just increment along + * with the logical blocks. (Or the file is not encrypted.) + * + * In rare cases, fscrypt can be using an IV generation method that allows the + * DUN to wrap around within logically contiguous blocks, and that wraparound + * will occur. If this happens, a value less than @nr_blocks will be returned + * so that the wraparound doesn't occur in the middle of a bio, which would + * cause encryption/decryption to produce wrong results. + * + * Return: the actual number of blocks that can be submitted + */ +u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks) +{ + const struct fscrypt_info *ci; + u32 dun; + + if (!fscrypt_inode_uses_inline_crypto(inode)) + return nr_blocks; + + if (nr_blocks <= 1) + return nr_blocks; + + ci = inode->i_crypt_info; + if (!(fscrypt_policy_flags(&ci->ci_policy) & + FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) + return nr_blocks; + + /* With IV_INO_LBLK_32, the DUN can wrap around from U32_MAX to 0. */ + + dun = ci->ci_hashed_ino + lblk; + + return min_t(u64, nr_blocks, (u64)U32_MAX + 1 - dun); +} +EXPORT_SYMBOL_GPL(fscrypt_limit_io_blocks); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 02f8bf8bd54d..d43c7e3bfe31 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -79,10 +79,9 @@ void fscrypt_put_master_key(struct fscrypt_master_key *mk) call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key); } -void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk) +void fscrypt_put_master_key_activeref(struct super_block *sb, + struct fscrypt_master_key *mk) { - struct super_block *sb = mk->mk_sb; - struct fscrypt_keyring *keyring = sb->s_master_keys; size_t i; if (!refcount_dec_and_test(&mk->mk_active_refs)) @@ -93,9 +92,9 @@ void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk) * destroying any subkeys embedded in it. */ - spin_lock(&keyring->lock); + spin_lock(&sb->s_master_keys->lock); hlist_del_rcu(&mk->mk_node); - spin_unlock(&keyring->lock); + spin_unlock(&sb->s_master_keys->lock); /* * ->mk_active_refs == 0 implies that ->mk_secret is not present and @@ -105,9 +104,12 @@ void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk) WARN_ON(!list_empty(&mk->mk_decrypted_inodes)); for (i = 0; i <= FSCRYPT_MODE_MAX; i++) { - fscrypt_destroy_prepared_key(&mk->mk_direct_keys[i]); - fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_64_keys[i]); - fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_32_keys[i]); + fscrypt_destroy_prepared_key( + sb, &mk->mk_direct_keys[i]); + fscrypt_destroy_prepared_key( + sb, &mk->mk_iv_ino_lblk_64_keys[i]); + fscrypt_destroy_prepared_key( + sb, &mk->mk_iv_ino_lblk_32_keys[i]); } memzero_explicit(&mk->mk_ino_hash_key, sizeof(mk->mk_ino_hash_key)); @@ -128,11 +130,11 @@ static int fscrypt_user_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { /* - * We just charge FSCRYPT_MAX_KEY_SIZE bytes to the user's key quota for - * each key, regardless of the exact key size. The amount of memory - * actually used is greater than the size of the raw key anyway. + * We just charge FSCRYPT_MAX_STANDARD_KEY_SIZE bytes to the user's key + * quota for each key, regardless of the exact key size. The amount of + * memory actually used is greater than the size of the raw key anyway. */ - return key_payload_reserve(key, FSCRYPT_MAX_KEY_SIZE); + return key_payload_reserve(key, FSCRYPT_MAX_STANDARD_KEY_SIZE); } static void fscrypt_user_key_describe(const struct key *key, struct seq_file *m) @@ -240,7 +242,7 @@ void fscrypt_destroy_keyring(struct super_block *sb) WARN_ON(refcount_read(&mk->mk_struct_refs) != 1); WARN_ON(!is_master_key_secret_present(&mk->mk_secret)); wipe_master_key_secret(&mk->mk_secret); - fscrypt_put_master_key_activeref(mk); + fscrypt_put_master_key_activeref(sb, mk); } } kfree_sensitive(keyring); @@ -421,7 +423,6 @@ static int add_new_master_key(struct super_block *sb, if (!mk) return -ENOMEM; - mk->mk_sb = sb; init_rwsem(&mk->mk_sem); refcount_set(&mk->mk_struct_refs, 1); mk->mk_spec = *mk_spec; @@ -534,20 +535,36 @@ static int add_master_key(struct super_block *sb, int err; if (key_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { - err = fscrypt_init_hkdf(&secret->hkdf, secret->raw, - secret->size); + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]; + u8 *kdf_key = secret->raw; + unsigned int kdf_key_size = secret->size; + u8 keyid_kdf_ctx = HKDF_CONTEXT_KEY_IDENTIFIER; + + /* + * For standard keys, the fscrypt master key is used directly as + * the fscrypt KDF key. For hardware-wrapped keys, we have to + * pass the master key to the hardware to derive the KDF key, + * which is then only used to derive non-file-contents subkeys. + */ + if (secret->is_hw_wrapped) { + err = fscrypt_derive_sw_secret(sb, secret->raw, + secret->size, sw_secret); + if (err) + return err; + kdf_key = sw_secret; + kdf_key_size = sizeof(sw_secret); + } + err = fscrypt_init_hkdf(&secret->hkdf, kdf_key, kdf_key_size); + /* + * Now that the KDF context is initialized, the raw KDF key is + * no longer needed. + */ + memzero_explicit(kdf_key, kdf_key_size); if (err) return err; - /* - * Now that the HKDF context is initialized, the raw key is no - * longer needed. - */ - memzero_explicit(secret->raw, secret->size); - /* Calculate the key identifier */ - err = fscrypt_hkdf_expand(&secret->hkdf, - HKDF_CONTEXT_KEY_IDENTIFIER, NULL, 0, + err = fscrypt_hkdf_expand(&secret->hkdf, keyid_kdf_ctx, NULL, 0, key_spec->u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); if (err) @@ -561,7 +578,7 @@ static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep) const struct fscrypt_provisioning_key_payload *payload = prep->data; if (prep->datalen < sizeof(*payload) + FSCRYPT_MIN_KEY_SIZE || - prep->datalen > sizeof(*payload) + FSCRYPT_MAX_KEY_SIZE) + prep->datalen > sizeof(*payload) + FSCRYPT_MAX_ANY_KEY_SIZE) return -EINVAL; if (payload->type != FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && @@ -710,15 +727,30 @@ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) return -EACCES; memset(&secret, 0, sizeof(secret)); + + if (arg.__flags) { + if (arg.__flags & ~__FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED) + return -EINVAL; + if (arg.key_spec.type != FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) + return -EINVAL; + secret.is_hw_wrapped = true; + } + if (arg.key_id) { if (arg.raw_size != 0) return -EINVAL; err = get_keyring_key(arg.key_id, arg.key_spec.type, &secret); if (err) goto out_wipe_secret; + err = -EINVAL; + if (secret.size > FSCRYPT_MAX_STANDARD_KEY_SIZE && + !secret.is_hw_wrapped) + goto out_wipe_secret; } else { if (arg.raw_size < FSCRYPT_MIN_KEY_SIZE || - arg.raw_size > FSCRYPT_MAX_KEY_SIZE) + arg.raw_size > (secret.is_hw_wrapped ? + FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE : + FSCRYPT_MAX_STANDARD_KEY_SIZE)) return -EINVAL; secret.size = arg.raw_size; err = -EFAULT; @@ -743,29 +775,69 @@ out_wipe_secret: } EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key); -/* - * Add the key for '-o test_dummy_encryption' to the filesystem keyring. - * - * Use a per-boot random key to prevent people from misusing this option. - */ -int fscrypt_add_test_dummy_key(struct super_block *sb, - struct fscrypt_key_specifier *key_spec) +static void +fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret) +{ + static u8 test_key[FSCRYPT_MAX_STANDARD_KEY_SIZE]; + + get_random_once(test_key, sizeof(test_key)); + + memset(secret, 0, sizeof(*secret)); + secret->size = sizeof(test_key); + memcpy(secret->raw, test_key, sizeof(test_key)); +} + +int fscrypt_get_test_dummy_key_identifier( + u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { - static u8 test_key[FSCRYPT_MAX_KEY_SIZE]; struct fscrypt_master_key_secret secret; int err; - get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE); + fscrypt_get_test_dummy_secret(&secret); - memset(&secret, 0, sizeof(secret)); - secret.size = FSCRYPT_MAX_KEY_SIZE; - memcpy(secret.raw, test_key, FSCRYPT_MAX_KEY_SIZE); - - err = add_master_key(sb, &secret, key_spec); + err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size); + if (err) + goto out; + err = fscrypt_hkdf_expand(&secret.hkdf, HKDF_CONTEXT_KEY_IDENTIFIER, + NULL, 0, key_identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); +out: wipe_master_key_secret(&secret); return err; } +/** + * fscrypt_add_test_dummy_key() - add the test dummy encryption key + * @sb: the filesystem instance to add the key to + * @dummy_policy: the encryption policy for test_dummy_encryption + * + * If needed, add the key for the test_dummy_encryption mount option to the + * filesystem. To prevent misuse of this mount option, a per-boot random key is + * used instead of a hardcoded one. This makes it so that any encrypted files + * created using this option won't be accessible after a reboot. + * + * Return: 0 on success, -errno on failure + */ +int fscrypt_add_test_dummy_key(struct super_block *sb, + const struct fscrypt_dummy_policy *dummy_policy) +{ + const union fscrypt_policy *policy = dummy_policy->policy; + struct fscrypt_key_specifier key_spec; + struct fscrypt_master_key_secret secret; + int err; + + if (!policy) + return 0; + err = fscrypt_policy_to_key_spec(policy, &key_spec); + if (err) + return err; + fscrypt_get_test_dummy_secret(&secret); + err = add_master_key(sb, &secret, &key_spec); + wipe_master_key_secret(&secret); + return err; +} +EXPORT_SYMBOL_GPL(fscrypt_add_test_dummy_key); + /* * Verify that the current user has added a master key with the given identifier * (returns -ENOKEY if not). This is needed to prevent a user from encrypting @@ -1025,7 +1097,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) err = -ENOKEY; if (is_master_key_secret_present(&mk->mk_secret)) { wipe_master_key_secret(&mk->mk_secret); - fscrypt_put_master_key_activeref(mk); + fscrypt_put_master_key_activeref(sb, mk); err = 0; } inodes_remain = refcount_read(&mk->mk_active_refs) > 0; diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index c3fbd594cc79..6d4ae17dbf22 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -44,6 +44,21 @@ struct fscrypt_mode fscrypt_modes[] = { .security_strength = 16, .ivsize = 16, }, + [FSCRYPT_MODE_SM4_XTS] = { + .friendly_name = "SM4-XTS", + .cipher_str = "xts(sm4)", + .keysize = 32, + .security_strength = 16, + .ivsize = 16, + .blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS, + }, + [FSCRYPT_MODE_SM4_CTS] = { + .friendly_name = "SM4-CTS-CBC", + .cipher_str = "cts(cbc(sm4))", + .keysize = 16, + .security_strength = 16, + .ivsize = 16, + }, [FSCRYPT_MODE_ADIANTUM] = { .friendly_name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", @@ -52,6 +67,13 @@ struct fscrypt_mode fscrypt_modes[] = { .ivsize = 32, .blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM, }, + [FSCRYPT_MODE_AES_256_HCTR2] = { + .friendly_name = "AES-256-HCTR2", + .cipher_str = "hctr2(aes)", + .keysize = 32, + .security_strength = 32, + .ivsize = 32, + }, }; static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex); @@ -93,7 +115,7 @@ fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, mode->cipher_str, PTR_ERR(tfm)); return tfm; } - if (!xchg(&mode->logged_impl_name, 1)) { + if (!xchg(&mode->logged_cryptoapi_impl, 1)) { /* * fscrypt performance can vary greatly depending on which * crypto algorithm implementation is used. Help people debug @@ -121,8 +143,9 @@ err_free_tfm: /* * Prepare the crypto transform object or blk-crypto key in @prep_key, given the - * raw key, encryption mode, and flag indicating which encryption implementation - * (fs-layer or blk-crypto) will be used. + * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption + * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt), + * and IV generation method (@ci->ci_policy.flags). */ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_info *ci) @@ -130,7 +153,9 @@ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, struct crypto_skcipher *tfm; if (fscrypt_using_inline_encryption(ci)) - return fscrypt_prepare_inline_crypt_key(prep_key, raw_key, ci); + return fscrypt_prepare_inline_crypt_key(prep_key, raw_key, + ci->ci_mode->keysize, + false, ci); tfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, ci->ci_inode); if (IS_ERR(tfm)) @@ -146,10 +171,11 @@ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, } /* Destroy a crypto transform object and/or blk-crypto key. */ -void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key) +void fscrypt_destroy_prepared_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key) { crypto_free_skcipher(prep_key->tfm); - fscrypt_destroy_inline_crypt_key(prep_key); + fscrypt_destroy_inline_crypt_key(sb, prep_key); memzero_explicit(prep_key, sizeof(*prep_key)); } @@ -170,14 +196,29 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, struct fscrypt_mode *mode = ci->ci_mode; const u8 mode_num = mode - fscrypt_modes; struct fscrypt_prepared_key *prep_key; - u8 mode_key[FSCRYPT_MAX_KEY_SIZE]; + u8 mode_key[FSCRYPT_MAX_STANDARD_KEY_SIZE]; u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)]; unsigned int hkdf_infolen = 0; + bool use_hw_wrapped_key = false; int err; if (WARN_ON(mode_num > FSCRYPT_MODE_MAX)) return -EINVAL; + if (mk->mk_secret.is_hw_wrapped && S_ISREG(inode->i_mode)) { + /* Using a hardware-wrapped key for file contents encryption */ + if (!fscrypt_using_inline_encryption(ci)) { + if (sb->s_flags & SB_INLINECRYPT) + fscrypt_warn(ci->ci_inode, + "Hardware-wrapped key required, but no suitable inline encryption capabilities are available"); + else + fscrypt_warn(ci->ci_inode, + "Hardware-wrapped keys require inline encryption (-o inlinecrypt)"); + return -EINVAL; + } + use_hw_wrapped_key = true; + } + prep_key = &keys[mode_num]; if (fscrypt_is_key_prepared(prep_key, ci)) { ci->ci_enc_key = *prep_key; @@ -189,6 +230,16 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, if (fscrypt_is_key_prepared(prep_key, ci)) goto done_unlock; + if (use_hw_wrapped_key) { + err = fscrypt_prepare_inline_crypt_key(prep_key, + mk->mk_secret.raw, + mk->mk_secret.size, true, + ci); + if (err) + goto out_unlock; + goto done_unlock; + } + BUILD_BUG_ON(sizeof(mode_num) != 1); BUILD_BUG_ON(sizeof(sb->s_uuid) != 16); BUILD_BUG_ON(sizeof(hkdf_info) != 17); @@ -311,6 +362,14 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, { int err; + if (mk->mk_secret.is_hw_wrapped && + !(ci->ci_policy.v2.flags & (FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 | + FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))) { + fscrypt_warn(ci->ci_inode, + "Hardware-wrapped keys are only supported with IV_INO_LBLK policies"); + return -EINVAL; + } + if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { /* * DIRECT_KEY: instead of deriving per-file encryption keys, the @@ -337,7 +396,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { err = fscrypt_setup_iv_ino_lblk_32_key(ci, mk); } else { - u8 derived_key[FSCRYPT_MAX_KEY_SIZE]; + u8 derived_key[FSCRYPT_MAX_STANDARD_KEY_SIZE]; err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, HKDF_CONTEXT_PER_FILE_ENC_KEY, @@ -418,33 +477,19 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, struct fscrypt_master_key *mk; int err; - err = fscrypt_select_encryption_impl(ci); + err = fscrypt_policy_to_key_spec(&ci->ci_policy, &mk_spec); if (err) return err; - switch (ci->ci_policy.version) { - case FSCRYPT_POLICY_V1: - mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; - memcpy(mk_spec.u.descriptor, - ci->ci_policy.v1.master_key_descriptor, - FSCRYPT_KEY_DESCRIPTOR_SIZE); - break; - case FSCRYPT_POLICY_V2: - mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; - memcpy(mk_spec.u.identifier, - ci->ci_policy.v2.master_key_identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE); - break; - default: - WARN_ON(1); - return -EINVAL; - } - mk = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec); if (!mk) { if (ci->ci_policy.version != FSCRYPT_POLICY_V1) return -ENOKEY; + err = fscrypt_select_encryption_impl(ci, false); + if (err) + return err; + /* * As a legacy fallback for v1 policies, search for the key in * the current task's subscribed keyrings too. Don't move this @@ -466,8 +511,20 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, goto out_release_key; } + err = fscrypt_select_encryption_impl(ci, mk->mk_secret.is_hw_wrapped); + if (err) + goto out_release_key; + switch (ci->ci_policy.version) { case FSCRYPT_POLICY_V1: + if (WARN_ON(mk->mk_secret.is_hw_wrapped)) { + /* + * This should never happen, as adding a v1 policy key + * that is hardware-wrapped isn't allowed. + */ + err = -EINVAL; + goto out_release_key; + } err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw); break; case FSCRYPT_POLICY_V2: @@ -500,7 +557,8 @@ static void put_crypt_info(struct fscrypt_info *ci) if (ci->ci_direct_key) fscrypt_put_direct_key(ci->ci_direct_key); else if (ci->ci_owns_key) - fscrypt_destroy_prepared_key(&ci->ci_enc_key); + fscrypt_destroy_prepared_key(ci->ci_inode->i_sb, + &ci->ci_enc_key); mk = ci->ci_master_key; if (mk) { @@ -513,7 +571,7 @@ static void put_crypt_info(struct fscrypt_info *ci) spin_lock(&mk->mk_decrypted_inodes_lock); list_del(&ci->ci_master_key_link); spin_unlock(&mk->mk_decrypted_inodes_lock); - fscrypt_put_master_key_activeref(mk); + fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk); } memzero_explicit(ci, sizeof(*ci)); kmem_cache_free(fscrypt_info_cachep, ci); diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c index 2762c5350432..ee8246382805 100644 --- a/fs/crypto/keysetup_v1.c +++ b/fs/crypto/keysetup_v1.c @@ -118,7 +118,8 @@ find_and_lock_process_key(const char *prefix, payload = (const struct fscrypt_key *)ukp->data; if (ukp->datalen != sizeof(struct fscrypt_key) || - payload->size < 1 || payload->size > FSCRYPT_MAX_KEY_SIZE) { + payload->size < 1 || + payload->size > FSCRYPT_MAX_STANDARD_KEY_SIZE) { fscrypt_warn(NULL, "key with description '%s' has invalid payload", key->description); @@ -143,18 +144,19 @@ invalid: /* Master key referenced by DIRECT_KEY policy */ struct fscrypt_direct_key { + struct super_block *dk_sb; struct hlist_node dk_node; refcount_t dk_refcount; const struct fscrypt_mode *dk_mode; struct fscrypt_prepared_key dk_key; u8 dk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; - u8 dk_raw[FSCRYPT_MAX_KEY_SIZE]; + u8 dk_raw[FSCRYPT_MAX_STANDARD_KEY_SIZE]; }; static void free_direct_key(struct fscrypt_direct_key *dk) { if (dk) { - fscrypt_destroy_prepared_key(&dk->dk_key); + fscrypt_destroy_prepared_key(dk->dk_sb, &dk->dk_key); kfree_sensitive(dk); } } @@ -231,6 +233,7 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key) dk = kzalloc(sizeof(*dk), GFP_KERNEL); if (!dk) return ERR_PTR(-ENOMEM); + dk->dk_sb = ci->ci_inode->i_sb; refcount_set(&dk->dk_refcount, 1); dk->dk_mode = ci->ci_mode; err = fscrypt_prepare_key(&dk->dk_key, raw_key, ci); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index cad34dbe8e29..410d744cfb52 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -10,6 +10,7 @@ * Modified by Eric Biggers, 2019 for v2 policy support. */ +#include #include #include #include @@ -32,6 +33,26 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, return !memcmp(policy1, policy2, fscrypt_policy_size(policy1)); } +int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, + struct fscrypt_key_specifier *key_spec) +{ + switch (policy->version) { + case FSCRYPT_POLICY_V1: + key_spec->type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; + memcpy(key_spec->u.descriptor, policy->v1.master_key_descriptor, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + return 0; + case FSCRYPT_POLICY_V2: + key_spec->type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; + memcpy(key_spec->u.identifier, policy->v2.master_key_identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); + return 0; + default: + WARN_ON(1); + return -EINVAL; + } +} + static const union fscrypt_policy * fscrypt_get_dummy_policy(struct super_block *sb) { @@ -40,7 +61,14 @@ fscrypt_get_dummy_policy(struct super_block *sb) return sb->s_cop->get_dummy_policy(sb); } -static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) +/* + * Return %true if the given combination of encryption modes is supported for v1 + * (and later) encryption policies. + * + * Do *not* add anything new here, since v1 encryption policies are deprecated. + * New combinations of modes should go in fscrypt_valid_enc_modes_v2() only. + */ +static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode) { if (contents_mode == FSCRYPT_MODE_AES_256_XTS && filenames_mode == FSCRYPT_MODE_AES_256_CTS) @@ -57,6 +85,19 @@ static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) return false; } +static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode) +{ + if (contents_mode == FSCRYPT_MODE_AES_256_XTS && + filenames_mode == FSCRYPT_MODE_AES_256_HCTR2) + return true; + + if (contents_mode == FSCRYPT_MODE_SM4_XTS && + filenames_mode == FSCRYPT_MODE_SM4_CTS) + return true; + + return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode); +} + static bool supported_direct_key_modes(const struct inode *inode, u32 contents_mode, u32 filenames_mode) { @@ -130,7 +171,7 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy, static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy, const struct inode *inode) { - if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + if (!fscrypt_valid_enc_modes_v1(policy->contents_encryption_mode, policy->filenames_encryption_mode)) { fscrypt_warn(inode, "Unsupported encryption modes (contents %d, filenames %d)", @@ -166,7 +207,7 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, { int count = 0; - if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + if (!fscrypt_valid_enc_modes_v2(policy->contents_encryption_mode, policy->filenames_encryption_mode)) { fscrypt_warn(inode, "Unsupported encryption modes (contents %d, filenames %d)", @@ -664,6 +705,32 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) return fscrypt_get_dummy_policy(dir->i_sb); } +/** + * fscrypt_context_for_new_inode() - create an encryption context for a new inode + * @ctx: where context should be written + * @inode: inode from which to fetch policy and nonce + * + * Given an in-core "prepared" (via fscrypt_prepare_new_inode) inode, + * generate a new context and write it to ctx. ctx _must_ be at least + * FSCRYPT_SET_CONTEXT_MAX_SIZE bytes. + * + * Return: size of the resulting context or a negative error code. + */ +int fscrypt_context_for_new_inode(void *ctx, struct inode *inode) +{ + struct fscrypt_info *ci = inode->i_crypt_info; + + BUILD_BUG_ON(sizeof(union fscrypt_context) != + FSCRYPT_SET_CONTEXT_MAX_SIZE); + + /* fscrypt_prepare_new_inode() should have set up the key already. */ + if (WARN_ON_ONCE(!ci)) + return -ENOKEY; + + return fscrypt_new_context(ctx, &ci->ci_policy, ci->ci_nonce); +} +EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode); + /** * fscrypt_set_context() - Set the fscrypt context of a new inode * @inode: a new inode @@ -680,12 +747,9 @@ int fscrypt_set_context(struct inode *inode, void *fs_data) union fscrypt_context ctx; int ctxsize; - /* fscrypt_prepare_new_inode() should have set up the key already. */ - if (WARN_ON_ONCE(!ci)) - return -ENOKEY; - - BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE); - ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce); + ctxsize = fscrypt_context_for_new_inode(&ctx, inode); + if (ctxsize < 0) + return ctxsize; /* * This may be the first time the inode number is available, so do any @@ -700,73 +764,45 @@ int fscrypt_set_context(struct inode *inode, void *fs_data) EXPORT_SYMBOL_GPL(fscrypt_set_context); /** - * fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption' - * @sb: the filesystem on which test_dummy_encryption is being specified - * @arg: the argument to the test_dummy_encryption option. May be NULL. - * @dummy_policy: the filesystem's current dummy policy (input/output, see - * below) + * fscrypt_parse_test_dummy_encryption() - parse the test_dummy_encryption mount option + * @param: the mount option + * @dummy_policy: (input/output) the place to write the dummy policy that will + * result from parsing the option. Zero-initialize this. If a policy is + * already set here (due to test_dummy_encryption being given multiple + * times), then this function will verify that the policies are the same. * - * Handle the test_dummy_encryption mount option by creating a dummy encryption - * policy, saving it in @dummy_policy, and adding the corresponding dummy - * encryption key to the filesystem. If the @dummy_policy is already set, then - * instead validate that it matches @arg. Don't support changing it via - * remount, as that is difficult to do safely. - * - * Return: 0 on success (dummy policy set, or the same policy is already set); - * -EEXIST if a different dummy policy is already set; - * or another -errno value. + * Return: 0 on success; -EINVAL if the argument is invalid; -EEXIST if the + * argument conflicts with one already specified; or -ENOMEM. */ -int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg, - struct fscrypt_dummy_policy *dummy_policy) +int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, + struct fscrypt_dummy_policy *dummy_policy) { - struct fscrypt_key_specifier key_spec = { 0 }; - int version; - union fscrypt_policy *policy = NULL; + const char *arg = "v2"; + union fscrypt_policy *policy; int err; - if (!arg) - arg = "v2"; - - if (!strcmp(arg, "v1")) { - version = FSCRYPT_POLICY_V1; - key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; - memset(key_spec.u.descriptor, 0x42, - FSCRYPT_KEY_DESCRIPTOR_SIZE); - } else if (!strcmp(arg, "v2")) { - version = FSCRYPT_POLICY_V2; - key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; - /* key_spec.u.identifier gets filled in when adding the key */ - } else { - err = -EINVAL; - goto out; - } + if (param->type == fs_value_is_string && *param->string) + arg = param->string; policy = kzalloc(sizeof(*policy), GFP_KERNEL); - if (!policy) { - err = -ENOMEM; - goto out; - } + if (!policy) + return -ENOMEM; - err = fscrypt_add_test_dummy_key(sb, &key_spec); - if (err) - goto out; - - policy->version = version; - switch (policy->version) { - case FSCRYPT_POLICY_V1: + if (!strcmp(arg, "v1")) { + policy->version = FSCRYPT_POLICY_V1; policy->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; policy->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - memcpy(policy->v1.master_key_descriptor, key_spec.u.descriptor, + memset(policy->v1.master_key_descriptor, 0x42, FSCRYPT_KEY_DESCRIPTOR_SIZE); - break; - case FSCRYPT_POLICY_V2: + } else if (!strcmp(arg, "v2")) { + policy->version = FSCRYPT_POLICY_V2; policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - memcpy(policy->v2.master_key_identifier, key_spec.u.identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE); - break; - default: - WARN_ON(1); + err = fscrypt_get_test_dummy_key_identifier( + policy->v2.master_key_identifier); + if (err) + goto out; + } else { err = -EINVAL; goto out; } @@ -785,6 +821,37 @@ out: kfree(policy); return err; } +EXPORT_SYMBOL_GPL(fscrypt_parse_test_dummy_encryption); + +/** + * fscrypt_dummy_policies_equal() - check whether two dummy policies are equal + * @p1: the first test dummy policy (may be unset) + * @p2: the second test dummy policy (may be unset) + * + * Return: %true if the dummy policies are both set and equal, or both unset. + */ +bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, + const struct fscrypt_dummy_policy *p2) +{ + if (!p1->policy && !p2->policy) + return true; + if (!p1->policy || !p2->policy) + return false; + return fscrypt_policies_equal(p1->policy, p2->policy); +} +EXPORT_SYMBOL_GPL(fscrypt_dummy_policies_equal); + +/* Deprecated, do not use */ +int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg, + struct fscrypt_dummy_policy *dummy_policy) +{ + struct fs_parameter param = { + .type = fs_value_is_string, + .string = arg ? (char *)arg : "", + }; + return fscrypt_parse_test_dummy_encryption(¶m, dummy_policy) ?: + fscrypt_add_test_dummy_key(sb, dummy_policy); +} EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); /** diff --git a/fs/dcache.c b/fs/dcache.c index cf871a81f4fd..2225920518ba 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2145,7 +2145,7 @@ struct dentry *d_obtain_alias(struct inode *inode) { return __d_obtain_alias(inode, true); } -EXPORT_SYMBOL(d_obtain_alias); +EXPORT_SYMBOL_NS(d_obtain_alias, ANDROID_GKI_VFS_EXPORT_ONLY); /** * d_obtain_root - find or allocate a dentry for a given inode @@ -2219,7 +2219,7 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, } return found; } -EXPORT_SYMBOL(d_add_ci); +EXPORT_SYMBOL_NS(d_add_ci, ANDROID_GKI_VFS_EXPORT_ONLY); static inline bool d_same_name(const struct dentry *dentry, @@ -2950,7 +2950,7 @@ void d_move(struct dentry *dentry, struct dentry *target) __d_move(dentry, target, false); write_sequnlock(&rename_lock); } -EXPORT_SYMBOL(d_move); +EXPORT_SYMBOL_NS(d_move, ANDROID_GKI_VFS_EXPORT_ONLY); /* * d_exchange - exchange two dentries @@ -3100,7 +3100,7 @@ out: __d_add(dentry, inode); return NULL; } -EXPORT_SYMBOL(d_splice_alias); +EXPORT_SYMBOL_NS(d_splice_alias, ANDROID_GKI_VFS_EXPORT_ONLY); /* * Test whether new_dentry is a subdirectory of old_dentry. diff --git a/fs/direct-io.c b/fs/direct-io.c index b2e86e739d7a..a13576d9f98a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1369,7 +1369,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, end_io, submit_io, flags); } -EXPORT_SYMBOL(__blockdev_direct_IO); +EXPORT_SYMBOL_NS(__blockdev_direct_IO, ANDROID_GKI_VFS_EXPORT_ONLY); static __init int dio_init(void) { diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 2dd23a82e0de..1cd7c9463b02 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -897,6 +897,7 @@ MODULE_AUTHOR("Michael A. Halcrow "); MODULE_DESCRIPTION("eCryptfs"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); module_init(ecryptfs_init) module_exit(ecryptfs_exit) diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 15880a68faad..b62aefe3b4b8 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -272,6 +272,7 @@ static __exit void efivarfs_exit(void) MODULE_AUTHOR("Matthew Garrett, Jeremy Kerr"); MODULE_DESCRIPTION("EFI Variable Filesystem"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_ALIAS_FS("efivarfs"); module_init(efivarfs_init); diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 89e73a6f0d36..8c0ecaa62de2 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -311,3 +311,4 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { } MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 14b747026742..f57255ab88ed 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -6,16 +6,22 @@ config EROFS_FS select FS_IOMAP select LIBCRC32C help - EROFS (Enhanced Read-Only File System) is a lightweight - read-only file system with modern designs (eg. page-sized - blocks, inline xattrs/data, etc.) for scenarios which need - high-performance read-only requirements, e.g. Android OS - for mobile phones and LIVECDs. + EROFS (Enhanced Read-Only File System) is a lightweight read-only + file system with modern designs (e.g. no buffer heads, inline + xattrs/data, chunk-based deduplication, multiple devices, etc.) for + scenarios which need high-performance read-only solutions, e.g. + smartphones with Android OS, LiveCDs and high-density hosts with + numerous containers; - It also provides fixed-sized output compression support, - which improves storage density, keeps relatively higher - compression ratios, which is more useful to achieve high - performance for embedded devices with limited memory. + It also provides fixed-sized output compression support in order to + improve storage density as well as keep relatively higher compression + ratios and implements in-place decompression to reuse the file page + for compressed data temporarily with proper strategies, which is + quite useful to ensure guaranteed end-to-end runtime decompression + performance under extremely memory pressure without extra cost. + + See the documentation at + for more details. If unsure, say N. @@ -76,3 +82,19 @@ config EROFS_FS_ZIP Enable fixed-sized output compression for EROFS. If you don't want to enable compression feature, say N. + +config EROFS_FS_ZIP_LZMA + bool "EROFS LZMA compressed data support" + depends on EROFS_FS_ZIP + select XZ_DEC + select XZ_DEC_MICROLZMA + help + Saying Y here includes support for reading EROFS file systems + containing LZMA compressed data, specifically called microLZMA. it + gives better compression ratios than the LZ4 algorithm, at the + expense of more CPU overhead. + + LZMA support is an experimental feature for now and so most file + systems will be readable without selecting this option. + + If unsure, say N. diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 1f9aced49070..8a3317e38e5a 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_EROFS_FS) += erofs.o -erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o +erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o sysfs.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o +erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 3701c72bacb2..579406504919 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -8,11 +8,6 @@ #include "internal.h" -enum { - Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, - Z_EROFS_COMPRESSION_RUNTIME_MAX -}; - struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; @@ -25,6 +20,12 @@ struct z_erofs_decompress_req { bool inplace_io, partial_decoding; }; +struct z_erofs_decompressor { + int (*decompress)(struct z_erofs_decompress_req *rq, + struct page **pagepool); + char *name; +}; + /* some special page->private (unsigned long, see below) */ #define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2) #define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2) @@ -63,7 +64,7 @@ static inline bool z_erofs_is_shortlived_page(struct page *page) return true; } -static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool, +static inline bool z_erofs_put_shortlivedpage(struct page **pagepool, struct page *page) { if (!z_erofs_is_shortlived_page(page)) @@ -74,13 +75,22 @@ static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool, put_page(page); } else { /* follow the pcluster rule above. */ - set_page_private(page, 0); - list_add(&page->lru, pagepool); + erofs_pagepool_add(pagepool, page); } return true; } -int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct list_head *pagepool); +#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) +static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, + struct page *page) +{ + return page->mapping == MNGD_MAPPING(sbi); +} +int z_erofs_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool); + +/* prototypes for specific algorithms */ +int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool); #endif diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 16a41d0db55a..0e35ef3f9f3d 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -89,6 +89,7 @@ static int erofs_map_blocks(struct inode *inode, erofs_off_t pos; int err = 0; + map->m_deviceid = 0; if (map->m_la >= inode->i_size) { /* leave out-of-bound access unmapped */ map->m_flags = 0; @@ -135,14 +136,8 @@ static int erofs_map_blocks(struct inode *inode, map->m_flags = 0; break; default: - /* only one device is supported for now */ - if (idx->device_id) { - erofs_err(sb, "invalid device id %u @ %llu for nid %llu", - le16_to_cpu(idx->device_id), - chunknr, vi->nid); - err = -EFSCORRUPTED; - goto out_unlock; - } + map->m_deviceid = le16_to_cpu(idx->device_id) & + EROFS_SB(sb)->device_id_mask; map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr)); map->m_flags = EROFS_MAP_MAPPED; break; @@ -155,11 +150,55 @@ out: return err; } +int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) +{ + struct erofs_dev_context *devs = EROFS_SB(sb)->devs; + struct erofs_device_info *dif; + int id; + + /* primary device by default */ + map->m_bdev = sb->s_bdev; + map->m_daxdev = EROFS_SB(sb)->dax_dev; + + if (map->m_deviceid) { + down_read(&devs->rwsem); + dif = idr_find(&devs->tree, map->m_deviceid - 1); + if (!dif) { + up_read(&devs->rwsem); + return -ENODEV; + } + map->m_bdev = dif->bdev; + map->m_daxdev = dif->dax_dev; + up_read(&devs->rwsem); + } else if (devs->extra_devices) { + down_read(&devs->rwsem); + idr_for_each_entry(&devs->tree, dif, id) { + erofs_off_t startoff, length; + + if (!dif->mapped_blkaddr) + continue; + startoff = blknr_to_addr(dif->mapped_blkaddr); + length = blknr_to_addr(dif->blocks); + + if (map->m_pa >= startoff && + map->m_pa < startoff + length) { + map->m_pa -= startoff; + map->m_bdev = dif->bdev; + map->m_daxdev = dif->dax_dev; + break; + } + } + up_read(&devs->rwsem); + } + return 0; +} + static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct erofs_map_blocks map; + struct erofs_map_dev mdev; map.m_la = offset; map.m_llen = length; @@ -168,8 +207,16 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (ret < 0) return ret; - iomap->bdev = inode->i_sb->s_bdev; - iomap->dax_dev = EROFS_I_SB(inode)->dax_dev; + mdev = (struct erofs_map_dev) { + .m_deviceid = map.m_deviceid, + .m_pa = map.m_pa, + }; + ret = erofs_map_dev(inode->i_sb, &mdev); + if (ret) + return ret; + + iomap->bdev = mdev.m_bdev; + iomap->dax_dev = mdev.m_daxdev; iomap->offset = map.m_la; iomap->length = map.m_llen; iomap->flags = 0; @@ -188,15 +235,15 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->type = IOMAP_INLINE; ipage = erofs_get_meta_page(inode->i_sb, - erofs_blknr(map.m_pa)); + erofs_blknr(mdev.m_pa)); if (IS_ERR(ipage)) return PTR_ERR(ipage); iomap->inline_data = page_address(ipage) + - erofs_blkoff(map.m_pa); + erofs_blkoff(mdev.m_pa); iomap->private = ipage; } else { iomap->type = IOMAP_MAPPED; - iomap->addr = map.m_pa; + iomap->addr = mdev.m_pa; } return 0; } diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 8193c14bb111..49568a458f03 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -16,17 +16,6 @@ #define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32) #endif -struct z_erofs_decompressor { - /* - * if destpages have sparsed pages, fill them with bounce pages. - * it also check whether destpages indicate continuous physical memory. - */ - int (*prepare_destpages)(struct z_erofs_decompress_req *rq, - struct list_head *pagepool); - int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out); - char *name; -}; - int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int size) @@ -63,8 +52,12 @@ int z_erofs_load_lz4_config(struct super_block *sb, return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks); } -static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) +/* + * Fill all gaps with bounce pages if it's a sparse page list. Also check if + * all physical pages are consecutive, which can be seen for moderate CR. + */ +static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq, + struct page **pagepool) { const unsigned int nr = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -123,7 +116,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, return kaddr ? 1 : 0; } -static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq, +static void *z_erofs_lz4_handle_inplace_io(struct z_erofs_decompress_req *rq, void *inpage, unsigned int *inputmargin, int *maptype, bool support_0padding) { @@ -193,7 +186,8 @@ docopy: return src; } -static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) +static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, + u8 *out) { unsigned int inputmargin; u8 *headpage, *src; @@ -220,8 +214,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) } rq->inputsize -= inputmargin; - src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype, - support_0padding); + src = z_erofs_lz4_handle_inplace_io(rq, headpage, &inputmargin, + &maptype, support_0padding); if (IS_ERR(src)) return PTR_ERR(src); @@ -245,6 +239,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) if (ret >= 0) memset(out + ret, 0, rq->outputsize - ret); ret = -EIO; + } else { + ret = 0; } if (maptype == 0) { @@ -260,86 +256,25 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) return ret; } -static struct z_erofs_decompressor decompressors[] = { - [Z_EROFS_COMPRESSION_SHIFTED] = { - .name = "shifted" - }, - [Z_EROFS_COMPRESSION_LZ4] = { - .prepare_destpages = z_erofs_lz4_prepare_destpages, - .decompress = z_erofs_lz4_decompress, - .name = "lz4" - }, -}; - -static void copy_from_pcpubuf(struct page **out, const char *dst, - unsigned short pageofs_out, - unsigned int outputsize) -{ - const char *end = dst + outputsize; - const unsigned int righthalf = PAGE_SIZE - pageofs_out; - const char *cur = dst - pageofs_out; - - while (cur < end) { - struct page *const page = *out++; - - if (page) { - char *buf = kmap_atomic(page); - - if (cur >= dst) { - memcpy(buf, cur, min_t(uint, PAGE_SIZE, - end - cur)); - } else { - memcpy(buf + pageofs_out, cur + pageofs_out, - min_t(uint, righthalf, end - cur)); - } - kunmap_atomic(buf); - } - cur += PAGE_SIZE; - } -} - -static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) +static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; - const struct z_erofs_decompressor *alg = decompressors + rq->alg; unsigned int dst_maptype; void *dst; int ret; - /* two optimized fast paths only for non bigpcluster cases yet */ - if (rq->inputsize <= PAGE_SIZE) { - if (nrpages_out == 1 && !rq->inplace_io) { - DBG_BUGON(!*rq->out); - dst = kmap_atomic(*rq->out); - dst_maptype = 0; - goto dstmap_out; - } - - /* - * For the case of small output size (especially much less - * than PAGE_SIZE), memcpy the decompressed data rather than - * compressed data is preferred. - */ - if (rq->outputsize <= PAGE_SIZE * 7 / 8) { - dst = erofs_get_pcpubuf(1); - if (IS_ERR(dst)) - return PTR_ERR(dst); - - rq->inplace_io = false; - ret = alg->decompress(rq, dst); - if (!ret) - copy_from_pcpubuf(rq->out, dst, rq->pageofs_out, - rq->outputsize); - - erofs_put_pcpubuf(dst); - return ret; - } + /* one optimized fast path only for non bigpcluster cases yet */ + if (rq->inputsize <= PAGE_SIZE && nrpages_out == 1 && !rq->inplace_io) { + DBG_BUGON(!*rq->out); + dst = kmap_atomic(*rq->out); + dst_maptype = 0; + goto dstmap_out; } /* general decoding path which can be used for all cases */ - ret = alg->prepare_destpages(rq, pagepool); + ret = z_erofs_lz4_prepare_dstpages(rq, pagepool); if (ret < 0) return ret; if (ret) { @@ -354,7 +289,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, dst_maptype = 2; dstmap_out: - ret = alg->decompress(rq, dst + rq->pageofs_out); + ret = z_erofs_lz4_decompress_mem(rq, dst + rq->pageofs_out); if (!dst_maptype) kunmap_atomic(dst); @@ -363,8 +298,8 @@ dstmap_out: return ret; } -static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq, - struct list_head *pagepool) +static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq, + struct page **pagepool) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -402,10 +337,25 @@ static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq, return 0; } +static struct z_erofs_decompressor decompressors[] = { + [Z_EROFS_COMPRESSION_SHIFTED] = { + .decompress = z_erofs_shifted_transform, + .name = "shifted" + }, + [Z_EROFS_COMPRESSION_LZ4] = { + .decompress = z_erofs_lz4_decompress, + .name = "lz4" + }, +#ifdef CONFIG_EROFS_FS_ZIP_LZMA + [Z_EROFS_COMPRESSION_LZMA] = { + .decompress = z_erofs_lzma_decompress, + .name = "lzma" + }, +#endif +}; + int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) + struct page **pagepool) { - if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED) - return z_erofs_shifted_transform(rq, pagepool); - return z_erofs_decompress_generic(rq, pagepool); + return decompressors[rq->alg].decompress(rq, pagepool); } diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c new file mode 100644 index 000000000000..50045510a1f4 --- /dev/null +++ b/fs/erofs/decompressor_lzma.c @@ -0,0 +1,290 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include "compress.h" + +struct z_erofs_lzma { + struct z_erofs_lzma *next; + struct xz_dec_microlzma *state; + struct xz_buf buf; + u8 bounce[PAGE_SIZE]; +}; + +/* considering the LZMA performance, no need to use a lockless list for now */ +static DEFINE_SPINLOCK(z_erofs_lzma_lock); +static unsigned int z_erofs_lzma_max_dictsize; +static unsigned int z_erofs_lzma_nstrms, z_erofs_lzma_avail_strms; +static struct z_erofs_lzma *z_erofs_lzma_head; +static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq); + +module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444); + +void z_erofs_lzma_exit(void) +{ + /* there should be no running fs instance */ + while (z_erofs_lzma_avail_strms) { + struct z_erofs_lzma *strm; + + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + DBG_BUGON(1); + return; + } + z_erofs_lzma_head = NULL; + spin_unlock(&z_erofs_lzma_lock); + + while (strm) { + struct z_erofs_lzma *n = strm->next; + + if (strm->state) + xz_dec_microlzma_end(strm->state); + kfree(strm); + --z_erofs_lzma_avail_strms; + strm = n; + } + } +} + +int z_erofs_lzma_init(void) +{ + unsigned int i; + + /* by default, use # of possible CPUs instead */ + if (!z_erofs_lzma_nstrms) + z_erofs_lzma_nstrms = num_possible_cpus(); + + for (i = 0; i < z_erofs_lzma_nstrms; ++i) { + struct z_erofs_lzma *strm = kzalloc(sizeof(*strm), GFP_KERNEL); + + if (!strm) { + z_erofs_lzma_exit(); + return -ENOMEM; + } + spin_lock(&z_erofs_lzma_lock); + strm->next = z_erofs_lzma_head; + z_erofs_lzma_head = strm; + spin_unlock(&z_erofs_lzma_lock); + ++z_erofs_lzma_avail_strms; + } + return 0; +} + +int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lzma_cfgs *lzma, int size) +{ + static DEFINE_MUTEX(lzma_resize_mutex); + unsigned int dict_size, i; + struct z_erofs_lzma *strm, *head = NULL; + int err; + + if (!lzma || size < sizeof(struct z_erofs_lzma_cfgs)) { + erofs_err(sb, "invalid lzma cfgs, size=%u", size); + return -EINVAL; + } + if (lzma->format) { + erofs_err(sb, "unidentified lzma format %x, please check kernel version", + le16_to_cpu(lzma->format)); + return -EINVAL; + } + dict_size = le32_to_cpu(lzma->dict_size); + if (dict_size > Z_EROFS_LZMA_MAX_DICT_SIZE || dict_size < 4096) { + erofs_err(sb, "unsupported lzma dictionary size %u", + dict_size); + return -EINVAL; + } + + erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!"); + + /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */ + mutex_lock(&lzma_resize_mutex); + + if (z_erofs_lzma_max_dictsize >= dict_size) { + mutex_unlock(&lzma_resize_mutex); + return 0; + } + + /* 1. collect/isolate all streams for the following check */ + for (i = 0; i < z_erofs_lzma_avail_strms; ++i) { + struct z_erofs_lzma *last; + +again: + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + wait_event(z_erofs_lzma_wq, + READ_ONCE(z_erofs_lzma_head)); + goto again; + } + z_erofs_lzma_head = NULL; + spin_unlock(&z_erofs_lzma_lock); + + for (last = strm; last->next; last = last->next) + ++i; + last->next = head; + head = strm; + } + + err = 0; + /* 2. walk each isolated stream and grow max dict_size if needed */ + for (strm = head; strm; strm = strm->next) { + if (strm->state) + xz_dec_microlzma_end(strm->state); + strm->state = xz_dec_microlzma_alloc(XZ_PREALLOC, dict_size); + if (!strm->state) + err = -ENOMEM; + } + + /* 3. push back all to the global list and update max dict_size */ + spin_lock(&z_erofs_lzma_lock); + DBG_BUGON(z_erofs_lzma_head); + z_erofs_lzma_head = head; + spin_unlock(&z_erofs_lzma_lock); + + z_erofs_lzma_max_dictsize = dict_size; + mutex_unlock(&lzma_resize_mutex); + return err; +} + +int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int nrpages_in = + PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + unsigned int inputmargin, inlen, outlen, pageofs; + struct z_erofs_lzma *strm; + u8 *kin; + bool bounced = false; + int no, ni, j, err = 0; + + /* 1. get the exact LZMA compressed size */ + kin = kmap(*rq->in); + inputmargin = 0; + while (!kin[inputmargin & ~PAGE_MASK]) + if (!(++inputmargin & ~PAGE_MASK)) + break; + + if (inputmargin >= PAGE_SIZE) { + kunmap(*rq->in); + return -EFSCORRUPTED; + } + rq->inputsize -= inputmargin; + + /* 2. get an available lzma context */ +again: + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + wait_event(z_erofs_lzma_wq, READ_ONCE(z_erofs_lzma_head)); + goto again; + } + z_erofs_lzma_head = strm->next; + spin_unlock(&z_erofs_lzma_lock); + + /* 3. multi-call decompress */ + inlen = rq->inputsize; + outlen = rq->outputsize; + xz_dec_microlzma_reset(strm->state, inlen, outlen, + !rq->partial_decoding); + pageofs = rq->pageofs_out; + strm->buf.in = kin + inputmargin; + strm->buf.in_pos = 0; + strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - inputmargin); + inlen -= strm->buf.in_size; + strm->buf.out = NULL; + strm->buf.out_pos = 0; + strm->buf.out_size = 0; + + for (ni = 0, no = -1;;) { + enum xz_ret xz_err; + + if (strm->buf.out_pos == strm->buf.out_size) { + if (strm->buf.out) { + kunmap(rq->out[no]); + strm->buf.out = NULL; + } + + if (++no >= nrpages_out || !outlen) { + erofs_err(rq->sb, "decompressed buf out of bound"); + err = -EFSCORRUPTED; + break; + } + strm->buf.out_pos = 0; + strm->buf.out_size = min_t(u32, outlen, + PAGE_SIZE - pageofs); + outlen -= strm->buf.out_size; + if (rq->out[no]) + strm->buf.out = kmap(rq->out[no]) + pageofs; + pageofs = 0; + } else if (strm->buf.in_pos == strm->buf.in_size) { + kunmap(rq->in[ni]); + + if (++ni >= nrpages_in || !inlen) { + erofs_err(rq->sb, "compressed buf out of bound"); + err = -EFSCORRUPTED; + break; + } + strm->buf.in_pos = 0; + strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE); + inlen -= strm->buf.in_size; + kin = kmap(rq->in[ni]); + strm->buf.in = kin; + bounced = false; + } + + /* + * Handle overlapping: Use bounced buffer if the compressed + * data is under processing; Otherwise, Use short-lived pages + * from the on-stack pagepool where pages share with the same + * request. + */ + if (!bounced && rq->out[no] == rq->in[ni]) { + memcpy(strm->bounce, strm->buf.in, strm->buf.in_size); + strm->buf.in = strm->bounce; + bounced = true; + } + for (j = ni + 1; j < nrpages_in; ++j) { + struct page *tmppage; + + if (rq->out[no] != rq->in[j]) + continue; + + DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb), + rq->in[j])); + tmppage = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); + copy_highpage(tmppage, rq->in[j]); + rq->in[j] = tmppage; + } + xz_err = xz_dec_microlzma_run(strm->state, &strm->buf); + DBG_BUGON(strm->buf.out_pos > strm->buf.out_size); + DBG_BUGON(strm->buf.in_pos > strm->buf.in_size); + + if (xz_err != XZ_OK) { + if (xz_err == XZ_STREAM_END && !outlen) + break; + erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]", + xz_err, rq->inputsize, rq->outputsize); + err = -EFSCORRUPTED; + break; + } + } + if (no < nrpages_out && strm->buf.out) + kunmap(rq->in[no]); + if (ni < nrpages_in) + kunmap(rq->in[ni]); + /* 4. push back LZMA stream context to the global list */ + spin_lock(&z_erofs_lzma_lock); + strm->next = z_erofs_lzma_head; + z_erofs_lzma_head = strm; + spin_unlock(&z_erofs_lzma_lock); + wake_up(&z_erofs_lzma_wq); + return err; +} diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index b0b23f41abc3..083997a034e5 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -21,14 +21,29 @@ #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002 #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002 #define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004 +#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008 +#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ - EROFS_FEATURE_INCOMPAT_CHUNKED_FILE) + EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ + EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ + EROFS_FEATURE_INCOMPAT_COMPR_HEAD2) #define EROFS_SB_EXTSLOT_SIZE 16 +struct erofs_deviceslot { + union { + u8 uuid[16]; /* used for device manager later */ + u8 userdata[64]; /* digest(sha256), etc. */ + } u; + __le32 blocks; /* total fs blocks of this device */ + __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */ + u8 reserved[56]; +}; +#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot) + /* erofs on-disk super block (currently 128 bytes) */ struct erofs_super_block { __le32 magic; /* file system magic number */ @@ -54,7 +69,9 @@ struct erofs_super_block { /* customized sliding window size instead of 64k by default */ __le16 lz4_max_distance; } __packed u1; - __u8 reserved2[42]; + __le16 extra_devices; /* # of devices besides the primary device */ + __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */ + __u8 reserved2[38]; }; /* @@ -238,7 +255,7 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e) /* 8-byte inode chunk indexes */ struct erofs_inode_chunk_index { __le16 advise; /* always 0, don't care for now */ - __le16 device_id; /* back-end storage id, always 0 for now */ + __le16 device_id; /* back-end storage id (with bits masked) */ __le32 blkaddr; /* start block address of this inode chunk */ }; @@ -247,10 +264,11 @@ struct erofs_inode_chunk_index { /* available compression algorithm types (for h_algorithmtype) */ enum { - Z_EROFS_COMPRESSION_LZ4 = 0, + Z_EROFS_COMPRESSION_LZ4 = 0, + Z_EROFS_COMPRESSION_LZMA = 1, Z_EROFS_COMPRESSION_MAX }; -#define Z_EROFS_ALL_COMPR_ALGS (1 << (Z_EROFS_COMPRESSION_MAX - 1)) +#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1) /* 14 bytes (+ length field = 16 bytes) */ struct z_erofs_lz4_cfgs { @@ -259,6 +277,15 @@ struct z_erofs_lz4_cfgs { u8 reserved[10]; } __packed; +/* 14 bytes (+ length field = 16 bytes) */ +struct z_erofs_lzma_cfgs { + __le32 dict_size; + __le16 format; + u8 reserved[8]; +} __packed; + +#define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE) + /* * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) * e.g. for 4k logical cluster size, 4B if compacted 2B is off; @@ -288,35 +315,34 @@ struct z_erofs_map_header { #define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8 /* - * Fixed-sized output compression ondisk Logical Extent cluster type: - * 0 - literal (uncompressed) cluster - * 1 - compressed cluster (for the head logical cluster) - * 2 - compressed cluster (for the other logical clusters) + * Fixed-sized output compression on-disk logical cluster type: + * 0 - literal (uncompressed) lcluster + * 1,3 - compressed lcluster (for HEAD lclusters) + * 2 - compressed lcluster (for NONHEAD lclusters) * * In detail, - * 0 - literal (uncompressed) cluster, + * 0 - literal (uncompressed) lcluster, * di_advise = 0 - * di_clusterofs = the literal data offset of the cluster - * di_blkaddr = the blkaddr of the literal cluster + * di_clusterofs = the literal data offset of the lcluster + * di_blkaddr = the blkaddr of the literal pcluster * - * 1 - compressed cluster (for the head logical cluster) - * di_advise = 1 - * di_clusterofs = the decompressed data offset of the cluster - * di_blkaddr = the blkaddr of the compressed cluster + * 1,3 - compressed lcluster (for HEAD lclusters) + * di_advise = 1 or 3 + * di_clusterofs = the decompressed data offset of the lcluster + * di_blkaddr = the blkaddr of the compressed pcluster * - * 2 - compressed cluster (for the other logical clusters) + * 2 - compressed lcluster (for NONHEAD lclusters) * di_advise = 2 * di_clusterofs = - * the decompressed data offset in its own head cluster - * di_u.delta[0] = distance to its corresponding head cluster - * di_u.delta[1] = distance to its corresponding tail cluster - * (di_advise could be 0, 1 or 2) + * the decompressed data offset in its own HEAD lcluster + * di_u.delta[0] = distance to this HEAD lcluster + * di_u.delta[1] = distance to the next HEAD lcluster */ enum { Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0, - Z_EROFS_VLE_CLUSTER_TYPE_HEAD = 1, + Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 = 1, Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2, - Z_EROFS_VLE_CLUSTER_TYPE_RESERVED = 3, + Z_EROFS_VLE_CLUSTER_TYPE_HEAD2 = 3, Z_EROFS_VLE_CLUSTER_TYPE_MAX }; @@ -384,6 +410,7 @@ static inline void erofs_check_ondisk_layout_definitions(void) /* keep in sync between 2 index structures for better extendibility */ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != sizeof(struct z_erofs_vle_decompressed_index)); + BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128); BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 0c293ff6697b..3ff36514cff2 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -192,7 +192,7 @@ static struct page *erofs_read_inode(struct inode *inode, inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec; inode->i_flags &= ~S_DAX; - if (test_opt(&sbi->ctx, DAX_ALWAYS) && S_ISREG(inode->i_mode) && + if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && vi->datalayout == EROFS_INODE_FLAT_PLAIN) inode->i_flags |= S_DAX; if (!nblks) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index b77acf09726c..bab167a35b64 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -47,7 +47,16 @@ typedef u64 erofs_off_t; /* data type for filesystem-wide blocks number */ typedef u32 erofs_blk_t; -struct erofs_fs_context { +struct erofs_device_info { + char *path; + struct block_device *bdev; + struct dax_device *dax_dev; + + u32 blocks; + u32 mapped_blkaddr; +}; + +struct erofs_mount_opts { #ifdef CONFIG_EROFS_FS_ZIP /* current strategy of how to use managed cache */ unsigned char cache_strategy; @@ -60,6 +69,18 @@ struct erofs_fs_context { unsigned int mount_opt; }; +struct erofs_dev_context { + struct idr tree; + struct rw_semaphore rwsem; + + unsigned int extra_devices; +}; + +struct erofs_fs_context { + struct erofs_mount_opts opt; + struct erofs_dev_context *devs; +}; + /* all filesystem-wide lz4 configurations */ struct erofs_sb_lz4_info { /* # of pages needed for EROFS lz4 rolling decompression */ @@ -69,6 +90,7 @@ struct erofs_sb_lz4_info { }; struct erofs_sb_info { + struct erofs_mount_opts opt; /* options */ #ifdef CONFIG_EROFS_FS_ZIP /* list for all registered superblocks, mainly for shrinker */ struct list_head list; @@ -85,12 +107,16 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ + struct erofs_dev_context *devs; struct dax_device *dax_dev; - u32 blocks; + u64 total_blocks; + u32 primarydevice_blocks; + u32 meta_blkaddr; #ifdef CONFIG_EROFS_FS_XATTR u32 xattr_blkaddr; #endif + u16 device_id_mask; /* valid bits of device id to be used */ /* inode slot unit size in bit shift */ unsigned char islotbits; @@ -109,7 +135,9 @@ struct erofs_sb_info { u32 feature_compat; u32 feature_incompat; - struct erofs_fs_context ctx; /* options */ + /* sysfs support */ + struct kobject s_kobj; /* /sys/fs/erofs/ */ + struct completion s_kobj_unregister; }; #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) @@ -121,9 +149,9 @@ struct erofs_sb_info { #define EROFS_MOUNT_DAX_ALWAYS 0x00000040 #define EROFS_MOUNT_DAX_NEVER 0x00000080 -#define clear_opt(ctx, option) ((ctx)->mount_opt &= ~EROFS_MOUNT_##option) -#define set_opt(ctx, option) ((ctx)->mount_opt |= EROFS_MOUNT_##option) -#define test_opt(ctx, option) ((ctx)->mount_opt & EROFS_MOUNT_##option) +#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option) +#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option) +#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option) enum { EROFS_ZIP_CACHE_DISABLED, @@ -208,6 +236,9 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING) EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS) EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER) +EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE) +EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE) +EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) /* atomic flag definitions */ @@ -278,6 +309,19 @@ static inline unsigned int erofs_inode_datalayout(unsigned int value) EROFS_I_DATALAYOUT_BITS); } +/* + * Different from grab_cache_page_nowait(), reclaiming is never triggered + * when allocating new pages. + */ +static inline +struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, + pgoff_t index) +{ + return pagecache_get_page(mapping, index, + FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, + readahead_gfp_mask(mapping) & ~__GFP_RECLAIM); +} + extern const struct super_operations erofs_sops; extern const struct address_space_operations erofs_raw_access_aops; @@ -309,7 +353,7 @@ extern const struct address_space_operations z_erofs_aops; * of the corresponding uncompressed data in the file. */ enum { - BH_Zipped = BH_PrivateStart, + BH_Encoded = BH_PrivateStart, BH_FullMapped, }; @@ -317,8 +361,8 @@ enum { #define EROFS_MAP_MAPPED (1 << BH_Mapped) /* Located in metadata (could be copied from bd_inode) */ #define EROFS_MAP_META (1 << BH_Meta) -/* The extent has been compressed */ -#define EROFS_MAP_ZIPPED (1 << BH_Zipped) +/* The extent is encoded */ +#define EROFS_MAP_ENCODED (1 << BH_Encoded) /* The length of extent is full */ #define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) @@ -326,6 +370,8 @@ struct erofs_map_blocks { erofs_off_t m_pa, m_la; u64 m_plen, m_llen; + unsigned short m_deviceid; + char m_algorithmformat; unsigned int m_flags; struct page *mpage; @@ -338,6 +384,13 @@ struct erofs_map_blocks { * approach instead if possible since it's more metadata lightweight.) */ #define EROFS_GET_BLOCKS_FIEMAP 0x0002 +/* Used to map the whole extent if non-negligible data is requested for LZMA */ +#define EROFS_GET_BLOCKS_READMORE 0x0004 + +enum { + Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, + Z_EROFS_COMPRESSION_RUNTIME_MAX +}; /* zmap.c */ extern const struct iomap_ops z_erofs_iomap_report_ops; @@ -357,9 +410,18 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode, } #endif /* !CONFIG_EROFS_FS_ZIP */ +struct erofs_map_dev { + struct block_device *m_bdev; + struct dax_device *m_daxdev; + + erofs_off_t m_pa; + unsigned int m_deviceid; +}; + /* data.c */ extern const struct file_operations erofs_file_fops; struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr); +int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); @@ -413,8 +475,21 @@ int erofs_pcpubuf_growsize(unsigned int nrpages); void erofs_pcpubuf_init(void); void erofs_pcpubuf_exit(void); +/* sysfs.c */ +int erofs_register_sysfs(struct super_block *sb); +void erofs_unregister_sysfs(struct super_block *sb); +int __init erofs_init_sysfs(void); +void erofs_exit_sysfs(void); + /* utils.c / zdata.c */ -struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp); +struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp); +static inline void erofs_pagepool_add(struct page **pagepool, + struct page *page) +{ + set_page_private(page, (unsigned long)*pagepool); + *pagepool = page; +} +void erofs_release_pages(struct page **pagepool); #ifdef CONFIG_EROFS_FS_ZIP int erofs_workgroup_put(struct erofs_workgroup *grp); @@ -454,6 +529,26 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, } #endif /* !CONFIG_EROFS_FS_ZIP */ +#ifdef CONFIG_EROFS_FS_ZIP_LZMA +int z_erofs_lzma_init(void); +void z_erofs_lzma_exit(void); +int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lzma_cfgs *lzma, int size); +#else +static inline int z_erofs_lzma_init(void) { return 0; } +static inline int z_erofs_lzma_exit(void) { return 0; } +static inline int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lzma_cfgs *lzma, int size) { + if (lzma) { + erofs_err(sb, "lzma algorithm isn't enabled"); + return -EINVAL; + } + return 0; +} +#endif /* !CONFIG_EROFS_FS_ZIP */ + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* __EROFS_INTERNAL_H */ diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c index 6c885575128a..a2efd833d1b6 100644 --- a/fs/erofs/pcpubuf.c +++ b/fs/erofs/pcpubuf.c @@ -49,7 +49,7 @@ int erofs_pcpubuf_growsize(unsigned int nrpages) { static DEFINE_MUTEX(pcb_resize_mutex); static unsigned int pcb_nrpages; - LIST_HEAD(pagepool); + struct page *pagepool = NULL; int delta, cpu, ret, i; mutex_lock(&pcb_resize_mutex); @@ -102,13 +102,13 @@ int erofs_pcpubuf_growsize(unsigned int nrpages) vunmap(old_ptr); free_pagearray: while (i) - list_add(&oldpages[--i]->lru, &pagepool); + erofs_pagepool_add(&pagepool, oldpages[--i]); kfree(oldpages); if (ret) break; } pcb_nrpages = nrpages; - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); out: mutex_unlock(&pcb_resize_mutex); return ret; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 11b88559f8bf..b84394da5ae5 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -225,6 +225,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb, case Z_EROFS_COMPRESSION_LZ4: ret = z_erofs_load_lz4_config(sb, dsb, data, size); break; + case Z_EROFS_COMPRESSION_LZMA: + ret = z_erofs_load_lzma_config(sb, dsb, data, size); + break; default: DBG_BUGON(1); ret = -EFAULT; @@ -252,6 +255,79 @@ static int erofs_load_compr_cfgs(struct super_block *sb, } #endif +static int erofs_init_devices(struct super_block *sb, + struct erofs_super_block *dsb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + unsigned int ondisk_extradevs; + erofs_off_t pos; + struct page *page = NULL; + struct erofs_device_info *dif; + struct erofs_deviceslot *dis; + void *ptr; + int id, err = 0; + + sbi->total_blocks = sbi->primarydevice_blocks; + if (!erofs_sb_has_device_table(sbi)) + ondisk_extradevs = 0; + else + ondisk_extradevs = le16_to_cpu(dsb->extra_devices); + + if (ondisk_extradevs != sbi->devs->extra_devices) { + erofs_err(sb, "extra devices don't match (ondisk %u, given %u)", + ondisk_extradevs, sbi->devs->extra_devices); + return -EINVAL; + } + if (!ondisk_extradevs) + return 0; + + sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1; + pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE; + down_read(&sbi->devs->rwsem); + idr_for_each_entry(&sbi->devs->tree, dif, id) { + erofs_blk_t blk = erofs_blknr(pos); + struct block_device *bdev; + + if (!page || page->index != blk) { + if (page) { + kunmap(page); + unlock_page(page); + put_page(page); + } + + page = erofs_get_meta_page(sb, blk); + if (IS_ERR(page)) { + up_read(&sbi->devs->rwsem); + return PTR_ERR(page); + } + ptr = kmap(page); + } + dis = ptr + erofs_blkoff(pos); + + bdev = blkdev_get_by_path(dif->path, + FMODE_READ | FMODE_EXCL, + sb->s_type); + if (IS_ERR(bdev)) { + err = PTR_ERR(bdev); + goto err_out; + } + dif->bdev = bdev; + dif->dax_dev = fs_dax_get_by_bdev(bdev); + dif->blocks = le32_to_cpu(dis->blocks); + dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); + sbi->total_blocks += dif->blocks; + pos += EROFS_DEVT_SLOT_SIZE; + } +err_out: + up_read(&sbi->devs->rwsem); + if (page) { + kunmap(page); + unlock_page(page); + put_page(page); + } + return err; +} + static int erofs_read_superblock(struct super_block *sb) { struct erofs_sb_info *sbi; @@ -303,7 +379,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->sb_size); goto out; } - sbi->blocks = le32_to_cpu(dsb->blocks); + sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); @@ -330,6 +406,11 @@ static int erofs_read_superblock(struct super_block *sb) ret = erofs_load_compr_cfgs(sb, dsb); else ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0); + if (ret < 0) + goto out; + + /* handle multiple devices */ + ret = erofs_init_devices(sb, dsb); out: kunmap(page); put_page(page); @@ -340,15 +421,15 @@ out: static void erofs_default_options(struct erofs_fs_context *ctx) { #ifdef CONFIG_EROFS_FS_ZIP - ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND; - ctx->max_sync_decompress_pages = 3; - ctx->readahead_sync_decompress = false; + ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; + ctx->opt.max_sync_decompress_pages = 3; + ctx->opt.readahead_sync_decompress = false; #endif #ifdef CONFIG_EROFS_FS_XATTR - set_opt(ctx, XATTR_USER); + set_opt(&ctx->opt, XATTR_USER); #endif #ifdef CONFIG_EROFS_FS_POSIX_ACL - set_opt(ctx, POSIX_ACL); + set_opt(&ctx->opt, POSIX_ACL); #endif } @@ -358,6 +439,7 @@ enum { Opt_cache_strategy, Opt_dax, Opt_dax_enum, + Opt_device, Opt_err }; @@ -381,6 +463,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { erofs_param_cache_strategy), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), + fsparam_string("device", Opt_device), {} }; @@ -392,12 +475,12 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) switch (mode) { case EROFS_MOUNT_DAX_ALWAYS: warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - set_opt(ctx, DAX_ALWAYS); - clear_opt(ctx, DAX_NEVER); + set_opt(&ctx->opt, DAX_ALWAYS); + clear_opt(&ctx->opt, DAX_NEVER); return true; case EROFS_MOUNT_DAX_NEVER: - set_opt(ctx, DAX_NEVER); - clear_opt(ctx, DAX_ALWAYS); + set_opt(&ctx->opt, DAX_NEVER); + clear_opt(&ctx->opt, DAX_ALWAYS); return true; default: DBG_BUGON(1); @@ -412,9 +495,10 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) static int erofs_fc_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct erofs_fs_context *ctx __maybe_unused = fc->fs_private; + struct erofs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; - int opt; + struct erofs_device_info *dif; + int opt, ret; opt = fs_parse(fc, erofs_fs_parameters, param, &result); if (opt < 0) @@ -424,9 +508,9 @@ static int erofs_fc_parse_param(struct fs_context *fc, case Opt_user_xattr: #ifdef CONFIG_EROFS_FS_XATTR if (result.boolean) - set_opt(ctx, XATTR_USER); + set_opt(&ctx->opt, XATTR_USER); else - clear_opt(ctx, XATTR_USER); + clear_opt(&ctx->opt, XATTR_USER); #else errorfc(fc, "{,no}user_xattr options not supported"); #endif @@ -434,16 +518,16 @@ static int erofs_fc_parse_param(struct fs_context *fc, case Opt_acl: #ifdef CONFIG_EROFS_FS_POSIX_ACL if (result.boolean) - set_opt(ctx, POSIX_ACL); + set_opt(&ctx->opt, POSIX_ACL); else - clear_opt(ctx, POSIX_ACL); + clear_opt(&ctx->opt, POSIX_ACL); #else errorfc(fc, "{,no}acl options not supported"); #endif break; case Opt_cache_strategy: #ifdef CONFIG_EROFS_FS_ZIP - ctx->cache_strategy = result.uint_32; + ctx->opt.cache_strategy = result.uint_32; #else errorfc(fc, "compression not supported, cache_strategy ignored"); #endif @@ -456,6 +540,25 @@ static int erofs_fc_parse_param(struct fs_context *fc, if (!erofs_fc_set_dax_mode(fc, result.uint_32)) return -EINVAL; break; + case Opt_device: + dif = kzalloc(sizeof(*dif), GFP_KERNEL); + if (!dif) + return -ENOMEM; + dif->path = kstrdup(param->string, GFP_KERNEL); + if (!dif->path) { + kfree(dif); + return -ENOMEM; + } + down_write(&ctx->devs->rwsem); + ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL); + up_write(&ctx->devs->rwsem); + if (ret < 0) { + kfree(dif->path); + kfree(dif); + return ret; + } + ++ctx->devs->extra_devices; + break; default: return -ENOPARAM; } @@ -540,15 +643,19 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return -ENOMEM; sb->s_fs_info = sbi; + sbi->opt = ctx->opt; sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev); + sbi->devs = ctx->devs; + ctx->devs = NULL; + err = erofs_read_superblock(sb); if (err) return err; - if (test_opt(ctx, DAX_ALWAYS) && + if (test_opt(&sbi->opt, DAX_ALWAYS) && !dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) { errorfc(fc, "DAX unsupported by block device. Turning off DAX."); - clear_opt(ctx, DAX_ALWAYS); + clear_opt(&sbi->opt, DAX_ALWAYS); } sb->s_flags |= SB_RDONLY | SB_NOATIME; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -557,13 +664,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &erofs_sops; sb->s_xattr = erofs_xattr_handlers; - if (test_opt(ctx, POSIX_ACL)) + if (test_opt(&sbi->opt, POSIX_ACL)) sb->s_flags |= SB_POSIXACL; else sb->s_flags &= ~SB_POSIXACL; - sbi->ctx = *ctx; - #ifdef CONFIG_EROFS_FS_ZIP xa_init(&sbi->managed_pslots); #endif @@ -590,6 +695,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; + err = erofs_register_sysfs(sb); + if (err) + return err; + erofs_info(sb, "mounted with root inode @ nid %llu.", ROOT_NID(sbi)); return 0; } @@ -607,20 +716,44 @@ static int erofs_fc_reconfigure(struct fs_context *fc) DBG_BUGON(!sb_rdonly(sb)); - if (test_opt(ctx, POSIX_ACL)) + if (test_opt(&ctx->opt, POSIX_ACL)) fc->sb_flags |= SB_POSIXACL; else fc->sb_flags &= ~SB_POSIXACL; - sbi->ctx = *ctx; + sbi->opt = ctx->opt; fc->sb_flags |= SB_RDONLY; return 0; } +static int erofs_release_device_info(int id, void *ptr, void *data) +{ + struct erofs_device_info *dif = ptr; + + fs_put_dax(dif->dax_dev); + if (dif->bdev) + blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); + kfree(dif->path); + kfree(dif); + return 0; +} + +static void erofs_free_dev_context(struct erofs_dev_context *devs) +{ + if (!devs) + return; + idr_for_each(&devs->tree, &erofs_release_device_info, NULL); + idr_destroy(&devs->tree); + kfree(devs); +} + static void erofs_fc_free(struct fs_context *fc) { - kfree(fc->fs_private); + struct erofs_fs_context *ctx = fc->fs_private; + + erofs_free_dev_context(ctx->devs); + kfree(ctx); } static const struct fs_context_operations erofs_context_ops = { @@ -632,15 +765,21 @@ static const struct fs_context_operations erofs_context_ops = { static int erofs_init_fs_context(struct fs_context *fc) { - fc->fs_private = kzalloc(sizeof(struct erofs_fs_context), GFP_KERNEL); - if (!fc->fs_private) + struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + + if (!ctx) return -ENOMEM; + ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); + if (!ctx->devs) { + kfree(ctx); + return -ENOMEM; + } + fc->fs_private = ctx; - /* set default mount options */ - erofs_default_options(fc->fs_private); - + idr_init(&ctx->devs->tree); + init_rwsem(&ctx->devs->rwsem); + erofs_default_options(ctx); fc->ops = &erofs_context_ops; - return 0; } @@ -659,6 +798,8 @@ static void erofs_kill_sb(struct super_block *sb) sbi = EROFS_SB(sb); if (!sbi) return; + + erofs_free_dev_context(sbi->devs); fs_put_dax(sbi->dax_dev); kfree(sbi); sb->s_fs_info = NULL; @@ -671,6 +812,7 @@ static void erofs_put_super(struct super_block *sb) DBG_BUGON(!sbi); + erofs_unregister_sysfs(sb); erofs_shrinker_unregister(sb); #ifdef CONFIG_EROFS_FS_ZIP iput(sbi->managed_cache); @@ -706,11 +848,19 @@ static int __init erofs_module_init(void) if (err) goto shrinker_err; + err = z_erofs_lzma_init(); + if (err) + goto lzma_err; + erofs_pcpubuf_init(); err = z_erofs_init_zip_subsystem(); if (err) goto zip_err; + err = erofs_init_sysfs(); + if (err) + goto sysfs_err; + err = register_filesystem(&erofs_fs_type); if (err) goto fs_err; @@ -718,8 +868,12 @@ static int __init erofs_module_init(void) return 0; fs_err: + erofs_exit_sysfs(); +sysfs_err: z_erofs_exit_zip_subsystem(); zip_err: + z_erofs_lzma_exit(); +lzma_err: erofs_exit_shrinker(); shrinker_err: kmem_cache_destroy(erofs_inode_cachep); @@ -730,11 +884,14 @@ icache_err: static void __exit erofs_module_exit(void) { unregister_filesystem(&erofs_fs_type); - z_erofs_exit_zip_subsystem(); - erofs_exit_shrinker(); - /* Ensure all RCU free inodes are safe before cache is destroyed. */ + /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */ rcu_barrier(); + + erofs_exit_sysfs(); + z_erofs_exit_zip_subsystem(); + z_erofs_lzma_exit(); + erofs_exit_shrinker(); kmem_cache_destroy(erofs_inode_cachep); erofs_pcpubuf_exit(); } @@ -748,7 +905,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = sb->s_magic; buf->f_bsize = EROFS_BLKSIZ; - buf->f_blocks = sbi->blocks; + buf->f_blocks = sbi->total_blocks; buf->f_bfree = buf->f_bavail = 0; buf->f_files = ULLONG_MAX; @@ -763,31 +920,31 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) static int erofs_show_options(struct seq_file *seq, struct dentry *root) { struct erofs_sb_info *sbi = EROFS_SB(root->d_sb); - struct erofs_fs_context *ctx = &sbi->ctx; + struct erofs_mount_opts *opt = &sbi->opt; #ifdef CONFIG_EROFS_FS_XATTR - if (test_opt(ctx, XATTR_USER)) + if (test_opt(opt, XATTR_USER)) seq_puts(seq, ",user_xattr"); else seq_puts(seq, ",nouser_xattr"); #endif #ifdef CONFIG_EROFS_FS_POSIX_ACL - if (test_opt(ctx, POSIX_ACL)) + if (test_opt(opt, POSIX_ACL)) seq_puts(seq, ",acl"); else seq_puts(seq, ",noacl"); #endif #ifdef CONFIG_EROFS_FS_ZIP - if (ctx->cache_strategy == EROFS_ZIP_CACHE_DISABLED) + if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED) seq_puts(seq, ",cache_strategy=disabled"); - else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) + else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) seq_puts(seq, ",cache_strategy=readahead"); - else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND) + else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND) seq_puts(seq, ",cache_strategy=readaround"); #endif - if (test_opt(ctx, DAX_ALWAYS)) + if (test_opt(opt, DAX_ALWAYS)) seq_puts(seq, ",dax=always"); - if (test_opt(ctx, DAX_NEVER)) + if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); return 0; } @@ -806,3 +963,4 @@ module_exit(erofs_module_exit); MODULE_DESCRIPTION("Enhanced ROM File System"); MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc."); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c new file mode 100644 index 000000000000..dd122dd1c38b --- /dev/null +++ b/fs/erofs/sysfs.c @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C), 2008-2021, OPPO Mobile Comm Corp., Ltd. + * https://www.oppo.com/ + */ +#include +#include + +#include "internal.h" + +enum { + attr_feature, + attr_pointer_ui, + attr_pointer_bool, +}; + +enum { + struct_erofs_sb_info, +}; + +struct erofs_attr { + struct attribute attr; + short attr_id; + int struct_type, offset; +}; + +#define EROFS_ATTR(_name, _mode, _id) \ +static struct erofs_attr erofs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ +} +#define EROFS_ATTR_FUNC(_name, _mode) EROFS_ATTR(_name, _mode, _name) +#define EROFS_ATTR_FEATURE(_name) EROFS_ATTR(_name, 0444, feature) + +#define EROFS_ATTR_OFFSET(_name, _mode, _id, _struct) \ +static struct erofs_attr erofs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ + .struct_type = struct_##_struct, \ + .offset = offsetof(struct _struct, _name),\ +} + +#define EROFS_ATTR_RW(_name, _id, _struct) \ + EROFS_ATTR_OFFSET(_name, 0644, _id, _struct) + +#define EROFS_RO_ATTR(_name, _id, _struct) \ + EROFS_ATTR_OFFSET(_name, 0444, _id, _struct) + +#define EROFS_ATTR_RW_UI(_name, _struct) \ + EROFS_ATTR_RW(_name, pointer_ui, _struct) + +#define EROFS_ATTR_RW_BOOL(_name, _struct) \ + EROFS_ATTR_RW(_name, pointer_bool, _struct) + +#define ATTR_LIST(name) (&erofs_attr_##name.attr) + +static struct attribute *erofs_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(erofs); + +/* Features this copy of erofs supports */ +EROFS_ATTR_FEATURE(zero_padding); +EROFS_ATTR_FEATURE(compr_cfgs); +EROFS_ATTR_FEATURE(big_pcluster); +EROFS_ATTR_FEATURE(chunked_file); +EROFS_ATTR_FEATURE(device_table); +EROFS_ATTR_FEATURE(compr_head2); +EROFS_ATTR_FEATURE(sb_chksum); + +static struct attribute *erofs_feat_attrs[] = { + ATTR_LIST(zero_padding), + ATTR_LIST(compr_cfgs), + ATTR_LIST(big_pcluster), + ATTR_LIST(chunked_file), + ATTR_LIST(device_table), + ATTR_LIST(compr_head2), + ATTR_LIST(sb_chksum), + NULL, +}; +ATTRIBUTE_GROUPS(erofs_feat); + +static unsigned char *__struct_ptr(struct erofs_sb_info *sbi, + int struct_type, int offset) +{ + if (struct_type == struct_erofs_sb_info) + return (unsigned char *)sbi + offset; + return NULL; +} + +static ssize_t erofs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, + s_kobj); + struct erofs_attr *a = container_of(attr, struct erofs_attr, attr); + unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset); + + switch (a->attr_id) { + case attr_feature: + return sysfs_emit(buf, "supported\n"); + case attr_pointer_ui: + if (!ptr) + return 0; + return sysfs_emit(buf, "%u\n", *(unsigned int *)ptr); + case attr_pointer_bool: + if (!ptr) + return 0; + return sysfs_emit(buf, "%d\n", *(bool *)ptr); + } + return 0; +} + +static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, + s_kobj); + struct erofs_attr *a = container_of(attr, struct erofs_attr, attr); + unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset); + unsigned long t; + int ret; + + switch (a->attr_id) { + case attr_pointer_ui: + if (!ptr) + return 0; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t != (unsigned int)t) + return -ERANGE; + *(unsigned int *)ptr = t; + return len; + case attr_pointer_bool: + if (!ptr) + return 0; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t != 0 && t != 1) + return -EINVAL; + *(bool *)ptr = !!t; + return len; + } + return 0; +} + +static void erofs_sb_release(struct kobject *kobj) +{ + struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +static const struct sysfs_ops erofs_attr_ops = { + .show = erofs_attr_show, + .store = erofs_attr_store, +}; + +static struct kobj_type erofs_sb_ktype = { + .default_groups = erofs_groups, + .sysfs_ops = &erofs_attr_ops, + .release = erofs_sb_release, +}; + +static struct kobj_type erofs_ktype = { + .sysfs_ops = &erofs_attr_ops, +}; + +static struct kset erofs_root = { + .kobj = {.ktype = &erofs_ktype}, +}; + +static struct kobj_type erofs_feat_ktype = { + .default_groups = erofs_feat_groups, + .sysfs_ops = &erofs_attr_ops, +}; + +static struct kobject erofs_feat = { + .kset = &erofs_root, +}; + +int erofs_register_sysfs(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + int err; + + sbi->s_kobj.kset = &erofs_root; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, + "%s", sb->s_id); + if (err) + goto put_sb_kobj; + return 0; + +put_sb_kobj: + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return err; +} + +void erofs_unregister_sysfs(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + if (sbi->s_kobj.state_in_sysfs) { + kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + } +} + +int __init erofs_init_sysfs(void) +{ + int ret; + + kobject_set_name(&erofs_root.kobj, "erofs"); + erofs_root.kobj.parent = fs_kobj; + ret = kset_register(&erofs_root); + if (ret) + goto root_err; + + ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype, + NULL, "features"); + if (ret) + goto feat_err; + return ret; + +feat_err: + kobject_put(&erofs_feat); + kset_unregister(&erofs_root); +root_err: + return ret; +} + +void erofs_exit_sysfs(void) +{ + kobject_put(&erofs_feat); + kset_unregister(&erofs_root); +} diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 3ca703cd5b24..ec9a1d780dc1 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -6,20 +6,29 @@ #include "internal.h" #include -struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp) +struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp) { - struct page *page; + struct page *page = *pagepool; - if (!list_empty(pool)) { - page = lru_to_page(pool); + if (page) { DBG_BUGON(page_ref_count(page) != 1); - list_del(&page->lru); + *pagepool = (struct page *)page_private(page); } else { page = alloc_page(gfp); } return page; } +void erofs_release_pages(struct page **pagepool) +{ + while (*pagepool) { + struct page *page = *pagepool; + + *pagepool = (struct page *)page_private(page); + put_page(page); + } +} + #ifdef CONFIG_EROFS_FS_ZIP /* global shrink count (for all mounted EROFS instances) */ static atomic_long_t erofs_global_shrink_cnt; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 778f2c52295d..01c581e93c5f 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -429,7 +429,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) static bool erofs_xattr_user_list(struct dentry *dentry) { - return test_opt(&EROFS_SB(dentry->d_sb)->ctx, XATTR_USER); + return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER); } static bool erofs_xattr_trusted_list(struct dentry *dentry) @@ -476,7 +476,7 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler, switch (handler->flags) { case EROFS_XATTR_INDEX_USER: - if (!test_opt(&sbi->ctx, XATTR_USER)) + if (!test_opt(&sbi->opt, XATTR_USER)) return -EOPNOTSUPP; break; case EROFS_XATTR_INDEX_TRUSTED: diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index eb51df4a9f77..9a249bfc2770 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -96,16 +96,9 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) DBG_BUGON(1); } -/* - * a compressed_pages[] placeholder in order to avoid - * being filled with file pages for in-place decompression. - */ -#define PAGE_UNALLOCATED ((void *)0x5F0E4B1D) - /* how to allocate cached pages for a pcluster */ enum z_erofs_cache_alloctype { DONTALLOC, /* don't allocate any cached pages */ - DELAYEDALLOC, /* delayed allocation (at the time of submitting io) */ /* * try to use cached I/O if page allocation succeeds or fallback * to in-place I/O instead to avoid any direct reclaim. @@ -236,7 +229,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock); static void preload_compressed_pages(struct z_erofs_collector *clt, struct address_space *mc, enum z_erofs_cache_alloctype type, - struct list_head *pagepool) + struct page **pagepool) { struct z_erofs_pcluster *pcl = clt->pcl; bool standalone = true; @@ -267,10 +260,6 @@ static void preload_compressed_pages(struct z_erofs_collector *clt, /* I/O is needed, no possible to decompress directly */ standalone = false; switch (type) { - case DELAYEDALLOC: - t = tagptr_init(compressed_page_t, - PAGE_UNALLOCATED); - break; case TRYALLOC: newpage = erofs_allocpage(pagepool, gfp); if (!newpage) @@ -287,12 +276,10 @@ static void preload_compressed_pages(struct z_erofs_collector *clt, if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) continue; - if (page) { + if (page) put_page(page); - } else if (newpage) { - set_page_private(newpage, 0); - list_add(&newpage->lru, pagepool); - } + else if (newpage) + erofs_pagepool_add(pagepool, newpage); } /* @@ -476,6 +463,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, struct erofs_workgroup *grp; int err; + if (!(map->m_flags & EROFS_MAP_ENCODED)) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + /* no available pcluster, let's allocate one */ pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT); if (IS_ERR(pcl)) @@ -483,16 +475,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, atomic_set(&pcl->obj.refcount, 1); pcl->obj.index = map->m_pa >> PAGE_SHIFT; - + pcl->algorithmformat = map->m_algorithmformat; pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | (map->m_flags & EROFS_MAP_FULL_MAPPED ? Z_EROFS_PCLUSTER_FULL_LENGTH : 0); - if (map->m_flags & EROFS_MAP_ZIPPED) - pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4; - else - pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; - /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = clt->owned_head; clt->mode = COLLECT_PRIMARY_FOLLOWED; @@ -643,7 +630,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, struct list_head *pagepool) + struct page *page, struct page **pagepool) { struct inode *const inode = fe->inode; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); @@ -695,7 +682,7 @@ restart_now: goto err_out; /* preload all compressed pages (maybe downgrade role if necessary) */ - if (should_alloc_managed_pages(fe, sbi->ctx.cache_strategy, map->m_la)) + if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, map->m_la)) cache_strategy = TRYALLOC; else cache_strategy = DONTALLOC; @@ -797,7 +784,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, /* Use workqueue and sync decompression for atomic contexts only */ if (in_atomic() || irqs_disabled()) { queue_work(z_erofs_workqueue, &io->u.work); - sbi->ctx.readahead_sync_decompress = true; + sbi->opt.readahead_sync_decompress = true; return; } z_erofs_decompressqueue_work(&io->u.work); @@ -837,7 +824,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) static int z_erofs_decompress_pcluster(struct super_block *sb, struct z_erofs_pcluster *pcl, - struct list_head *pagepool) + struct page **pagepool) { struct erofs_sb_info *const sbi = EROFS_SB(sb); struct z_erofs_pagevec_ctor ctor; @@ -1037,7 +1024,7 @@ out: } static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, - struct list_head *pagepool) + struct page **pagepool) { z_erofs_next_pcluster_t owned = io->head; @@ -1061,18 +1048,18 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) { struct z_erofs_decompressqueue *bgq = container_of(work, struct z_erofs_decompressqueue, u.work); - LIST_HEAD(pagepool); + struct page *pagepool = NULL; DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); z_erofs_decompress_queue(bgq, &pagepool); - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); kvfree(bgq); } static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, unsigned int nr, - struct list_head *pagepool, + struct page **pagepool, struct address_space *mc, gfp_t gfp) { @@ -1092,15 +1079,6 @@ repeat: if (!page) goto out_allocpage; - /* - * the cached page has not been allocated and - * an placeholder is out there, prepare it now. - */ - if (page == PAGE_UNALLOCATED) { - tocache = true; - goto out_allocpage; - } - /* process the target tagged pointer */ t = tagptr_init(compressed_page_t, page); justfound = tagptr_unfold_tags(t); @@ -1174,7 +1152,7 @@ repeat: out_allocpage: page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { - list_add(&page->lru, pagepool); + erofs_pagepool_add(pagepool, page); cond_resched(); goto repeat; } @@ -1258,7 +1236,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, static void z_erofs_submit_queue(struct super_block *sb, struct z_erofs_decompress_frontend *f, - struct list_head *pagepool, + struct page **pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) { @@ -1267,8 +1245,9 @@ static void z_erofs_submit_queue(struct super_block *sb, struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; z_erofs_next_pcluster_t owned_head = f->clt.owned_head; - /* since bio will be NULL, no need to initialize last_index */ + /* bio is NULL initially, so no need to initialize last_{index,bdev} */ pgoff_t last_index; + struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; @@ -1280,6 +1259,7 @@ static void z_erofs_submit_queue(struct super_block *sb, q[JQ_SUBMIT]->head = owned_head; do { + struct erofs_map_dev mdev; struct z_erofs_pcluster *pcl; pgoff_t cur, end; unsigned int i = 0; @@ -1291,7 +1271,13 @@ static void z_erofs_submit_queue(struct super_block *sb, pcl = container_of(owned_head, struct z_erofs_pcluster, next); - cur = pcl->obj.index; + /* no device id here, thus it will always succeed */ + mdev = (struct erofs_map_dev) { + .m_pa = blknr_to_addr(pcl->obj.index), + }; + (void)erofs_map_dev(sb, &mdev); + + cur = erofs_blknr(mdev.m_pa); end = cur + pcl->pclusterpages; /* close the main owned chain at first */ @@ -1307,7 +1293,8 @@ static void z_erofs_submit_queue(struct super_block *sb, if (!page) continue; - if (bio && cur != last_index + 1) { + if (bio && (cur != last_index + 1 || + last_bdev != mdev.m_bdev)) { submit_bio_retry: submit_bio(bio); bio = NULL; @@ -1315,9 +1302,10 @@ submit_bio_retry: if (!bio) { bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); - bio->bi_end_io = z_erofs_decompressqueue_endio; - bio_set_dev(bio, sb->s_bdev); + + bio_set_dev(bio, mdev.m_bdev); + last_bdev = mdev.m_bdev; bio->bi_iter.bi_sector = (sector_t)cur << LOG_SECTORS_PER_BLOCK; bio->bi_private = bi_private; @@ -1356,7 +1344,7 @@ submit_bio_retry: static void z_erofs_runqueue(struct super_block *sb, struct z_erofs_decompress_frontend *f, - struct list_head *pagepool, bool force_fg) + struct page **pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; @@ -1378,18 +1366,87 @@ static void z_erofs_runqueue(struct super_block *sb, z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); } +/* + * Since partial uptodate is still unimplemented for now, we have to use + * approximate readmore strategies as a start. + */ +static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, + struct readahead_control *rac, + erofs_off_t end, + struct page **pagepool, + bool backmost) +{ + struct inode *inode = f->inode; + struct erofs_map_blocks *map = &f->map; + erofs_off_t cur; + int err; + + if (backmost) { + map->m_la = end; + err = z_erofs_map_blocks_iter(inode, map, + EROFS_GET_BLOCKS_READMORE); + if (err) + return; + + /* expend ra for the trailing edge if readahead */ + if (rac) { + loff_t newstart = readahead_pos(rac); + + cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); + readahead_expand(rac, newstart, cur - newstart); + return; + } + end = round_up(end, PAGE_SIZE); + } else { + end = round_up(map->m_la, PAGE_SIZE); + + if (!map->m_llen) + return; + } + + cur = map->m_la + map->m_llen - 1; + while (cur >= end) { + pgoff_t index = cur >> PAGE_SHIFT; + struct page *page; + + page = erofs_grab_cache_page_nowait(inode->i_mapping, index); + if (!page) + goto skip; + + if (PageUptodate(page)) { + unlock_page(page); + put_page(page); + goto skip; + } + + err = z_erofs_do_read_page(f, page, pagepool); + if (err) + erofs_err(inode->i_sb, + "readmore error at page %lu @ nid %llu", + index, EROFS_I(inode)->nid); + put_page(page); +skip: + if (cur < PAGE_SIZE) + break; + cur = (index << PAGE_SHIFT) - 1; + } +} + static int z_erofs_readpage(struct file *file, struct page *page) { struct inode *const inode = page->mapping->host; struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + struct page *pagepool = NULL; int err; - LIST_HEAD(pagepool); trace_erofs_readpage(page, false); - f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; + z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1, + &pagepool, true); err = z_erofs_do_read_page(&f, page, &pagepool); + z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false); + (void)z_erofs_collector_end(&f.clt); /* if some compressed cluster ready, need submit them anyway */ @@ -1401,8 +1458,7 @@ static int z_erofs_readpage(struct file *file, struct page *page) if (f.map.mpage) put_page(f.map.mpage); - /* clean up the remaining free pages */ - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); return err; } @@ -1410,29 +1466,19 @@ static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - - unsigned int nr_pages = readahead_count(rac); - bool sync = (sbi->ctx.readahead_sync_decompress && - nr_pages <= sbi->ctx.max_sync_decompress_pages); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *page, *head = NULL; - LIST_HEAD(pagepool); - - trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + struct page *pagepool = NULL, *head = NULL, *page; + unsigned int nr_pages; f.readahead = true; f.headoffset = readahead_pos(rac); + z_erofs_pcluster_readmore(&f, rac, f.headoffset + + readahead_length(rac) - 1, &pagepool, true); + nr_pages = readahead_count(rac); + trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + while ((page = readahead_page(rac))) { - prefetchw(&page->flags); - - /* - * A pure asynchronous readahead is indicated if - * a PG_readahead marked page is hitted at first. - * Let's also do asynchronous decompression for this case. - */ - sync &= !(PageReadahead(page) && !head); - set_page_private(page, (unsigned long)head); head = page; } @@ -1451,16 +1497,15 @@ static void z_erofs_readahead(struct readahead_control *rac) page->index, EROFS_I(inode)->nid); put_page(page); } - + z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); (void)z_erofs_collector_end(&f.clt); - z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync); - + z_erofs_runqueue(inode->i_sb, &f, &pagepool, + sbi->opt.readahead_sync_decompress && + nr_pages <= sbi->opt.max_sync_decompress_pages); if (f.map.mpage) put_page(f.map.mpage); - - /* clean up the remaining free pages */ - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); } const struct address_space_operations z_erofs_aops = { diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 3a008f1b9f78..4a69515dea75 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -94,13 +94,6 @@ struct z_erofs_decompressqueue { } u; }; -#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) -static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, - struct page *page) -{ - return page->mapping == MNGD_MAPPING(sbi); -} - #define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 #define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) #define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) @@ -186,4 +179,3 @@ static inline void z_erofs_onlinepage_endio(struct page *page) #define Z_EROFS_VMAP_GLOBAL_PAGES 2048 #endif - diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 73b86b5c1a75..24331b5283fb 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -28,7 +28,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); struct super_block *const sb = inode->i_sb; - int err; + int err, headnr; erofs_off_t pos; struct page *page; void *kaddr; @@ -68,9 +68,11 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) vi->z_algorithmtype[0] = h->h_algorithmtype & 15; vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; - if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) { - erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel", - vi->z_algorithmtype[0], vi->nid); + headnr = 0; + if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || + vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { + erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", + headnr + 1, vi->z_algorithmtype[headnr], vi->nid); err = -EOPNOTSUPP; goto unmap_done; } @@ -111,7 +113,7 @@ struct z_erofs_maprecorder { unsigned long lcn; /* compression extent information gathered */ - u8 type; + u8 type, headtype; u16 clusterofs; u16 delta[2]; erofs_blk_t pblk, compressedlcs; @@ -178,7 +180,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, m->clusterofs = 1 << vi->z_logical_clusterbits; m->delta[0] = le16_to_cpu(di->di_u.delta[0]); if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) { - if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) { + if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | + Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { DBG_BUGON(1); return -EFSCORRUPTED; } @@ -189,7 +192,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, m->delta[1] = le16_to_cpu(di->di_u.delta[1]); break; case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: m->clusterofs = le16_to_cpu(di->di_clusterofs); m->pblk = le32_to_cpu(di->di_u.blkaddr); break; @@ -446,9 +450,9 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, } return z_erofs_extent_lookback(m, m->delta[0]); case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - map->m_flags &= ~EROFS_MAP_ZIPPED; - fallthrough; - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: + m->headtype = m->type; map->m_la = (lcn << lclusterbits) | m->clusterofs; break; default: @@ -471,13 +475,18 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, int err; DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN && - m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD); - if (!(map->m_flags & EROFS_MAP_ZIPPED) || - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) { + m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 && + m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD2); + DBG_BUGON(m->type != m->headtype); + + if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || + ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1) && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) || + ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { map->m_plen = 1 << lclusterbits; return 0; } - lcn = m->lcn + 1; if (m->compressedlcs) goto out; @@ -499,7 +508,8 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, switch (m->type) { case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: /* * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type * rather than CBLKCNT, it's a 1 lcluster-sized pcluster. @@ -554,7 +564,8 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) DBG_BUGON(!m->delta[1] && m->clusterofs != 1 << lclusterbits); } else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || - m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD) { + m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 || + m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) { /* go on until the next HEAD lcluster */ if (lcn != headlcn) break; @@ -609,16 +620,15 @@ int z_erofs_map_blocks_iter(struct inode *inode, if (err) goto unmap_out; - map->m_flags = EROFS_MAP_ZIPPED; /* by default, compressed */ + map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED; end = (m.lcn + 1ULL) << lclusterbits; switch (m.type) { case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - if (endoff >= m.clusterofs) - map->m_flags &= ~EROFS_MAP_ZIPPED; - fallthrough; - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: if (endoff >= m.clusterofs) { + m.headtype = m.type; map->m_la = (m.lcn << lclusterbits) | m.clusterofs; break; } @@ -650,13 +660,22 @@ int z_erofs_map_blocks_iter(struct inode *inode, map->m_llen = end - map->m_la; map->m_pa = blknr_to_addr(m.pblk); - map->m_flags |= EROFS_MAP_MAPPED; err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) goto out; - if (flags & EROFS_GET_BLOCKS_FIEMAP) { + if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) + map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; + else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) + map->m_algorithmformat = vi->z_algorithmtype[1]; + else + map->m_algorithmformat = vi->z_algorithmtype[0]; + + if ((flags & EROFS_GET_BLOCKS_FIEMAP) || + ((flags & EROFS_GET_BLOCKS_READMORE) && + map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA && + map->m_llen >= EROFS_BLKSIZ)) { err = z_erofs_get_extent_decompressedlen(&m); if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1ec197825544..df38fba17287 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -1870,8 +1871,8 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, write_unlock_irq(&ep->lock); if (!eavail) - timed_out = !schedule_hrtimeout_range(to, slack, - HRTIMER_MODE_ABS); + timed_out = !freezable_schedule_hrtimeout_range(to, slack, + HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* diff --git a/fs/exec.c b/fs/exec.c index 881390b44cfd..103aca5fba4c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -285,6 +285,7 @@ err: mmap_write_unlock(mm); err_free: bprm->vma = NULL; + VM_BUG_ON(vma->vm_file); vm_area_free(vma); return err; } @@ -1013,6 +1014,7 @@ static int exec_mmap(struct mm_struct *mm) active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; + lru_gen_add_mm(mm); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for @@ -1028,6 +1030,7 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm->vmacache_seqnum = 0; vmacache_flush(tsk); task_unlock(tsk); + lru_gen_use_mm(mm); if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 4b5d02b1df58..a3d78dc442cb 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -837,5 +837,6 @@ module_exit(exit_exfat_fs); MODULE_ALIAS_FS("exfat"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_DESCRIPTION("exFAT filesystem support"); MODULE_AUTHOR("Samsung Electronics Co., Ltd."); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 02d82f8fe85d..c8c61c718e97 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1657,5 +1657,6 @@ static void __exit exit_ext2_fs(void) MODULE_AUTHOR("Remy Card and others"); MODULE_DESCRIPTION("Second Extended Filesystem"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); module_init(init_ext2_fs) module_exit(exit_ext2_fs) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 45f6d75de660..da93b001d4f2 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -36,10 +36,17 @@ #include "acl.h" #include "truncate.h" -static bool ext4_dio_supported(struct inode *inode) +static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter) { - if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode)) - return false; + struct inode *inode = file_inode(iocb->ki_filp); + + if (IS_ENCRYPTED(inode)) { + if (!fscrypt_dio_supported(inode)) + return false; + if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), + i_blocksize(inode))) + return false; + } if (fsverity_active(inode)) return false; if (ext4_should_journal_data(inode)) @@ -61,7 +68,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) inode_lock_shared(inode); } - if (!ext4_dio_supported(inode)) { + if (!ext4_dio_supported(iocb, to)) { inode_unlock_shared(inode); /* * Fallback to buffered I/O if the operation being performed on @@ -511,7 +518,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) } /* Fallback to buffered I/O if the inode does not support direct I/O. */ - if (!ext4_dio_supported(inode)) { + if (!ext4_dio_supported(iocb, from)) { if (ilock_shared) inode_unlock_shared(inode); else @@ -767,6 +774,7 @@ static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, + .speculative = true, }; static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0a63863bc58c..e4f85fecf2ff 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3458,6 +3458,13 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (ret < 0) return ret; out: + /* + * When inline encryption is enabled, sometimes I/O to an encrypted file + * has to be broken up to guarantee DUN contiguity. Handle this by + * limiting the length of the mapping returned. + */ + map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); + ext4_set_iomap(inode, iomap, &map, offset, length); return 0; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 3db923403505..6d4479962498 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -76,14 +76,10 @@ static void __read_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, iter_all) { page = bv->bv_page; - /* PG_error was set if any post_read step failed */ - if (bio->bi_status || PageError(page)) { + if (bio->bi_status) ClearPageUptodate(page); - /* will re-read again later */ - ClearPageError(page); - } else { + else SetPageUptodate(page); - } unlock_page(page); } if (bio->bi_private) @@ -97,10 +93,12 @@ static void decrypt_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; - fscrypt_decrypt_bio(ctx->bio); - - bio_post_read_processing(ctx); + if (fscrypt_decrypt_bio(bio)) + bio_post_read_processing(ctx); + else + __read_end_io(bio); } static void verity_work(struct work_struct *work) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 802ca160d31e..ab45a1032f7a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1572,7 +1572,6 @@ static const struct fscrypt_operations ext4_cryptops = { .set_context = ext4_set_context, .get_dummy_policy = ext4_get_dummy_policy, .empty_dir = ext4_empty_dir, - .max_namelen = EXT4_NAME_LEN, .has_stable_inodes = ext4_has_stable_inodes, .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, }; @@ -6732,6 +6731,7 @@ static void __exit ext4_exit_fs(void) MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Fourth Extended Filesystem"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_SOFTDEP("pre: crc32c"); module_init(ext4_init_fs) module_exit(ext4_exit_fs) diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 7eea3cfd894d..03ef087537c7 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -7,6 +7,7 @@ config F2FS_FS select CRYPTO_CRC32 select F2FS_FS_XATTR if FS_ENCRYPTION select FS_ENCRYPTION_ALGS if FS_ENCRYPTION + select FS_IOMAP select LZ4_COMPRESS if F2FS_FS_LZ4 select LZ4_DECOMPRESS if F2FS_FS_LZ4 select LZ4HC_COMPRESS if F2FS_FS_LZ4HC @@ -142,3 +143,10 @@ config F2FS_IOSTAT Support getting IO statistics through sysfs and printing out periodic IO statistics tracepoint events. You have to turn on "iostat_enable" sysfs node to enable this feature. + +config F2FS_UNFAIR_RWSEM + bool "F2FS unfair rw_semaphore" + depends on F2FS_FS && BLK_CGROUP + help + Use unfair rw_semaphore, if system configured IO priority by block + cgroup. diff --git a/fs/f2fs/OWNERS b/fs/f2fs/OWNERS new file mode 100644 index 000000000000..6a5c01163993 --- /dev/null +++ b/fs/f2fs/OWNERS @@ -0,0 +1 @@ +jaegeuk@google.com diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 16e826e01f09..eaa240b21f07 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -204,8 +204,9 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type, bool rcu) return __f2fs_get_acl(inode, type, NULL); } -static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p, - struct posix_acl **acl) +static int f2fs_acl_update_mode(struct user_namespace *mnt_userns, + struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) { umode_t mode = inode->i_mode; int error; @@ -218,14 +219,15 @@ static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p, return error; if (error == 0) *acl = NULL; - if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) && - !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) + if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; return 0; } -static int __f2fs_set_acl(struct inode *inode, int type, +static int __f2fs_set_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { int name_index; @@ -238,7 +240,8 @@ static int __f2fs_set_acl(struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = f2fs_acl_update_mode(inode, &mode, &acl); + error = f2fs_acl_update_mode(mnt_userns, inode, + &mode, &acl); if (error) return error; set_acl_inode(inode, mode); @@ -279,7 +282,7 @@ int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; - return __f2fs_set_acl(inode, type, acl, NULL); + return __f2fs_set_acl(mnt_userns, inode, type, acl, NULL); } /* @@ -419,7 +422,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, f2fs_mark_inode_dirty_sync(inode, true); if (default_acl) { - error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, + error = __f2fs_set_acl(NULL, inode, ACL_TYPE_DEFAULT, default_acl, ipage); posix_acl_release(default_acl); } else { @@ -427,7 +430,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, } if (acl) { if (!error) - error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, + error = __f2fs_set_acl(NULL, inode, ACL_TYPE_ACCESS, acl, ipage); posix_acl_release(acl); } else { diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 02840dadde5d..15e43fd4ea58 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -26,12 +26,16 @@ static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; -void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason) { f2fs_build_fault_attr(sbi, 0, 0); set_ckpt_flags(sbi, CP_ERROR_FLAG); - if (!end_io) + if (!end_io) { f2fs_flush_merged_writes(sbi); + + f2fs_handle_stop(sbi, reason); + } } /* @@ -89,7 +93,7 @@ repeat: return ERR_PTR(err); } - f2fs_update_iostat(sbi, FS_META_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE); lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -98,6 +102,7 @@ repeat: } if (unlikely(!PageUptodate(page))) { + f2fs_handle_page_eio(sbi, page->index, META); f2fs_put_page(page, 1); return ERR_PTR(-EIO); } @@ -121,7 +126,7 @@ retry: if (PTR_ERR(page) == -EIO && ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_META_PAGE); } return page; } @@ -166,6 +171,11 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { + if (time_to_inject(sbi, FAULT_BLKADDR)) { + f2fs_show_injection_info(sbi, FAULT_BLKADDR); + return false; + } + switch (type) { case META_NAT: break; @@ -283,25 +293,30 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, f2fs_put_page(page, err ? 1 : 0); if (!err) - f2fs_update_iostat(sbi, FS_META_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, + F2FS_BLKSIZE); } out: blk_finish_plug(&plug); return blkno - start; } -void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, + unsigned int ra_blocks) { struct page *page; bool readahead = false; + if (ra_blocks == RECOVERY_MIN_RA_BLOCKS) + return; + page = find_get_page(META_MAPPING(sbi), index); if (!page || !PageUptodate(page)) readahead = true; f2fs_put_page(page, 0); if (readahead) - f2fs_ra_meta_pages(sbi, index, BIO_MAX_VECS, META_POR, true); + f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true); } static int __f2fs_write_meta_page(struct page *page, @@ -359,13 +374,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping, goto skip_write; /* if locked failed, cp will flush dirty pages instead */ - if (!down_write_trylock(&sbi->cp_global_sem)) + if (!f2fs_down_write_trylock(&sbi->cp_global_sem)) goto skip_write; trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); - up_write(&sbi->cp_global_sem); + f2fs_up_write(&sbi->cp_global_sem); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -450,8 +465,7 @@ static int f2fs_set_meta_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); - if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + if (__set_page_dirty_nobuffers(page)) { inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); set_page_private_reference(page); return 1; @@ -672,7 +686,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* truncate all the data during iput */ iput(inode); - err = f2fs_get_node_info(sbi, ino, &ni); + err = f2fs_get_node_info(sbi, ino, &ni, false); if (err) goto err_out; @@ -713,9 +727,6 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) } #ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sbi->sb->s_flags |= SB_ACTIVE; - /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. @@ -1009,9 +1020,7 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type) return; set_inode_flag(inode, flag); - if (!f2fs_is_volatile_file(inode)) - list_add_tail(&F2FS_I(inode)->dirty_list, - &sbi->inode_list[type]); + list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } @@ -1177,7 +1186,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi) if (!is_journalled_quota(sbi)) return false; - if (!down_write_trylock(&sbi->quota_sem)) + if (!f2fs_down_write_trylock(&sbi->quota_sem)) return true; if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) { ret = false; @@ -1189,7 +1198,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi) } else if (get_pages(sbi, F2FS_DIRTY_QDATA)) { ret = true; } - up_write(&sbi->quota_sem); + f2fs_up_write(&sbi->quota_sem); return ret; } @@ -1246,10 +1255,10 @@ retry_flush_dents: * POR: we should ensure that there are no dirty node pages * until finishing nat/sit flush. inode->i_blocks can be updated. */ - down_write(&sbi->node_change); + f2fs_down_write(&sbi->node_change); if (get_pages(sbi, F2FS_DIRTY_IMETA)) { - up_write(&sbi->node_change); + f2fs_up_write(&sbi->node_change); f2fs_unlock_all(sbi); err = f2fs_sync_inode_meta(sbi); if (err) @@ -1259,15 +1268,15 @@ retry_flush_dents: } retry_flush_nodes: - down_write(&sbi->node_write); + f2fs_down_write(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { - up_write(&sbi->node_write); + f2fs_up_write(&sbi->node_write); atomic_inc(&sbi->wb_sync_req[NODE]); err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); atomic_dec(&sbi->wb_sync_req[NODE]); if (err) { - up_write(&sbi->node_change); + f2fs_up_write(&sbi->node_change); f2fs_unlock_all(sbi); return err; } @@ -1280,13 +1289,13 @@ retry_flush_nodes: * dirty node blocks and some checkpoint values by block allocation. */ __prepare_cp_block(sbi); - up_write(&sbi->node_change); + f2fs_up_write(&sbi->node_change); return err; } static void unblock_operations(struct f2fs_sb_info *sbi) { - up_write(&sbi->node_write); + f2fs_up_write(&sbi->node_write); f2fs_unlock_all(sbi); } @@ -1561,6 +1570,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* update user_block_counts */ sbi->last_valid_block_count = sbi->total_valid_block_count; percpu_counter_set(&sbi->alloc_valid_block_count, 0); + percpu_counter_set(&sbi->rf_node_block_count, 0); /* Here, we have one bio having CP pack except cp pack 2 page */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); @@ -1630,7 +1640,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_warn(sbi, "Start checkpoint disabled!"); } if (cpc->reason != CP_RESIZE) - down_write(&sbi->cp_global_sem); + f2fs_down_write(&sbi->cp_global_sem); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || @@ -1711,7 +1721,7 @@ stop: trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: if (cpc->reason != CP_RESIZE) - up_write(&sbi->cp_global_sem); + f2fs_up_write(&sbi->cp_global_sem); return err; } @@ -1759,9 +1769,9 @@ static int __write_checkpoint_sync(struct f2fs_sb_info *sbi) struct cp_control cpc = { .reason = CP_SYNC, }; int err; - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); err = f2fs_write_checkpoint(sbi, &cpc); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); return err; } @@ -1849,9 +1859,9 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) { int ret; - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); ret = f2fs_write_checkpoint(sbi, &cpc); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); return ret; } @@ -1893,8 +1903,10 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi, "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(cprc->f2fs_issue_ckpt)) { + int err = PTR_ERR(cprc->f2fs_issue_ckpt); + cprc->f2fs_issue_ckpt = NULL; - return -ENOMEM; + return err; } set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 6adf04725954..181ecbd15db0 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -153,6 +153,7 @@ void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; + cc->valid_nr_cpages = 0; if (!reuse) cc->cluster_idx = NULL_CLUSTER; } @@ -565,10 +566,7 @@ MODULE_PARM_DESC(num_compress_pages, int f2fs_init_compress_mempool(void) { compress_page_pool = mempool_create_page_pool(num_compress_pages, 0); - if (!compress_page_pool) - return -ENOMEM; - - return 0; + return compress_page_pool ? 0 : -ENOMEM; } void f2fs_destroy_compress_mempool(void) @@ -618,7 +616,6 @@ static int f2fs_compress_pages(struct compress_ctx *cc) const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; unsigned int max_len, new_nr_cpages; - struct page **new_cpages; u32 chksum = 0; int i, ret; @@ -633,6 +630,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) max_len = COMPRESS_HEADER_SIZE + cc->clen; cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); + cc->valid_nr_cpages = cc->nr_cpages; cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages); if (!cc->cpages) { @@ -683,13 +681,6 @@ static int f2fs_compress_pages(struct compress_ctx *cc) new_nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); - /* Now we're going to cut unnecessary tail pages */ - new_cpages = page_array_alloc(cc->inode, new_nr_cpages); - if (!new_cpages) { - ret = -ENOMEM; - goto out_vunmap_cbuf; - } - /* zero out any unused part of the last page */ memset(&cc->cbuf->cdata[cc->clen], 0, (new_nr_cpages * PAGE_SIZE) - @@ -699,10 +690,8 @@ static int f2fs_compress_pages(struct compress_ctx *cc) vm_unmap_ram(cc->rbuf, cc->cluster_size); for (i = 0; i < cc->nr_cpages; i++) { - if (i < new_nr_cpages) { - new_cpages[i] = cc->cpages[i]; + if (i < new_nr_cpages) continue; - } f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } @@ -710,9 +699,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) if (cops->destroy_compress_ctx) cops->destroy_compress_ctx(cc); - page_array_free(cc->inode, cc->cpages, cc->nr_cpages); - cc->cpages = new_cpages; - cc->nr_cpages = new_nr_cpages; + cc->valid_nr_cpages = new_nr_cpages; trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, cc->clen, ret); @@ -738,14 +725,19 @@ out: return ret; } -void f2fs_decompress_cluster(struct decompress_io_ctx *dic) +static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, + bool pre_alloc); +static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, + bool bypass_destroy_callback, bool pre_alloc); + +void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) { struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); struct f2fs_inode_info *fi = F2FS_I(dic->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; + bool bypass_callback = false; int ret; - int i; trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, dic->cluster_size, fi->i_compress_algorithm); @@ -755,41 +747,10 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic) goto out_end_io; } - dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); - if (!dic->tpages) { - ret = -ENOMEM; - goto out_end_io; - } - - for (i = 0; i < dic->cluster_size; i++) { - if (dic->rpages[i]) { - dic->tpages[i] = dic->rpages[i]; - continue; - } - - dic->tpages[i] = f2fs_compress_alloc_page(); - if (!dic->tpages[i]) { - ret = -ENOMEM; - goto out_end_io; - } - } - - if (cops->init_decompress_ctx) { - ret = cops->init_decompress_ctx(dic); - if (ret) - goto out_end_io; - } - - dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); - if (!dic->rbuf) { - ret = -ENOMEM; - goto out_destroy_decompress_ctx; - } - - dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); - if (!dic->cbuf) { - ret = -ENOMEM; - goto out_vunmap_rbuf; + ret = f2fs_prepare_decomp_mem(dic, false); + if (ret) { + bypass_callback = true; + goto out_release; } dic->clen = le32_to_cpu(dic->cbuf->clen); @@ -797,7 +758,8 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic) if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { ret = -EFSCORRUPTED; - goto out_vunmap_cbuf; + f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); + goto out_release; } ret = cops->decompress_pages(dic); @@ -818,17 +780,13 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic) } } -out_vunmap_cbuf: - vm_unmap_ram(dic->cbuf, dic->nr_cpages); -out_vunmap_rbuf: - vm_unmap_ram(dic->rbuf, dic->cluster_size); -out_destroy_decompress_ctx: - if (cops->destroy_decompress_ctx) - cops->destroy_decompress_ctx(dic); +out_release: + f2fs_release_decomp_mem(dic, bypass_callback, false); + out_end_io: trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, dic->clen, ret); - f2fs_decompress_end_io(dic, ret); + f2fs_decompress_end_io(dic, ret, in_task); } /* @@ -838,7 +796,7 @@ out_end_io: * (or in the case of a failure, cleans up without actually decompressing). */ void f2fs_end_read_compressed_page(struct page *page, bool failed, - block_t blkaddr) + block_t blkaddr, bool in_task) { struct decompress_io_ctx *dic = (struct decompress_io_ctx *)page_private(page); @@ -848,12 +806,12 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed, if (failed) WRITE_ONCE(dic->failed, true); - else if (blkaddr) + else if (blkaddr && in_task) f2fs_cache_compressed_page(sbi, page, dic->inode->i_ino, blkaddr); if (atomic_dec_and_test(&dic->remaining_pages)) - f2fs_decompress_cluster(dic); + f2fs_decompress_cluster(dic, in_task); } static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) @@ -880,6 +838,32 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) return is_page_in_cluster(cc, index); } +bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages, + int index, int nr_pages, bool uptodate) +{ + unsigned long pgidx = pages[index]->index; + int i = uptodate ? 0 : 1; + + /* + * when uptodate set to true, try to check all pages in cluster is + * uptodate or not. + */ + if (uptodate && (pgidx % cc->cluster_size)) + return false; + + if (nr_pages - index < cc->cluster_size) + return false; + + for (; i < cc->cluster_size; i++) { + if (pages[index + i]->index != pgidx + i) + return false; + if (uptodate && !PageUptodate(pages[index + i])) + return false; + } + + return true; +} + static bool cluster_has_invalid_data(struct compress_ctx *cc) { loff_t i_size = i_size_read(cc->inode); @@ -925,17 +909,15 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) reason = "[C|*|C|*]"; goto out; } - if (compressed) { - if (!__is_valid_data_blkaddr(blkaddr)) { - if (!cluster_end) - cluster_end = i; - continue; - } - /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ - if (cluster_end) { - reason = "[C|N|N|V]"; - goto out; - } + if (!__is_valid_data_blkaddr(blkaddr)) { + if (!cluster_end) + cluster_end = i; + continue; + } + /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ + if (cluster_end) { + reason = "[C|N|N|V]"; + goto out; } } return false; @@ -965,6 +947,7 @@ static int __f2fs_cluster_blocks(struct inode *inode, if (f2fs_sanity_check_cluster(&dn)) { ret = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_CLUSTER); goto fail; } @@ -1256,7 +1239,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, * checkpoint. This can only happen to quota writes which can cause * the below discard race condition. */ - down_read(&sbi->node_write); + f2fs_down_read(&sbi->node_write); } else if (!f2fs_trylock_op(sbi)) { goto out_free; } @@ -1275,7 +1258,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, psize = (loff_t)(cc->rpages[last_index]->index + 1) << PAGE_SHIFT; - err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); if (err) goto out_put_dnode; @@ -1287,14 +1270,14 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; cic->inode = inode; - atomic_set(&cic->pending_pages, cc->nr_cpages); + atomic_set(&cic->pending_pages, cc->valid_nr_cpages); cic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!cic->rpages) goto out_put_cic; cic->nr_rpages = cc->cluster_size; - for (i = 0; i < cc->nr_cpages; i++) { + for (i = 0; i < cc->valid_nr_cpages; i++) { f2fs_set_compressed_page(cc->cpages[i], inode, cc->rpages[i + 1]->index, cic); fio.compressed_page = cc->cpages[i]; @@ -1339,7 +1322,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, if (fio.compr_blocks && __is_valid_data_blkaddr(blkaddr)) fio.compr_blocks++; - if (i > cc->nr_cpages) { + if (i > cc->valid_nr_cpages) { if (__is_valid_data_blkaddr(blkaddr)) { f2fs_invalidate_blocks(sbi, blkaddr); f2fs_update_data_blkaddr(&dn, NEW_ADDR); @@ -1364,8 +1347,8 @@ unlock_continue: if (fio.compr_blocks) f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false); - f2fs_i_compr_blocks_update(inode, cc->nr_cpages, true); - add_compr_block_stat(inode, cc->nr_cpages); + f2fs_i_compr_blocks_update(inode, cc->valid_nr_cpages, true); + add_compr_block_stat(inode, cc->valid_nr_cpages); set_inode_flag(cc->inode, FI_APPEND_WRITE); if (cc->cluster_idx == 0) @@ -1373,7 +1356,7 @@ unlock_continue: f2fs_put_dnode(&dn); if (IS_NOQUOTA(inode)) - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); else f2fs_unlock_op(sbi); @@ -1399,13 +1382,11 @@ out_put_dnode: f2fs_put_dnode(&dn); out_unlock_op: if (IS_NOQUOTA(inode)) - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); else f2fs_unlock_op(sbi); out_free: - for (i = 0; i < cc->nr_cpages; i++) { - if (!cc->cpages[i]) - continue; + for (i = 0; i < cc->valid_nr_cpages; i++) { f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } @@ -1496,9 +1477,7 @@ continue_unlock: if (IS_NOQUOTA(cc->inode)) return 0; ret = 0; - cond_resched(); - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry_write; } return ret; @@ -1546,16 +1525,81 @@ destroy_out: return err; } -static void f2fs_free_dic(struct decompress_io_ctx *dic); +static inline bool allow_memalloc_for_decomp(struct f2fs_sb_info *sbi, + bool pre_alloc) +{ + return pre_alloc ^ f2fs_low_mem_mode(sbi); +} + +static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, + bool pre_alloc) +{ + const struct f2fs_compress_ops *cops = + f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + int i; + + if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) + return 0; + + dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); + if (!dic->tpages) + return -ENOMEM; + + for (i = 0; i < dic->cluster_size; i++) { + if (dic->rpages[i]) { + dic->tpages[i] = dic->rpages[i]; + continue; + } + + dic->tpages[i] = f2fs_compress_alloc_page(); + if (!dic->tpages[i]) + return -ENOMEM; + } + + dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); + if (!dic->rbuf) + return -ENOMEM; + + dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); + if (!dic->cbuf) + return -ENOMEM; + + if (cops->init_decompress_ctx) + return cops->init_decompress_ctx(dic); + + return 0; +} + +static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, + bool bypass_destroy_callback, bool pre_alloc) +{ + const struct f2fs_compress_ops *cops = + f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + + if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) + return; + + if (!bypass_destroy_callback && cops->destroy_decompress_ctx) + cops->destroy_decompress_ctx(dic); + + if (dic->cbuf) + vm_unmap_ram(dic->cbuf, dic->nr_cpages); + + if (dic->rbuf) + vm_unmap_ram(dic->rbuf, dic->cluster_size); +} + +static void f2fs_free_dic(struct decompress_io_ctx *dic, + bool bypass_destroy_callback); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) { struct decompress_io_ctx *dic; pgoff_t start_idx = start_idx_of_cluster(cc); - int i; + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + int i, ret; - dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, - false, F2FS_I_SB(cc->inode)); + dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, false, sbi); if (!dic) return ERR_PTR(-ENOMEM); @@ -1581,32 +1625,43 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->nr_rpages = cc->cluster_size; dic->cpages = page_array_alloc(dic->inode, dic->nr_cpages); - if (!dic->cpages) + if (!dic->cpages) { + ret = -ENOMEM; goto out_free; + } for (i = 0; i < dic->nr_cpages; i++) { struct page *page; page = f2fs_compress_alloc_page(); - if (!page) + if (!page) { + ret = -ENOMEM; goto out_free; + } f2fs_set_compressed_page(page, cc->inode, start_idx + i + 1, dic); dic->cpages[i] = page; } + ret = f2fs_prepare_decomp_mem(dic, true); + if (ret) + goto out_free; + return dic; out_free: - f2fs_free_dic(dic); - return ERR_PTR(-ENOMEM); + f2fs_free_dic(dic, true); + return ERR_PTR(ret); } -static void f2fs_free_dic(struct decompress_io_ctx *dic) +static void f2fs_free_dic(struct decompress_io_ctx *dic, + bool bypass_destroy_callback) { int i; + f2fs_release_decomp_mem(dic, bypass_destroy_callback, true); + if (dic->tpages) { for (i = 0; i < dic->cluster_size; i++) { if (dic->rpages[i]) @@ -1631,38 +1686,25 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic) kmem_cache_free(dic_entry_slab, dic); } -static void f2fs_put_dic(struct decompress_io_ctx *dic) +static void f2fs_late_free_dic(struct work_struct *work) { - if (refcount_dec_and_test(&dic->refcnt)) - f2fs_free_dic(dic); + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, free_work); + + f2fs_free_dic(dic, false); } -/* - * Update and unlock the cluster's pagecache pages, and release the reference to - * the decompress_io_ctx that was being held for I/O completion. - */ -static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) +static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task) { - int i; - - for (i = 0; i < dic->cluster_size; i++) { - struct page *rpage = dic->rpages[i]; - - if (!rpage) - continue; - - /* PG_error was set if verity failed. */ - if (failed || PageError(rpage)) { - ClearPageUptodate(rpage); - /* will re-read again later */ - ClearPageError(rpage); + if (refcount_dec_and_test(&dic->refcnt)) { + if (in_task) { + f2fs_free_dic(dic, false); } else { - SetPageUptodate(rpage); + INIT_WORK(&dic->free_work, f2fs_late_free_dic); + queue_work(F2FS_I_SB(dic->inode)->post_read_wq, + &dic->free_work); } - unlock_page(rpage); } - - f2fs_put_dic(dic); } static void f2fs_verify_cluster(struct work_struct *work) @@ -1671,23 +1713,32 @@ static void f2fs_verify_cluster(struct work_struct *work) container_of(work, struct decompress_io_ctx, verity_work); int i; - /* Verify the cluster's decompressed pages with fs-verity. */ + /* Verify, update, and unlock the decompressed pages. */ for (i = 0; i < dic->cluster_size; i++) { struct page *rpage = dic->rpages[i]; - if (rpage && !fsverity_verify_page(rpage)) - SetPageError(rpage); + if (!rpage) + continue; + + if (fsverity_verify_page(rpage)) + SetPageUptodate(rpage); + else + ClearPageUptodate(rpage); + unlock_page(rpage); } - __f2fs_decompress_end_io(dic, false); + f2fs_put_dic(dic, true); } /* * This is called when a compressed cluster has been decompressed * (or failed to be read and/or decompressed). */ -void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task) { + int i; + if (!failed && dic->need_verity) { /* * Note that to avoid deadlocks, the verity work can't be done @@ -1697,9 +1748,28 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) */ INIT_WORK(&dic->verity_work, f2fs_verify_cluster); fsverity_enqueue_verify_work(&dic->verity_work); - } else { - __f2fs_decompress_end_io(dic, failed); + return; } + + /* Update and unlock the cluster's pagecache pages. */ + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; + + if (!rpage) + continue; + + if (failed) + ClearPageUptodate(rpage); + else + SetPageUptodate(rpage); + unlock_page(rpage); + } + + /* + * Release the reference to the decompress_io_ctx that was being held + * for I/O completion. + */ + f2fs_put_dic(dic, in_task); } /* @@ -1707,12 +1777,12 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) * * This is called when the page is no longer needed and can be freed. */ -void f2fs_put_page_dic(struct page *page) +void f2fs_put_page_dic(struct page *page, bool in_task) { struct decompress_io_ctx *dic = (struct decompress_io_ctx *)page_private(page); - f2fs_put_dic(dic); + f2fs_put_dic(dic, in_task); } /* @@ -1825,7 +1895,7 @@ bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { - struct address_space *mapping = sbi->compress_inode->i_mapping; + struct address_space *mapping = COMPRESS_MAPPING(sbi); struct pagevec pvec; pgoff_t index = 0; pgoff_t end = MAX_BLKADDR(sbi); @@ -1902,6 +1972,9 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) dev_t dev = sbi->sb->s_bdev->bd_dev; char slab_name[32]; + if (!f2fs_sb_has_compression(sbi)) + return 0; + sprintf(slab_name, "f2fs_page_array_entry-%u:%u", MAJOR(dev), MINOR(dev)); sbi->page_array_slab_size = sizeof(struct page *) << @@ -1909,9 +1982,7 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) sbi->page_array_slab = f2fs_kmem_cache_create(slab_name, sbi->page_array_slab_size); - if (!sbi->page_array_slab) - return -ENOMEM; - return 0; + return sbi->page_array_slab ? 0 : -ENOMEM; } void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) @@ -1919,53 +1990,24 @@ void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) kmem_cache_destroy(sbi->page_array_slab); } -static int __init f2fs_init_cic_cache(void) +int __init f2fs_init_compress_cache(void) { cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry", sizeof(struct compress_io_ctx)); if (!cic_entry_slab) return -ENOMEM; - return 0; -} - -static void f2fs_destroy_cic_cache(void) -{ - kmem_cache_destroy(cic_entry_slab); -} - -static int __init f2fs_init_dic_cache(void) -{ dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry", sizeof(struct decompress_io_ctx)); if (!dic_entry_slab) - return -ENOMEM; - return 0; -} - -static void f2fs_destroy_dic_cache(void) -{ - kmem_cache_destroy(dic_entry_slab); -} - -int __init f2fs_init_compress_cache(void) -{ - int err; - - err = f2fs_init_cic_cache(); - if (err) - goto out; - err = f2fs_init_dic_cache(); - if (err) goto free_cic; return 0; free_cic: - f2fs_destroy_cic_cache(); -out: + kmem_cache_destroy(cic_entry_slab); return -ENOMEM; } void f2fs_destroy_compress_cache(void) { - f2fs_destroy_dic_cache(); - f2fs_destroy_cic_cache(); + kmem_cache_destroy(dic_entry_slab); + kmem_cache_destroy(cic_entry_slab); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cfa6e1322e46..6ef1089df4c7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -39,10 +40,8 @@ static struct bio_set f2fs_bioset; int __init f2fs_init_bioset(void) { - if (bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, - 0, BIOSET_NEED_BVECS)) - return -ENOMEM; - return 0; + return bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, + 0, BIOSET_NEED_BVECS); } void f2fs_destroy_bioset(void) @@ -69,8 +68,7 @@ static bool __is_cp_guaranteed(struct page *page) if (f2fs_is_compressed_page(page)) return false; - if ((S_ISREG(inode->i_mode) && - (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) || + if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || page_private_gcing(page)) return true; return false; @@ -117,42 +115,56 @@ struct bio_post_read_ctx { struct f2fs_sb_info *sbi; struct work_struct work; unsigned int enabled_steps; + /* + * decompression_attempted keeps track of whether + * f2fs_end_read_compressed_page() has been called on the pages in the + * bio that belong to a compressed cluster yet. + */ + bool decompression_attempted; block_t fs_blkaddr; }; -static void f2fs_finish_read_bio(struct bio *bio) +/* + * Update and unlock a bio's pages, and free the bio. + * + * This marks pages up-to-date only if there was no error in the bio (I/O error, + * decryption error, or verity error), as indicated by bio->bi_status. + * + * "Compressed pages" (pagecache pages backed by a compressed cluster on-disk) + * aren't marked up-to-date here, as decompression is done on a per-compression- + * cluster basis rather than a per-bio basis. Instead, we only must do two + * things for each compressed page here: call f2fs_end_read_compressed_page() + * with failed=true if an error occurred before it would have normally gotten + * called (i.e., I/O error or decryption error, but *not* verity error), and + * release the bio's reference to the decompress_io_ctx of the page's cluster. + */ +static void f2fs_finish_read_bio(struct bio *bio, bool in_task) { struct bio_vec *bv; struct bvec_iter_all iter_all; + struct bio_post_read_ctx *ctx = bio->bi_private; - /* - * Update and unlock the bio's pagecache pages, and put the - * decompression context for any compressed pages. - */ bio_for_each_segment_all(bv, bio, iter_all) { struct page *page = bv->bv_page; if (f2fs_is_compressed_page(page)) { - if (bio->bi_status) - f2fs_end_read_compressed_page(page, true, 0); - f2fs_put_page_dic(page); + if (ctx && !ctx->decompression_attempted) + f2fs_end_read_compressed_page(page, true, 0, + in_task); + f2fs_put_page_dic(page, in_task); continue; } - /* PG_error was set if decryption or verity failed. */ - if (bio->bi_status || PageError(page)) { + if (bio->bi_status) ClearPageUptodate(page); - /* will re-read again later */ - ClearPageError(page); - } else { + else SetPageUptodate(page); - } dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); + if (ctx) + mempool_free(ctx, bio_post_read_ctx_pool); bio_put(bio); } @@ -185,14 +197,16 @@ static void f2fs_verify_bio(struct work_struct *work) struct page *page = bv->bv_page; if (!f2fs_is_compressed_page(page) && - !PageError(page) && !fsverity_verify_page(page)) - SetPageError(page); + !fsverity_verify_page(page)) { + bio->bi_status = BLK_STS_IOERR; + break; + } } } else { fsverity_verify_bio(bio); } - f2fs_finish_read_bio(bio); + f2fs_finish_read_bio(bio, true); } /* @@ -204,7 +218,7 @@ static void f2fs_verify_bio(struct work_struct *work) * can involve reading verity metadata pages from the file, and these verity * metadata pages may be encrypted and/or compressed. */ -static void f2fs_verify_and_finish_bio(struct bio *bio) +static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task) { struct bio_post_read_ctx *ctx = bio->bi_private; @@ -212,7 +226,7 @@ static void f2fs_verify_and_finish_bio(struct bio *bio) INIT_WORK(&ctx->work, f2fs_verify_bio); fsverity_enqueue_verify_work(&ctx->work); } else { - f2fs_finish_read_bio(bio); + f2fs_finish_read_bio(bio, in_task); } } @@ -225,7 +239,8 @@ static void f2fs_verify_and_finish_bio(struct bio *bio) * that the bio includes at least one compressed page. The actual decompression * is done on a per-cluster basis, not a per-bio basis. */ -static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) +static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, + bool in_task) { struct bio_vec *bv; struct bvec_iter_all iter_all; @@ -235,16 +250,17 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) bio_for_each_segment_all(bv, ctx->bio, iter_all) { struct page *page = bv->bv_page; - /* PG_error was set if decryption failed. */ if (f2fs_is_compressed_page(page)) - f2fs_end_read_compressed_page(page, PageError(page), - blkaddr); + f2fs_end_read_compressed_page(page, false, blkaddr, + in_task); else all_compressed = false; blkaddr++; } + ctx->decompression_attempted = true; + /* * Optimization: if all the bio's pages are compressed, then scheduling * the per-bio verity work is unnecessary, as verity will be fully @@ -258,20 +274,24 @@ static void f2fs_post_read_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; - if (ctx->enabled_steps & STEP_DECRYPT) - fscrypt_decrypt_bio(ctx->bio); + if ((ctx->enabled_steps & STEP_DECRYPT) && !fscrypt_decrypt_bio(bio)) { + f2fs_finish_read_bio(bio, true); + return; + } if (ctx->enabled_steps & STEP_DECOMPRESS) - f2fs_handle_step_decompress(ctx); + f2fs_handle_step_decompress(ctx, true); - f2fs_verify_and_finish_bio(ctx->bio); + f2fs_verify_and_finish_bio(bio, true); } static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); struct bio_post_read_ctx *ctx; + bool intask = in_task(); iostat_update_and_unbind_ctx(bio, 0); ctx = bio->bi_private; @@ -282,16 +302,29 @@ static void f2fs_read_end_io(struct bio *bio) } if (bio->bi_status) { - f2fs_finish_read_bio(bio); + f2fs_finish_read_bio(bio, intask); return; } - if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) { - INIT_WORK(&ctx->work, f2fs_post_read_work); - queue_work(ctx->sbi->post_read_wq, &ctx->work); - } else { - f2fs_verify_and_finish_bio(bio); + if (ctx) { + unsigned int enabled_steps = ctx->enabled_steps & + (STEP_DECRYPT | STEP_DECOMPRESS); + + /* + * If we have only decompression step between decompression and + * decrypt, we don't need post processing for this. + */ + if (enabled_steps == STEP_DECOMPRESS && + !f2fs_low_mem_mode(sbi)) { + f2fs_handle_step_decompress(ctx, intask); + } else if (enabled_steps) { + INIT_WORK(&ctx->work, f2fs_post_read_work); + queue_work(ctx->sbi->post_read_wq, &ctx->work); + return; + } } + + f2fs_verify_and_finish_bio(bio, intask); } static void f2fs_write_end_io(struct bio *bio) @@ -318,7 +351,8 @@ static void f2fs_write_end_io(struct bio *bio) mempool_free(page, sbi->write_io_dummy); if (unlikely(bio->bi_status)) - f2fs_stop_checkpoint(sbi, true); + f2fs_stop_checkpoint(sbi, true, + STOP_CP_REASON_WRITE_FAIL); continue; } @@ -334,7 +368,8 @@ static void f2fs_write_end_io(struct bio *bio) if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); if (type == F2FS_WB_CP_DATA) - f2fs_stop_checkpoint(sbi, true); + f2fs_stop_checkpoint(sbi, true, + STOP_CP_REASON_WRITE_FAIL); } f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && @@ -354,7 +389,7 @@ static void f2fs_write_end_io(struct bio *bio) } struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, - block_t blk_addr, struct bio *bio) + block_t blk_addr, sector_t *sector) { struct block_device *bdev = sbi->sb->s_bdev; int i; @@ -369,10 +404,9 @@ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, } } } - if (bio) { - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); - } + + if (sector) + *sector = SECTOR_FROM_BLOCK(blk_addr); return bdev; } @@ -389,22 +423,55 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) return 0; } +static unsigned int f2fs_io_flags(struct f2fs_io_info *fio) +{ + unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; + unsigned int fua_flag, meta_flag, io_flag; + unsigned int op_flags = 0; + + if (fio->op != REQ_OP_WRITE) + return 0; + if (fio->type == DATA) + io_flag = fio->sbi->data_io_flag; + else if (fio->type == NODE) + io_flag = fio->sbi->node_io_flag; + else + return 0; + + fua_flag = io_flag & temp_mask; + meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; + + /* + * data/node io flag bits per temp: + * REQ_META | REQ_FUA | + * 5 | 4 | 3 | 2 | 1 | 0 | + * Cold | Warm | Hot | Cold | Warm | Hot | + */ + if ((1 << fio->temp) & meta_flag) + op_flags |= REQ_META; + if ((1 << fio->temp) & fua_flag) + op_flags |= REQ_FUA; + return op_flags; +} + static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) { struct f2fs_sb_info *sbi = fio->sbi; + struct block_device *bdev; + sector_t sector; struct bio *bio; + bdev = f2fs_target_device(sbi, fio->new_blkaddr, §or); bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset); - - f2fs_target_device(sbi, fio->new_blkaddr, bio); + bio_set_dev(bio, bdev); + bio_set_op_attrs(bio, fio->op, fio->op_flags | f2fs_io_flags(fio)); + bio->bi_iter.bi_sector = sector; if (is_read_io(fio->op)) { bio->bi_end_io = f2fs_read_end_io; bio->bi_private = NULL; } else { bio->bi_end_io = f2fs_write_end_io; bio->bi_private = sbi; - bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, - fio->type, fio->temp); } iostat_alloc_and_bind_ctx(sbi, bio, NULL); @@ -425,6 +492,8 @@ static void f2fs_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, */ if (!fio || !fio->encrypted_page) fscrypt_set_bio_crypt_ctx(bio, inode, first_idx, gfp_mask); + else if (fscrypt_inode_should_skip_dm_default_key(inode)) + bio_set_skip_dm_default_key(bio); } static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode, @@ -436,7 +505,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode, * read/write raw data without encryption. */ if (fio && fio->encrypted_page) - return !bio_has_crypt_ctx(bio); + return !bio_has_crypt_ctx(bio) && + (bio_should_skip_dm_default_key(bio) == + fscrypt_inode_should_skip_dm_default_key(inode)); return fscrypt_mergeable_bio(bio, inode, next_idx); } @@ -500,34 +571,6 @@ void f2fs_submit_bio(struct f2fs_sb_info *sbi, __submit_bio(sbi, bio, type); } -static void __attach_io_flag(struct f2fs_io_info *fio) -{ - struct f2fs_sb_info *sbi = fio->sbi; - unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; - unsigned int io_flag, fua_flag, meta_flag; - - if (fio->type == DATA) - io_flag = sbi->data_io_flag; - else if (fio->type == NODE) - io_flag = sbi->node_io_flag; - else - return; - - fua_flag = io_flag & temp_mask; - meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; - - /* - * data/node io flag bits per temp: - * REQ_META | REQ_FUA | - * 5 | 4 | 3 | 2 | 1 | 0 | - * Cold | Warm | Hot | Cold | Warm | Hot | - */ - if ((1 << fio->temp) & meta_flag) - fio->op_flags |= REQ_META; - if ((1 << fio->temp) & fua_flag) - fio->op_flags |= REQ_FUA; -} - static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; @@ -535,9 +578,6 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - __attach_io_flag(fio); - bio_set_op_attrs(io->bio, fio->op, fio->op_flags); - if (is_read_io(fio->op)) trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); else @@ -584,24 +624,51 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, return false; } +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < NR_PAGE_TYPE; i++) { + int n = (i == META) ? 1 : NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = f2fs_kmalloc(sbi, + array_size(n, sizeof(struct f2fs_bio_info)), + GFP_KERNEL); + if (!sbi->write_io[i]) + return -ENOMEM; + + for (j = HOT; j < n; j++) { + init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); + sbi->write_io[i][j].sbi = sbi; + sbi->write_io[i][j].bio = NULL; + spin_lock_init(&sbi->write_io[i][j].io_lock); + INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); + INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); + init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); + } + } + + return 0; +} + static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; - down_write(&io->io_rwsem); + f2fs_down_write(&io->io_rwsem); /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; - io->fio.op = REQ_OP_WRITE; - io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC; + io->bio->bi_opf |= REQ_META | REQ_PRIO | REQ_SYNC; if (!test_opt(sbi, NOBARRIER)) - io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA; + io->bio->bi_opf |= REQ_PREFLUSH | REQ_FUA; } __submit_merged_bio(io); - up_write(&io->io_rwsem); + f2fs_up_write(&io->io_rwsem); } static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, @@ -616,9 +683,9 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; - down_read(&io->io_rwsem); + f2fs_down_read(&io->io_rwsem); ret = __has_merged_page(io->bio, inode, page, ino); - up_read(&io->io_rwsem); + f2fs_up_read(&io->io_rwsem); } if (ret) __f2fs_submit_merged_write(sbi, type, temp); @@ -660,8 +727,10 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, fio->is_por ? META_POR : (__is_meta_io(fio) ? - META_GENERIC : DATA_GENERIC_ENHANCE))) + META_GENERIC : DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } trace_f2fs_submit_page_bio(page, fio); @@ -679,11 +748,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) if (fio->io_wbc && !is_read_io(fio->op)) wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); - __attach_io_flag(fio); - bio_set_op_attrs(bio, fio->op, fio->op_flags); - inc_page_count(fio->sbi, is_read_io(fio->op) ? - __read_io_type(page): WB_DATA_TYPE(fio->page)); + __read_io_type(page) : WB_DATA_TYPE(fio->page)); __submit_bio(fio->sbi, bio, fio->type); return 0; @@ -742,9 +808,9 @@ static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio, if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE) f2fs_bug_on(sbi, 1); - down_write(&io->bio_list_lock); + f2fs_down_write(&io->bio_list_lock); list_add_tail(&be->list, &io->bio_list); - up_write(&io->bio_list_lock); + f2fs_up_write(&io->bio_list_lock); } static void del_bio_entry(struct bio_entry *be) @@ -766,7 +832,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, struct list_head *head = &io->bio_list; struct bio_entry *be; - down_write(&io->bio_list_lock); + f2fs_down_write(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (be->bio != *bio) continue; @@ -790,7 +856,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, __submit_bio(sbi, *bio, DATA); break; } - up_write(&io->bio_list_lock); + f2fs_up_write(&io->bio_list_lock); } if (ret) { @@ -816,7 +882,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, if (list_empty(head)) continue; - down_read(&io->bio_list_lock); + f2fs_down_read(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (target) found = (target == be->bio); @@ -826,14 +892,14 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, if (found) break; } - up_read(&io->bio_list_lock); + f2fs_up_read(&io->bio_list_lock); if (!found) continue; found = false; - down_write(&io->bio_list_lock); + f2fs_down_write(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (target) found = (target == be->bio); @@ -846,7 +912,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, break; } } - up_write(&io->bio_list_lock); + f2fs_up_write(&io->bio_list_lock); } if (found) @@ -864,8 +930,10 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) fio->encrypted_page : fio->page; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, - __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) + __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) { + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } trace_f2fs_submit_page_bio(page, fio); @@ -875,10 +943,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) alloc_new: if (!bio) { bio = __bio_alloc(fio, BIO_MAX_VECS); - __attach_io_flag(fio); f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, fio->page->index, fio, GFP_NOIO); - bio_set_op_attrs(bio, fio->op, fio->op_flags); add_bio_entry(fio->sbi, bio, page, fio->temp); } else { @@ -906,7 +972,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) f2fs_bug_on(sbi, is_read_io(fio->op)); - down_write(&io->io_rwsem); + f2fs_down_write(&io->io_rwsem); next: if (fio->in_list) { spin_lock(&io->io_lock); @@ -973,7 +1039,7 @@ out: if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || !f2fs_is_checkpoint_ready(sbi)) __submit_merged_bio(io); - up_write(&io->io_rwsem); + f2fs_up_write(&io->io_rwsem); } static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, @@ -984,17 +1050,18 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, struct bio *bio; struct bio_post_read_ctx *ctx = NULL; unsigned int post_read_steps = 0; + sector_t sector; + struct block_device *bdev = f2fs_target_device(sbi, blkaddr, §or); bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL, bio_max_segs(nr_pages), &f2fs_bioset); + bio_set_dev(bio, bdev); + bio_set_op_attrs(bio, REQ_OP_READ, op_flag); if (!bio) return ERR_PTR(-ENOMEM); - + bio->bi_iter.bi_sector = sector; f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS); - - f2fs_target_device(sbi, blkaddr, bio); bio->bi_end_io = f2fs_read_end_io; - bio_set_op_attrs(bio, REQ_OP_READ, op_flag); if (fscrypt_inode_uses_fs_layer_crypto(inode)) post_read_steps |= STEP_DECRYPT; @@ -1016,6 +1083,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, ctx->sbi = sbi; ctx->enabled_steps = post_read_steps; ctx->fs_blkaddr = blkaddr; + ctx->decompression_attempted = false; bio->bi_private = ctx; } iostat_alloc_and_bind_ctx(sbi, bio, ctx); @@ -1042,9 +1110,8 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, bio_put(bio); return -EFAULT; } - ClearPageError(page); inc_page_count(sbi, F2FS_RD_DATA); - f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); __submit_bio(sbi, bio, DATA); return 0; } @@ -1081,7 +1148,7 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { dn->data_blkaddr = blkaddr; f2fs_set_data_blkaddr(dn); - f2fs_update_extent_cache(dn); + f2fs_update_read_extent_cache(dn); } /* dn->ofs_in_node will be returned with up-to-date last block pointer */ @@ -1150,7 +1217,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) struct extent_info ei = {0, }; struct inode *inode = dn->inode; - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn->data_blkaddr = ei.blk + index - ei.fofs; return 0; } @@ -1159,7 +1226,8 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) } struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - int op_flags, bool for_write) + int op_flags, bool for_write, + pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -1171,11 +1239,13 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, if (!page) return ERR_PTR(-ENOMEM); - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto put_err; } goto got_it; @@ -1183,12 +1253,17 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err) + if (err) { + if (err == -ENOENT && next_pgofs) + *next_pgofs = f2fs_get_next_page_offset(&dn, index); goto put_err; + } f2fs_put_dnode(&dn); if (unlikely(dn.data_blkaddr == NULL_ADDR)) { err = -ENOENT; + if (next_pgofs) + *next_pgofs = index + 1; goto put_err; } if (dn.data_blkaddr != NEW_ADDR && @@ -1196,6 +1271,8 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, dn.data_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto put_err; } got_it: @@ -1230,7 +1307,8 @@ put_err: return ERR_PTR(err); } -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index) +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct page *page; @@ -1240,7 +1318,7 @@ struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = f2fs_get_read_data_page(inode, index, 0, false); + page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs); if (IS_ERR(page)) return page; @@ -1266,7 +1344,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = f2fs_get_read_data_page(inode, index, 0, for_write); + page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL); if (IS_ERR(page)) return page; @@ -1354,7 +1432,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - err = f2fs_get_node_info(sbi, dn->nid, &ni); + err = f2fs_get_node_info(sbi, dn->nid, &ni, false); if (err) return err; @@ -1376,68 +1454,16 @@ alloc: f2fs_invalidate_compress_page(sbi, old_blkaddr); } f2fs_update_data_blkaddr(dn, dn->data_blkaddr); - - /* - * i_size will be updated by direct_IO. Otherwise, we'll get stale - * data from unwritten block via dio_read. - */ return 0; } -int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct f2fs_map_blocks map; - int flag; - int err = 0; - bool direct_io = iocb->ki_flags & IOCB_DIRECT; - - map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); - map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); - if (map.m_len > map.m_lblk) - map.m_len -= map.m_lblk; - else - map.m_len = 0; - - map.m_next_pgofs = NULL; - map.m_next_extent = NULL; - map.m_seg_type = NO_CHECK_TYPE; - map.m_may_create = true; - - if (direct_io) { - map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint); - flag = f2fs_force_buffered_io(inode, iocb, from) ? - F2FS_GET_BLOCK_PRE_AIO : - F2FS_GET_BLOCK_PRE_DIO; - goto map_blocks; - } - if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } - if (f2fs_has_inline_data(inode)) - return err; - - flag = F2FS_GET_BLOCK_PRE_AIO; - -map_blocks: - err = f2fs_map_blocks(inode, &map, 1, flag); - if (map.m_len > 0 && err == -ENOSPC) { - if (!direct_io) - set_inode_flag(inode, FI_NO_PREALLOC); - err = 0; - } - return err; -} - void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) { if (flag == F2FS_GET_BLOCK_PRE_AIO) { if (lock) - down_read(&sbi->node_change); + f2fs_down_read(&sbi->node_change); else - up_read(&sbi->node_change); + f2fs_up_read(&sbi->node_change); } else { if (lock) f2fs_lock_op(sbi); @@ -1465,10 +1491,15 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, struct extent_info ei = {0, }; block_t blkaddr; unsigned int start_pgofs; + int bidx = 0; if (!maxblocks) return 0; + map->m_bdev = inode->i_sb->s_bdev; + map->m_multidev_dio = + f2fs_allow_multi_device_dio(F2FS_I_SB(inode), flag); + map->m_len = 0; map->m_flags = 0; @@ -1476,7 +1507,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, pgofs = (pgoff_t)map->m_lblk; end = pgofs + maxblocks; - if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { + if (!create && f2fs_lookup_read_extent_cache(inode, pgofs, &ei)) { if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && map->m_may_create) goto next_dnode; @@ -1491,6 +1522,21 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, if (flag == F2FS_GET_BLOCK_DIO) f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); + + if (map->m_multidev_dio) { + block_t blk_addr = map->m_pblk; + + bidx = f2fs_target_device_index(sbi, map->m_pblk); + + map->m_bdev = FDEV(bidx).bdev; + map->m_pblk -= FDEV(bidx).start_blk; + map->m_len = min(map->m_len, + FDEV(bidx).end_blk + 1 - map->m_pblk); + + if (map->m_may_create) + f2fs_update_device_state(sbi, inode->i_ino, + blk_addr, map->m_len); + } goto out; } @@ -1541,6 +1587,7 @@ next_block: if (__is_valid_data_blkaddr(blkaddr) && !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto sync_out; } @@ -1570,8 +1617,11 @@ next_block: flag != F2FS_GET_BLOCK_DIO); err = __allocate_data_block(&dn, map->m_seg_type); - if (!err) + if (!err) { + if (flag == F2FS_GET_BLOCK_PRE_DIO) + file_need_truncate(inode); set_inode_flag(inode, FI_APPEND_WRITE); + } } if (err) goto sync_out; @@ -1583,6 +1633,8 @@ next_block: (flag != F2FS_GET_BLOCK_FIEMAP || IS_ENABLED(CONFIG_F2FS_CHECK_FS))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_CORRUPTED_CLUSTER); goto sync_out; } if (flag == F2FS_GET_BLOCK_BMAP) { @@ -1609,6 +1661,9 @@ next_block: if (flag == F2FS_GET_BLOCK_PRE_AIO) goto skip; + if (map->m_multidev_dio) + bidx = f2fs_target_device_index(sbi, blkaddr); + if (map->m_len == 0) { /* preallocated unwritten block should be mapped for fiemap. */ if (blkaddr == NEW_ADDR) @@ -1617,10 +1672,15 @@ next_block: map->m_pblk = blkaddr; map->m_len = 1; + + if (map->m_multidev_dio) + map->m_bdev = FDEV(bidx).bdev; } else if ((map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs)) || (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) || flag == F2FS_GET_BLOCK_PRE_DIO) { + if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev) + goto sync_out; ofs++; map->m_len++; } else { @@ -1657,7 +1717,7 @@ skip: if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; - f2fs_update_extent_cache_range(&dn, + f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } @@ -1673,16 +1733,36 @@ skip: sync_out: - /* for hardware encryption, but to avoid potential issue in future */ - if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) + if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) { + /* + * for hardware encryption, but to avoid potential issue + * in future + */ f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); + if (map->m_multidev_dio) { + block_t blk_addr = map->m_pblk; + + bidx = f2fs_target_device_index(sbi, map->m_pblk); + + map->m_bdev = FDEV(bidx).bdev; + map->m_pblk -= FDEV(bidx).start_blk; + + if (map->m_may_create) + f2fs_update_device_state(sbi, inode->i_ino, + blk_addr, map->m_len); + + f2fs_bug_on(sbi, blk_addr + map->m_len > + FDEV(bidx).end_blk + 1); + } + } + if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; - f2fs_update_extent_cache_range(&dn, + f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } @@ -1696,7 +1776,7 @@ unlock_out: f2fs_balance_fs(sbi, dn.node_changed); } out: - trace_f2fs_map_blocks(inode, map, err); + trace_f2fs_map_blocks(inode, map, create, flag, err); return err; } @@ -1736,47 +1816,6 @@ static inline u64 blks_to_bytes(struct inode *inode, u64 blks) return (blks << inode->i_blkbits); } -static int __get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create, int flag, - pgoff_t *next_pgofs, int seg_type, bool may_write) -{ - struct f2fs_map_blocks map; - int err; - - map.m_lblk = iblock; - map.m_len = bytes_to_blks(inode, bh->b_size); - map.m_next_pgofs = next_pgofs; - map.m_next_extent = NULL; - map.m_seg_type = seg_type; - map.m_may_create = may_write; - - err = f2fs_map_blocks(inode, &map, create, flag); - if (!err) { - map_bh(bh, inode->i_sb, map.m_pblk); - bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; - bh->b_size = blks_to_bytes(inode, map.m_len); - } - return err; -} - -static int get_data_block_dio_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL, - f2fs_rw_hint_to_seg_type(inode->i_write_hint), - true); -} - -static int get_data_block_dio(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL, - f2fs_rw_hint_to_seg_type(inode->i_write_hint), - false); -} - static int f2fs_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) { @@ -1796,7 +1835,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { f2fs_put_page(page, 1); return err; @@ -1819,7 +1858,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); trace_f2fs_fiemap(inode, 0, phys, len, flags, err); - if (err || err == 1) + if (err) return err; } @@ -1828,7 +1867,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - err = f2fs_get_node_info(sbi, xnid, &ni); + err = f2fs_get_node_info(sbi, xnid, &ni, false); if (err) { f2fs_put_page(page, 1); return err; @@ -2083,6 +2122,8 @@ got_it: if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, DATA_GENERIC_ENHANCE_READ)) { ret = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto out; } } else { @@ -2131,8 +2172,8 @@ submit_and_realloc: goto submit_and_realloc; inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); - f2fs_update_iostat(F2FS_I_SB(inode), FS_DATA_READ_IO, F2FS_BLKSIZE); - ClearPageError(page); + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, + F2FS_BLKSIZE); *last_block_in_bio = block_nr; goto out; confused: @@ -2159,7 +2200,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, sector_t last_block_in_file; const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; - struct extent_info ei = {0, }; + struct extent_info ei = {}; bool from_dnode = true; int i; int ret = 0; @@ -2193,7 +2234,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (f2fs_cluster_is_empty(cc)) goto out; - if (f2fs_lookup_extent_cache(inode, start_idx, &ei)) + if (f2fs_lookup_read_extent_cache(inode, start_idx, &ei)) from_dnode = false; if (!from_dnode) @@ -2252,7 +2293,7 @@ skip_reading_dnode: if (f2fs_load_compressed_page(sbi, page, blkaddr)) { if (atomic_dec_and_test(&dic->remaining_pages)) - f2fs_decompress_cluster(dic); + f2fs_decompress_cluster(dic, true); continue; } @@ -2270,7 +2311,7 @@ submit_and_realloc: page->index, for_write); if (IS_ERR(bio)) { ret = PTR_ERR(bio); - f2fs_decompress_end_io(dic, ret); + f2fs_decompress_end_io(dic, ret, true); f2fs_put_dnode(&dn); *bio_ret = NULL; return ret; @@ -2285,9 +2326,7 @@ submit_and_realloc: refcount_inc(&dic->refcnt); inc_page_count(sbi, F2FS_RD_DATA); - f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); - f2fs_update_iostat(sbi, FS_CDATA_READ_IO, F2FS_BLKSIZE); - ClearPageError(page); + f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); *last_block_in_bio = blkaddr; } @@ -2304,7 +2343,6 @@ out: for (i = 0; i < cc->cluster_size; i++) { if (cc->rpages[i]) { ClearPageUptodate(cc->rpages[i]); - ClearPageError(cc->rpages[i]); unlock_page(cc->rpages[i]); } } @@ -2401,7 +2439,6 @@ read_single_page: #ifdef CONFIG_F2FS_FS_COMPRESSION set_error_page: #endif - SetPageError(page); zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); } @@ -2512,6 +2549,9 @@ static inline bool check_inplace_update_policy(struct inode *inode, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int policy = SM_I(sbi)->ipu_policy; + if (policy & (0x1 << F2FS_IPU_HONOR_OPU_WRITE) && + is_inode_flag_set(inode, FI_OPU_WRITE)) + return false; if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi)) @@ -2554,7 +2594,7 @@ bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) return true; /* if this is cold file, we should overwrite to avoid fragmentation */ - if (file_is_cold(inode)) + if (file_is_cold(inode) && !is_inode_flag_set(inode, FI_OPU_WRITE)) return true; return check_inplace_update_policy(inode, fio); @@ -2582,6 +2622,9 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return true; + if (is_inode_flag_set(inode, FI_OPU_WRITE)) + return true; + if (fio) { if (page_private_gcing(fio->page)) return true; @@ -2614,14 +2657,22 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) bool ipu_force = false; int err = 0; - set_new_dnode(&dn, inode, NULL, NULL, 0); + /* Use COW inode to make dnode_of_data for atomic write */ + if (f2fs_is_atomic_file(inode)) + set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0); + else + set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && - f2fs_lookup_extent_cache(inode, page->index, &ei)) { + f2fs_lookup_read_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, - DATA_GENERIC_ENHANCE)) + DATA_GENERIC_ENHANCE)) { + f2fs_handle_error(fio->sbi, + ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } ipu_force = true; fio->need_lock = LOCK_DONE; @@ -2649,8 +2700,10 @@ got_it: !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); goto out_writepage; } + /* * If current allocation needs SSR, * it had better in-place writes for updated data. @@ -2688,7 +2741,7 @@ got_it: fio->need_lock = LOCK_REQ; } - err = f2fs_get_node_info(fio->sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio->sbi, dn.nid, &ni, false); if (err) goto out_writepage; @@ -2747,6 +2800,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, .submitted = false, .compr_blocks = compr_blocks, .need_lock = LOCK_RETRY, + .post_read = f2fs_post_read_required(inode), .io_type = io_type, .io_wbc = wbc, .bio = bio, @@ -2787,11 +2841,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, write: if (f2fs_is_drop_cache(inode)) goto out; - /* we should not write 0'th page having journal header */ - if (f2fs_is_volatile_file(inode) && (!page->index || - (!wbc->for_reclaim && - f2fs_available_free_memory(sbi, BASE_CHECK)))) - goto redirty_out; /* Dentry/quota blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) { @@ -2801,13 +2850,13 @@ write: * the below discard race condition. */ if (IS_NOQUOTA(inode)) - down_read(&sbi->node_write); + f2fs_down_read(&sbi->node_write); fio.need_lock = LOCK_DONE; err = f2fs_do_write_data_page(&fio); if (IS_NOQUOTA(inode)) - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); goto done; } @@ -2923,7 +2972,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, { int ret = 0; int done = 0, retry = 0; - struct pagevec pvec; + struct page *pages[F2FS_ONSTACK_PAGES]; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); struct bio *bio = NULL; sector_t last_block; @@ -2937,6 +2986,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, .rpages = NULL, .nr_rpages = 0, .cpages = NULL, + .valid_nr_cpages = 0, .rbuf = NULL, .cbuf = NULL, .rlen = PAGE_SIZE * F2FS_I(inode)->i_cluster_size, @@ -2953,8 +3003,6 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int submitted = 0; int i; - pagevec_init(&pvec); - if (get_dirty_pages(mapping->host) <= SM_I(F2FS_M_SB(mapping))->min_hot_blocks) set_inode_flag(mapping->host, FI_HOT_DATA); @@ -2980,18 +3028,22 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !retry && (index <= end)) { - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag); + nr_pages = find_get_pages_range_tag(mapping, &index, end, + tag, F2FS_ONSTACK_PAGES, pages); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + struct page *page = pages[i]; bool need_readd; readd: need_readd = false; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { + void *fsdata = NULL; + struct page *pagep; + int ret2; + ret = f2fs_init_compress_ctx(&cc); if (ret) { done = 1; @@ -3010,27 +3062,27 @@ readd: if (unlikely(f2fs_cp_error(sbi))) goto lock_page; - if (f2fs_cluster_is_empty(&cc)) { - void *fsdata = NULL; - struct page *pagep; - int ret2; + if (!f2fs_cluster_is_empty(&cc)) + goto lock_page; - ret2 = f2fs_prepare_compress_overwrite( + if (f2fs_all_cluster_page_ready(&cc, + pages, i, nr_pages, true)) + goto lock_page; + + ret2 = f2fs_prepare_compress_overwrite( inode, &pagep, page->index, &fsdata); - if (ret2 < 0) { - ret = ret2; - done = 1; - break; - } else if (ret2 && - !f2fs_compress_write_end(inode, - fsdata, page->index, - 1)) { - retry = 1; - break; - } - } else { - goto lock_page; + if (ret2 < 0) { + ret = ret2; + done = 1; + break; + } else if (ret2 && + (!f2fs_compress_write_end(inode, + fsdata, page->index, 1) || + !f2fs_all_cluster_page_ready(&cc, + pages, i, nr_pages, false))) { + retry = 1; + break; } } #endif @@ -3098,8 +3150,7 @@ result: } else if (ret == -EAGAIN) { ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, + f2fs_io_schedule_timeout( DEFAULT_IO_TIMEOUT); goto retry_write; } @@ -3119,7 +3170,7 @@ next: if (need_readd) goto readd; } - pagevec_release(&pvec); + release_pages(pages, nr_pages); cond_resched(); } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -3205,8 +3256,8 @@ static int __f2fs_write_data_pages(struct address_space *mapping, f2fs_available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; - /* skip writing during file defragment */ - if (is_inode_flag_set(inode, FI_DO_DEFRAG)) + /* skip writing in file defragment preparing stage */ + if (is_inode_flag_set(inode, FI_SKIP_WRITES)) goto skip_write; trace_f2fs_writepages(mapping->host, wbc, DATA); @@ -3259,7 +3310,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, FS_CP_DATA_IO : FS_DATA_IO); } -static void f2fs_write_failed(struct inode *inode, loff_t to) +void f2fs_write_failed(struct inode *inode, loff_t to) { loff_t i_size = i_size_read(inode); @@ -3268,14 +3319,14 @@ static void f2fs_write_failed(struct inode *inode, loff_t to) /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ if (to > i_size && !f2fs_verity_in_progress(inode)) { - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -3293,12 +3344,10 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, int flag; /* - * we already allocated all the blocks, so we don't need to get - * the block addresses when there is no need to fill the page. + * If a whole page is being written and we already preallocated all the + * blocks, then there is no need to get a block address now. */ - if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE && - !is_inode_flag_set(inode, FI_NO_PREALLOC) && - !f2fs_verity_in_progress(inode)) + if (len == PAGE_SIZE && is_inode_flag_set(inode, FI_PREALLOCATED_ALL)) return 0; /* f2fs_lock_op avoids race between write CP and convert_inline_page */ @@ -3339,7 +3388,7 @@ restart: } else if (locked) { err = f2fs_get_block(&dn, index); } else { - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; } else { /* hole case */ @@ -3366,6 +3415,104 @@ unlock_out: return err; } +static int __find_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr) +{ + struct dnode_of_data dn; + struct page *ipage; + struct extent_info ei = {0, }; + int err = 0; + + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + /* hole case */ + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) { + dn.data_blkaddr = NULL_ADDR; + err = 0; + } + } + *blk_addr = dn.data_blkaddr; + f2fs_put_dnode(&dn); + return err; +} + +static int __reserve_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr, bool *node_changed) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct page *ipage; + int err = 0; + + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + set_new_dnode(&dn, inode, ipage, ipage, 0); + + err = f2fs_get_block(&dn, index); + + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; + f2fs_put_dnode(&dn); + +unlock_out: + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + return err; +} + +static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned int len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + struct inode *cow_inode = F2FS_I(inode)->cow_inode; + pgoff_t index = page->index; + int err = 0; + block_t ori_blk_addr = NULL_ADDR; + + /* If pos is beyond the end of file, reserve a new block in COW inode */ + if ((pos & PAGE_MASK) >= i_size_read(inode)) + goto reserve_block; + + /* Look for the block in COW inode first */ + err = __find_data_block(cow_inode, index, blk_addr); + if (err) + return err; + else if (*blk_addr != NULL_ADDR) + return 0; + + if (is_inode_flag_set(inode, FI_ATOMIC_REPLACE)) + goto reserve_block; + + /* Look for the block in the original inode */ + err = __find_data_block(inode, index, &ori_blk_addr); + if (err) + return err; + +reserve_block: + /* Finally, we should reserve a new block in COW inode for the update */ + err = __reserve_data_block(cow_inode, index, blk_addr, node_changed); + if (err) + return err; + inc_atomic_write_cnt(inode); + + if (ori_blk_addr != NULL_ADDR) + *blk_addr = ori_blk_addr; + return 0; +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -3374,7 +3521,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; - bool need_balance = false, drop_atomic = false; + bool need_balance = false; block_t blkaddr = NULL_ADDR; int err = 0; @@ -3385,14 +3532,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, goto fail; } - if ((f2fs_is_atomic_file(inode) && - !f2fs_available_free_memory(sbi, INMEM_PAGES)) || - is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { - err = -ENOMEM; - drop_atomic = true; - goto fail; - } - /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -3440,7 +3579,11 @@ repeat: *pagep = page; - err = prepare_write_begin(sbi, page, pos, len, + if (f2fs_is_atomic_file(inode)) + err = prepare_atomic_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); + else + err = prepare_write_begin(sbi, page, pos, len, &blkaddr, &need_balance); if (err) goto fail; @@ -3475,6 +3618,7 @@ repeat: if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; } err = f2fs_submit_page_read(inode, page, blkaddr, 0, true); @@ -3496,8 +3640,6 @@ repeat: fail: f2fs_put_page(page, 1); f2fs_write_failed(inode, pos + len); - if (drop_atomic) - f2fs_drop_inmem_pages_all(sbi, false); return err; } @@ -3541,166 +3683,18 @@ static int f2fs_write_end(struct file *file, set_page_dirty(page); if (pos + copied > i_size_read(inode) && - !f2fs_verity_in_progress(inode)) + !f2fs_verity_in_progress(inode)) { f2fs_i_size_write(inode, pos + copied); + if (f2fs_is_atomic_file(inode)) + f2fs_i_size_write(F2FS_I(inode)->cow_inode, + pos + copied); + } unlock_out: f2fs_put_page(page, 1); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } -static int check_direct_IO(struct inode *inode, struct iov_iter *iter, - loff_t offset) -{ - unsigned i_blkbits = READ_ONCE(inode->i_blkbits); - unsigned blkbits = i_blkbits; - unsigned blocksize_mask = (1 << blkbits) - 1; - unsigned long align = offset | iov_iter_alignment(iter); - struct block_device *bdev = inode->i_sb->s_bdev; - - if (iov_iter_rw(iter) == READ && offset >= i_size_read(inode)) - return 1; - - if (align & blocksize_mask) { - if (bdev) - blkbits = blksize_bits(bdev_logical_block_size(bdev)); - blocksize_mask = (1 << blkbits) - 1; - if (align & blocksize_mask) - return -EINVAL; - return 1; - } - return 0; -} - -static void f2fs_dio_end_io(struct bio *bio) -{ - struct f2fs_private_dio *dio = bio->bi_private; - - dec_page_count(F2FS_I_SB(dio->inode), - dio->write ? F2FS_DIO_WRITE : F2FS_DIO_READ); - - bio->bi_private = dio->orig_private; - bio->bi_end_io = dio->orig_end_io; - - kfree(dio); - - bio_endio(bio); -} - -static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode, - loff_t file_offset) -{ - struct f2fs_private_dio *dio; - bool write = (bio_op(bio) == REQ_OP_WRITE); - - dio = f2fs_kzalloc(F2FS_I_SB(inode), - sizeof(struct f2fs_private_dio), GFP_NOFS); - if (!dio) - goto out; - - dio->inode = inode; - dio->orig_end_io = bio->bi_end_io; - dio->orig_private = bio->bi_private; - dio->write = write; - - bio->bi_end_io = f2fs_dio_end_io; - bio->bi_private = dio; - - inc_page_count(F2FS_I_SB(inode), - write ? F2FS_DIO_WRITE : F2FS_DIO_READ); - - submit_bio(bio); - return; -out: - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); -} - -static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - size_t count = iov_iter_count(iter); - loff_t offset = iocb->ki_pos; - int rw = iov_iter_rw(iter); - int err; - enum rw_hint hint = iocb->ki_hint; - int whint_mode = F2FS_OPTION(sbi).whint_mode; - bool do_opu; - - err = check_direct_IO(inode, iter, offset); - if (err) - return err < 0 ? err : 0; - - if (f2fs_force_buffered_io(inode, iocb, iter)) - return 0; - - do_opu = rw == WRITE && f2fs_lfs_mode(sbi); - - trace_f2fs_direct_IO_enter(inode, offset, count, rw); - - if (rw == WRITE && whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = WRITE_LIFE_NOT_SET; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!down_read_trylock(&fi->i_gc_rwsem[rw])) { - iocb->ki_hint = hint; - err = -EAGAIN; - goto out; - } - if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) { - up_read(&fi->i_gc_rwsem[rw]); - iocb->ki_hint = hint; - err = -EAGAIN; - goto out; - } - } else { - down_read(&fi->i_gc_rwsem[rw]); - if (do_opu) - down_read(&fi->i_gc_rwsem[READ]); - } - - err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, rw == WRITE ? get_data_block_dio_write : - get_data_block_dio, NULL, f2fs_dio_submit_bio, - rw == WRITE ? DIO_LOCKING | DIO_SKIP_HOLES : - DIO_SKIP_HOLES); - - if (do_opu) - up_read(&fi->i_gc_rwsem[READ]); - - up_read(&fi->i_gc_rwsem[rw]); - - if (rw == WRITE) { - if (whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = hint; - if (err > 0) { - f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, - err); - if (!do_opu) - set_inode_flag(inode, FI_UPDATE_WRITE); - } else if (err == -EIOCBQUEUED) { - f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, - count - iov_iter_count(iter)); - } else if (err < 0) { - f2fs_write_failed(inode, offset + count); - } - } else { - if (err > 0) - f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, err); - else if (err == -EIOCBQUEUED) - f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_READ_IO, - count - iov_iter_count(iter)); - } - -out: - trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); - - return err; -} - void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length) { @@ -3724,15 +3718,9 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, clear_page_private_gcing(page); - if (test_opt(sbi, COMPRESS_CACHE)) { - if (f2fs_compressed_file(inode)) - f2fs_invalidate_compress_pages(sbi, inode->i_ino); - if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) - clear_page_private_data(page); - } - - if (page_private_atomic(page)) - return f2fs_drop_inmem_page(inode, page); + if (test_opt(sbi, COMPRESS_CACHE) && + inode->i_ino == F2FS_COMPRESS_INO(sbi)) + clear_page_private_data(page); detach_page_private(page); set_page_private(page, 0); @@ -3744,17 +3732,10 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (PageDirty(page)) return 0; - /* This is atomic written page, keep Private */ - if (page_private_atomic(page)) - return 0; - if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) { - struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct inode *inode = page->mapping->host; - if (f2fs_compressed_file(inode)) - f2fs_invalidate_compress_pages(sbi, inode->i_ino); - if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) + if (inode->i_ino == F2FS_COMPRESS_INO(F2FS_I_SB(inode))) clear_page_private_data(page); } @@ -3776,20 +3757,7 @@ static int f2fs_set_data_page_dirty(struct page *page) if (PageSwapCache(page)) return __set_page_dirty_nobuffers(page); - if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { - if (!page_private_atomic(page)) { - f2fs_register_inmem_page(inode, page); - return 1; - } - /* - * Previously, this page has been registered, we just - * return here. - */ - return 0; - } - - if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + if (__set_page_dirty_nobuffers(page)) { f2fs_update_dirty_page(inode, page); return 1; } @@ -3867,42 +3835,14 @@ out: int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { - int rc, extra_count; - struct f2fs_inode_info *fi = F2FS_I(mapping->host); - bool atomic_written = page_private_atomic(page); + int rc, extra_count = 0; BUG_ON(PageWriteback(page)); - /* migrating an atomic written page is safe with the inmem_lock hold */ - if (atomic_written) { - if (mode != MIGRATE_SYNC) - return -EBUSY; - if (!mutex_trylock(&fi->inmem_lock)) - return -EAGAIN; - } - - /* one extra reference was held for atomic_write page */ - extra_count = atomic_written ? 1 : 0; rc = migrate_page_move_mapping(mapping, newpage, page, extra_count); - if (rc != MIGRATEPAGE_SUCCESS) { - if (atomic_written) - mutex_unlock(&fi->inmem_lock); + if (rc != MIGRATEPAGE_SUCCESS) return rc; - } - - if (atomic_written) { - struct inmem_pages *cur; - - list_for_each_entry(cur, &fi->inmem_pages, list) - if (cur->page == page) { - cur->page = newpage; - break; - } - mutex_unlock(&fi->inmem_lock); - put_page(page); - get_page(newpage); - } /* guarantee to start from no stale private field */ set_page_private(newpage, 0); @@ -3936,19 +3876,20 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, unsigned int end_sec = secidx + blkcnt / blk_per_sec; int ret = 0; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); set_inode_flag(inode, FI_ALIGNED_WRITE); + set_inode_flag(inode, FI_OPU_WRITE); for (; secidx < end_sec; secidx++) { - down_write(&sbi->pin_sem); + f2fs_down_write(&sbi->pin_sem); f2fs_lock_op(sbi); f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); f2fs_unlock_op(sbi); - set_inode_flag(inode, FI_DO_DEFRAG); + set_inode_flag(inode, FI_SKIP_WRITES); for (blkofs = 0; blkofs < blk_per_sec; blkofs++) { struct page *page; @@ -3956,7 +3897,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, page = f2fs_get_lock_data_page(inode, blkidx, true); if (IS_ERR(page)) { - up_write(&sbi->pin_sem); + f2fs_up_write(&sbi->pin_sem); ret = PTR_ERR(page); goto done; } @@ -3965,22 +3906,23 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, f2fs_put_page(page, 1); } - clear_inode_flag(inode, FI_DO_DEFRAG); + clear_inode_flag(inode, FI_SKIP_WRITES); ret = filemap_fdatawrite(inode->i_mapping); - up_write(&sbi->pin_sem); + f2fs_up_write(&sbi->pin_sem); if (ret) break; } done: - clear_inode_flag(inode, FI_DO_DEFRAG); + clear_inode_flag(inode, FI_SKIP_WRITES); + clear_inode_flag(inode, FI_OPU_WRITE); clear_inode_flag(inode, FI_ALIGNED_WRITE); filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -4123,6 +4065,7 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret < 0) return ret; + stat_inc_swapfile_inode(inode); set_inode_flag(inode, FI_PIN_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; @@ -4132,6 +4075,7 @@ static void f2fs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); + stat_dec_swapfile_inode(inode); clear_inode_flag(inode, FI_PIN_FILE); } #else @@ -4156,7 +4100,7 @@ const struct address_space_operations f2fs_dblock_aops = { .set_page_dirty = f2fs_set_data_page_dirty, .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, - .direct_IO = f2fs_direct_IO, + .direct_IO = noop_direct_IO, .bmap = f2fs_bmap, .swap_activate = f2fs_swap_activate, .swap_deactivate = f2fs_swap_deactivate, @@ -4212,9 +4156,7 @@ int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi) sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq", WQ_UNBOUND | WQ_HIGHPRI, num_online_cpus()); - if (!sbi->post_read_wq) - return -ENOMEM; - return 0; + return sbi->post_read_wq ? 0 : -ENOMEM; } void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi) @@ -4227,12 +4169,72 @@ int __init f2fs_init_bio_entry_cache(void) { bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab", sizeof(struct bio_entry)); - if (!bio_entry_slab) - return -ENOMEM; - return 0; + return bio_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_bio_entry_cache(void) { kmem_cache_destroy(bio_entry_slab); } + +static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct f2fs_map_blocks map = {}; + pgoff_t next_pgofs = 0; + int err; + + map.m_lblk = bytes_to_blks(inode, offset); + map.m_len = bytes_to_blks(inode, offset + length - 1) - map.m_lblk + 1; + map.m_next_pgofs = &next_pgofs; + map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); + if (flags & IOMAP_WRITE) + map.m_may_create = true; + + err = f2fs_map_blocks(inode, &map, flags & IOMAP_WRITE, + F2FS_GET_BLOCK_DIO); + if (err) + return err; + + iomap->offset = blks_to_bytes(inode, map.m_lblk); + + /* + * When inline encryption is enabled, sometimes I/O to an encrypted file + * has to be broken up to guarantee DUN contiguity. Handle this by + * limiting the length of the mapping returned. + */ + map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); + + if (map.m_flags & (F2FS_MAP_MAPPED | F2FS_MAP_UNWRITTEN)) { + iomap->length = blks_to_bytes(inode, map.m_len); + if (map.m_flags & F2FS_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + iomap->flags |= IOMAP_F_MERGED; + } else { + iomap->type = IOMAP_UNWRITTEN; + } + if (WARN_ON_ONCE(!__is_valid_data_blkaddr(map.m_pblk))) + return -EINVAL; + + iomap->bdev = map.m_bdev; + iomap->addr = blks_to_bytes(inode, map.m_pblk); + } else { + iomap->length = blks_to_bytes(inode, next_pgofs) - + iomap->offset; + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + } + + if (map.m_flags & F2FS_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; + if ((inode->i_state & I_DIRTY_DATASYNC) || + offset + length > i_size_read(inode)) + iomap->flags |= IOMAP_F_DIRTY; + + return 0; +} + +const struct iomap_ops f2fs_iomap_ops = { + .iomap_begin = f2fs_iomap_begin, +}; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index b449c7a372a4..32af4f0c5735 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -39,7 +39,7 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = BLKS_PER_SEC(sbi); + blks_per_sec = CAP_BLKS_PER_SEC(sbi); hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, true); @@ -72,15 +72,26 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->main_area_zones = si->main_area_sections / le32_to_cpu(raw_super->secs_per_zone); - /* validation check of the segment numbers */ + /* general extent cache stats */ + for (i = 0; i < NR_EXTENT_CACHES; i++) { + struct extent_tree_info *eti = &sbi->extent_tree[i]; + + si->hit_cached[i] = atomic64_read(&sbi->read_hit_cached[i]); + si->hit_rbtree[i] = atomic64_read(&sbi->read_hit_rbtree[i]); + si->total_ext[i] = atomic64_read(&sbi->total_hit_ext[i]); + si->hit_total[i] = si->hit_cached[i] + si->hit_rbtree[i]; + si->ext_tree[i] = atomic_read(&eti->total_ext_tree); + si->zombie_tree[i] = atomic_read(&eti->total_zombie_tree); + si->ext_node[i] = atomic_read(&eti->total_ext_node); + } + /* read extent_cache only */ si->hit_largest = atomic64_read(&sbi->read_hit_largest); - si->hit_cached = atomic64_read(&sbi->read_hit_cached); - si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); - si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; - si->total_ext = atomic64_read(&sbi->total_hit_ext); - si->ext_tree = atomic_read(&sbi->total_ext_tree); - si->zombie_tree = atomic_read(&sbi->total_zombie_tree); - si->ext_node = atomic_read(&sbi->total_ext_node); + si->hit_total[EX_READ] += si->hit_largest; + + /* block age extent_cache only */ + si->allocated_data_blocks = atomic64_read(&sbi->allocated_data_blocks); + + /* validation check of the segment numbers */ si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); @@ -91,11 +102,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; - si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->aw_cnt = sbi->atomic_files; - si->vw_cnt = atomic_read(&sbi->vw_cnt); + si->aw_cnt = atomic_read(&sbi->atomic_files); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); - si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ); si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); @@ -138,6 +146,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); si->compr_inode = atomic_read(&sbi->compr_inode); + si->swapfile_inode = atomic_read(&sbi->swapfile_inode); si->compr_blocks = atomic64_read(&sbi->compr_blocks); si->append = sbi->im[APPEND_INO].ino_num; si->update = sbi->im[UPDATE_INO].ino_num; @@ -167,8 +176,6 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->io_skip_bggc = sbi->io_skip_bggc; si->other_skip_bggc = sbi->other_skip_bggc; - si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC]; - si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC]; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; @@ -296,28 +303,34 @@ get_cache: sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] * sizeof(struct nat_entry_set); - si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); - si->cache_mem += atomic_read(&sbi->total_ext_tree) * + + for (i = 0; i < NR_EXTENT_CACHES; i++) { + struct extent_tree_info *eti = &sbi->extent_tree[i]; + + si->ext_mem[i] = atomic_read(&eti->total_ext_tree) * sizeof(struct extent_tree); - si->cache_mem += atomic_read(&sbi->total_ext_node) * + si->ext_mem[i] += atomic_read(&eti->total_ext_node) * sizeof(struct extent_node); + si->cache_mem += si->ext_mem[i]; + } si->page_mem = 0; if (sbi->node_inode) { - unsigned npages = NODE_MAPPING(sbi)->nrpages; + unsigned long npages = NODE_MAPPING(sbi)->nrpages; si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } if (sbi->meta_inode) { - unsigned npages = META_MAPPING(sbi)->nrpages; + unsigned long npages = META_MAPPING(sbi)->nrpages; si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } #ifdef CONFIG_F2FS_FS_COMPRESSION if (sbi->compress_inode) { - unsigned npages = COMPRESS_MAPPING(sbi)->nrpages; + unsigned long npages = COMPRESS_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } #endif @@ -353,7 +366,7 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", si->sbi->sb->s_bdev, i++, - f2fs_readonly(si->sbi->sb) ? "RO": "RW", + f2fs_readonly(si->sbi->sb) ? "RO" : "RW", is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? "Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good")); if (si->sbi->s_flag) { @@ -391,6 +404,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_dir); seq_printf(s, " - Compressed Inode: %u, Blocks: %llu\n", si->compr_inode, si->compr_blocks); + seq_printf(s, " - Swapfile Inode: %u\n", + si->swapfile_inode); seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", @@ -463,64 +478,78 @@ static int stat_show(struct seq_file *s, void *v) si->meta_count[META_NAT]); seq_printf(s, " - ssa blocks : %u\n", si->meta_count[META_SSA]); - seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, " - "Cur time: %4d(ms), Peak time: %4d(ms))\n", - si->nr_queued_ckpt, si->nr_issued_ckpt, - si->nr_total_ckpt, si->cur_ckpt_time, - si->peak_ckpt_time); + seq_puts(s, "CP merge:\n"); + seq_printf(s, " - Queued : %4d\n", si->nr_queued_ckpt); + seq_printf(s, " - Issued : %4d\n", si->nr_issued_ckpt); + seq_printf(s, " - Total : %4d\n", si->nr_total_ckpt); + seq_printf(s, " - Cur time : %4d(ms)\n", si->cur_ckpt_time); + seq_printf(s, " - Peak time : %4d(ms)\n", si->peak_ckpt_time); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", si->data_segs, si->bg_data_segs); seq_printf(s, " - node segments : %d (%d)\n", si->node_segs, si->bg_node_segs); - seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), " - "Idle Greedy (%d), Idle AT (%d), " - "Urgent High (%d), Urgent Low (%d)\n", - si->sbi->gc_reclaimed_segs[GC_NORMAL], - si->sbi->gc_reclaimed_segs[GC_IDLE_CB], - si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY], - si->sbi->gc_reclaimed_segs[GC_IDLE_AT], - si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH], - si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); + seq_puts(s, " - Reclaimed segs :\n"); + seq_printf(s, " - Normal : %d\n", si->sbi->gc_reclaimed_segs[GC_NORMAL]); + seq_printf(s, " - Idle CB : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_CB]); + seq_printf(s, " - Idle Greedy : %d\n", + si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY]); + seq_printf(s, " - Idle AT : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_AT]); + seq_printf(s, " - Urgent High : %d\n", + si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH]); + seq_printf(s, " - Urgent Mid : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_MID]); + seq_printf(s, " - Urgent Low : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, si->bg_data_blks + si->bg_node_blks); seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks, si->bg_data_blks); seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); - seq_printf(s, "Skipped : atomic write %llu (%llu)\n", - si->skipped_atomic_files[BG_GC] + - si->skipped_atomic_files[FG_GC], - si->skipped_atomic_files[BG_GC]); seq_printf(s, "BG skip : IO: %u, Other: %u\n", si->io_skip_bggc, si->other_skip_bggc); - seq_puts(s, "\nExtent Cache:\n"); + seq_puts(s, "\nExtent Cache (Read):\n"); seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", - si->hit_largest, si->hit_cached, - si->hit_rbtree); + si->hit_largest, si->hit_cached[EX_READ], + si->hit_rbtree[EX_READ]); seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", - !si->total_ext ? 0 : - div64_u64(si->hit_total * 100, si->total_ext), - si->hit_total, si->total_ext); + !si->total_ext[EX_READ] ? 0 : + div64_u64(si->hit_total[EX_READ] * 100, + si->total_ext[EX_READ]), + si->hit_total[EX_READ], si->total_ext[EX_READ]); seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", - si->ext_tree, si->zombie_tree, si->ext_node); + si->ext_tree[EX_READ], si->zombie_tree[EX_READ], + si->ext_node[EX_READ]); + seq_puts(s, "\nExtent Cache (Block Age):\n"); + seq_printf(s, " - Allocated Data Blocks: %llu\n", + si->allocated_data_blocks); + seq_printf(s, " - Hit Count: L1:%llu L2:%llu\n", + si->hit_cached[EX_BLOCK_AGE], + si->hit_rbtree[EX_BLOCK_AGE]); + seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", + !si->total_ext[EX_BLOCK_AGE] ? 0 : + div64_u64(si->hit_total[EX_BLOCK_AGE] * 100, + si->total_ext[EX_BLOCK_AGE]), + si->hit_total[EX_BLOCK_AGE], + si->total_ext[EX_BLOCK_AGE]); + seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", + si->ext_tree[EX_BLOCK_AGE], + si->zombie_tree[EX_BLOCK_AGE], + si->ext_node[EX_BLOCK_AGE]); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - DIO (R: %4d, W: %4d)\n", si->nr_dio_read, si->nr_dio_write); seq_printf(s, " - IO_R (Data: %4d, Node: %4d, Meta: %4d\n", si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta); - seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), " - "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", + seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), ", si->nr_wb_cp_data, si->nr_wb_data, si->nr_flushing, si->nr_flushed, - si->flush_list_empty, + si->flush_list_empty); + seq_printf(s, "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); - seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " - "volatile IO: %4d (Max. %4d)\n", - si->inmem_pages, si->aw_cnt, si->max_aw_cnt, - si->vw_cnt, si->max_vw_cnt); + seq_printf(s, " - atomic IO: %4d (Max. %4d)\n", + si->aw_cnt, si->max_aw_cnt); seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); @@ -534,6 +563,9 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_meta, si->meta_pages); seq_printf(s, " - imeta: %4d\n", si->ndirty_imeta); + seq_printf(s, " - fsync mark: %4lld\n", + percpu_counter_sum_positive( + &si->sbi->rf_node_block_count)); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", @@ -570,8 +602,12 @@ static int stat_show(struct seq_file *s, void *v) (si->base_mem + si->cache_mem + si->page_mem) >> 10); seq_printf(s, " - static: %llu KB\n", si->base_mem >> 10); - seq_printf(s, " - cached: %llu KB\n", + seq_printf(s, " - cached all: %llu KB\n", si->cache_mem >> 10); + seq_printf(s, " - read extent cache: %llu KB\n", + si->ext_mem[EX_READ] >> 10); + seq_printf(s, " - block age extent cache: %llu KB\n", + si->ext_mem[EX_BLOCK_AGE] >> 10); seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } @@ -604,23 +640,28 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->sbi = sbi; sbi->stat_info = si; - atomic64_set(&sbi->total_hit_ext, 0); - atomic64_set(&sbi->read_hit_rbtree, 0); + /* general extent cache stats */ + for (i = 0; i < NR_EXTENT_CACHES; i++) { + atomic64_set(&sbi->total_hit_ext[i], 0); + atomic64_set(&sbi->read_hit_rbtree[i], 0); + atomic64_set(&sbi->read_hit_cached[i], 0); + } + + /* read extent_cache only */ atomic64_set(&sbi->read_hit_largest, 0); - atomic64_set(&sbi->read_hit_cached, 0); atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->compr_inode, 0); atomic64_set(&sbi->compr_blocks, 0); + atomic_set(&sbi->swapfile_inode, 0); + atomic_set(&sbi->atomic_files, 0); atomic_set(&sbi->inplace_count, 0); for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); - atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); - atomic_set(&sbi->max_vw_cnt, 0); raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_add_tail(&si->stat_list, &f2fs_stat_list); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 5c78350158df..fd764b1d69d7 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -340,6 +340,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, unsigned int bidx, end_block; struct page *dentry_page; struct f2fs_dir_entry *de = NULL; + pgoff_t next_pgofs; bool room = false; int max_slots; @@ -350,12 +351,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, le32_to_cpu(fname->hash) % nbucket); end_block = bidx + nblock; - for (; bidx < end_block; bidx++) { + while (bidx < end_block) { /* no need to allocate new dentry pages to all the indices */ - dentry_page = f2fs_find_data_page(dir, bidx); + dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) { room = true; + bidx = next_pgofs; continue; } else { *res_page = dentry_page; @@ -376,6 +378,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, if (max_slots >= s) room = true; f2fs_put_page(dentry_page, 0); + + bidx++; } if (!de && room && F2FS_I(dir)->chash != fname->hash) { @@ -767,7 +771,7 @@ add_dentry: f2fs_wait_on_page_writeback(dentry_page, DATA, true, true); if (inode) { - down_write(&F2FS_I(inode)->i_sem); + f2fs_down_write(&F2FS_I(inode)->i_sem); page = f2fs_init_inode_metadata(inode, dir, fname, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -794,7 +798,7 @@ add_dentry: f2fs_update_parent_metadata(dir, inode, current_depth); fail: if (inode) - up_write(&F2FS_I(inode)->i_sem); + f2fs_up_write(&F2FS_I(inode)->i_sem); f2fs_put_page(dentry_page, 1); @@ -859,7 +863,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) struct page *page; int err = 0; - down_write(&F2FS_I(inode)->i_sem); + f2fs_down_write(&F2FS_I(inode)->i_sem); page = f2fs_init_inode_metadata(inode, dir, NULL, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -870,7 +874,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) clear_inode_flag(inode, FI_NEW_INODE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); fail: - up_write(&F2FS_I(inode)->i_sem); + f2fs_up_write(&F2FS_I(inode)->i_sem); return err; } @@ -878,7 +882,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - down_write(&F2FS_I(inode)->i_sem); + f2fs_down_write(&F2FS_I(inode)->i_sem); if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, false); @@ -889,7 +893,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) f2fs_i_links_write(inode, false); f2fs_i_size_write(inode, 0); } - up_write(&F2FS_I(inode)->i_sem); + f2fs_up_write(&F2FS_I(inode)->i_sem); if (inode->i_nlink == 0) f2fs_add_orphan_inode(inode); @@ -956,7 +960,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, bool f2fs_empty_dir(struct inode *dir) { - unsigned long bidx; + unsigned long bidx = 0; struct page *dentry_page; unsigned int bit_pos; struct f2fs_dentry_block *dentry_blk; @@ -965,13 +969,17 @@ bool f2fs_empty_dir(struct inode *dir) if (f2fs_has_inline_dentry(dir)) return f2fs_empty_inline_dir(dir); - for (bidx = 0; bidx < nblock; bidx++) { - dentry_page = f2fs_get_lock_data_page(dir, bidx, false); + while (bidx < nblock) { + pgoff_t next_pgofs; + + dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); if (IS_ERR(dentry_page)) { - if (PTR_ERR(dentry_page) == -ENOENT) + if (PTR_ERR(dentry_page) == -ENOENT) { + bidx = next_pgofs; continue; - else + } else { return false; + } } dentry_blk = page_address(dentry_page); @@ -983,10 +991,12 @@ bool f2fs_empty_dir(struct inode *dir) NR_DENTRY_IN_BLOCK, bit_pos); - f2fs_put_page(dentry_page, 1); + f2fs_put_page(dentry_page, 0); if (bit_pos < NR_DENTRY_IN_BLOCK) return false; + + bidx++; } return true; } @@ -1000,7 +1010,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, struct fscrypt_str de_name = FSTR_INIT(NULL, 0); struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); struct blk_plug plug; - bool readdir_ra = sbi->readdir_ra == 1; + bool readdir_ra = sbi->readdir_ra; bool found_valid_dirent = false; int err = 0; @@ -1041,6 +1051,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, __func__, le16_to_cpu(de->name_len)); set_sbi_flag(sbi, SBI_NEED_FSCK); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_DIRENT); goto out; } @@ -1103,7 +1114,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) goto out_free; } - for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) { + for (; n < npages; ctx->pos = n * NR_DENTRY_IN_BLOCK) { + pgoff_t next_pgofs; /* allow readdir() to be interrupted */ if (fatal_signal_pending(current)) { @@ -1117,11 +1129,12 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - dentry_page = f2fs_find_data_page(inode, n); + dentry_page = f2fs_find_data_page(inode, n, &next_pgofs); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); if (err == -ENOENT) { err = 0; + n = next_pgofs; continue; } else { goto out_free; @@ -1140,6 +1153,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) } f2fs_put_page(dentry_page, 0); + + n++; } out_free: fscrypt_fname_free_buffer(&fstr); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 6a9ab5c11939..342af24b2f8c 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -6,6 +6,10 @@ * Copyright (c) 2015 Samsung Electronics * Authors: Jaegeuk Kim * Chao Yu + * + * block_age-based extent cache added by: + * Copyright (c) 2022 xiaomi Co., Ltd. + * http://www.xiaomi.com/ */ #include @@ -15,6 +19,123 @@ #include "node.h" #include +static void __set_extent_info(struct extent_info *ei, + unsigned int fofs, unsigned int len, + block_t blk, bool keep_clen, + unsigned long age, unsigned long last_blocks, + enum extent_type type) +{ + ei->fofs = fofs; + ei->len = len; + + if (type == EX_READ) { + ei->blk = blk; + if (keep_clen) + return; +#ifdef CONFIG_F2FS_FS_COMPRESSION + ei->c_len = 0; +#endif + } else if (type == EX_BLOCK_AGE) { + ei->age = age; + ei->last_blocks = last_blocks; + } +} + +static bool __may_read_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, READ_EXTENT_CACHE)) + return false; + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return false; + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && + !f2fs_sb_has_readonly(sbi)) + return false; + return S_ISREG(inode->i_mode); +} + +static bool __may_age_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, AGE_EXTENT_CACHE)) + return false; + /* don't cache block age info for cold file */ + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) + return false; + if (file_is_cold(inode)) + return false; + + return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); +} + +static bool __init_may_extent_tree(struct inode *inode, enum extent_type type) +{ + if (type == EX_READ) + return __may_read_extent_tree(inode); + else if (type == EX_BLOCK_AGE) + return __may_age_extent_tree(inode); + return false; +} + +static bool __may_extent_tree(struct inode *inode, enum extent_type type) +{ + /* + * for recovered files during mount do not create extents + * if shrinker is not registered. + */ + if (list_empty(&F2FS_I_SB(inode)->s_list)) + return false; + + return __init_may_extent_tree(inode, type); +} + +static void __try_update_largest_extent(struct extent_tree *et, + struct extent_node *en) +{ + if (et->type != EX_READ) + return; + if (en->ei.len <= et->largest.len) + return; + + et->largest = en->ei; + et->largest_updated = true; +} + +static bool __is_extent_mergeable(struct extent_info *back, + struct extent_info *front, enum extent_type type) +{ + if (type == EX_READ) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (back->c_len && back->len != back->c_len) + return false; + if (front->c_len && front->len != front->c_len) + return false; +#endif + return (back->fofs + back->len == front->fofs && + back->blk + back->len == front->blk); + } else if (type == EX_BLOCK_AGE) { + return (back->fofs + back->len == front->fofs && + abs(back->age - front->age) <= SAME_AGE_REGION && + abs(back->last_blocks - front->last_blocks) <= + SAME_AGE_REGION); + } + return false; +} + +static bool __is_back_mergeable(struct extent_info *cur, + struct extent_info *back, enum extent_type type) +{ + return __is_extent_mergeable(back, cur, type); +} + +static bool __is_front_mergeable(struct extent_info *cur, + struct extent_info *front, enum extent_type type) +{ + return __is_extent_mergeable(cur, front, type); +} + static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, unsigned int ofs) { @@ -237,6 +358,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, struct rb_node *parent, struct rb_node **p, bool leftmost) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct extent_node *en; en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi); @@ -250,16 +372,18 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, rb_link_node(&en->rb_node, parent, p); rb_insert_color_cached(&en->rb_node, &et->root, leftmost); atomic_inc(&et->node_cnt); - atomic_inc(&sbi->total_ext_node); + atomic_inc(&eti->total_ext_node); return en; } static void __detach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + rb_erase_cached(&en->rb_node, &et->root); atomic_dec(&et->node_cnt); - atomic_dec(&sbi->total_ext_node); + atomic_dec(&eti->total_ext_node); if (et->cached_en == en) et->cached_en = NULL; @@ -275,61 +399,51 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi, static void __release_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { - spin_lock(&sbi->extent_lock); + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + + spin_lock(&eti->extent_lock); f2fs_bug_on(sbi, list_empty(&en->list)); list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); __detach_extent_node(sbi, et, en); } -static struct extent_tree *__grab_extent_tree(struct inode *inode) +static struct extent_tree *__grab_extent_tree(struct inode *inode, + enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree_info *eti = &sbi->extent_tree[type]; struct extent_tree *et; nid_t ino = inode->i_ino; - mutex_lock(&sbi->extent_tree_lock); - et = radix_tree_lookup(&sbi->extent_tree_root, ino); + mutex_lock(&eti->extent_tree_lock); + et = radix_tree_lookup(&eti->extent_tree_root, ino); if (!et) { et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS, true, NULL); - f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); + f2fs_radix_tree_insert(&eti->extent_tree_root, ino, et); memset(et, 0, sizeof(struct extent_tree)); et->ino = ino; + et->type = type; et->root = RB_ROOT_CACHED; et->cached_en = NULL; rwlock_init(&et->lock); INIT_LIST_HEAD(&et->list); atomic_set(&et->node_cnt, 0); - atomic_inc(&sbi->total_ext_tree); + atomic_inc(&eti->total_ext_tree); } else { - atomic_dec(&sbi->total_zombie_tree); + atomic_dec(&eti->total_zombie_tree); list_del_init(&et->list); } - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); /* never died until evict_inode */ - F2FS_I(inode)->extent_tree = et; + F2FS_I(inode)->extent_tree[type] = et; return et; } -static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_info *ei) -{ - struct rb_node **p = &et->root.rb_root.rb_node; - struct extent_node *en; - - en = __attach_extent_node(sbi, et, ei, NULL, p, true); - if (!en) - return NULL; - - et->largest = en->ei; - et->cached_en = en; - return en; -} - static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et) { @@ -358,71 +472,89 @@ static void __drop_largest_extent(struct extent_tree *et, } } -/* return true, if inode page is changed */ -static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage) +void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL; + struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; + struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext; struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (!f2fs_may_extent_tree(inode)) { - /* drop largest extent */ + if (!__may_extent_tree(inode, EX_READ)) { + /* drop largest read extent */ if (i_ext && i_ext->len) { f2fs_wait_on_page_writeback(ipage, NODE, true, true); i_ext->len = 0; set_page_dirty(ipage); - return; } - return; + goto out; } - et = __grab_extent_tree(inode); + et = __grab_extent_tree(inode, EX_READ); if (!i_ext || !i_ext->len) - return; + goto out; - get_extent_info(&ei, i_ext); + get_read_extent_info(&ei, i_ext); write_lock(&et->lock); if (atomic_read(&et->node_cnt)) - goto out; + goto unlock_out; - en = __init_extent_tree(sbi, et, &ei); + en = __attach_extent_node(sbi, et, &ei, NULL, + &et->root.rb_root.rb_node, true); if (en) { - spin_lock(&sbi->extent_lock); - list_add_tail(&en->list, &sbi->extent_list); - spin_unlock(&sbi->extent_lock); + et->largest = en->ei; + et->cached_en = en; + + spin_lock(&eti->extent_lock); + list_add_tail(&en->list, &eti->extent_list); + spin_unlock(&eti->extent_lock); } -out: +unlock_out: write_unlock(&et->lock); -} - -void f2fs_init_extent_tree(struct inode *inode, struct page *ipage) -{ - __f2fs_init_extent_tree(inode, ipage); - - if (!F2FS_I(inode)->extent_tree) +out: + if (!F2FS_I(inode)->extent_tree[EX_READ]) set_inode_flag(inode, FI_NO_EXTENT); } -static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) +void f2fs_init_age_extent_tree(struct inode *inode) +{ + if (!__init_may_extent_tree(inode, EX_BLOCK_AGE)) + return; + __grab_extent_tree(inode, EX_BLOCK_AGE); +} + +void f2fs_init_extent_tree(struct inode *inode) +{ + /* initialize read cache */ + if (__init_may_extent_tree(inode, EX_READ)) + __grab_extent_tree(inode, EX_READ); + + /* initialize block age cache */ + if (__init_may_extent_tree(inode, EX_BLOCK_AGE)) + __grab_extent_tree(inode, EX_BLOCK_AGE); +} + +static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; struct extent_node *en; bool ret = false; if (!et) return false; - trace_f2fs_lookup_extent_tree_start(inode, pgofs); + trace_f2fs_lookup_extent_tree_start(inode, pgofs, type); read_lock(&et->lock); - if (et->largest.fofs <= pgofs && + if (type == EX_READ && + et->largest.fofs <= pgofs && et->largest.fofs + et->largest.len > pgofs) { *ei = et->largest; ret = true; @@ -436,23 +568,26 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; if (en == et->cached_en) - stat_inc_cached_node_hit(sbi); + stat_inc_cached_node_hit(sbi, type); else - stat_inc_rbtree_node_hit(sbi); + stat_inc_rbtree_node_hit(sbi, type); *ei = en->ei; - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); et->cached_en = en; } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); ret = true; out: - stat_inc_total_hit(sbi); + stat_inc_total_hit(sbi, type); read_unlock(&et->lock); - trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei); + if (type == EX_READ) + trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei); + else if (type == EX_BLOCK_AGE) + trace_f2fs_lookup_age_extent_tree_end(inode, pgofs, ei); return ret; } @@ -461,18 +596,20 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, struct extent_node *prev_ex, struct extent_node *next_ex) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct extent_node *en = NULL; - if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { + if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei, et->type)) { prev_ex->ei.len += ei->len; ei = &prev_ex->ei; en = prev_ex; } - if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { + if (next_ex && __is_front_mergeable(ei, &next_ex->ei, et->type)) { next_ex->ei.fofs = ei->fofs; - next_ex->ei.blk = ei->blk; next_ex->ei.len += ei->len; + if (et->type == EX_READ) + next_ex->ei.blk = ei->blk; if (en) __release_extent_node(sbi, et, prev_ex); @@ -484,12 +621,12 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, __try_update_largest_extent(et, en); - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); et->cached_en = en; } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); return en; } @@ -499,6 +636,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, struct rb_node *insert_parent, bool leftmost) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct rb_node **p; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -521,48 +659,55 @@ do_insert: __try_update_largest_extent(et, en); /* update in global extent list */ - spin_lock(&sbi->extent_lock); - list_add_tail(&en->list, &sbi->extent_list); + spin_lock(&eti->extent_lock); + list_add_tail(&en->list, &eti->extent_list); et->cached_en = en; - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); return en; } -static void f2fs_update_extent_tree_range(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int len) +static void __update_extent_tree_range(struct inode *inode, + struct extent_info *tei, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; struct extent_node *en = NULL, *en1 = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; struct extent_info ei, dei, prev; struct rb_node **insert_p = NULL, *insert_parent = NULL; + unsigned int fofs = tei->fofs, len = tei->len; unsigned int end = fofs + len; - unsigned int pos = (unsigned int)fofs; bool updated = false; bool leftmost = false; if (!et) return; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); + if (type == EX_READ) + trace_f2fs_update_read_extent_tree_range(inode, fofs, len, + tei->blk, 0); + else if (type == EX_BLOCK_AGE) + trace_f2fs_update_age_extent_tree_range(inode, fofs, len, + tei->age, tei->last_blocks); write_lock(&et->lock); - if (is_inode_flag_set(inode, FI_NO_EXTENT)) { - write_unlock(&et->lock); - return; + if (type == EX_READ) { + if (is_inode_flag_set(inode, FI_NO_EXTENT)) { + write_unlock(&et->lock); + return; + } + + prev = et->largest; + dei.len = 0; + + /* + * drop largest extent before lookup, in case it's already + * been shrunk from extent tree + */ + __drop_largest_extent(et, fofs, len); } - prev = et->largest; - dei.len = 0; - - /* - * drop largest extent before lookup, in case it's already - * been shrunk from extent tree - */ - __drop_largest_extent(et, fofs, len); - /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, (struct rb_entry *)et->cached_en, fofs, @@ -582,26 +727,32 @@ static void f2fs_update_extent_tree_range(struct inode *inode, dei = en->ei; org_end = dei.fofs + dei.len; - f2fs_bug_on(sbi, pos >= org_end); + f2fs_bug_on(sbi, fofs >= org_end); - if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { - en->ei.len = pos - en->ei.fofs; + if (fofs > dei.fofs && (type != EX_READ || + fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN)) { + en->ei.len = fofs - en->ei.fofs; prev_en = en; parts = 1; } - if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) { + if (end < org_end && (type != EX_READ || + org_end - end >= F2FS_MIN_EXTENT_LEN)) { if (parts) { - set_extent_info(&ei, end, - end - dei.fofs + dei.blk, - org_end - end); + __set_extent_info(&ei, + end, org_end - end, + end - dei.fofs + dei.blk, false, + dei.age, dei.last_blocks, + type); en1 = __insert_extent_tree(sbi, et, &ei, NULL, NULL, true); next_en = en1; } else { - en->ei.fofs = end; - en->ei.blk += end - dei.fofs; - en->ei.len -= end - dei.fofs; + __set_extent_info(&en->ei, + end, en->ei.len - (end - dei.fofs), + en->ei.blk + (end - dei.fofs), true, + dei.age, dei.last_blocks, + type); next_en = en; } parts++; @@ -631,10 +782,15 @@ static void f2fs_update_extent_tree_range(struct inode *inode, en = next_en; } - /* 3. update extent in extent cache */ - if (blkaddr) { + if (type == EX_BLOCK_AGE) + goto update_age_extent_cache; - set_extent_info(&ei, fofs, blkaddr, len); + /* 3. update extent in read extent cache */ + BUG_ON(type != EX_READ); + + if (tei->blk) { + __set_extent_info(&ei, fofs, len, tei->blk, false, + 0, 0, EX_READ); if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent, leftmost); @@ -656,7 +812,17 @@ static void f2fs_update_extent_tree_range(struct inode *inode, et->largest_updated = false; updated = true; } + goto out_read_extent_cache; +update_age_extent_cache: + if (!tei->last_blocks) + goto out_read_extent_cache; + __set_extent_info(&ei, fofs, len, 0, false, + tei->age, tei->last_blocks, EX_BLOCK_AGE); + if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) + __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent, leftmost); +out_read_extent_cache: write_unlock(&et->lock); if (updated) @@ -664,19 +830,20 @@ static void f2fs_update_extent_tree_range(struct inode *inode, } #ifdef CONFIG_F2FS_FS_COMPRESSION -void f2fs_update_extent_tree_range_compressed(struct inode *inode, +void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int llen, unsigned int c_len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; struct extent_node *en = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; struct extent_info ei; struct rb_node **insert_p = NULL, *insert_parent = NULL; bool leftmost = false; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen); + trace_f2fs_update_read_extent_tree_range(inode, fofs, llen, + blkaddr, c_len); /* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */ if (is_inode_flag_set(inode, FI_NO_EXTENT)) @@ -693,7 +860,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode, if (en) goto unlock_out; - set_extent_info(&ei, fofs, blkaddr, llen); + __set_extent_info(&ei, fofs, llen, blkaddr, true, 0, 0, EX_READ); ei.c_len = c_len; if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) @@ -704,24 +871,114 @@ unlock_out: } #endif -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +static unsigned long long __calculate_block_age(unsigned long long new, + unsigned long long old) { + unsigned long long diff; + + diff = (new >= old) ? new - (new - old) : new + (old - new); + + return div_u64(diff * LAST_AGE_WEIGHT, 100); +} + +/* This returns a new age and allocated blocks in ei */ +static int __get_new_block_age(struct inode *inode, struct extent_info *ei, + block_t blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + loff_t f_size = i_size_read(inode); + unsigned long long cur_blocks = + atomic64_read(&sbi->allocated_data_blocks); + struct extent_info tei = *ei; /* only fofs and len are valid */ + + /* + * When I/O is not aligned to a PAGE_SIZE, update will happen to the last + * file block even in seq write. So don't record age for newly last file + * block here. + */ + if ((f_size >> PAGE_SHIFT) == ei->fofs && f_size & (PAGE_SIZE - 1) && + blkaddr == NEW_ADDR) + return -EINVAL; + + if (__lookup_extent_tree(inode, ei->fofs, &tei, EX_BLOCK_AGE)) { + unsigned long long cur_age; + + if (cur_blocks >= tei.last_blocks) + cur_age = cur_blocks - tei.last_blocks; + else + /* allocated_data_blocks overflow */ + cur_age = ULLONG_MAX - tei.last_blocks + cur_blocks; + + if (tei.age) + ei->age = __calculate_block_age(cur_age, tei.age); + else + ei->age = cur_age; + ei->last_blocks = cur_blocks; + WARN_ON(ei->age > cur_blocks); + return 0; + } + + f2fs_bug_on(sbi, blkaddr == NULL_ADDR); + + /* the data block was allocated for the first time */ + if (blkaddr == NEW_ADDR) + goto out; + + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { + f2fs_bug_on(sbi, 1); + return -EINVAL; + } +out: + /* + * init block age with zero, this can happen when the block age extent + * was reclaimed due to memory constraint or system reboot + */ + ei->age = 0; + ei->last_blocks = cur_blocks; + return 0; +} + +static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type) +{ + struct extent_info ei = {}; + + if (!__may_extent_tree(dn->inode, type)) + return; + + ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + dn->ofs_in_node; + ei.len = 1; + + if (type == EX_READ) { + if (dn->data_blkaddr == NEW_ADDR) + ei.blk = NULL_ADDR; + else + ei.blk = dn->data_blkaddr; + } else if (type == EX_BLOCK_AGE) { + if (__get_new_block_age(dn->inode, &ei, dn->data_blkaddr)) + return; + } + __update_extent_tree_range(dn->inode, &ei, type); +} + +static unsigned int __shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink, + enum extent_type type) +{ + struct extent_tree_info *eti = &sbi->extent_tree[type]; struct extent_tree *et, *next; struct extent_node *en; unsigned int node_cnt = 0, tree_cnt = 0; int remained; - if (!test_opt(sbi, EXTENT_CACHE)) - return 0; - - if (!atomic_read(&sbi->total_zombie_tree)) + if (!atomic_read(&eti->total_zombie_tree)) goto free_node; - if (!mutex_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&eti->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ - list_for_each_entry_safe(et, next, &sbi->zombie_list, list) { + list_for_each_entry_safe(et, next, &eti->zombie_list, list) { if (atomic_read(&et->node_cnt)) { write_lock(&et->lock); node_cnt += __free_extent_tree(sbi, et); @@ -729,61 +986,137 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) } f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); list_del_init(&et->list); - radix_tree_delete(&sbi->extent_tree_root, et->ino); + radix_tree_delete(&eti->extent_tree_root, et->ino); kmem_cache_free(extent_tree_slab, et); - atomic_dec(&sbi->total_ext_tree); - atomic_dec(&sbi->total_zombie_tree); + atomic_dec(&eti->total_ext_tree); + atomic_dec(&eti->total_zombie_tree); tree_cnt++; if (node_cnt + tree_cnt >= nr_shrink) goto unlock_out; cond_resched(); } - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); free_node: /* 2. remove LRU extent entries */ - if (!mutex_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&eti->extent_tree_lock)) goto out; remained = nr_shrink - (node_cnt + tree_cnt); - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); for (; remained > 0; remained--) { - if (list_empty(&sbi->extent_list)) + if (list_empty(&eti->extent_list)) break; - en = list_first_entry(&sbi->extent_list, + en = list_first_entry(&eti->extent_list, struct extent_node, list); et = en->et; if (!write_trylock(&et->lock)) { /* refresh this extent node's position in extent list */ - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); continue; } list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); __detach_extent_node(sbi, et, en); write_unlock(&et->lock); node_cnt++; - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); unlock_out: - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); out: - trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); + trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt, type); return node_cnt + tree_cnt; } -unsigned int f2fs_destroy_extent_node(struct inode *inode) +/* read extent cache operations */ +bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!__may_extent_tree(inode, EX_READ)) + return false; + + return __lookup_extent_tree(inode, pgofs, ei, EX_READ); +} + +void f2fs_update_read_extent_cache(struct dnode_of_data *dn) +{ + return __update_extent_cache(dn, EX_READ); +} + +void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len) +{ + struct extent_info ei = { + .fofs = fofs, + .len = len, + .blk = blkaddr, + }; + + if (!__may_extent_tree(dn->inode, EX_READ)) + return; + + __update_extent_tree_range(dn->inode, &ei, EX_READ); +} + +unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + if (!test_opt(sbi, READ_EXTENT_CACHE)) + return 0; + + return __shrink_extent_tree(sbi, nr_shrink, EX_READ); +} + +/* block age extent cache operations */ +bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!__may_extent_tree(inode, EX_BLOCK_AGE)) + return false; + + return __lookup_extent_tree(inode, pgofs, ei, EX_BLOCK_AGE); +} + +void f2fs_update_age_extent_cache(struct dnode_of_data *dn) +{ + return __update_extent_cache(dn, EX_BLOCK_AGE); +} + +void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, unsigned int len) +{ + struct extent_info ei = { + .fofs = fofs, + .len = len, + }; + + if (!__may_extent_tree(dn->inode, EX_BLOCK_AGE)) + return; + + __update_extent_tree_range(dn->inode, &ei, EX_BLOCK_AGE); +} + +unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + if (!test_opt(sbi, AGE_EXTENT_CACHE)) + return 0; + + return __shrink_extent_tree(sbi, nr_shrink, EX_BLOCK_AGE); +} + +static unsigned int __destroy_extent_node(struct inode *inode, + enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; unsigned int node_cnt = 0; if (!et || !atomic_read(&et->node_cnt)) @@ -796,31 +1129,46 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode) return node_cnt; } -void f2fs_drop_extent_tree(struct inode *inode) +void f2fs_destroy_extent_node(struct inode *inode) +{ + __destroy_extent_node(inode, EX_READ); + __destroy_extent_node(inode, EX_BLOCK_AGE); +} + +static void __drop_extent_tree(struct inode *inode, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; bool updated = false; - if (!f2fs_may_extent_tree(inode)) + if (!__may_extent_tree(inode, type)) return; write_lock(&et->lock); - set_inode_flag(inode, FI_NO_EXTENT); __free_extent_tree(sbi, et); - if (et->largest.len) { - et->largest.len = 0; - updated = true; + if (type == EX_READ) { + set_inode_flag(inode, FI_NO_EXTENT); + if (et->largest.len) { + et->largest.len = 0; + updated = true; + } } write_unlock(&et->lock); if (updated) f2fs_mark_inode_dirty_sync(inode, true); } -void f2fs_destroy_extent_tree(struct inode *inode) +void f2fs_drop_extent_tree(struct inode *inode) +{ + __drop_extent_tree(inode, EX_READ); + __drop_extent_tree(inode, EX_BLOCK_AGE); +} + +static void __destroy_extent_tree(struct inode *inode, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; unsigned int node_cnt = 0; if (!et) @@ -828,76 +1176,56 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (inode->i_nlink && !is_bad_inode(inode) && atomic_read(&et->node_cnt)) { - mutex_lock(&sbi->extent_tree_lock); - list_add_tail(&et->list, &sbi->zombie_list); - atomic_inc(&sbi->total_zombie_tree); - mutex_unlock(&sbi->extent_tree_lock); + mutex_lock(&eti->extent_tree_lock); + list_add_tail(&et->list, &eti->zombie_list); + atomic_inc(&eti->total_zombie_tree); + mutex_unlock(&eti->extent_tree_lock); return; } /* free all extent info belong to this extent tree */ - node_cnt = f2fs_destroy_extent_node(inode); + node_cnt = __destroy_extent_node(inode, type); /* delete extent tree entry in radix tree */ - mutex_lock(&sbi->extent_tree_lock); + mutex_lock(&eti->extent_tree_lock); f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); - radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); + radix_tree_delete(&eti->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); - atomic_dec(&sbi->total_ext_tree); - mutex_unlock(&sbi->extent_tree_lock); + atomic_dec(&eti->total_ext_tree); + mutex_unlock(&eti->extent_tree_lock); - F2FS_I(inode)->extent_tree = NULL; + F2FS_I(inode)->extent_tree[type] = NULL; - trace_f2fs_destroy_extent_tree(inode, node_cnt); + trace_f2fs_destroy_extent_tree(inode, node_cnt, type); } -bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) +void f2fs_destroy_extent_tree(struct inode *inode) { - if (!f2fs_may_extent_tree(inode)) - return false; - - return f2fs_lookup_extent_tree(inode, pgofs, ei); + __destroy_extent_tree(inode, EX_READ); + __destroy_extent_tree(inode, EX_BLOCK_AGE); } -void f2fs_update_extent_cache(struct dnode_of_data *dn) +static void __init_extent_tree_info(struct extent_tree_info *eti) { - pgoff_t fofs; - block_t blkaddr; - - if (!f2fs_may_extent_tree(dn->inode)) - return; - - if (dn->data_blkaddr == NEW_ADDR) - blkaddr = NULL_ADDR; - else - blkaddr = dn->data_blkaddr; - - fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + - dn->ofs_in_node; - f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1); -} - -void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t fofs, block_t blkaddr, unsigned int len) - -{ - if (!f2fs_may_extent_tree(dn->inode)) - return; - - f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len); + INIT_RADIX_TREE(&eti->extent_tree_root, GFP_NOIO); + mutex_init(&eti->extent_tree_lock); + INIT_LIST_HEAD(&eti->extent_list); + spin_lock_init(&eti->extent_lock); + atomic_set(&eti->total_ext_tree, 0); + INIT_LIST_HEAD(&eti->zombie_list); + atomic_set(&eti->total_zombie_tree, 0); + atomic_set(&eti->total_ext_node, 0); } void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) { - INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); - mutex_init(&sbi->extent_tree_lock); - INIT_LIST_HEAD(&sbi->extent_list); - spin_lock_init(&sbi->extent_lock); - atomic_set(&sbi->total_ext_tree, 0); - INIT_LIST_HEAD(&sbi->zombie_list); - atomic_set(&sbi->total_zombie_tree, 0); - atomic_set(&sbi->total_ext_node, 0); + __init_extent_tree_info(&sbi->extent_tree[EX_READ]); + __init_extent_tree_info(&sbi->extent_tree[EX_BLOCK_AGE]); + + /* initialize for block age extents */ + atomic64_set(&sbi->allocated_data_blocks, 0); + sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD; + sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD; } int __init f2fs_create_extent_cache(void) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a144471c5316..54f7ca03f4ac 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -56,6 +56,8 @@ enum { FAULT_WRITE_IO, FAULT_SLAB_ALLOC, FAULT_DQUOT_INIT, + FAULT_LOCK_OP, + FAULT_BLKADDR, FAULT_MAX, }; @@ -87,7 +89,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_FLUSH_MERGE 0x00000400 #define F2FS_MOUNT_NOBARRIER 0x00000800 #define F2FS_MOUNT_FASTBOOT 0x00001000 -#define F2FS_MOUNT_EXTENT_CACHE 0x00002000 +#define F2FS_MOUNT_READ_EXTENT_CACHE 0x00002000 #define F2FS_MOUNT_DATA_FLUSH 0x00008000 #define F2FS_MOUNT_FAULT_INJECTION 0x00010000 #define F2FS_MOUNT_USRQUOTA 0x00080000 @@ -102,6 +104,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 #define F2FS_MOUNT_GC_MERGE 0x20000000 #define F2FS_MOUNT_COMPRESS_CACHE 0x40000000 +#define F2FS_MOUNT_AGE_EXTENT_CACHE 0x80000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -120,6 +123,20 @@ typedef u32 nid_t; #define COMPRESS_EXT_NUM 16 +/* + * An implementation of an rwsem that is explicitly unfair to readers. This + * prevents priority inversion when a low-priority reader acquires the read lock + * while sleeping on the write lock but the write lock is needed by + * higher-priority clients. + */ + +struct f2fs_rwsem { + struct rw_semaphore internal_rwsem; +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + wait_queue_head_t read_waiters; +#endif +}; + struct f2fs_mount_info { unsigned int opt; int write_io_size_bits; /* Write IO size bits */ @@ -137,11 +154,11 @@ struct f2fs_mount_info { int s_jquota_fmt; /* Format of quota to use */ #endif /* For which write hints are passed down to block layer */ - int whint_mode; int alloc_mode; /* segment allocation policy */ int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ + int memory_mode; /* memory mode */ int discard_unit; /* * discard command's offset/size should * be aligned to this unit: block, @@ -184,10 +201,6 @@ struct f2fs_mount_info { #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) #define F2FS_HAS_FEATURE(sbi, mask) __F2FS_HAS_FEATURE(sbi->raw_super, mask) -#define F2FS_SET_FEATURE(sbi, mask) \ - (sbi->raw_super->feature |= cpu_to_le32(mask)) -#define F2FS_CLEAR_FEATURE(sbi, mask) \ - (sbi->raw_super->feature &= ~cpu_to_le32(mask)) /* * Default values for user and/or group using reserved blocks @@ -212,7 +225,6 @@ enum { #define CP_PAUSE 0x00000040 #define CP_RESIZE 0x00000080 -#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ #define DEF_MID_DISCARD_ISSUE_TIME 500 /* 500 ms, if device busy */ @@ -261,7 +273,7 @@ enum { ORPHAN_INO, /* for orphan ino list */ APPEND_INO, /* for append ino list */ UPDATE_INO, /* for update ino list */ - TRANS_DIR_INO, /* for trasactions dir ino list */ + TRANS_DIR_INO, /* for transactions dir ino list */ FLUSH_INO, /* for multiple device flushing */ MAX_INO_ENTRY, /* max. list */ }; @@ -311,8 +323,12 @@ struct discard_entry { unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; +/* minimum discard granularity, unit: block count */ +#define MIN_DISCARD_GRANULARITY 1 /* default discard granularity of inner discard thread, unit: block count */ #define DEFAULT_DISCARD_GRANULARITY 16 +/* default maximum discard granularity of ordered discard, unit: block count */ +#define DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY 16 /* max discard pend list number */ #define MAX_PLIST_NUM 512 @@ -387,7 +403,13 @@ struct discard_cmd_control { struct mutex cmd_lock; unsigned int nr_discards; /* # of discards in the list */ unsigned int max_discards; /* max. discards to be issued */ + unsigned int max_discard_request; /* max. discard request per round */ + unsigned int min_discard_issue_time; /* min. interval between discard issue */ + unsigned int mid_discard_issue_time; /* mid. interval between discard issue */ + unsigned int max_discard_issue_time; /* max. interval between discard issue */ + unsigned int discard_urgent_util; /* utilization which issue discard proactively */ unsigned int discard_granularity; /* discard granularity */ + unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */ unsigned int undiscard_blks; /* # of undiscard blocks */ unsigned int next_pos; /* next discard position */ atomic_t issued_discard; /* # of issued discard */ @@ -562,15 +584,45 @@ enum { /* maximum retry quota flush count */ #define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 +/* maximum retry of EIO'ed page */ +#define MAX_RETRY_PAGE_EIO 100 + #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ +/* dirty segments threshold for triggering CP */ +#define DEFAULT_DIRTY_THRESHOLD 4 + +#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS +#define RECOVERY_MIN_RA_BLOCKS 1 + +#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */ + /* for in-memory extent cache entry */ #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ /* number of extent info in extent cache we try to shrink */ -#define EXTENT_CACHE_SHRINK_NUMBER 128 +#define READ_EXTENT_CACHE_SHRINK_NUMBER 128 + +/* number of age extent info in extent cache we try to shrink */ +#define AGE_EXTENT_CACHE_SHRINK_NUMBER 128 +#define LAST_AGE_WEIGHT 30 +#define SAME_AGE_REGION 1024 + +/* + * Define data block with age less than 1GB as hot data + * define data block with age less than 10GB but more than 1GB as warm data + */ +#define DEF_HOT_DATA_AGE_THRESHOLD 262144 +#define DEF_WARM_DATA_AGE_THRESHOLD 2621440 + +/* extent cache type */ +enum extent_type { + EX_READ, + EX_BLOCK_AGE, + NR_EXTENT_CACHES, +}; struct rb_entry { struct rb_node rb_node; /* rb node located in rb-tree */ @@ -586,10 +638,24 @@ struct rb_entry { struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ - u32 blk; /* start block address of the extent */ + union { + /* read extent_cache */ + struct { + /* start block address of the extent */ + block_t blk; #ifdef CONFIG_F2FS_FS_COMPRESSION - unsigned int c_len; /* physical extent length of compressed blocks */ + /* physical extent length of compressed blocks */ + unsigned int c_len; #endif + }; + /* block age extent_cache */ + struct { + /* block age of the extent */ + unsigned long long age; + /* last total blocks allocated */ + unsigned long long last_blocks; + }; + }; }; struct extent_node { @@ -601,13 +667,25 @@ struct extent_node { struct extent_tree { nid_t ino; /* inode number */ + enum extent_type type; /* keep the extent tree type */ struct rb_root_cached root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ - struct extent_info largest; /* largested extent info */ struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ atomic_t node_cnt; /* # of extent node in rb-tree*/ bool largest_updated; /* largest extent updated */ + struct extent_info largest; /* largest cached extent for EX_READ */ +}; + +struct extent_tree_info { + struct radix_tree_root extent_tree_root;/* cache extent cache entries */ + struct mutex extent_tree_lock; /* locking extent radix tree */ + struct list_head extent_list; /* lru list for shrinker */ + spinlock_t extent_lock; /* locking extent lru list */ + atomic_t total_ext_tree; /* extent tree count */ + struct list_head zombie_list; /* extent zombie tree list */ + atomic_t total_zombie_tree; /* extent zombie tree count */ + atomic_t total_ext_node; /* extent info count */ }; /* @@ -622,6 +700,7 @@ struct extent_tree { F2FS_MAP_UNWRITTEN) struct f2fs_map_blocks { + struct block_device *m_bdev; /* for multi-device dio */ block_t m_pblk; block_t m_lblk; unsigned int m_len; @@ -630,6 +709,7 @@ struct f2fs_map_blocks { pgoff_t *m_next_extent; /* point to next possible extent */ int m_seg_type; bool m_may_create; /* indicate it is from write path */ + bool m_multidev_dio; /* indicate it allows multi-device dio */ }; /* for flag in get_data_block */ @@ -653,6 +733,7 @@ enum { #define FADVISE_KEEP_SIZE_BIT 0x10 #define FADVISE_HOT_BIT 0x20 #define FADVISE_VERITY_BIT 0x40 +#define FADVISE_TRUNC_BIT 0x80 #define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) @@ -680,11 +761,14 @@ enum { #define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT) #define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT) +#define file_should_truncate(inode) is_file(inode, FADVISE_TRUNC_BIT) +#define file_need_truncate(inode) set_file(inode, FADVISE_TRUNC_BIT) +#define file_dont_truncate(inode) clear_file(inode, FADVISE_TRUNC_BIT) + #define DEF_DIR_LEVEL 0 enum { GC_FAILURE_PIN, - GC_FAILURE_ATOMIC, MAX_GC_FAILURE }; @@ -706,20 +790,18 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ - FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ - FI_VOLATILE_FILE, /* indicate volatile file */ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ FI_INLINE_DOTS, /* indicate inline dot dentries */ - FI_DO_DEFRAG, /* indicate defragment is running */ + FI_SKIP_WRITES, /* should skip data page writeback */ + FI_OPU_WRITE, /* used for opu per file */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ - FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ + FI_PREALLOCATED_ALL, /* all blocks for write were preallocated */ FI_HOT_DATA, /* indicate file is hot */ FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ FI_PIN_FILE, /* indicate file should not be gced */ - FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ @@ -727,6 +809,9 @@ enum { FI_ENABLE_COMPRESS, /* enable compression in "user" compression mode */ FI_COMPRESS_RELEASED, /* compressed blocks were released */ FI_ALIGNED_WRITE, /* enable aligned write */ + FI_COW_FILE, /* indicate COW file */ + FI_ATOMIC_COMMITTED, /* indicate atomic commit completed except disk sync */ + FI_ATOMIC_REPLACE, /* indicate atomic replace */ FI_MAX, /* max flag, never be used */ }; @@ -743,7 +828,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */ - struct rw_semaphore i_sem; /* protect fi info */ + struct f2fs_rwsem i_sem; /* protect fi info */ atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ @@ -762,15 +847,14 @@ struct f2fs_inode_info { #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ - struct list_head inmem_ilist; /* list for inmem inodes */ - struct list_head inmem_pages; /* inmemory pages managed by f2fs */ - struct task_struct *inmem_task; /* store inmemory task */ - struct mutex inmem_lock; /* lock for inmemory pages */ - struct extent_tree *extent_tree; /* cached extent_tree entry */ + struct task_struct *atomic_write_task; /* store atomic write task */ + struct extent_tree *extent_tree[NR_EXTENT_CACHES]; + /* cached extent_tree entry */ + struct inode *cow_inode; /* copy-on-write inode for atomic write */ /* avoid racing between foreground op and gc */ - struct rw_semaphore i_gc_rwsem[2]; - struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ + struct f2fs_rwsem i_gc_rwsem[2]; + struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */ int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ @@ -785,9 +869,12 @@ struct f2fs_inode_info { unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ unsigned short i_compress_flag; /* compress flag */ unsigned int i_cluster_size; /* cluster size */ + + unsigned int atomic_write_cnt; + loff_t original_i_size; /* original i_size before atomic write */ }; -static inline void get_extent_info(struct extent_info *ext, +static inline void get_read_extent_info(struct extent_info *ext, struct f2fs_extent *i_ext) { ext->fofs = le32_to_cpu(i_ext->fofs); @@ -795,7 +882,7 @@ static inline void get_extent_info(struct extent_info *ext, ext->len = le32_to_cpu(i_ext->len); } -static inline void set_raw_extent(struct extent_info *ext, +static inline void set_raw_read_extent(struct extent_info *ext, struct f2fs_extent *i_ext) { i_ext->fofs = cpu_to_le32(ext->fofs); @@ -803,17 +890,6 @@ static inline void set_raw_extent(struct extent_info *ext, i_ext->len = cpu_to_le32(ext->len); } -static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, - u32 blk, unsigned int len) -{ - ei->fofs = fofs; - ei->blk = blk; - ei->len = len; -#ifdef CONFIG_F2FS_FS_COMPRESSION - ei->c_len = 0; -#endif -} - static inline bool __is_discard_mergeable(struct discard_info *back, struct discard_info *front, unsigned int max_len) { @@ -833,41 +909,6 @@ static inline bool __is_discard_front_mergeable(struct discard_info *cur, return __is_discard_mergeable(cur, front, max_len); } -static inline bool __is_extent_mergeable(struct extent_info *back, - struct extent_info *front) -{ -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (back->c_len && back->len != back->c_len) - return false; - if (front->c_len && front->len != front->c_len) - return false; -#endif - return (back->fofs + back->len == front->fofs && - back->blk + back->len == front->blk); -} - -static inline bool __is_back_mergeable(struct extent_info *cur, - struct extent_info *back) -{ - return __is_extent_mergeable(back, cur); -} - -static inline bool __is_front_mergeable(struct extent_info *cur, - struct extent_info *front) -{ - return __is_extent_mergeable(cur, front); -} - -extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); -static inline void __try_update_largest_extent(struct extent_tree *et, - struct extent_node *en) -{ - if (en->ei.len > et->largest.len) { - et->largest = en->ei; - et->largest_updated = true; - } -} - /* * For free nid management */ @@ -889,6 +930,7 @@ struct f2fs_nm_info { nid_t max_nid; /* maximum possible node ids */ nid_t available_nids; /* # of available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ + nid_t max_rf_node_blocks; /* max # of nodes for recovery */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */ @@ -896,7 +938,7 @@ struct f2fs_nm_info { /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ struct radix_tree_root nat_set_root;/* root of the nat set cache */ - struct rw_semaphore nat_tree_lock; /* protect nat entry tree */ + struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */ struct list_head nat_entries; /* cached nat entry list (clean) */ spinlock_t nat_list_lock; /* protect clean nat entry list */ unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ @@ -1009,7 +1051,7 @@ struct f2fs_sm_info { struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ - struct rw_semaphore curseg_lock; /* for preventing curseg change */ + struct f2fs_rwsem curseg_lock; /* for preventing curseg change */ block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ @@ -1024,9 +1066,6 @@ struct f2fs_sm_info { /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; - /* for batched trimming */ - unsigned int trim_sections; /* # of sections to trim */ - struct list_head sit_entry_set; /* sit entry set list */ unsigned int ipu_policy; /* in-place-update policy */ @@ -1059,7 +1098,6 @@ enum count_type { F2FS_DIRTY_QDATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, - F2FS_INMEM_PAGES, F2FS_DIRTY_IMETA, F2FS_WB_CP_DATA, F2FS_WB_DATA, @@ -1089,11 +1127,7 @@ enum page_type { META, NR_PAGE_TYPE, META_FLUSH, - INMEM, /* the below types are used by tracepoints only. */ - INMEM_DROP, - INMEM_INVALIDATE, - INMEM_REVOKE, - IPU, + IPU, /* the below types are used by tracepoints only. */ OPU, }; @@ -1130,7 +1164,10 @@ enum iostat_type { APP_BUFFERED_IO, /* app buffered write IOs */ APP_WRITE_IO, /* app write IOs */ APP_MAPPED_IO, /* app mapped IOs */ + APP_BUFFERED_CDATA_IO, /* app buffered write IOs on compressed file */ + APP_MAPPED_CDATA_IO, /* app mapped write IOs on compressed file */ FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_CDATA_IO, /* data IOs from kworker/fsync/reclaimer on compressed file */ FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ FS_META_IO, /* meta IOs from kworker/reclaimer */ FS_GC_DATA_IO, /* data IOs from forground gc */ @@ -1144,6 +1181,8 @@ enum iostat_type { APP_BUFFERED_READ_IO, /* app buffered read IOs */ APP_READ_IO, /* app read IOs */ APP_MAPPED_READ_IO, /* app mapped read IOs */ + APP_BUFFERED_CDATA_READ_IO, /* app buffered read IOs on compressed file */ + APP_MAPPED_CDATA_READ_IO, /* app mapped read IOs on compressed file */ FS_DATA_READ_IO, /* data read IOs */ FS_GDATA_READ_IO, /* data read IOs from background gc */ FS_CDATA_READ_IO, /* compressed data read IOs */ @@ -1175,6 +1214,7 @@ struct f2fs_io_info { bool retry; /* need to reallocate block address */ int compr_blocks; /* # of compressed block addresses */ bool encrypted; /* indicate file is encrypted */ + bool post_read; /* require post read */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ struct bio **bio; /* bio for ipu */ @@ -1193,11 +1233,11 @@ struct f2fs_bio_info { struct bio *bio; /* bios to merge */ sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ - struct rw_semaphore io_rwsem; /* blocking op for bio */ + struct f2fs_rwsem io_rwsem; /* blocking op for bio */ spinlock_t io_lock; /* serialize DATA/NODE IOs */ struct list_head io_list; /* track fios */ struct list_head bio_list; /* bio entry list head */ - struct rw_semaphore bio_list_lock; /* lock to protect bio entry list */ + struct f2fs_rwsem bio_list_lock; /* lock to protect bio entry list */ }; #define FDEV(i) (sbi->devs[i]) @@ -1211,7 +1251,6 @@ struct f2fs_dev_info { #ifdef CONFIG_BLK_DEV_ZONED unsigned int nr_blkz; /* Total number of zones */ unsigned long *blkz_seq; /* Bitmap indicating sequential zones */ - block_t *zone_capacity_blocks; /* Array of zone capacity in blks */ #endif }; @@ -1219,7 +1258,6 @@ enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ DIRTY_META, /* for all dirtied inode metadata */ - ATOMIC_FILE, /* for all atomic files */ NR_INODE_TYPE, }; @@ -1243,6 +1281,15 @@ struct atgc_management { unsigned long long age_threshold; /* age threshold */ }; +struct f2fs_gc_control { + unsigned int victim_segno; /* target victim segment number */ + int init_gc_type; /* FG_GC or BG_GC */ + bool no_bg_gc; /* check the space and stop bg_gc */ + bool should_migrate_blocks; /* should migrate blocks */ + bool err_gc_skipped; /* return EAGAIN if GC skipped */ + unsigned int nr_free_secs; /* # of free sections to do GC */ +}; + /* For s_flag in struct f2fs_sb_info */ enum { SBI_IS_DIRTY, /* dirty flag for checkpoint */ @@ -1272,6 +1319,7 @@ enum { MAX_TIME, }; +/* Note that you need to keep synchronization with this gc_mode_names array */ enum { GC_NORMAL, GC_IDLE_CB, @@ -1279,6 +1327,7 @@ enum { GC_IDLE_AT, GC_URGENT_HIGH, GC_URGENT_LOW, + GC_URGENT_MID, MAX_GC_MODE, }; @@ -1292,14 +1341,10 @@ enum { }; enum { - FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */ - FS_MODE_LFS, /* use lfs allocation only */ -}; - -enum { - WHINT_MODE_OFF, /* not pass down write hints */ - WHINT_MODE_USER, /* try to pass down hints given by users */ - WHINT_MODE_FS, /* pass down hints with F2FS policy */ + FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */ + FS_MODE_LFS, /* use lfs allocation only */ + FS_MODE_FRAGMENT_SEG, /* segment fragmentation mode */ + FS_MODE_FRAGMENT_BLK, /* block fragmentation mode */ }; enum { @@ -1331,6 +1376,13 @@ enum { DISCARD_UNIT_SECTION, /* basic discard unit is section */ }; +enum { + MEMORY_MODE_NORMAL, /* memory mode for normal devices */ + MEMORY_MODE_LOW, /* memory mode for low memry devices */ +}; + + + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -1486,6 +1538,7 @@ struct compress_ctx { unsigned int nr_rpages; /* total page number in rpages */ struct page **cpages; /* pages store compressed data in cluster */ unsigned int nr_cpages; /* total page number in cpages */ + unsigned int valid_nr_cpages; /* valid page number in cpages */ void *rbuf; /* virtual mapped address on rpages */ struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ @@ -1550,6 +1603,7 @@ struct decompress_io_ctx { void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ struct work_struct verity_work; /* work to verify the decompressed pages */ + struct work_struct free_work; /* work for late free this structure itself */ }; #define NULL_CLUSTER ((unsigned int)(~0)) @@ -1561,7 +1615,7 @@ struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ struct f2fs_super_block *raw_super; /* raw super block pointer */ - struct rw_semaphore sb_lock; /* lock for raw super block */ + struct f2fs_rwsem sb_lock; /* lock for raw super block */ int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ struct mutex writepages; /* mutex for writepages() */ @@ -1581,18 +1635,20 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ /* keep migration IO order for LFS mode */ - struct rw_semaphore io_order_lock; + struct f2fs_rwsem io_order_lock; mempool_t *write_io_dummy; /* Dummy pages */ + pgoff_t page_eio_ofs[NR_PAGE_TYPE]; /* EIO page offset */ + int page_eio_cnt[NR_PAGE_TYPE]; /* EIO count */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ int cur_cp_pack; /* remain current cp pack */ spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ - struct rw_semaphore cp_global_sem; /* checkpoint procedure lock */ - struct rw_semaphore cp_rwsem; /* blocking FS operations */ - struct rw_semaphore node_write; /* locking node writes */ - struct rw_semaphore node_change; /* locking node change */ + struct f2fs_rwsem cp_global_sem; /* checkpoint procedure lock */ + struct f2fs_rwsem cp_rwsem; /* blocking FS operations */ + struct f2fs_rwsem node_write; /* locking node writes */ + struct f2fs_rwsem node_change; /* locking node change */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ @@ -1614,14 +1670,12 @@ struct f2fs_sb_info { struct mutex flush_lock; /* for flush exclusion */ /* for extent tree cache */ - struct radix_tree_root extent_tree_root;/* cache extent cache entries */ - struct mutex extent_tree_lock; /* locking extent radix tree */ - struct list_head extent_list; /* lru list for shrinker */ - spinlock_t extent_lock; /* locking extent lru list */ - atomic_t total_ext_tree; /* extent tree count */ - struct list_head zombie_list; /* extent zombie tree list */ - atomic_t total_zombie_tree; /* extent zombie tree count */ - atomic_t total_ext_node; /* extent info count */ + struct extent_tree_info extent_tree[NR_EXTENT_CACHES]; + atomic64_t allocated_data_blocks; /* for block age extent_cache */ + + /* The threshold used for hot and warm data seperation*/ + unsigned int hot_data_age_threshold; + unsigned int warm_data_age_threshold; /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ @@ -1632,13 +1686,14 @@ struct f2fs_sb_info { unsigned int meta_ino_num; /* meta inode number*/ unsigned int log_blocks_per_seg; /* log2 blocks per segment */ unsigned int blocks_per_seg; /* blocks per segment */ + unsigned int unusable_blocks_per_sec; /* unusable blocks per section */ unsigned int segs_per_sec; /* segments per section */ unsigned int secs_per_zone; /* sections per zone */ unsigned int total_sections; /* total section count */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ int dir_level; /* directory level */ - int readdir_ra; /* readahead inode in readdir */ + bool readdir_ra; /* readahead inode in readdir */ u64 max_io_bytes; /* max io bytes to merge IOs */ block_t user_block_count; /* # of user blocks */ @@ -1652,12 +1707,14 @@ struct f2fs_sb_info { block_t unusable_block_count; /* # of blocks saved by last cp */ unsigned int nquota_files; /* # of quota sysfile */ - struct rw_semaphore quota_sem; /* blocking cp for flags */ + struct f2fs_rwsem quota_sem; /* blocking cp for flags */ /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; + /* # of node block writes as roll forward recovery */ + struct percpu_counter rf_node_block_count; /* writeback control */ atomic_t wb_sync_req[META]; /* count # of WB_SYNC threads */ @@ -1668,7 +1725,7 @@ struct f2fs_sb_info { struct f2fs_mount_info mount_opt; /* mount options */ /* for cleaning operations */ - struct rw_semaphore gc_lock; /* + struct f2fs_rwsem gc_lock; /* * semaphore for GC, avoid * race between GC and GC or CP */ @@ -1677,15 +1734,16 @@ struct f2fs_sb_info { unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ + spinlock_t gc_remaining_trials_lock; + /* remaining trial count for GC_URGENT_* and GC_IDLE_* */ + unsigned int gc_remaining_trials; /* for skip statistic */ - unsigned int atomic_files; /* # of opened atomic file */ - unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; - struct rw_semaphore pin_sem; + struct f2fs_rwsem pin_sem; /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; @@ -1702,18 +1760,22 @@ struct f2fs_sb_info { unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ - atomic64_t total_hit_ext; /* # of lookup extent cache */ - atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */ - atomic64_t read_hit_largest; /* # of hit largest extent node */ - atomic64_t read_hit_cached; /* # of hit cached extent node */ + /* # of lookup extent cache */ + atomic64_t total_hit_ext[NR_EXTENT_CACHES]; + /* # of hit rbtree extent node */ + atomic64_t read_hit_rbtree[NR_EXTENT_CACHES]; + /* # of hit cached extent node */ + atomic64_t read_hit_cached[NR_EXTENT_CACHES]; + /* # of hit largest extent node in read extent cache */ + atomic64_t read_hit_largest; atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ atomic64_t compr_blocks; /* # of compressed blocks */ - atomic_t vw_cnt; /* # of volatile writes */ + atomic_t swapfile_inode; /* # of swapfile inodes */ + atomic_t atomic_files; /* # of opened atomic file */ atomic_t max_aw_cnt; /* max # of atomic writes */ - atomic_t max_vw_cnt; /* max # of volatile writes */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ @@ -1724,7 +1786,7 @@ struct f2fs_sb_info { unsigned int data_io_flag; unsigned int node_io_flag; - /* For sysfs suppport */ + /* For sysfs support */ struct kobject s_kobj; /* /sys/fs/f2fs/ */ struct completion s_kobj_unregister; @@ -1736,12 +1798,15 @@ struct f2fs_sb_info { /* For shrinker support */ struct list_head s_list; + struct mutex umount_mutex; + unsigned int shrinker_run_no; + + /* For multi devices */ int s_ndevs; /* number of devices */ struct f2fs_dev_info *devs; /* for device list */ unsigned int dirty_device; /* for checkpoint data flush */ spinlock_t dev_lock; /* protect dirty_device */ - struct mutex umount_mutex; - unsigned int shrinker_run_no; + bool aligned_blksize; /* all devices has the same logical blksize */ /* For write statistics */ u64 sectors_written_start; @@ -1755,6 +1820,10 @@ struct f2fs_sb_info { struct workqueue_struct *post_read_wq; /* post read workqueue */ + unsigned char errors[MAX_F2FS_ERRORS]; /* error flags */ + spinlock_t error_lock; /* protect errors array */ + bool error_dirty; /* errors of sb is dirty */ + struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ unsigned int inline_xattr_slab_size; /* default inline xattr slab size */ @@ -1764,6 +1833,15 @@ struct f2fs_sb_info { unsigned long seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */ + int max_fragment_chunk; /* max chunk size for block fragmentation mode */ + int max_fragment_hole; /* max hole size for block fragmentation mode */ + + /* For atomic write statistics */ + atomic64_t current_atomic_write; + s64 peak_atomic_write; + u64 committed_atomic_block; + u64 revoked_atomic_block; + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ @@ -1795,13 +1873,6 @@ struct f2fs_sb_info { #endif }; -struct f2fs_private_dio { - struct inode *inode; - void *orig_private; - bio_end_io_t *orig_end_io; - bool write; -}; - #ifdef CONFIG_F2FS_FAULT_INJECTION #define f2fs_show_injection_info(sbi, type) \ printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", \ @@ -2080,29 +2151,105 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock_irqrestore(&sbi->cp_lock, flags); } +#define init_f2fs_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_f2fs_rwsem((sem), #sem, &__key); \ +} while (0) + +static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem, + const char *sem_name, struct lock_class_key *key) +{ + __init_rwsem(&sem->internal_rwsem, sem_name, key); +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + init_waitqueue_head(&sem->read_waiters); +#endif +} + +static inline int f2fs_rwsem_is_locked(struct f2fs_rwsem *sem) +{ + return rwsem_is_locked(&sem->internal_rwsem); +} + +static inline int f2fs_rwsem_is_contended(struct f2fs_rwsem *sem) +{ + return rwsem_is_contended(&sem->internal_rwsem); +} + +static inline void f2fs_down_read(struct f2fs_rwsem *sem) +{ +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + wait_event(sem->read_waiters, down_read_trylock(&sem->internal_rwsem)); +#else + down_read(&sem->internal_rwsem); +#endif +} + +static inline int f2fs_down_read_trylock(struct f2fs_rwsem *sem) +{ + return down_read_trylock(&sem->internal_rwsem); +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass) +{ + down_read_nested(&sem->internal_rwsem, subclass); +} +#else +#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem) +#endif + +static inline void f2fs_up_read(struct f2fs_rwsem *sem) +{ + up_read(&sem->internal_rwsem); +} + +static inline void f2fs_down_write(struct f2fs_rwsem *sem) +{ + down_write(&sem->internal_rwsem); +} + +static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem) +{ + return down_write_trylock(&sem->internal_rwsem); +} + +static inline void f2fs_up_write(struct f2fs_rwsem *sem) +{ + up_write(&sem->internal_rwsem); +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + wake_up_all(&sem->read_waiters); +#endif +} + static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { - down_read(&sbi->cp_rwsem); + f2fs_down_read(&sbi->cp_rwsem); } static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) { - return down_read_trylock(&sbi->cp_rwsem); + if (time_to_inject(sbi, FAULT_LOCK_OP)) { + f2fs_show_injection_info(sbi, FAULT_LOCK_OP); + return 0; + } + return f2fs_down_read_trylock(&sbi->cp_rwsem); } static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { - up_read(&sbi->cp_rwsem); + f2fs_up_read(&sbi->cp_rwsem); } static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { - down_write(&sbi->cp_rwsem); + f2fs_down_write(&sbi->cp_rwsem); } static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) { - up_write(&sbi->cp_rwsem); + f2fs_up_write(&sbi->cp_rwsem); } static inline int __get_cp_reason(struct f2fs_sb_info *sbi) @@ -2309,6 +2456,28 @@ static inline void inode_dec_dirty_pages(struct inode *inode) dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); } +static inline void inc_atomic_write_cnt(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + u64 current_write; + + fi->atomic_write_cnt++; + atomic64_inc(&sbi->current_atomic_write); + current_write = atomic64_read(&sbi->current_atomic_write); + if (current_write > sbi->peak_atomic_write) + sbi->peak_atomic_write = current_write; +} + +static inline void release_atomic_write_cnt(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + + atomic64_sub(fi->atomic_write_cnt, &sbi->current_atomic_write); + fi->atomic_write_cnt = 0; +} + static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { return atomic_read(&sbi->nr_pages[count_type]); @@ -2374,7 +2543,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) if (__cp_payload(sbi) > 0) { if (flag == NAT_BITMAP) - return &ckpt->sit_nat_version_bitmap; + return tmp_ptr; else return (unsigned char *)ckpt + F2FS_BLKSIZE; } else { @@ -2412,6 +2581,7 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } +extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) { @@ -2581,16 +2751,6 @@ static inline struct page *f2fs_pagecache_get_page( return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); } -static inline void f2fs_copy_page(struct page *src, struct page *dst) -{ - char *src_kaddr = kmap(src); - char *dst_kaddr = kmap(dst); - - memcpy(dst_kaddr, src_kaddr, PAGE_SIZE); - kunmap(dst); - kunmap(src); -} - static inline void f2fs_put_page(struct page *page, int unlock) { if (!page) @@ -2671,6 +2831,9 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type) if (is_inflight_io(sbi, type)) return false; + if (sbi->gc_mode == GC_URGENT_MID) + return true; + if (sbi->gc_mode == GC_URGENT_LOW && (type == DISCARD_TIME || type == GC_TIME)) return true; @@ -2811,7 +2974,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) /* Flags that should be inherited by new inodes from their parent. */ #define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \ F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ - F2FS_CASEFOLD_FL | F2FS_COMPR_FL | F2FS_NOCOMP_FL) + F2FS_CASEFOLD_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ @@ -2909,6 +3072,8 @@ static inline void f2fs_i_blocks_write(struct inode *inode, set_inode_flag(inode, FI_AUTO_RECOVER); } +static inline bool f2fs_is_atomic_file(struct inode *inode); + static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); @@ -2918,6 +3083,10 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) return; i_size_write(inode, i_size); + + if (f2fs_is_atomic_file(inode)) + return; + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); @@ -3056,6 +3225,10 @@ static inline int inline_xattr_size(struct inode *inode) return 0; } +/* + * Notice: check inline_data flag without inode page lock is unsafe. + * It could change at any time by f2fs_convert_inline_page(). + */ static inline int f2fs_has_inline_data(struct inode *inode) { return is_inode_flag_set(inode, FI_INLINE_DATA); @@ -3086,14 +3259,9 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } -static inline bool f2fs_is_commit_atomic_write(struct inode *inode) +static inline bool f2fs_is_cow_file(struct inode *inode) { - return is_inode_flag_set(inode, FI_ATOMIC_COMMIT); -} - -static inline bool f2fs_is_volatile_file(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_VOLATILE_FILE); + return is_inode_flag_set(inode, FI_COW_FILE); } static inline bool f2fs_is_first_block_written(struct inode *inode) @@ -3126,12 +3294,16 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { + if (is_file(inode, type)) + return; F2FS_I(inode)->i_advise |= type; f2fs_mark_inode_dirty_sync(inode, true); } static inline void clear_file(struct inode *inode, int type) { + if (!is_file(inode, type)) + return; F2FS_I(inode)->i_advise &= ~type; f2fs_mark_inode_dirty_sync(inode, true); } @@ -3324,6 +3496,8 @@ void f2fs_handle_failed_inode(struct inode *inode); int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); +int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct inode **new_inode); /* * dir.c @@ -3392,6 +3566,8 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); +void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason); +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); @@ -3416,7 +3592,7 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, - struct node_info *ni); + struct node_info *ni, bool checkpoint_context); pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); @@ -3459,11 +3635,8 @@ void f2fs_destroy_node_manager_caches(void); * segment.c */ bool f2fs_need_SSR(struct f2fs_sb_info *sbi); -void f2fs_register_inmem_page(struct inode *inode, struct page *page); -void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure); -void f2fs_drop_inmem_pages(struct inode *inode); -void f2fs_drop_inmem_page(struct inode *inode, struct page *page); -int f2fs_commit_inmem_pages(struct inode *inode); +int f2fs_commit_atomic_write(struct inode *inode); +void f2fs_abort_atomic_write(struct inode *inode, bool clean); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg); int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); @@ -3517,6 +3690,8 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio); +void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, + block_t blkaddr, unsigned int blkcnt); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered, bool locked); void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); @@ -3534,17 +3709,26 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_segment_manager_caches(void); void f2fs_destroy_segment_manager_caches(void); int f2fs_rw_hint_to_seg_type(enum rw_hint hint); -enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, - enum page_type type, enum temp_type temp); unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, unsigned int segno); unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno); +#define DEF_FRAGMENT_SIZE 4 +#define MIN_FRAGMENT_SIZE 1 +#define MAX_FRAGMENT_SIZE 512 + +static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG || + F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK; +} + /* * checkpoint.c */ -void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason); void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi); struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); @@ -3554,7 +3738,8 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); -void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, + unsigned int ra_blocks); long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type); void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); @@ -3596,6 +3781,7 @@ int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type); +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, @@ -3607,18 +3793,18 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio); int f2fs_merge_page_bio(struct f2fs_io_info *fio); void f2fs_submit_page_write(struct f2fs_io_info *fio); struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, - block_t blk_addr, struct bio *bio); + block_t blk_addr, sector_t *sector); int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); void f2fs_set_data_blkaddr(struct dnode_of_data *dn); void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); -int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - int op_flags, bool for_write); -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); + int op_flags, bool for_write, pgoff_t *next_pgofs); +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs); struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, bool for_write); struct page *f2fs_get_new_data_page(struct inode *inode, @@ -3637,6 +3823,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, struct writeback_control *wbc, enum iostat_type io_type, int compr_blocks, bool allow_balance); +void f2fs_write_failed(struct inode *inode, loff_t to); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); @@ -3650,6 +3837,7 @@ int f2fs_init_post_read_processing(void); void f2fs_destroy_post_read_processing(void); int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); +extern const struct iomap_ops f2fs_iomap_ops; /* * gc.c @@ -3657,8 +3845,7 @@ void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, bool force, - unsigned int segno); +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); int __init f2fs_create_garbage_collection_cache(void); @@ -3681,12 +3868,21 @@ struct f2fs_stat_info { struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; - unsigned long long hit_largest, hit_cached, hit_rbtree; - unsigned long long hit_total, total_ext; - int ext_tree, zombie_tree, ext_node; + unsigned long long hit_cached[NR_EXTENT_CACHES]; + unsigned long long hit_rbtree[NR_EXTENT_CACHES]; + unsigned long long total_ext[NR_EXTENT_CACHES]; + unsigned long long hit_total[NR_EXTENT_CACHES]; + int ext_tree[NR_EXTENT_CACHES]; + int zombie_tree[NR_EXTENT_CACHES]; + int ext_node[NR_EXTENT_CACHES]; + /* to count memory footprint */ + unsigned long long ext_mem[NR_EXTENT_CACHES]; + /* for read extent cache */ + unsigned long long hit_largest; + /* for block age extent cache */ + unsigned long long allocated_data_blocks; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; - int inmem_pages; unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; @@ -3702,9 +3898,9 @@ struct f2fs_stat_info { int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt; unsigned int cur_ckpt_time, peak_ckpt_time; int inline_xattr, inline_inode, inline_dir, append, update, orphans; - int compr_inode; + int compr_inode, swapfile_inode; unsigned long long compr_blocks; - int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; + int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -3716,7 +3912,6 @@ struct f2fs_stat_info { int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; int bg_data_blks, bg_node_blks; - unsigned long long skipped_atomic_files[2]; int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; @@ -3744,10 +3939,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) #define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) #define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) -#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) -#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) +#define stat_inc_total_hit(sbi, type) (atomic64_inc(&(sbi)->total_hit_ext[type])) +#define stat_inc_rbtree_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_rbtree[type])) #define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) -#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached)) +#define stat_inc_cached_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_cached[type])) #define stat_inc_inline_xattr(inode) \ do { \ if (f2fs_has_inline_xattr(inode)) \ @@ -3792,6 +3987,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic64_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_sub_compr_blocks(inode, blocks) \ (atomic64_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) +#define stat_inc_swapfile_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_dec_swapfile_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_inc_atomic_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->atomic_files)) +#define stat_dec_atomic_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->atomic_files)) #define stat_inc_meta_count(sbi, blkaddr) \ do { \ if (blkaddr < SIT_I(sbi)->sit_base_addr) \ @@ -3811,22 +4014,11 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic_inc(&(sbi)->inplace_count)) #define stat_update_max_atomic_write(inode) \ do { \ - int cur = F2FS_I_SB(inode)->atomic_files; \ + int cur = atomic_read(&F2FS_I_SB(inode)->atomic_files); \ int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) -#define stat_inc_volatile_write(inode) \ - (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) -#define stat_dec_volatile_write(inode) \ - (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) -#define stat_update_max_volatile_write(inode) \ - do { \ - int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ - int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ - if (cur > max) \ - atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ - } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ @@ -3873,10 +4065,10 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_other_skip_bggc_count(sbi) do { } while (0) #define stat_inc_dirty_inode(sbi, type) do { } while (0) #define stat_dec_dirty_inode(sbi, type) do { } while (0) -#define stat_inc_total_hit(sbi) do { } while (0) -#define stat_inc_rbtree_node_hit(sbi) do { } while (0) +#define stat_inc_total_hit(sbi, type) do { } while (0) +#define stat_inc_rbtree_node_hit(sbi, type) do { } while (0) #define stat_inc_largest_node_hit(sbi) do { } while (0) -#define stat_inc_cached_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi, type) do { } while (0) #define stat_inc_inline_xattr(inode) do { } while (0) #define stat_dec_inline_xattr(inode) do { } while (0) #define stat_inc_inline_inode(inode) do { } while (0) @@ -3887,10 +4079,11 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_dec_compr_inode(inode) do { } while (0) #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) +#define stat_inc_swapfile_inode(inode) do { } while (0) +#define stat_dec_swapfile_inode(inode) do { } while (0) +#define stat_inc_atomic_inode(inode) do { } while (0) +#define stat_dec_atomic_inode(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) -#define stat_inc_volatile_write(inode) do { } while (0) -#define stat_dec_volatile_write(inode) do { } while (0) -#define stat_update_max_volatile_write(inode) do { } while (0) #define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) #define stat_inc_block_count(sbi, curseg) do { } while (0) @@ -3981,20 +4174,34 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, bool force, bool *leftmost); bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, struct rb_root_cached *root, bool check_key); -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); -void f2fs_init_extent_tree(struct inode *inode, struct page *ipage); +void f2fs_init_extent_tree(struct inode *inode); void f2fs_drop_extent_tree(struct inode *inode); -unsigned int f2fs_destroy_extent_node(struct inode *inode); +void f2fs_destroy_extent_node(struct inode *inode); void f2fs_destroy_extent_tree(struct inode *inode); -bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei); -void f2fs_update_extent_cache(struct dnode_of_data *dn); -void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t fofs, block_t blkaddr, unsigned int len); void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi); int __init f2fs_create_extent_cache(void); void f2fs_destroy_extent_cache(void); +/* read extent cache ops */ +void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage); +bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_read_extent_cache(struct dnode_of_data *dn); +void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len); +unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, + int nr_shrink); + +/* block age extent cache ops */ +void f2fs_init_age_extent_tree(struct inode *inode); +bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_age_extent_cache(struct dnode_of_data *dn); +void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, unsigned int len); +unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, + int nr_shrink); + /* * sysfs.c */ @@ -4050,11 +4257,13 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); int f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); -void f2fs_decompress_cluster(struct decompress_io_ctx *dic); +void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task); void f2fs_end_read_compressed_page(struct page *page, bool failed, - block_t blkaddr); + block_t blkaddr, bool in_task); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); +bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages, + int index, int nr_pages, bool uptodate); bool f2fs_sanity_check_cluster(struct dnode_of_data *dn); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); int f2fs_write_multi_pages(struct compress_ctx *cc, @@ -4062,15 +4271,16 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, struct writeback_control *wbc, enum iostat_type io_type); int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); -void f2fs_update_extent_tree_range_compressed(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int llen, - unsigned int c_len); +void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, + unsigned int llen, unsigned int c_len); int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); -void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); -void f2fs_put_page_dic(struct page *page); +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task); +void f2fs_put_page_dic(struct page *page, bool in_task); unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); @@ -4116,13 +4326,14 @@ static inline struct page *f2fs_compress_control_page(struct page *page) } static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } -static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { } +static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic, + bool in_task) { } static inline void f2fs_end_read_compressed_page(struct page *page, - bool failed, block_t blkaddr) + bool failed, block_t blkaddr, bool in_task) { WARN_ON_ONCE(1); } -static inline void f2fs_put_page_dic(struct page *page) +static inline void f2fs_put_page_dic(struct page *page, bool in_task) { WARN_ON_ONCE(1); } @@ -4143,13 +4354,15 @@ static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { } #define inc_compr_inode_stat(inode) do { } while (0) -static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int llen, - unsigned int c_len) { } +static inline void f2fs_update_read_extent_tree_range_compressed( + struct inode *inode, + pgoff_t fofs, block_t blkaddr, + unsigned int llen, unsigned int c_len) { } #endif -static inline void set_compress_context(struct inode *inode) +static inline int set_compress_context(struct inode *inode) { +#ifdef CONFIG_F2FS_FS_COMPRESSION struct f2fs_sb_info *sbi = F2FS_I_SB(inode); F2FS_I(inode)->i_compress_algorithm = @@ -4172,6 +4385,10 @@ static inline void set_compress_context(struct inode *inode) stat_inc_compr_inode(inode); inc_compr_inode_stat(inode); f2fs_mark_inode_dirty_sync(inode, true); + return 0; +#else + return -EOPNOTSUPP; +#endif } static inline bool f2fs_disable_compressed_file(struct inode *inode) @@ -4191,7 +4408,7 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode) } #define F2FS_FEATURE_FUNCS(name, flagname) \ -static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \ +static inline bool f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \ { \ return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \ } @@ -4211,26 +4428,6 @@ F2FS_FEATURE_FUNCS(casefold, CASEFOLD); F2FS_FEATURE_FUNCS(compression, COMPRESSION); F2FS_FEATURE_FUNCS(readonly, RO); -static inline bool f2fs_may_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - - if (!test_opt(sbi, EXTENT_CACHE) || - is_inode_flag_set(inode, FI_NO_EXTENT) || - (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && - !f2fs_sb_has_readonly(sbi))) - return false; - - /* - * for recovered files during mount do not create extents - * if shrinker is not registered. - */ - if (list_empty(&sbi->s_list)) - return false; - - return S_ISREG(inode->i_mode); -} - #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, block_t blkaddr) @@ -4289,11 +4486,15 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS; } +static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW; +} + static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || - f2fs_is_atomic_file(inode) || - f2fs_is_volatile_file(inode)) + f2fs_is_atomic_file(inode) || f2fs_has_inline_data(inode)) return false; return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); } @@ -4301,8 +4502,8 @@ static inline bool f2fs_may_compress(struct inode *inode) static inline void f2fs_i_compr_blocks_update(struct inode *inode, u64 blocks, bool add) { - int diff = F2FS_I(inode)->i_cluster_size - blocks; struct f2fs_inode_info *fi = F2FS_I(inode); + int diff = fi->i_cluster_size - blocks; /* don't update i_compr_blocks if saved blocks were released */ if (!add && !atomic_read(&fi->i_compr_blocks)) @@ -4318,43 +4519,14 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode, f2fs_mark_inode_dirty_sync(inode, true); } -static inline int block_unaligned_IO(struct inode *inode, - struct kiocb *iocb, struct iov_iter *iter) +static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi, + int flag) { - unsigned int i_blkbits = READ_ONCE(inode->i_blkbits); - unsigned int blocksize_mask = (1 << i_blkbits) - 1; - loff_t offset = iocb->ki_pos; - unsigned long align = offset | iov_iter_alignment(iter); - - return align & blocksize_mask; -} - -static inline bool f2fs_force_buffered_io(struct inode *inode, - struct kiocb *iocb, struct iov_iter *iter) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int rw = iov_iter_rw(iter); - - if (f2fs_post_read_required(inode)) - return true; - if (f2fs_is_multi_device(sbi)) - return true; - /* - * for blkzoned device, fallback direct IO to buffered IO, so - * all IOs can be serialized by log-structured write. - */ - if (f2fs_sb_has_blkzoned(sbi)) - return true; - if (f2fs_lfs_mode(sbi) && (rw == WRITE)) { - if (block_unaligned_IO(inode, iocb, iter)) - return true; - if (F2FS_IO_ALIGNED(sbi)) - return true; - } - if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED)) - return true; - - return false; + if (!f2fs_is_multi_device(sbi)) + return false; + if (flag != F2FS_GET_BLOCK_DIO) + return false; + return sbi->aligned_blksize; } static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) @@ -4388,6 +4560,32 @@ static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi) return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK; } +static inline void f2fs_io_schedule_timeout(long timeout) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(timeout); +} + +static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs, + enum page_type type) +{ + if (unlikely(f2fs_cp_error(sbi))) + return; + + if (ofs == sbi->page_eio_ofs[type]) { + if (sbi->page_eio_cnt[type]++ == MAX_RETRY_PAGE_EIO) + set_ckpt_flags(sbi, CP_ERROR_FLAG); + } else { + sbi->page_eio_ofs[type] = ofs; + sbi->page_eio_cnt[type] = 0; + } +} + +static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi) +{ + return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb); +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 758048a885d2..cd971d580f61 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -42,8 +43,8 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) ret = filemap_fault(vmf); if (!ret) - f2fs_update_iostat(F2FS_I_SB(inode), APP_MAPPED_READ_IO, - F2FS_BLKSIZE); + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_MAPPED_READ_IO, F2FS_BLKSIZE); trace_f2fs_filemap_fault(inode, vmf->pgoff, (unsigned long)ret); @@ -153,7 +154,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (!PageUptodate(page)) SetPageUptodate(page); - f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, inode, APP_MAPPED_IO, F2FS_BLKSIZE); f2fs_update_time(sbi, REQ_TIME); trace_f2fs_vm_page_mkwrite(page, DATA); @@ -169,6 +170,7 @@ static const struct vm_operations_struct f2fs_file_vm_ops = { .fault = f2fs_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, + .speculative = true, }; static int get_parent_ino(struct inode *inode, nid_t *pino) @@ -236,13 +238,13 @@ static void try_to_fix_pino(struct inode *inode) struct f2fs_inode_info *fi = F2FS_I(inode); nid_t pino; - down_write(&fi->i_sem); + f2fs_down_write(&fi->i_sem); if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { f2fs_i_pino_write(inode, pino); file_got_pino(inode); } - up_write(&fi->i_sem); + f2fs_up_write(&fi->i_sem); } static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, @@ -317,9 +319,9 @@ go_write: * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. */ - down_read(&F2FS_I(inode)->i_sem); + f2fs_down_read(&F2FS_I(inode)->i_sem); cp_reason = need_do_checkpoint(inode); - up_read(&F2FS_I(inode)->i_sem); + f2fs_up_read(&F2FS_I(inode)->i_sem); if (cp_reason) { /* all the dirty node pages should be flushed for POR */ @@ -371,7 +373,8 @@ sync_nodes: f2fs_remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) + if ((!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) || + (atomic && !test_opt(sbi, NOBARRIER) && f2fs_sb_has_blkzoned(sbi))) ret = f2fs_issue_flush(sbi, inode->i_ino); if (!ret) { f2fs_remove_ino_entry(sbi, ino, UPDATE_INO); @@ -569,7 +572,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) raw_node = F2FS_NODE(dn->node_page); addr = blkaddr_in_node(raw_node) + base + ofs; - /* Assumption: truncateion starts with cluster */ + /* Assumption: truncation starts with cluster */ for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) { block_t blkaddr = le32_to_cpu(*addr); @@ -616,7 +619,8 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) */ fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + ofs; - f2fs_update_extent_cache_range(dn, fofs, 0, len); + f2fs_update_read_extent_cache_range(dn, fofs, 0, len); + f2fs_update_age_extent_cache_range(dn, fofs, nr_free); dec_valid_block_count(sbi, dn->inode, nr_free); } dn->ofs_in_node = ofs; @@ -806,12 +810,40 @@ int f2fs_truncate(struct inode *inode) return 0; } +static bool f2fs_force_buffered_io(struct inode *inode, int rw) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!fscrypt_dio_supported(inode)) + return true; + if (fsverity_active(inode)) + return true; + if (f2fs_compressed_file(inode)) + return true; + + /* disallow direct IO if any of devices has unaligned blksize */ + if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) + return true; + /* + * for blkzoned device, fallback direct IO to buffered IO, so + * all IOs can be serialized by log-structured write. + */ + if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE)) + return true; + if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi)) + return true; + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + return true; + + return false; +} + int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_inode *ri; + struct f2fs_inode *ri = NULL; unsigned int flags; if (f2fs_has_extra_attr(inode) && @@ -843,7 +875,7 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(mnt_userns, inode, stat); /* we need to show initial sectors used for inline_data/dentries */ if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || @@ -903,7 +935,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, !f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(mnt_userns, dentry, attr); if (err) return err; @@ -957,7 +989,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return err; } - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); truncate_setsize(inode, attr->ia_size); @@ -969,7 +1001,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * larger than i_size. */ filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (err) return err; @@ -979,10 +1011,10 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, spin_unlock(&F2FS_I(inode)->i_size_lock); } - __setattr_copy(&init_user_ns, inode, attr); + __setattr_copy(mnt_userns, inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = posix_acl_chmod(&init_user_ns, inode, f2fs_get_inode_mode(inode)); + err = posix_acl_chmod(mnt_userns, inode, f2fs_get_inode_mode(inode)); if (is_inode_flag_set(inode, FI_ACL_MODE)) { if (!err) @@ -1111,7 +1143,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_start = (loff_t)pg_start << PAGE_SHIFT; blk_end = (loff_t)pg_end << PAGE_SHIFT; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); truncate_pagecache_range(inode, blk_start, blk_end - 1); @@ -1121,7 +1153,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) f2fs_unlock_op(sbi); filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -1159,6 +1191,7 @@ next_dnode: !f2fs_is_valid_blkaddr(sbi, *blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } @@ -1232,7 +1265,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, if (ret) return ret; - ret = f2fs_get_node_info(sbi, dn.nid, &ni); + ret = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (ret) { f2fs_put_dnode(&dn); return ret; @@ -1276,7 +1309,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, f2fs_put_page(psrc, 1); return PTR_ERR(pdst); } - f2fs_copy_page(psrc, pdst); + memcpy_page(pdst, 0, psrc, 0, PAGE_SIZE); set_page_dirty(pdst); f2fs_put_page(pdst, 1); f2fs_put_page(psrc, 1); @@ -1354,7 +1387,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); f2fs_lock_op(sbi); @@ -1364,7 +1397,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_unlock_op(sbi); filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1443,6 +1476,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr, DATA_GENERIC_ENHANCE)) { ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); break; } @@ -1451,7 +1485,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, f2fs_set_data_blkaddr(dn); } - f2fs_update_extent_cache_range(dn, start, 0, index - start); + f2fs_update_read_extent_cache_range(dn, start, 0, index - start); return ret; } @@ -1507,7 +1541,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, unsigned int end_offset; pgoff_t end; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(mapping); truncate_pagecache_range(inode, @@ -1521,7 +1555,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) { f2fs_unlock_op(sbi); filemap_invalidate_unlock(mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } @@ -1533,7 +1567,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_unlock_op(sbi); filemap_invalidate_unlock(mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_balance_fs(sbi, dn.node_changed); @@ -1607,7 +1641,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(mapping); truncate_pagecache(inode, offset); @@ -1625,7 +1659,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_unlock_op(sbi); } filemap_invalidate_unlock(mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* write out all moved pages, if possible */ filemap_invalidate_lock(mapping); @@ -1645,6 +1679,11 @@ static int expand_inode_data(struct inode *inode, loff_t offset, struct f2fs_map_blocks map = { .m_next_pgofs = NULL, .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = true }; + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true, + .nr_free_secs = 0 }; pgoff_t pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; @@ -1674,20 +1713,20 @@ static int expand_inode_data(struct inode *inode, loff_t offset, return 0; if (f2fs_is_pinned_file(inode)) { - block_t sec_blks = BLKS_PER_SEC(sbi); + block_t sec_blks = CAP_BLKS_PER_SEC(sbi); block_t sec_len = roundup(map.m_len, sec_blks); map.m_len = sec_blks; next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { - down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); - if (err && err != -ENODATA && err != -EAGAIN) + f2fs_down_write(&sbi->gc_lock); + err = f2fs_gc(sbi, &gc_control); + if (err && err != -ENODATA) goto out_err; } - down_write(&sbi->pin_sem); + f2fs_down_write(&sbi->pin_sem); f2fs_lock_op(sbi); f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); @@ -1695,8 +1734,9 @@ next_alloc: map.m_seg_type = CURSEG_COLD_DATA_PINNED; err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + file_dont_truncate(inode); - up_write(&sbi->pin_sem); + f2fs_up_write(&sbi->pin_sem); expanded += map.m_len; sec_len -= map.m_len; @@ -1756,7 +1796,11 @@ static long f2fs_fallocate(struct file *file, int mode, (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; - if (f2fs_compressed_file(inode) && + /* + * Pinned file should not support partial trucation since the block + * can be used by applications. + */ + if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) && (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; @@ -1810,16 +1854,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) atomic_read(&inode->i_writecount) != 1) return 0; - /* some remained atomic pages should discarded */ - if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); - if (f2fs_is_volatile_file(inode)) { - set_inode_flag(inode, FI_DROP_CACHE); - filemap_fdatawrite(inode->i_mapping); - clear_inode_flag(inode, FI_DROP_CACHE); - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); - } + f2fs_abort_atomic_write(inode, true); return 0; } @@ -1833,9 +1868,8 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) * until all the writers close its file. Since this should be done * before dropping file lock, it needs to do in ->flush. */ - if (f2fs_is_atomic_file(inode) && - F2FS_I(inode)->inmem_task == current) - f2fs_drop_inmem_pages(inode); + if (F2FS_I(inode)->atomic_write_task == current) + f2fs_abort_atomic_write(inode, true); return 0; } @@ -1870,12 +1904,16 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) if (!f2fs_disable_compressed_file(inode)) return -EINVAL; } else { + /* try to convert inline_data to support compression */ + int err = f2fs_convert_inline_inode(inode); + if (err) + return err; if (!f2fs_may_compress(inode)) return -EINVAL; - if (S_ISREG(inode->i_mode) && inode->i_size) + if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode)) return -EINVAL; - - set_compress_context(inode); + if (set_compress_context(inode)) + return -EOPNOTSUPP; } } @@ -1985,14 +2023,17 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) return put_user(inode->i_generation, (int __user *)arg); } -static int f2fs_ioc_start_atomic_write(struct file *filp) +static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) { struct inode *inode = file_inode(filp); + struct user_namespace *mnt_userns = file_mnt_user_ns(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct inode *pinode; + loff_t isize; int ret; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; if (!S_ISREG(inode->i_mode)) @@ -2012,45 +2053,67 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; } - if (f2fs_is_atomic_file(inode)) { - if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) - ret = -EINVAL; + if (f2fs_is_atomic_file(inode)) goto out; - } ret = f2fs_convert_inline_inode(inode); if (ret) goto out; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); /* * Should wait end_io to count F2FS_WB_CP_DATA correctly by * f2fs_is_atomic_file. */ if (get_dirty_pages(inode)) - f2fs_warn(F2FS_I_SB(inode), "Unexpected flush for atomic writes: ino=%lu, npages=%u", + f2fs_warn(sbi, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) { - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); goto out; } - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (list_empty(&fi->inmem_ilist)) - list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]); - sbi->atomic_files++; - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + /* Create a COW inode for atomic write */ + pinode = f2fs_iget(inode->i_sb, fi->i_pino); + if (IS_ERR(pinode)) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + ret = PTR_ERR(pinode); + goto out; + } + + ret = f2fs_get_tmpfile(mnt_userns, pinode, &fi->cow_inode); + iput(pinode); + if (ret) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + goto out; + } + + f2fs_write_inode(inode, NULL); + + stat_inc_atomic_inode(inode); - /* add inode in inmem_list first and set atomic_file */ set_inode_flag(inode, FI_ATOMIC_FILE); - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + set_inode_flag(fi->cow_inode, FI_COW_FILE); + clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - F2FS_I(inode)->inmem_task = current; + isize = i_size_read(inode); + fi->original_i_size = isize; + if (truncate) { + set_inode_flag(inode, FI_ATOMIC_REPLACE); + truncate_inode_pages_final(inode->i_mapping); + f2fs_i_size_write(inode, 0); + isize = 0; + } + f2fs_i_size_write(fi->cow_inode, isize); + + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + + f2fs_update_time(sbi, REQ_TIME); + fi->atomic_write_task = current; stat_update_max_atomic_write(inode); + fi->atomic_write_cnt = 0; out: inode_unlock(inode); mnt_drop_write_file(filp); @@ -2060,9 +2123,10 @@ out: static int f2fs_ioc_commit_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); + struct user_namespace *mnt_userns = file_mnt_user_ns(filp); int ret; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -2073,73 +2137,28 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) inode_lock(inode); - if (f2fs_is_volatile_file(inode)) { - ret = -EINVAL; - goto err_out; - } - if (f2fs_is_atomic_file(inode)) { - ret = f2fs_commit_inmem_pages(inode); - if (ret) - goto err_out; - - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + ret = f2fs_commit_atomic_write(inode); if (!ret) - f2fs_drop_inmem_pages(inode); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + + f2fs_abort_atomic_write(inode, ret); } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } -err_out: - if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - ret = -EINVAL; - } + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } -static int f2fs_ioc_start_volatile_write(struct file *filp) +static int f2fs_ioc_abort_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); + struct user_namespace *mnt_userns = file_mnt_user_ns(filp); int ret; - if (!inode_owner_or_capable(&init_user_ns, inode)) - return -EACCES; - - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (f2fs_is_volatile_file(inode)) - goto out; - - ret = f2fs_convert_inline_inode(inode); - if (ret) - goto out; - - stat_inc_volatile_write(inode); - stat_update_max_volatile_write(inode); - - set_inode_flag(inode, FI_VOLATILE_FILE); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_release_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - int ret; - - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -2148,44 +2167,7 @@ static int f2fs_ioc_release_volatile_write(struct file *filp) inode_lock(inode); - if (!f2fs_is_volatile_file(inode)) - goto out; - - if (!f2fs_is_first_block_written(inode)) { - ret = truncate_partial_data_page(inode, 0, true); - goto out; - } - - ret = punch_hole(inode, 0, F2FS_BLKSIZE); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_abort_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - int ret; - - if (!inode_owner_or_capable(&init_user_ns, inode)) - return -EACCES; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); - if (f2fs_is_volatile_file(inode)) { - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); - } - - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); + f2fs_abort_atomic_write(inode, true); inode_unlock(inode); @@ -2213,7 +2195,8 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (ret) { if (ret == -EROFS) { ret = 0; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); trace_f2fs_shutdown(sbi, in, ret); } @@ -2226,7 +2209,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) ret = freeze_bdev(sb->s_bdev); if (ret) goto out; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); thaw_bdev(sb->s_bdev); break; @@ -2235,16 +2218,16 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) ret = f2fs_sync_fs(sb, 1); if (ret) goto out; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NOSYNC: - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_METAFLUSH: f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NEED_FSCK: @@ -2353,7 +2336,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) if (err) return err; - down_write(&sbi->sb_lock); + f2fs_down_write(&sbi->sb_lock); if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) goto got_it; @@ -2372,7 +2355,7 @@ got_it: 16)) err = -EFAULT; out_err: - up_write(&sbi->sb_lock); + f2fs_up_write(&sbi->sb_lock); mnt_drop_write_file(filp); return err; } @@ -2432,6 +2415,10 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .no_bg_gc = false, + .should_migrate_blocks = false, + .nr_free_secs = 0 }; __u32 sync; int ret; @@ -2449,15 +2436,17 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) return ret; if (!sync) { - if (!down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } } else { - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO); + gc_control.init_gc_type = sync ? FG_GC : BG_GC; + gc_control.err_gc_skipped = sync; + ret = f2fs_gc(sbi, &gc_control); out: mnt_drop_write_file(filp); return ret; @@ -2466,6 +2455,12 @@ out: static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) { struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); + struct f2fs_gc_control gc_control = { + .init_gc_type = range->sync ? FG_GC : BG_GC, + .no_bg_gc = false, + .should_migrate_blocks = false, + .err_gc_skipped = range->sync, + .nr_free_secs = 0 }; u64 end; int ret; @@ -2485,22 +2480,22 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) do_more: if (!range->sync) { - if (!down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } } else { - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, range->sync, true, false, - GET_SEGNO(sbi, range->start)); + gc_control.victim_segno = GET_SEGNO(sbi, range->start); + ret = f2fs_gc(sbi, &gc_control); if (ret) { if (ret == -EBUSY) ret = -EAGAIN; goto out; } - range->start += BLKS_PER_SEC(sbi); + range->start += CAP_BLKS_PER_SEC(sbi); if (range->start <= end) goto do_more; out: @@ -2553,7 +2548,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, struct f2fs_map_blocks map = { .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = false }; - struct extent_info ei = {0, 0, 0}; + struct extent_info ei = {}; pgoff_t pg_start, pg_end, next_pgofs; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; @@ -2561,10 +2556,6 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, bool fragmented = false; int err; - /* if in-place-update policy is enabled, don't waste time here */ - if (f2fs_should_update_inplace(inode, NULL)) - return -EINVAL; - pg_start = range->start >> PAGE_SHIFT; pg_end = (range->start + range->len) >> PAGE_SHIFT; @@ -2572,6 +2563,13 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, inode_lock(inode); + /* if in-place-update policy is enabled, don't waste time here */ + set_inode_flag(inode, FI_OPU_WRITE); + if (f2fs_should_update_inplace(inode, NULL)) { + err = -EINVAL; + goto out; + } + /* writeback all dirty pages in the range */ err = filemap_write_and_wait_range(inode->i_mapping, range->start, range->start + range->len - 1); @@ -2582,7 +2580,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, * lookup mapping info in extent cache, skip defragmenting if physical * block addresses are continuous. */ - if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, pg_start, &ei)) { if (ei.fofs + ei.len >= pg_end) goto out; } @@ -2622,7 +2620,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, goto out; } - sec_num = DIV_ROUND_UP(total, BLKS_PER_SEC(sbi)); + sec_num = DIV_ROUND_UP(total, CAP_BLKS_PER_SEC(sbi)); /* * make sure there are enough free section for LFS allocation, this can @@ -2653,7 +2651,7 @@ do_map: goto check; } - set_inode_flag(inode, FI_DO_DEFRAG); + set_inode_flag(inode, FI_SKIP_WRITES); idx = map.m_lblk; while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { @@ -2679,15 +2677,16 @@ check: if (map.m_lblk < pg_end && cnt < blk_per_seg) goto do_map; - clear_inode_flag(inode, FI_DO_DEFRAG); + clear_inode_flag(inode, FI_SKIP_WRITES); err = filemap_fdatawrite(inode->i_mapping); if (err) goto out; } clear_out: - clear_inode_flag(inode, FI_DO_DEFRAG); + clear_inode_flag(inode, FI_SKIP_WRITES); out: + clear_inode_flag(inode, FI_OPU_WRITE); inode_unlock(inode); if (!err) range->len = (u64)total << PAGE_SHIFT; @@ -2823,10 +2822,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_balance_fs(sbi, true); - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); if (src != dst) { ret = -EBUSY; - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) + if (!f2fs_down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) goto out_src; } @@ -2844,9 +2843,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_unlock_op(sbi); if (src != dst) - up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); out_src: - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); out_unlock: if (src != dst) inode_unlock(dst); @@ -2905,6 +2904,11 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) unsigned int start_segno = 0, end_segno = 0; unsigned int dev_start_segno = 0, dev_end_segno = 0; struct f2fs_flush_device range; + struct f2fs_gc_control gc_control = { + .init_gc_type = FG_GC, + .should_migrate_blocks = true, + .err_gc_skipped = true, + .nr_free_secs = 0 }; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -2941,14 +2945,16 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) end_segno = min(start_segno + range.segments, dev_end_segno); while (start_segno < end_segno) { - if (!down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } sm->last_victim[GC_CB] = end_segno + 1; sm->last_victim[GC_GREEDY] = end_segno + 1; sm->last_victim[ALLOC_NEXT] = end_segno + 1; - ret = f2fs_gc(sbi, true, true, true, start_segno); + + gc_control.victim_segno = start_segno; + ret = f2fs_gc(sbi, &gc_control); if (ret == -EAGAIN) ret = 0; else if (ret < 0) @@ -2993,7 +2999,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) { struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct page *ipage; + struct f2fs_inode *ri = NULL; kprojid_t kprojid; int err; @@ -3009,7 +3015,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) kprojid = make_kprojid(&init_user_ns, (projid_t)projid); - if (projid_eq(kprojid, F2FS_I(inode)->i_projid)) + if (projid_eq(kprojid, fi->i_projid)) return 0; err = -EPERM; @@ -3017,17 +3023,8 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) if (IS_NOQUOTA(inode)) return err; - ipage = f2fs_get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); - - if (!F2FS_FITS_IN_INODE(F2FS_INODE(ipage), fi->i_extra_isize, - i_projid)) { - err = -EOVERFLOW; - f2fs_put_page(ipage, 1); - return err; - } - f2fs_put_page(ipage, 1); + if (!F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + return -EOVERFLOW; err = f2fs_dquot_initialize(inode); if (err) @@ -3038,7 +3035,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) if (err) goto out_unlock; - F2FS_I(inode)->i_projid = kprojid; + fi->i_projid = kprojid; inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); out_unlock: @@ -3218,9 +3215,9 @@ int f2fs_precache_extents(struct inode *inode) while (map.m_lblk < end) { map.m_len = end - map.m_lblk; - down_write(&fi->i_gc_rwsem[WRITE]); + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE); - up_write(&fi->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); if (err) return err; @@ -3297,11 +3294,11 @@ static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg) if (!vbuf) return -ENOMEM; - down_read(&sbi->sb_lock); + f2fs_down_read(&sbi->sb_lock); count = utf16s_to_utf8s(sbi->raw_super->volume_name, ARRAY_SIZE(sbi->raw_super->volume_name), UTF16_LITTLE_ENDIAN, vbuf, MAX_VOLUME_NAME); - up_read(&sbi->sb_lock); + f2fs_up_read(&sbi->sb_lock); if (copy_to_user((char __user *)arg, vbuf, min(FSLABEL_MAX, count))) @@ -3329,7 +3326,7 @@ static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg) if (err) goto out; - down_write(&sbi->sb_lock); + f2fs_down_write(&sbi->sb_lock); memset(sbi->raw_super->volume_name, 0, sizeof(sbi->raw_super->volume_name)); @@ -3339,7 +3336,7 @@ static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg) err = f2fs_commit_super(sbi, false); - up_write(&sbi->sb_lock); + f2fs_up_write(&sbi->sb_lock); mnt_drop_write_file(filp); out: @@ -3377,8 +3374,10 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count) if (!__is_valid_data_blkaddr(blkaddr)) continue; if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, - DATA_GENERIC_ENHANCE))) + DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } } while (count) { @@ -3465,7 +3464,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) if (!atomic_read(&F2FS_I(inode)->i_compr_blocks)) goto out; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); @@ -3502,7 +3501,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) } filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); out: inode_unlock(inode); @@ -3539,8 +3538,10 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) if (!__is_valid_data_blkaddr(blkaddr)) continue; if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, - DATA_GENERIC_ENHANCE))) + DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } } while (count) { @@ -3618,7 +3619,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) goto unlock_inode; } - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); @@ -3655,7 +3656,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) } filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (ret >= 0) { clear_inode_flag(inode, FI_COMPRESS_RELEASED); @@ -3773,7 +3774,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) if (ret) goto err; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(mapping); ret = filemap_write_and_wait_range(mapping, range.start, @@ -3812,6 +3813,8 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) DATA_GENERIC_ENHANCE)) { ret = -EFSCORRUPTED; f2fs_put_dnode(&dn); + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); goto out; } @@ -3862,7 +3865,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) prev_block, len, range.flags); out: filemap_invalidate_unlock(mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); err: inode_unlock(inode); file_end_write(filp); @@ -3969,10 +3972,10 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) for (i = 0; i < page_len; i++, redirty_idx++) { page = find_lock_page(mapping, redirty_idx); - if (!page) { - ret = -ENOMEM; - break; - } + + /* It will never fail, when page has pinned above */ + f2fs_bug_on(F2FS_I_SB(inode), !page); + set_page_dirty(page); f2fs_put_page(page, 1); f2fs_put_page(page, 0); @@ -3988,7 +3991,7 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) struct f2fs_inode_info *fi = F2FS_I(inode); pgoff_t page_idx = 0, last_idx; unsigned int blk_per_seg = sbi->blocks_per_seg; - int cluster_size = F2FS_I(inode)->i_cluster_size; + int cluster_size = fi->i_cluster_size; int count, ret; if (!f2fs_sb_has_compression(sbi) || @@ -4133,15 +4136,16 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FS_IOC_GETVERSION: return f2fs_ioc_getversion(filp, arg); case F2FS_IOC_START_ATOMIC_WRITE: - return f2fs_ioc_start_atomic_write(filp); + return f2fs_ioc_start_atomic_write(filp, false); + case F2FS_IOC_START_ATOMIC_REPLACE: + return f2fs_ioc_start_atomic_write(filp, true); case F2FS_IOC_COMMIT_ATOMIC_WRITE: return f2fs_ioc_commit_atomic_write(filp); + case F2FS_IOC_ABORT_ATOMIC_WRITE: + return f2fs_ioc_abort_atomic_write(filp); case F2FS_IOC_START_VOLATILE_WRITE: - return f2fs_ioc_start_volatile_write(filp); case F2FS_IOC_RELEASE_VOLATILE_WRITE: - return f2fs_ioc_release_volatile_write(filp); - case F2FS_IOC_ABORT_VOLATILE_WRITE: - return f2fs_ioc_abort_volatile_write(filp); + return -EOPNOTSUPP; case F2FS_IOC_SHUTDOWN: return f2fs_ioc_shutdown(filp, arg); case FITRIM: @@ -4227,27 +4231,402 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return __f2fs_ioctl(filp, cmd, arg); } -static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +/* + * Return %true if the given read or write request should use direct I/O, or + * %false if it should use buffered I/O. + */ +static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb, + struct iov_iter *iter) +{ + unsigned int align; + + if (!(iocb->ki_flags & IOCB_DIRECT)) + return false; + + if (f2fs_force_buffered_io(inode, iov_iter_rw(iter))) + return false; + + /* + * Direct I/O not aligned to the disk's logical_block_size will be + * attempted, but will fail with -EINVAL. + * + * f2fs additionally requires that direct I/O be aligned to the + * filesystem block size, which is often a stricter requirement. + * However, f2fs traditionally falls back to buffered I/O on requests + * that are logical_block_size-aligned but not fs-block aligned. + * + * The below logic implements this behavior. + */ + align = iocb->ki_pos | iov_iter_alignment(iter); + if (!IS_ALIGNED(align, i_blocksize(inode)) && + IS_ALIGNED(align, bdev_logical_block_size(inode->i_sb->s_bdev))) + return false; + + return true; +} + +static int f2fs_dio_read_end_io(struct kiocb *iocb, ssize_t size, int error, + unsigned int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp)); + + dec_page_count(sbi, F2FS_DIO_READ); + if (error) + return error; + f2fs_update_iostat(sbi, NULL, APP_DIRECT_READ_IO, size); + return 0; +} + +static const struct iomap_dio_ops f2fs_iomap_dio_read_ops = { + .end_io = f2fs_dio_read_end_io, +}; + +static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - int ret; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + const loff_t pos = iocb->ki_pos; + const size_t count = iov_iter_count(to); + struct iomap_dio *dio; + ssize_t ret; + + if (count == 0) + return 0; /* skip atime update */ + + trace_f2fs_direct_IO_enter(inode, iocb, count, READ); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) { + ret = -EAGAIN; + goto out; + } + } else { + f2fs_down_read(&fi->i_gc_rwsem[READ]); + } + + /* + * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of + * the higher-level function iomap_dio_rw() in order to ensure that the + * F2FS_DIO_READ counter will be decremented correctly in all cases. + */ + inc_page_count(sbi, F2FS_DIO_READ); + dio = __iomap_dio_rw(iocb, to, &f2fs_iomap_ops, + &f2fs_iomap_dio_read_ops, 0, 0); + if (IS_ERR_OR_NULL(dio)) { + ret = PTR_ERR_OR_ZERO(dio); + if (ret != -EIOCBQUEUED) + dec_page_count(sbi, F2FS_DIO_READ); + } else { + ret = iomap_dio_complete(dio); + } + + f2fs_up_read(&fi->i_gc_rwsem[READ]); + + file_accessed(file); +out: + trace_f2fs_direct_IO_exit(inode, pos, count, READ, ret); + return ret; +} + +static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + const loff_t pos = iocb->ki_pos; + ssize_t ret; if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - ret = generic_file_read_iter(iocb, iter); + if (trace_f2fs_dataread_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL); + char *path; - if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_READ_IO, ret); + if (!p) + goto skip_read_trace; + path = dentry_path_raw(file_dentry(iocb->ki_filp), p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_read_trace; + } + + trace_f2fs_dataread_start(inode, pos, iov_iter_count(to), + current->pid, path, current->comm); + kfree(p); + } +skip_read_trace: + if (f2fs_should_use_dio(inode, iocb, to)) { + ret = f2fs_dio_read_iter(iocb, to); + } else { + ret = filemap_read(iocb, to, 0); + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_READ_IO, ret); + } + if (trace_f2fs_dataread_end_enabled()) + trace_f2fs_dataread_end(inode, pos, ret); + return ret; +} + +static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + ssize_t count; + int err; + + if (IS_IMMUTABLE(inode)) + return -EPERM; + + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + return -EPERM; + + count = generic_write_checks(iocb, from); + if (count <= 0) + return count; + + err = file_modified(file); + if (err) + return err; + return count; +} + +/* + * Preallocate blocks for a write request, if it is possible and helpful to do + * so. Returns a positive number if blocks may have been preallocated, 0 if no + * blocks were preallocated, or a negative errno value if something went + * seriously wrong. Also sets FI_PREALLOCATED_ALL on the inode if *all* the + * requested blocks (not just some of them) have been allocated. + */ +static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter, + bool dio) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + const loff_t pos = iocb->ki_pos; + const size_t count = iov_iter_count(iter); + struct f2fs_map_blocks map = {}; + int flag; + int ret; + + /* If it will be an out-of-place direct write, don't bother. */ + if (dio && f2fs_lfs_mode(sbi)) + return 0; + /* + * Don't preallocate holes aligned to DIO_SKIP_HOLES which turns into + * buffered IO, if DIO meets any holes. + */ + if (dio && i_size_read(inode) && + (F2FS_BYTES_TO_BLK(pos) < F2FS_BLK_ALIGN(i_size_read(inode)))) + return 0; + + /* No-wait I/O can't allocate blocks. */ + if (iocb->ki_flags & IOCB_NOWAIT) + return 0; + + /* If it will be a short write, don't bother. */ + if (fault_in_iov_iter_readable(iter, count)) + return 0; + + if (f2fs_has_inline_data(inode)) { + /* If the data will fit inline, don't bother. */ + if (pos + count <= MAX_INLINE_DATA(inode)) + return 0; + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + + /* Do not preallocate blocks that will be written partially in 4KB. */ + map.m_lblk = F2FS_BLK_ALIGN(pos); + map.m_len = F2FS_BYTES_TO_BLK(pos + count); + if (map.m_len > map.m_lblk) + map.m_len -= map.m_lblk; + else + map.m_len = 0; + map.m_may_create = true; + if (dio) { + map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); + flag = F2FS_GET_BLOCK_PRE_DIO; + } else { + map.m_seg_type = NO_CHECK_TYPE; + flag = F2FS_GET_BLOCK_PRE_AIO; + } + + ret = f2fs_map_blocks(inode, &map, 1, flag); + /* -ENOSPC|-EDQUOT are fine to report the number of allocated blocks. */ + if (ret < 0 && !((ret == -ENOSPC || ret == -EDQUOT) && map.m_len > 0)) + return ret; + if (ret == 0) + set_inode_flag(inode, FI_PREALLOCATED_ALL); + return map.m_len; +} + +static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + + current->backing_dev_info = inode_to_bdi(inode); + ret = generic_perform_write(file, from, iocb->ki_pos); + current->backing_dev_info = NULL; + + if (ret > 0) { + iocb->ki_pos += ret; + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_IO, ret); + } + return ret; +} + +static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, + unsigned int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp)); + + dec_page_count(sbi, F2FS_DIO_WRITE); + if (error) + return error; + f2fs_update_iostat(sbi, NULL, APP_DIRECT_IO, size); + return 0; +} + +static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = { + .end_io = f2fs_dio_write_end_io, +}; + +static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, + bool *may_need_sync) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + const bool do_opu = f2fs_lfs_mode(sbi); + const loff_t pos = iocb->ki_pos; + const ssize_t count = iov_iter_count(from); + unsigned int dio_flags; + struct iomap_dio *dio; + ssize_t ret; + + trace_f2fs_direct_IO_enter(inode, iocb, count, WRITE); + + if (iocb->ki_flags & IOCB_NOWAIT) { + /* f2fs_convert_inline_inode() and block allocation can block */ + if (f2fs_has_inline_data(inode) || + !f2fs_overwrite_io(inode, pos, count)) { + ret = -EAGAIN; + goto out; + } + + if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[WRITE])) { + ret = -EAGAIN; + goto out; + } + if (do_opu && !f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) { + f2fs_up_read(&fi->i_gc_rwsem[WRITE]); + ret = -EAGAIN; + goto out; + } + } else { + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + f2fs_down_read(&fi->i_gc_rwsem[WRITE]); + if (do_opu) + f2fs_down_read(&fi->i_gc_rwsem[READ]); + } + /* + * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of + * the higher-level function iomap_dio_rw() in order to ensure that the + * F2FS_DIO_WRITE counter will be decremented correctly in all cases. + */ + inc_page_count(sbi, F2FS_DIO_WRITE); + dio_flags = 0; + if (pos + count > inode->i_size) + dio_flags |= IOMAP_DIO_FORCE_WAIT; + dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops, + &f2fs_iomap_dio_write_ops, dio_flags, 0); + if (IS_ERR_OR_NULL(dio)) { + ret = PTR_ERR_OR_ZERO(dio); + if (ret == -ENOTBLK) + ret = 0; + if (ret != -EIOCBQUEUED) + dec_page_count(sbi, F2FS_DIO_WRITE); + } else { + ret = iomap_dio_complete(dio); + } + + if (do_opu) + f2fs_up_read(&fi->i_gc_rwsem[READ]); + f2fs_up_read(&fi->i_gc_rwsem[WRITE]); + + if (ret < 0) + goto out; + if (pos + ret > inode->i_size) + f2fs_i_size_write(inode, pos + ret); + if (!do_opu) + set_inode_flag(inode, FI_UPDATE_WRITE); + + if (iov_iter_count(from)) { + ssize_t ret2; + loff_t bufio_start_pos = iocb->ki_pos; + + /* + * The direct write was partial, so we need to fall back to a + * buffered write for the remainder. + */ + + ret2 = f2fs_buffered_write_iter(iocb, from); + if (iov_iter_count(from)) + f2fs_write_failed(inode, iocb->ki_pos); + if (ret2 < 0) + goto out; + + /* + * Ensure that the pagecache pages are written to disk and + * invalidated to preserve the expected O_DIRECT semantics. + */ + if (ret2 > 0) { + loff_t bufio_end_pos = bufio_start_pos + ret2 - 1; + + ret += ret2; + + ret2 = filemap_write_and_wait_range(file->f_mapping, + bufio_start_pos, + bufio_end_pos); + if (ret2 < 0) + goto out; + invalidate_mapping_pages(file->f_mapping, + bufio_start_pos >> PAGE_SHIFT, + bufio_end_pos >> PAGE_SHIFT); + } + } else { + /* iomap_dio_rw() already handled the generic_write_sync(). */ + *may_need_sync = false; + } +out: + trace_f2fs_direct_IO_exit(inode, pos, count, WRITE, ret); return ret; } static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); + struct inode *inode = file_inode(iocb->ki_filp); + const loff_t orig_pos = iocb->ki_pos; + const size_t orig_count = iov_iter_count(from); + loff_t target_size; + bool dio; + bool may_need_sync = true; + int preallocated; ssize_t ret; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { @@ -4269,91 +4648,64 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); } - if (unlikely(IS_IMMUTABLE(inode))) { - ret = -EPERM; - goto unlock; - } + ret = f2fs_write_checks(iocb, from); + if (ret <= 0) + goto out_unlock; - if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { - ret = -EPERM; - goto unlock; - } + /* Determine whether we will do a direct write or a buffered write. */ + dio = f2fs_should_use_dio(inode, iocb, from); - ret = generic_write_checks(iocb, from); - if (ret > 0) { - bool preallocated = false; - size_t target_size = 0; - int err; + /* Possibly preallocate the blocks for the write. */ + target_size = iocb->ki_pos + iov_iter_count(from); + preallocated = f2fs_preallocate_blocks(iocb, from, dio); + if (preallocated < 0) { + ret = preallocated; + } else { + if (trace_f2fs_datawrite_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), + PATH_MAX, GFP_KERNEL); + char *path; - if (fault_in_iov_iter_readable(from, iov_iter_count(from))) - set_inode_flag(inode, FI_NO_PREALLOC); - - if ((iocb->ki_flags & IOCB_NOWAIT)) { - if (!f2fs_overwrite_io(inode, iocb->ki_pos, - iov_iter_count(from)) || - f2fs_has_inline_data(inode) || - f2fs_force_buffered_io(inode, iocb, from)) { - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); - ret = -EAGAIN; - goto out; + if (!p) + goto skip_write_trace; + path = dentry_path_raw(file_dentry(iocb->ki_filp), + p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_write_trace; } - goto write; + trace_f2fs_datawrite_start(inode, orig_pos, orig_count, + current->pid, path, current->comm); + kfree(p); } +skip_write_trace: + /* Do the actual write. */ + ret = dio ? + f2fs_dio_write_iter(iocb, from, &may_need_sync) : + f2fs_buffered_write_iter(iocb, from); - if (is_inode_flag_set(inode, FI_NO_PREALLOC)) - goto write; - - if (iocb->ki_flags & IOCB_DIRECT) { - /* - * Convert inline data for Direct I/O before entering - * f2fs_direct_IO(). - */ - err = f2fs_convert_inline_inode(inode); - if (err) - goto out_err; - /* - * If force_buffere_io() is true, we have to allocate - * blocks all the time, since f2fs_direct_IO will fall - * back to buffered IO. - */ - if (!f2fs_force_buffered_io(inode, iocb, from) && - f2fs_lfs_mode(F2FS_I_SB(inode))) - goto write; - } - preallocated = true; - target_size = iocb->ki_pos + iov_iter_count(from); - - err = f2fs_preallocate_blocks(iocb, from); - if (err) { -out_err: - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); - ret = err; - goto out; - } -write: - ret = __generic_file_write_iter(iocb, from); - clear_inode_flag(inode, FI_NO_PREALLOC); - - /* if we couldn't write data, we should deallocate blocks. */ - if (preallocated && i_size_read(inode) < target_size) { - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - filemap_invalidate_lock(inode->i_mapping); - f2fs_truncate(inode); - filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - } - - if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); + if (trace_f2fs_datawrite_end_enabled()) + trace_f2fs_datawrite_end(inode, orig_pos, ret); } -unlock: + + /* Don't leave any preallocated blocks around past i_size. */ + if (preallocated && i_size_read(inode) < target_size) { + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + if (!f2fs_truncate(inode)) + file_dont_truncate(inode); + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } else { + file_dont_truncate(inode); + } + + clear_inode_flag(inode, FI_PREALLOCATED_ALL); +out_unlock: inode_unlock(inode); out: - trace_f2fs_file_write_iter(inode, iocb->ki_pos, - iov_iter_count(from), ret); - if (ret > 0) + trace_f2fs_file_write_iter(inode, orig_pos, orig_count, ret); + if (ret > 0 && may_need_sync) ret = generic_write_sync(iocb, ret); return ret; } @@ -4361,12 +4713,12 @@ out: static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) { - struct inode *inode; struct address_space *mapping; struct backing_dev_info *bdi; + struct inode *inode = file_inode(filp); + int err; if (advice == POSIX_FADV_SEQUENTIAL) { - inode = file_inode(filp); if (S_ISFIFO(inode->i_mode)) return -ESPIPE; @@ -4383,7 +4735,13 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, return 0; } - return generic_fadvise(filp, offset, len, advice); + err = generic_fadvise(filp, offset, len, advice); + if (!err && advice == POSIX_FADV_DONTNEED && + test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) && + f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino); + + return err; } #ifdef CONFIG_COMPAT @@ -4456,7 +4814,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_COMMIT_ATOMIC_WRITE: case F2FS_IOC_START_VOLATILE_WRITE: case F2FS_IOC_RELEASE_VOLATILE_WRITE: - case F2FS_IOC_ABORT_VOLATILE_WRITE: + case F2FS_IOC_ABORT_ATOMIC_WRITE: case F2FS_IOC_SHUTDOWN: case FITRIM: case FS_IOC_SET_ENCRYPTION_POLICY: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 280a93855eaf..352f706fcccd 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -34,6 +35,10 @@ static int gc_thread_func(void *data) wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq; unsigned int wait_ms; + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .should_migrate_blocks = false, + .err_gc_skipped = false }; wait_ms = gc_th->min_sleep_time; @@ -69,7 +74,8 @@ static int gc_thread_func(void *data) if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FAULT_INJECT); } if (!sb_start_write_trylock(sbi->sb)) { @@ -90,23 +96,24 @@ static int gc_thread_func(void *data) * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. */ - if (sbi->gc_mode == GC_URGENT_HIGH) { + if (sbi->gc_mode == GC_URGENT_HIGH || + sbi->gc_mode == GC_URGENT_MID) { wait_ms = gc_th->urgent_sleep_time; - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); goto do_gc; } if (foreground) { - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); goto do_gc; - } else if (!down_write_trylock(&sbi->gc_lock)) { + } else if (!f2fs_down_write_trylock(&sbi->gc_lock)) { stat_other_skip_bggc_count(sbi); goto next; } if (!is_idle(sbi, GC_TIME)) { increase_sleep_time(gc_th, &wait_ms); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); stat_io_skip_bggc_count(sbi); goto next; } @@ -125,9 +132,20 @@ do_gc: if (foreground) sync_mode = false; + gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; + gc_control.no_bg_gc = foreground; + gc_control.nr_free_secs = foreground ? 1 : 0; + /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO)) - wait_ms = gc_th->no_gc_sleep_time; + if (f2fs_gc(sbi, &gc_control)) { + /* don't bother wait_ms by foreground gc */ + if (!foreground) + wait_ms = gc_th->no_gc_sleep_time; + } else { + /* reset wait_ms to default sleep time */ + if (wait_ms == gc_th->no_gc_sleep_time) + wait_ms = gc_th->min_sleep_time; + } if (foreground) wake_up_all(&gc_th->fggc_wq); @@ -138,6 +156,15 @@ do_gc: /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi, true); next: + if (sbi->gc_mode != GC_NORMAL) { + spin_lock(&sbi->gc_remaining_trials_lock); + if (sbi->gc_remaining_trials) { + sbi->gc_remaining_trials--; + if (!sbi->gc_remaining_trials) + sbi->gc_mode = GC_NORMAL; + } + spin_unlock(&sbi->gc_remaining_trials_lock); + } sb_end_write(sbi->sb); } while (!kthread_should_stop()); @@ -148,13 +175,10 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th; dev_t dev = sbi->sb->s_bdev->bd_dev; - int err = 0; gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); - if (!gc_th) { - err = -ENOMEM; - goto out; - } + if (!gc_th) + return -ENOMEM; gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; @@ -169,12 +193,14 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { - err = PTR_ERR(gc_th->f2fs_gc_task); + int err = PTR_ERR(gc_th->f2fs_gc_task); + kfree(gc_th); sbi->gc_thread = NULL; + return err; } -out: - return err; + + return 0; } void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) @@ -257,7 +283,9 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->max_search = sbi->max_victim_search; /* let's select beginning hot/small space first in no_heap mode*/ - if (test_opt(sbi, NOHEAP) && + if (f2fs_need_rand_seg(sbi)) + p->offset = prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec); + else if (test_opt(sbi, NOHEAP) && (type == CURSEG_HOT_DATA || IS_NODESEG(type))) p->offset = 0; else @@ -461,7 +489,7 @@ static void atgc_lookup_victim(struct f2fs_sb_info *sbi, unsigned long long age, u, accu; unsigned long long max_mtime = sit_i->dirty_max_mtime; unsigned long long min_mtime = sit_i->dirty_min_mtime; - unsigned int sec_blocks = BLKS_PER_SEC(sbi); + unsigned int sec_blocks = CAP_BLKS_PER_SEC(sbi); unsigned int vblocks; unsigned int dirty_threshold = max(am->max_candidate_count, am->candidate_ratio * @@ -628,6 +656,54 @@ static void release_victim_entry(struct f2fs_sb_info *sbi) f2fs_bug_on(sbi, !list_empty(&am->victim_list)); } +static bool f2fs_pin_section(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!dirty_i->enable_pin_section) + return false; + if (!test_and_set_bit(secno, dirty_i->pinned_secmap)) + dirty_i->pinned_secmap_cnt++; + return true; +} + +static bool f2fs_pinned_section_exists(struct dirty_seglist_info *dirty_i) +{ + return dirty_i->pinned_secmap_cnt; +} + +static bool f2fs_section_is_pinned(struct dirty_seglist_info *dirty_i, + unsigned int secno) +{ + return dirty_i->enable_pin_section && + f2fs_pinned_section_exists(dirty_i) && + test_bit(secno, dirty_i->pinned_secmap); +} + +static void f2fs_unpin_all_sections(struct f2fs_sb_info *sbi, bool enable) +{ + unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + + if (f2fs_pinned_section_exists(DIRTY_I(sbi))) { + memset(DIRTY_I(sbi)->pinned_secmap, 0, bitmap_size); + DIRTY_I(sbi)->pinned_secmap_cnt = 0; + } + DIRTY_I(sbi)->enable_pin_section = enable; +} + +static int f2fs_gc_pinned_control(struct inode *inode, int gc_type, + unsigned int segno) +{ + if (!f2fs_is_pinned_file(inode)) + return 0; + if (gc_type != FG_GC) + return -EBUSY; + if (!f2fs_pin_section(F2FS_I_SB(inode), segno)) + f2fs_pin_file_control(inode, true); + return -EAGAIN; +} + /* * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. @@ -769,6 +845,9 @@ retry: if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (gc_type == FG_GC && f2fs_section_is_pinned(dirty_i, secno)) + goto next; + if (is_atgc) { add_victim_entry(sbi, &p, segno); goto next; @@ -944,7 +1023,7 @@ next_step: continue; } - if (f2fs_get_node_info(sbi, nid, &ni)) { + if (f2fs_get_node_info(sbi, nid, &ni, false)) { f2fs_put_page(node_page, 1); continue; } @@ -1002,7 +1081,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, { struct page *node_page; nid_t nid; - unsigned int ofs_in_node, max_addrs; + unsigned int ofs_in_node, max_addrs, base; block_t source_blkaddr; nid = le32_to_cpu(sum->nid); @@ -1012,7 +1091,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (IS_ERR(node_page)) return false; - if (f2fs_get_node_info(sbi, nid, dni)) { + if (f2fs_get_node_info(sbi, nid, dni, false)) { f2fs_put_page(node_page, 1); return false; } @@ -1028,11 +1107,17 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return false; } - max_addrs = IS_INODE(node_page) ? DEF_ADDRS_PER_INODE : - DEF_ADDRS_PER_BLOCK; - if (ofs_in_node >= max_addrs) { - f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u", - ofs_in_node, dni->ino, dni->nid, max_addrs); + if (IS_INODE(node_page)) { + base = offset_in_addr(F2FS_INODE(node_page)); + max_addrs = DEF_ADDRS_PER_INODE; + } else { + base = 0; + max_addrs = DEF_ADDRS_PER_BLOCK; + } + + if (base + ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u", + base, ofs_in_node, max_addrs, dni->ino, dni->nid); f2fs_put_page(node_page, 1); return false; } @@ -1065,7 +1150,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei = {0, 0, 0}; + struct extent_info ei = {0, }; struct f2fs_io_info fio = { .sbi = sbi, .ino = inode->i_ino, @@ -1083,11 +1168,12 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (!page) return -ENOMEM; - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto put_page; } goto got_it; @@ -1106,6 +1192,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto put_page; } got_it: @@ -1135,8 +1222,8 @@ got_it: f2fs_put_page(fio.encrypted_page, 0); f2fs_put_page(page, 1); - f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); - f2fs_update_iostat(sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_GDATA_READ_IO, F2FS_BLKSIZE); return 0; put_encrypted_page: @@ -1185,18 +1272,9 @@ static int move_data_block(struct inode *inode, block_t bidx, goto out; } - if (f2fs_is_atomic_file(inode)) { - F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; - F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; - err = -EAGAIN; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) goto out; - } - - if (f2fs_is_pinned_file(inode)) { - f2fs_pin_file_control(inode, true); - err = -EAGAIN; - goto out; - } set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); @@ -1217,7 +1295,7 @@ static int move_data_block(struct inode *inode, block_t bidx, f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); - err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); if (err) goto put_out; @@ -1226,7 +1304,7 @@ static int move_data_block(struct inode *inode, block_t bidx, fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; if (lfs_mode) - down_write(&fio.sbi->io_order_lock); + f2fs_down_write(&fio.sbi->io_order_lock); mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi), fio.old_blkaddr, false); @@ -1245,8 +1323,10 @@ static int move_data_block(struct inode *inode, block_t bidx, goto up_out; } - f2fs_update_iostat(fio.sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); - f2fs_update_iostat(fio.sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, inode, FS_DATA_READ_IO, + F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, NULL, FS_GDATA_READ_IO, + F2FS_BLKSIZE); lock_page(mpage); if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) || @@ -1298,7 +1378,7 @@ static int move_data_block(struct inode *inode, block_t bidx, goto put_page_out; } - f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, NULL, FS_GC_DATA_IO, F2FS_BLKSIZE); f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); @@ -1312,7 +1392,7 @@ recover_block: true, true, true); up_out: if (lfs_mode) - up_write(&fio.sbi->io_order_lock); + f2fs_up_write(&fio.sbi->io_order_lock); put_out: f2fs_put_dnode(&dn); out: @@ -1335,18 +1415,9 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; } - if (f2fs_is_atomic_file(inode)) { - F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; - F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; - err = -EAGAIN; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) goto out; - } - if (f2fs_is_pinned_file(inode)) { - if (gc_type == FG_GC) - f2fs_pin_file_control(inode, true); - err = -EAGAIN; - goto out; - } if (gc_type == BG_GC) { if (PageWriteback(page)) { @@ -1438,7 +1509,7 @@ next_step: */ if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) || (!force_migrate && get_valid_blocks(sbi, segno, true) == - BLKS_PER_SEC(sbi))) + CAP_BLKS_PER_SEC(sbi))) return submitted; if (check_valid_map(sbi, segno, off) == 0) @@ -1467,12 +1538,20 @@ next_step: ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 3) { + int err; + inode = f2fs_iget(sb, dni.ino); if (IS_ERR(inode) || is_bad_inode(inode) || special_file(inode->i_mode)) continue; - if (!down_write_trylock( + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err == -EAGAIN) { + iput(inode); + return submitted; + } + + if (!f2fs_down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); sbi->skipped_gc_rwsem++; @@ -1485,7 +1564,7 @@ next_step: if (f2fs_post_read_required(inode)) { int err = ra_data_block(inode, start_bidx); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (err) { iput(inode); continue; @@ -1494,9 +1573,9 @@ next_step: continue; } - data_page = f2fs_get_read_data_page(inode, - start_bidx, REQ_RAHEAD, true); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + data_page = f2fs_get_read_data_page(inode, start_bidx, + REQ_RAHEAD, true, NULL); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); continue; @@ -1515,14 +1594,14 @@ next_step: int err; if (S_ISREG(inode->i_mode)) { - if (!down_write_trylock(&fi->i_gc_rwsem[READ])) { + if (!f2fs_down_write_trylock(&fi->i_gc_rwsem[READ])) { sbi->skipped_gc_rwsem++; continue; } - if (!down_write_trylock( + if (!f2fs_down_write_trylock( &fi->i_gc_rwsem[WRITE])) { sbi->skipped_gc_rwsem++; - up_write(&fi->i_gc_rwsem[READ]); + f2fs_up_write(&fi->i_gc_rwsem[READ]); continue; } locked = true; @@ -1545,8 +1624,8 @@ next_step: submitted++; if (locked) { - up_write(&fi->i_gc_rwsem[WRITE]); - up_write(&fi->i_gc_rwsem[READ]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[READ]); } stat_inc_data_blk_count(sbi, 1, gc_type); @@ -1646,7 +1725,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT", segno, type, GET_SUM_TYPE((&sum->footer))); set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_CORRUPTED_SUMMARY); goto skip; } @@ -1692,23 +1772,21 @@ skip: return seg_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, - bool background, bool force, unsigned int segno) +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) { - int gc_type = sync ? FG_GC : BG_GC; + int gc_type = gc_control->init_gc_type; + unsigned int segno = gc_control->victim_segno; int sec_freed = 0, seg_freed = 0, total_freed = 0; int ret = 0; struct cp_control cpc; - unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; - unsigned long long first_skipped; unsigned int skipped_round = 0, round = 0; - trace_f2fs_gc_begin(sbi->sb, sync, background, + trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc, + gc_control->nr_free_secs, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), get_pages(sbi, F2FS_DIRTY_IMETA), @@ -1719,7 +1797,6 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, cpc.reason = __get_cp_reason(sbi); sbi->skipped_gc_rwsem = 0; - first_skipped = last_skipped; gc_more: if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; @@ -1736,8 +1813,7 @@ gc_more: * threshold, we can make them free by checkpoint. Then, we * secure free segments which doesn't need fggc any more. */ - if (prefree_segments(sbi) && - !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { + if (prefree_segments(sbi)) { ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; @@ -1747,54 +1823,69 @@ gc_more: } /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ - if (gc_type == BG_GC && !background) { + if (gc_type == BG_GC && gc_control->no_bg_gc) { ret = -EINVAL; goto stop; } +retry: ret = __get_victim(sbi, &segno, gc_type); - if (ret) + if (ret) { + /* allow to search victim from sections has pinned data */ + if (ret == -ENODATA && gc_type == FG_GC && + f2fs_pinned_section_exists(DIRTY_I(sbi))) { + f2fs_unpin_all_sections(sbi, false); + goto retry; + } goto stop; + } - seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force); - if (gc_type == FG_GC && - seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) - sec_freed++; + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, + gc_control->should_migrate_blocks); total_freed += seg_freed; - if (gc_type == FG_GC) { - if (sbi->skipped_atomic_files[FG_GC] > last_skipped || - sbi->skipped_gc_rwsem) - skipped_round++; - last_skipped = sbi->skipped_atomic_files[FG_GC]; - round++; - } + if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) + sec_freed++; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (sync) + if (gc_control->init_gc_type == FG_GC || + !has_not_enough_free_secs(sbi, + (gc_type == FG_GC) ? sec_freed : 0, 0)) { + if (gc_type == FG_GC && sec_freed < gc_control->nr_free_secs) + goto go_gc_more; goto stop; - - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { - if (skipped_round <= MAX_SKIP_GC_COUNT || - skipped_round * 2 < round) { - segno = NULL_SEGNO; - goto gc_more; - } - - if (first_skipped < last_skipped && - (last_skipped - first_skipped) > - sbi->skipped_gc_rwsem) { - f2fs_drop_inmem_pages_all(sbi, true); - segno = NULL_SEGNO; - goto gc_more; - } - if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) - ret = f2fs_write_checkpoint(sbi, &cpc); } + + /* FG_GC stops GC by skip_count */ + if (gc_type == FG_GC) { + if (sbi->skipped_gc_rwsem) + skipped_round++; + round++; + if (skipped_round > MAX_SKIP_GC_COUNT && + skipped_round * 2 >= round) { + ret = f2fs_write_checkpoint(sbi, &cpc); + goto stop; + } + } + + /* Write checkpoint to reclaim prefree segments */ + if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE && + prefree_segments(sbi)) { + ret = f2fs_write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } +go_gc_more: + segno = NULL_SEGNO; + goto gc_more; + stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; - SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = gc_control->victim_segno; + + if (gc_type == FG_GC) + f2fs_unpin_all_sections(sbi, true); trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, get_pages(sbi, F2FS_DIRTY_NODES), @@ -1805,11 +1896,11 @@ stop: reserved_segments(sbi), prefree_segments(sbi)); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); put_gc_inode(&gc_list); - if (sync && !ret) + if (gc_control->err_gc_skipped && !ret) ret = sec_freed ? 0 : -EAGAIN; return ret; } @@ -1818,9 +1909,7 @@ int __init f2fs_create_garbage_collection_cache(void) { victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry", sizeof(struct victim_entry)); - if (!victim_entry_slab) - return -ENOMEM; - return 0; + return victim_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_garbage_collection_cache(void) @@ -1934,7 +2023,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) long long block_count; int segs = secs * sbi->segs_per_sec; - down_write(&sbi->sb_lock); + f2fs_down_write(&sbi->sb_lock); section_count = le32_to_cpu(raw_sb->section_count); segment_count = le32_to_cpu(raw_sb->segment_count); @@ -1955,7 +2044,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) cpu_to_le32(dev_segs + segs); } - up_write(&sbi->sb_lock); + f2fs_up_write(&sbi->sb_lock); } static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) @@ -2029,7 +2118,7 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi)); /* stop other GC */ - if (!down_write_trylock(&sbi->gc_lock)) + if (!f2fs_down_write_trylock(&sbi->gc_lock)) return -EAGAIN; /* stop CP to protect MAIN_SEC in free_segment_range */ @@ -2049,13 +2138,13 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) out_unlock: f2fs_unlock_op(sbi); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); if (err) return err; freeze_super(sbi->sb); - down_write(&sbi->gc_lock); - down_write(&sbi->cp_global_sem); + f2fs_down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->cp_global_sem); spin_lock(&sbi->stat_lock); if (shrunk_blocks + valid_user_blocks(sbi) + @@ -2102,8 +2191,8 @@ recover_out: spin_unlock(&sbi->stat_lock); } out_err: - up_write(&sbi->cp_global_sem); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->cp_global_sem); + f2fs_up_write(&sbi->gc_lock); thaw_super(sbi->sb); return err; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 3fe145e8e594..19b956c2d697 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -120,15 +120,13 @@ static inline block_t free_user_blocks(struct f2fs_sb_info *sbi) return free_blks - ovp_blks; } -static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi) +static inline block_t limit_invalid_user_blocks(block_t user_block_count) { - return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100; + return (long)(user_block_count * LIMIT_INVALID_BLOCK) / 100; } -static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) +static inline block_t limit_free_user_blocks(block_t reclaimable_user_blocks) { - block_t reclaimable_user_blocks = sbi->user_block_count - - written_block_count(sbi); return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; } @@ -163,15 +161,16 @@ static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) { - block_t invalid_user_blocks = sbi->user_block_count - - written_block_count(sbi); + block_t user_block_count = sbi->user_block_count; + block_t invalid_user_blocks = user_block_count - + written_block_count(sbi); /* * Background GC is triggered with the following conditions. * 1. There are a number of invalid blocks. * 2. There is not enough free space. */ - if (invalid_user_blocks > limit_invalid_user_blocks(sbi) && - free_user_blocks(sbi) < limit_free_user_blocks(sbi)) - return true; - return false; + return (invalid_user_blocks > + limit_invalid_user_blocks(user_block_count) && + free_user_blocks(sbi) < + limit_free_user_blocks(invalid_user_blocks)); } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index d0e3fc963cf2..21a495234ffd 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -64,7 +64,6 @@ bool f2fs_may_inline_dentry(struct inode *inode) void f2fs_do_read_inline_data(struct page *page, struct page *ipage) { struct inode *inode = page->mapping->host; - void *src_addr, *dst_addr; if (PageUptodate(page)) return; @@ -74,11 +73,8 @@ void f2fs_do_read_inline_data(struct page *page, struct page *ipage) zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); /* Copy the whole inline data block */ - src_addr = inline_data_addr(inode, ipage); - dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); - flush_dcache_page(page); - kunmap_atomic(dst_addr); + memcpy_to_page(page, 0, inline_data_addr(inode, ipage), + MAX_INLINE_DATA(inode)); if (!PageUptodate(page)) SetPageUptodate(page); } @@ -150,7 +146,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) if (err) return err; - err = f2fs_get_node_info(fio.sbi, dn->nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn->nid, &ni, false); if (err) { f2fs_truncate_data_blocks_range(dn, 1); f2fs_put_dnode(dn); @@ -164,6 +160,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) set_sbi_flag(fio.sbi, SBI_NEED_FSCK); f2fs_warn(fio.sbi, "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", __func__, dn->inode->i_ino, dn->data_blkaddr); + f2fs_handle_error(fio.sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } @@ -246,7 +243,6 @@ out: int f2fs_write_inline_data(struct inode *inode, struct page *page) { - void *src_addr, *dst_addr; struct dnode_of_data dn; int err; @@ -263,10 +259,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_bug_on(F2FS_I_SB(inode), page->index); f2fs_wait_on_page_writeback(dn.inode_page, NODE, true, true); - src_addr = kmap_atomic(page); - dst_addr = inline_data_addr(inode, dn.inode_page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); - kunmap_atomic(src_addr); + memcpy_from_page(inline_data_addr(inode, dn.inode_page), + page, 0, MAX_INLINE_DATA(inode)); set_page_dirty(dn.inode_page); f2fs_clear_page_cache_dirty_tag(page); @@ -419,6 +413,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); f2fs_warn(F2FS_P_SB(page), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", __func__, dir->i_ino, dn.data_blkaddr); + f2fs_handle_error(F2FS_P_SB(page), ERROR_INVALID_BLKADDR); err = -EFSCORRUPTED; goto out; } @@ -648,7 +643,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, } if (inode) { - down_write(&F2FS_I(inode)->i_sem); + f2fs_down_write(&F2FS_I(inode)->i_sem); page = f2fs_init_inode_metadata(inode, dir, fname, ipage); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -677,7 +672,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, f2fs_update_parent_metadata(dir, inode, 0); fail: if (inode) - up_write(&F2FS_I(inode)->i_sem); + f2fs_up_write(&F2FS_I(inode)->i_sem); out: f2fs_put_page(ipage, 1); return err; @@ -805,7 +800,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, ilen = start + len; ilen -= start; - err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni, false); if (err) goto out; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bd8960f4966b..200e1d866c3e 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -81,8 +81,10 @@ static int __written_first_block(struct f2fs_sb_info *sbi, if (!__is_valid_data_blkaddr(addr)) return 1; - if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) + if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } return 0; } @@ -260,8 +262,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (F2FS_I(inode)->extent_tree) { - struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; + if (fi->extent_tree[EX_READ]) { + struct extent_info *ei = &fi->extent_tree[EX_READ]->largest; if (ei->len && (!f2fs_is_valid_blkaddr(sbi, ei->blk, @@ -333,6 +335,16 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return true; } +static void init_idisk_time(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + fi->i_disk_time[0] = inode->i_atime; + fi->i_disk_time[1] = inode->i_ctime; + fi->i_disk_time[2] = inode->i_mtime; + fi->i_disk_time[3] = fi->i_crtime; +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -380,8 +392,6 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - f2fs_init_extent_tree(inode, node_page); - get_inline_info(inode, ri); fi->i_extra_isize = f2fs_has_extra_attr(inode) ? @@ -405,6 +415,7 @@ static int do_read_inode(struct inode *inode) if (!sanity_check_inode(inode, node_page)) { f2fs_put_page(node_page, 1); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; } @@ -465,10 +476,12 @@ static int do_read_inode(struct inode *inode) } } - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; - F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; + init_idisk_time(inode); + + /* Need all the flag bits */ + f2fs_init_read_extent_tree(inode, node_page); + f2fs_init_age_extent_tree(inode); + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -480,6 +493,12 @@ static int do_read_inode(struct inode *inode) return 0; } +static bool is_meta_ino(struct f2fs_sb_info *sbi, unsigned int ino) +{ + return ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi) || + ino == F2FS_COMPRESS_INO(sbi); +} + struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -491,16 +510,22 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { + if (is_meta_ino(sbi, ino)) { + f2fs_err(sbi, "inaccessible inode: %lu, run fsck to repair", ino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + ret = -EFSCORRUPTED; + trace_f2fs_iget_exit(inode, ret); + iput(inode); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return ERR_PTR(ret); + } + trace_f2fs_iget(inode); return inode; } - if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) - goto make_now; -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (ino == F2FS_COMPRESS_INO(sbi)) + if (is_meta_ino(sbi, ino)) goto make_now; -#endif ret = do_read_inode(inode); if (ret) @@ -548,6 +573,15 @@ make_now: goto bad_inode; } f2fs_set_inode_flags(inode); + + if (file_should_truncate(inode) && + !is_sbi_flag_set(sbi, SBI_POR_DOING)) { + ret = f2fs_truncate(inode); + if (ret) + goto bad_inode; + file_dont_truncate(inode); + } + unlock_new_inode(inode); trace_f2fs_iget(inode); return inode; @@ -576,7 +610,7 @@ retry: void f2fs_update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; f2fs_wait_on_page_writeback(node_page, NODE, true, true); set_page_dirty(node_page); @@ -590,12 +624,15 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) ri->i_uid = cpu_to_le32(i_uid_read(inode)); ri->i_gid = cpu_to_le32(i_gid_read(inode)); ri->i_links = cpu_to_le32(inode->i_nlink); - ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1); + if (!f2fs_is_atomic_file(inode) || + is_inode_flag_set(inode, FI_ATOMIC_COMMITTED)) + ri->i_size = cpu_to_le64(i_size_read(inode)); + if (et) { read_lock(&et->lock); - set_raw_extent(&et->largest, &ri->i_ext); + set_raw_read_extent(&et->largest, &ri->i_ext); read_unlock(&et->lock); } else { memset(&ri->i_ext, 0, sizeof(ri->i_ext)); @@ -667,11 +704,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (inode->i_nlink == 0) clear_page_private_inline(node_page); - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; - F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; - + init_idisk_time(inode); #ifdef CONFIG_F2FS_CHECK_FS f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page); #endif @@ -690,7 +723,8 @@ retry: cond_resched(); goto retry; } else if (err != -ENOENT) { - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_UPDATE_INODE); } return; } @@ -735,14 +769,13 @@ void f2fs_evict_inode(struct inode *inode) nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; - /* some remained atomic pages should discarded */ - if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); - if (test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode)) + if ((inode->i_nlink || is_bad_inode(inode)) && + test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode)) f2fs_invalidate_compress_pages(sbi, inode->i_ino); if (inode->i_ino == F2FS_NODE_INO(sbi) || @@ -888,7 +921,7 @@ void f2fs_handle_failed_inode(struct inode *inode) * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); set_inode_flag(inode, FI_FREE_NID); diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index cdcf54ae0db8..3166a8939ed4 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -31,55 +31,65 @@ int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) /* print app write IOs */ seq_puts(seq, "[WRITE]\n"); - seq_printf(seq, "app buffered: %-16llu\n", + seq_printf(seq, "app buffered data: %-16llu\n", sbi->rw_iostat[APP_BUFFERED_IO]); - seq_printf(seq, "app direct: %-16llu\n", + seq_printf(seq, "app direct data: %-16llu\n", sbi->rw_iostat[APP_DIRECT_IO]); - seq_printf(seq, "app mapped: %-16llu\n", + seq_printf(seq, "app mapped data: %-16llu\n", sbi->rw_iostat[APP_MAPPED_IO]); + seq_printf(seq, "app buffered cdata: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_CDATA_IO]); + seq_printf(seq, "app mapped cdata: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_CDATA_IO]); /* print fs write IOs */ - seq_printf(seq, "fs data: %-16llu\n", + seq_printf(seq, "fs data: %-16llu\n", sbi->rw_iostat[FS_DATA_IO]); - seq_printf(seq, "fs node: %-16llu\n", + seq_printf(seq, "fs cdata: %-16llu\n", + sbi->rw_iostat[FS_CDATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", sbi->rw_iostat[FS_NODE_IO]); - seq_printf(seq, "fs meta: %-16llu\n", + seq_printf(seq, "fs meta: %-16llu\n", sbi->rw_iostat[FS_META_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", + seq_printf(seq, "fs gc data: %-16llu\n", sbi->rw_iostat[FS_GC_DATA_IO]); - seq_printf(seq, "fs gc node: %-16llu\n", + seq_printf(seq, "fs gc node: %-16llu\n", sbi->rw_iostat[FS_GC_NODE_IO]); - seq_printf(seq, "fs cp data: %-16llu\n", + seq_printf(seq, "fs cp data: %-16llu\n", sbi->rw_iostat[FS_CP_DATA_IO]); - seq_printf(seq, "fs cp node: %-16llu\n", + seq_printf(seq, "fs cp node: %-16llu\n", sbi->rw_iostat[FS_CP_NODE_IO]); - seq_printf(seq, "fs cp meta: %-16llu\n", + seq_printf(seq, "fs cp meta: %-16llu\n", sbi->rw_iostat[FS_CP_META_IO]); /* print app read IOs */ seq_puts(seq, "[READ]\n"); - seq_printf(seq, "app buffered: %-16llu\n", + seq_printf(seq, "app buffered data: %-16llu\n", sbi->rw_iostat[APP_BUFFERED_READ_IO]); - seq_printf(seq, "app direct: %-16llu\n", + seq_printf(seq, "app direct data: %-16llu\n", sbi->rw_iostat[APP_DIRECT_READ_IO]); - seq_printf(seq, "app mapped: %-16llu\n", + seq_printf(seq, "app mapped data: %-16llu\n", sbi->rw_iostat[APP_MAPPED_READ_IO]); + seq_printf(seq, "app buffered cdata: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO]); + seq_printf(seq, "app mapped cdata: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO]); /* print fs read IOs */ - seq_printf(seq, "fs data: %-16llu\n", + seq_printf(seq, "fs data: %-16llu\n", sbi->rw_iostat[FS_DATA_READ_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", + seq_printf(seq, "fs gc data: %-16llu\n", sbi->rw_iostat[FS_GDATA_READ_IO]); - seq_printf(seq, "fs compr_data: %-16llu\n", + seq_printf(seq, "fs cdata: %-16llu\n", sbi->rw_iostat[FS_CDATA_READ_IO]); - seq_printf(seq, "fs node: %-16llu\n", + seq_printf(seq, "fs node: %-16llu\n", sbi->rw_iostat[FS_NODE_READ_IO]); - seq_printf(seq, "fs meta: %-16llu\n", + seq_printf(seq, "fs meta: %-16llu\n", sbi->rw_iostat[FS_META_READ_IO]); /* print other IOs */ seq_puts(seq, "[OTHER]\n"); - seq_printf(seq, "fs discard: %-16llu\n", + seq_printf(seq, "fs discard: %-16llu\n", sbi->rw_iostat[FS_DISCARD]); return 0; @@ -91,8 +101,9 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) unsigned int cnt; struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + unsigned long flags; - spin_lock_irq(&sbi->iostat_lat_lock); + spin_lock_irqsave(&sbi->iostat_lat_lock, flags); for (idx = 0; idx < MAX_IO_TYPE; idx++) { for (io = 0; io < NR_PAGE_TYPE; io++) { cnt = io_lat->bio_cnt[idx][io]; @@ -106,7 +117,7 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) io_lat->bio_cnt[idx][io] = 0; } } - spin_unlock_irq(&sbi->iostat_lat_lock); + spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); trace_f2fs_iostat_latency(sbi, iostat_lat); } @@ -115,14 +126,15 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) { unsigned long long iostat_diff[NR_IO_TYPE]; int i; + unsigned long flags; if (time_is_after_jiffies(sbi->iostat_next_period)) return; /* Need double check under the lock */ - spin_lock(&sbi->iostat_lock); + spin_lock_irqsave(&sbi->iostat_lock, flags); if (time_is_after_jiffies(sbi->iostat_next_period)) { - spin_unlock(&sbi->iostat_lock); + spin_unlock_irqrestore(&sbi->iostat_lock, flags); return; } sbi->iostat_next_period = jiffies + @@ -133,7 +145,7 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) sbi->prev_rw_iostat[i]; sbi->prev_rw_iostat[i] = sbi->rw_iostat[i]; } - spin_unlock(&sbi->iostat_lock); + spin_unlock_irqrestore(&sbi->iostat_lock, flags); trace_f2fs_iostat(sbi, iostat_diff); @@ -145,37 +157,58 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi) struct iostat_lat_info *io_lat = sbi->iostat_io_lat; int i; - spin_lock(&sbi->iostat_lock); + spin_lock_irq(&sbi->iostat_lock); for (i = 0; i < NR_IO_TYPE; i++) { sbi->rw_iostat[i] = 0; sbi->prev_rw_iostat[i] = 0; } - spin_unlock(&sbi->iostat_lock); + spin_unlock_irq(&sbi->iostat_lock); spin_lock_irq(&sbi->iostat_lat_lock); memset(io_lat, 0, sizeof(struct iostat_lat_info)); spin_unlock_irq(&sbi->iostat_lat_lock); } -void f2fs_update_iostat(struct f2fs_sb_info *sbi, +void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes) { + unsigned long flags; + if (!sbi->iostat_enable) return; - spin_lock(&sbi->iostat_lock); + spin_lock_irqsave(&sbi->iostat_lock, flags); sbi->rw_iostat[type] += io_bytes; - if (type == APP_WRITE_IO || type == APP_DIRECT_IO) - sbi->rw_iostat[APP_BUFFERED_IO] = - sbi->rw_iostat[APP_WRITE_IO] - - sbi->rw_iostat[APP_DIRECT_IO]; + if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO) + sbi->rw_iostat[APP_WRITE_IO] += io_bytes; - if (type == APP_READ_IO || type == APP_DIRECT_READ_IO) - sbi->rw_iostat[APP_BUFFERED_READ_IO] = - sbi->rw_iostat[APP_READ_IO] - - sbi->rw_iostat[APP_DIRECT_READ_IO]; - spin_unlock(&sbi->iostat_lock); + if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO) + sbi->rw_iostat[APP_READ_IO] += io_bytes; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (inode && f2fs_compressed_file(inode)) { + if (type == APP_BUFFERED_IO) + sbi->rw_iostat[APP_BUFFERED_CDATA_IO] += io_bytes; + + if (type == APP_BUFFERED_READ_IO) + sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO] += io_bytes; + + if (type == APP_MAPPED_READ_IO) + sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO] += io_bytes; + + if (type == APP_MAPPED_IO) + sbi->rw_iostat[APP_MAPPED_CDATA_IO] += io_bytes; + + if (type == FS_DATA_READ_IO) + sbi->rw_iostat[FS_CDATA_READ_IO] += io_bytes; + + if (type == FS_DATA_IO) + sbi->rw_iostat[FS_CDATA_IO] += io_bytes; + } +#endif + + spin_unlock_irqrestore(&sbi->iostat_lock, flags); f2fs_record_iostat(sbi); } @@ -185,10 +218,10 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, { unsigned long ts_diff; unsigned int iotype = iostat_ctx->type; - unsigned long flags; struct f2fs_sb_info *sbi = iostat_ctx->sbi; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; int idx; + unsigned long flags; if (!sbi->iostat_enable) return; diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h index 22a2d01f57ef..2c048307b6e0 100644 --- a/fs/f2fs/iostat.h +++ b/fs/f2fs/iostat.h @@ -31,7 +31,7 @@ struct iostat_lat_info { extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset); extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi); -extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, +extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes); struct bio_iostat_ctx { @@ -65,7 +65,7 @@ extern void f2fs_destroy_iostat_processing(void); extern int f2fs_init_iostat(struct f2fs_sb_info *sbi); extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi); #else -static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes) {} static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {} static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 7a86a8dcf4f1..2c2c87d1e40a 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -22,139 +22,6 @@ #include "acl.h" #include -static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - nid_t ino; - struct inode *inode; - bool nid_free = false; - bool encrypt = false; - int xattr_size = 0; - int err; - - inode = new_inode(dir->i_sb); - if (!inode) - return ERR_PTR(-ENOMEM); - - f2fs_lock_op(sbi); - if (!f2fs_alloc_nid(sbi, &ino)) { - f2fs_unlock_op(sbi); - err = -ENOSPC; - goto fail; - } - f2fs_unlock_op(sbi); - - nid_free = true; - - inode_init_owner(&init_user_ns, inode, dir, mode); - - inode->i_ino = ino; - inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - F2FS_I(inode)->i_crtime = inode->i_mtime; - inode->i_generation = prandom_u32(); - - if (S_ISDIR(inode->i_mode)) - F2FS_I(inode)->i_current_depth = 1; - - err = insert_inode_locked(inode); - if (err) { - err = -EINVAL; - goto fail; - } - - if (f2fs_sb_has_project_quota(sbi) && - (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL)) - F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; - else - F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, - F2FS_DEF_PROJID); - - err = fscrypt_prepare_new_inode(dir, inode, &encrypt); - if (err) - goto fail_drop; - - err = f2fs_dquot_initialize(inode); - if (err) - goto fail_drop; - - set_inode_flag(inode, FI_NEW_INODE); - - if (encrypt) - f2fs_set_encrypted_inode(inode); - - if (f2fs_sb_has_extra_attr(sbi)) { - set_inode_flag(inode, FI_EXTRA_ATTR); - F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; - } - - if (test_opt(sbi, INLINE_XATTR)) - set_inode_flag(inode, FI_INLINE_XATTR); - - if (f2fs_may_inline_dentry(inode)) - set_inode_flag(inode, FI_INLINE_DENTRY); - - if (f2fs_sb_has_flexible_inline_xattr(sbi)) { - f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); - if (f2fs_has_inline_xattr(inode)) - xattr_size = F2FS_OPTION(sbi).inline_xattr_size; - /* Otherwise, will be 0 */ - } else if (f2fs_has_inline_xattr(inode) || - f2fs_has_inline_dentry(inode)) { - xattr_size = DEFAULT_INLINE_XATTR_ADDRS; - } - F2FS_I(inode)->i_inline_xattr_size = xattr_size; - - f2fs_init_extent_tree(inode, NULL); - - F2FS_I(inode)->i_flags = - f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); - - if (S_ISDIR(inode->i_mode)) - F2FS_I(inode)->i_flags |= F2FS_INDEX_FL; - - if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) - set_inode_flag(inode, FI_PROJ_INHERIT); - - if (f2fs_sb_has_compression(sbi)) { - /* Inherit the compression flag in directory */ - if ((F2FS_I(dir)->i_flags & F2FS_COMPR_FL) && - f2fs_may_compress(inode)) - set_compress_context(inode); - } - - /* Should enable inline_data after compression set */ - if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) - set_inode_flag(inode, FI_INLINE_DATA); - - stat_inc_inline_xattr(inode); - stat_inc_inline_inode(inode); - stat_inc_inline_dir(inode); - - f2fs_set_inode_flags(inode); - - trace_f2fs_new_inode(inode, 0); - return inode; - -fail: - trace_f2fs_new_inode(inode, err); - make_bad_inode(inode); - if (nid_free) - set_inode_flag(inode, FI_FREE_NID); - iput(inode); - return ERR_PTR(err); -fail_drop: - trace_f2fs_new_inode(inode, err); - dquot_drop(inode); - inode->i_flags |= S_NOQUOTA; - if (nid_free) - set_inode_flag(inode, FI_FREE_NID); - clear_nlink(inode); - unlock_new_inode(inode); - iput(inode); - return ERR_PTR(err); -} - static inline int is_extension_exist(const unsigned char *s, const char *sub, bool tmp_ext) { @@ -189,36 +56,6 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub, return 0; } -/* - * Set file's temperature for hot/cold data separation - */ -static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, - const unsigned char *name) -{ - __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - int i, cold_count, hot_count; - - down_read(&sbi->sb_lock); - - cold_count = le32_to_cpu(sbi->raw_super->extension_count); - hot_count = sbi->raw_super->hot_ext_count; - - for (i = 0; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i], true)) - break; - } - - up_read(&sbi->sb_lock); - - if (i == cold_count + hot_count) - return; - - if (i < cold_count) - file_set_cold(inode); - else - file_set_hot(inode); -} - int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set) { @@ -285,56 +122,215 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, return 0; } -static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, - const unsigned char *name) +static void set_compress_new_inode(struct f2fs_sb_info *sbi, struct inode *dir, + struct inode *inode, const unsigned char *name) { __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - unsigned char (*noext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).noextensions; + unsigned char (*noext)[F2FS_EXTENSION_LEN] = + F2FS_OPTION(sbi).noextensions; unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions; unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; int i, cold_count, hot_count; - if (!f2fs_sb_has_compression(sbi) || - F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL || - !f2fs_may_compress(inode) || - (!ext_cnt && !noext_cnt)) + if (!f2fs_sb_has_compression(sbi)) return; - down_read(&sbi->sb_lock); + if (S_ISDIR(inode->i_mode)) + goto inherit_comp; + /* This name comes only from normal files. */ + if (!name) + return; + + /* Don't compress hot files. */ + f2fs_down_read(&sbi->sb_lock); cold_count = le32_to_cpu(sbi->raw_super->extension_count); hot_count = sbi->raw_super->hot_ext_count; - - for (i = cold_count; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i], false)) { - up_read(&sbi->sb_lock); - return; - } - } - - up_read(&sbi->sb_lock); - - for (i = 0; i < noext_cnt; i++) { - if (is_extension_exist(name, noext[i], false)) { - f2fs_disable_compressed_file(inode); - return; - } - } - - if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) + for (i = cold_count; i < cold_count + hot_count; i++) + if (is_extension_exist(name, extlist[i], false)) + break; + f2fs_up_read(&sbi->sb_lock); + if (i < (cold_count + hot_count)) return; + /* Don't compress unallowed extension. */ + for (i = 0; i < noext_cnt; i++) + if (is_extension_exist(name, noext[i], false)) + return; + + /* Compress wanting extension. */ for (i = 0; i < ext_cnt; i++) { - if (!is_extension_exist(name, ext[i], false)) - continue; - - /* Do not use inline_data with compression */ - stat_dec_inline_inode(inode); - clear_inode_flag(inode, FI_INLINE_DATA); - set_compress_context(inode); - return; + if (is_extension_exist(name, ext[i], false)) { + set_compress_context(inode); + return; + } } +inherit_comp: + /* Inherit the {no-}compression flag in directory */ + if (F2FS_I(dir)->i_flags & F2FS_NOCOMP_FL) { + F2FS_I(inode)->i_flags |= F2FS_NOCOMP_FL; + f2fs_mark_inode_dirty_sync(inode, true); + } else if (F2FS_I(dir)->i_flags & F2FS_COMPR_FL) { + set_compress_context(inode); + } +} + +/* + * Set file's temperature for hot/cold data separation + */ +static void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, + const unsigned char *name) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int i, cold_count, hot_count; + + f2fs_down_read(&sbi->sb_lock); + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; + for (i = 0; i < cold_count + hot_count; i++) + if (is_extension_exist(name, extlist[i], true)) + break; + f2fs_up_read(&sbi->sb_lock); + + if (i == cold_count + hot_count) + return; + + if (i < cold_count) + file_set_cold(inode); + else + file_set_hot(inode); +} + +static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, + struct inode *dir, umode_t mode, + const char *name) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + nid_t ino; + struct inode *inode; + bool nid_free = false; + bool encrypt = false; + int xattr_size = 0; + int err; + + inode = new_inode(dir->i_sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!f2fs_alloc_nid(sbi, &ino)) { + err = -ENOSPC; + goto fail; + } + + nid_free = true; + + inode_init_owner(mnt_userns, inode, dir, mode); + + inode->i_ino = ino; + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + F2FS_I(inode)->i_crtime = inode->i_mtime; + inode->i_generation = prandom_u32(); + + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_current_depth = 1; + + err = insert_inode_locked(inode); + if (err) { + err = -EINVAL; + goto fail; + } + + if (f2fs_sb_has_project_quota(sbi) && + (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL)) + F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; + else + F2FS_I(inode)->i_projid = make_kprojid(mnt_userns, + F2FS_DEF_PROJID); + + err = fscrypt_prepare_new_inode(dir, inode, &encrypt); + if (err) + goto fail_drop; + + err = f2fs_dquot_initialize(inode); + if (err) + goto fail_drop; + + set_inode_flag(inode, FI_NEW_INODE); + + if (encrypt) + f2fs_set_encrypted_inode(inode); + + if (f2fs_sb_has_extra_attr(sbi)) { + set_inode_flag(inode, FI_EXTRA_ATTR); + F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; + } + + if (test_opt(sbi, INLINE_XATTR)) + set_inode_flag(inode, FI_INLINE_XATTR); + + if (f2fs_may_inline_dentry(inode)) + set_inode_flag(inode, FI_INLINE_DENTRY); + + if (f2fs_sb_has_flexible_inline_xattr(sbi)) { + f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); + if (f2fs_has_inline_xattr(inode)) + xattr_size = F2FS_OPTION(sbi).inline_xattr_size; + /* Otherwise, will be 0 */ + } else if (f2fs_has_inline_xattr(inode) || + f2fs_has_inline_dentry(inode)) { + xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + } + F2FS_I(inode)->i_inline_xattr_size = xattr_size; + + F2FS_I(inode)->i_flags = + f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); + + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_flags |= F2FS_INDEX_FL; + + if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + + /* Check compression first. */ + set_compress_new_inode(sbi, dir, inode, name); + + /* Should enable inline_data after compression set */ + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) + set_inode_flag(inode, FI_INLINE_DATA); + + if (name && !test_opt(sbi, DISABLE_EXT_IDENTIFY)) + set_file_temperature(sbi, inode, name); + + stat_inc_inline_xattr(inode); + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + + f2fs_set_inode_flags(inode); + + f2fs_init_extent_tree(inode); + + trace_f2fs_new_inode(inode, 0); + return inode; + +fail: + trace_f2fs_new_inode(inode, err); + make_bad_inode(inode); + if (nid_free) + set_inode_flag(inode, FI_FREE_NID); + iput(inode); + return ERR_PTR(err); +fail_drop: + trace_f2fs_new_inode(inode, err); + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + if (nid_free) + set_inode_flag(inode, FI_FREE_NID); + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(err); } static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, @@ -354,15 +350,10 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode, dentry->d_name.name); if (IS_ERR(inode)) return PTR_ERR(inode); - if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) - set_file_temperature(sbi, inode, dentry->d_name.name); - - set_compress_inode(sbi, inode, dentry->d_name.name); - inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; @@ -634,6 +625,8 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) goto fail; } f2fs_delete_entry(de, page, dir, inode); + f2fs_unlock_op(sbi); + #ifdef CONFIG_UNICODE /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid @@ -644,8 +637,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) if (IS_CASEFOLDED(dir)) d_invalidate(dentry); #endif - f2fs_unlock_op(sbi); - if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); fail: @@ -691,7 +682,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); + inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -762,7 +753,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(dir, S_IFDIR | mode); + inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -819,7 +810,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -846,8 +837,9 @@ out: return err; } -static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, - umode_t mode, struct inode **whiteout) +static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool is_whiteout, + struct inode **new_inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -857,11 +849,11 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, if (err) return err; - inode = f2fs_new_inode(dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); - if (whiteout) { + if (is_whiteout) { init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); inode->i_op = &f2fs_special_inode_operations; } else { @@ -886,21 +878,25 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, f2fs_add_orphan_inode(inode); f2fs_alloc_nid_done(sbi, inode->i_ino); - if (whiteout) { + if (is_whiteout) { f2fs_i_links_write(inode, false); spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); - - *whiteout = inode; } else { - d_tmpfile(dentry, inode); + if (dentry) + d_tmpfile(dentry, inode); + else + f2fs_i_links_write(inode, false); } /* link_count was changed by d_tmpfile as well. */ f2fs_unlock_op(sbi); unlock_new_inode(inode); + if (new_inode) + *new_inode = inode; + f2fs_balance_fs(sbi, true); return 0; @@ -921,20 +917,28 @@ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - return __f2fs_tmpfile(dir, dentry, mode, NULL); + return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, false, NULL); } -static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout) +static int f2fs_create_whiteout(struct user_namespace *mnt_userns, + struct inode *dir, struct inode **whiteout) { if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) return -EIO; - return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout); + return __f2fs_tmpfile(mnt_userns, dir, NULL, + S_IFCHR | WHITEOUT_MODE, true, whiteout); } -static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct inode **new_inode) +{ + return __f2fs_tmpfile(mnt_userns, dir, NULL, S_IFREG, false, new_inode); +} + +static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); struct inode *old_inode = d_inode(old_dentry); @@ -972,7 +976,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (flags & RENAME_WHITEOUT) { - err = f2fs_create_whiteout(old_dir, &whiteout); + err = f2fs_create_whiteout(mnt_userns, old_dir, &whiteout); if (err) return err; } @@ -1035,11 +1039,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, new_page = NULL; new_inode->i_ctime = current_time(new_inode); - down_write(&F2FS_I(new_inode)->i_sem); + f2fs_down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) f2fs_i_links_write(new_inode, false); f2fs_i_links_write(new_inode, false); - up_write(&F2FS_I(new_inode)->i_sem); + f2fs_up_write(&F2FS_I(new_inode)->i_sem); if (!new_inode->i_nlink) f2fs_add_orphan_inode(new_inode); @@ -1060,13 +1064,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(new_dir, true); } - down_write(&F2FS_I(old_inode)->i_sem); + f2fs_down_write(&F2FS_I(old_inode)->i_sem); if (!old_dir_entry || whiteout) file_lost_pino(old_inode); else /* adjust dir's i_pino to pass fsck check */ f2fs_i_pino_write(old_inode, new_dir->i_ino); - up_write(&F2FS_I(old_inode)->i_sem); + f2fs_up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); f2fs_mark_inode_dirty_sync(old_inode, false); @@ -1119,8 +1123,7 @@ out_dir: out_old: f2fs_put_page(old_page, 0); out: - if (whiteout) - iput(whiteout); + iput(whiteout); return err; } @@ -1226,38 +1229,38 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, /* update directory entry info of old dir inode */ f2fs_set_link(old_dir, old_entry, old_page, new_inode); - down_write(&F2FS_I(old_inode)->i_sem); + f2fs_down_write(&F2FS_I(old_inode)->i_sem); if (!old_dir_entry) file_lost_pino(old_inode); else /* adjust dir's i_pino to pass fsck check */ f2fs_i_pino_write(old_inode, new_dir->i_ino); - up_write(&F2FS_I(old_inode)->i_sem); + f2fs_up_write(&F2FS_I(old_inode)->i_sem); old_dir->i_ctime = current_time(old_dir); if (old_nlink) { - down_write(&F2FS_I(old_dir)->i_sem); + f2fs_down_write(&F2FS_I(old_dir)->i_sem); f2fs_i_links_write(old_dir, old_nlink > 0); - up_write(&F2FS_I(old_dir)->i_sem); + f2fs_up_write(&F2FS_I(old_dir)->i_sem); } f2fs_mark_inode_dirty_sync(old_dir, false); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); - down_write(&F2FS_I(new_inode)->i_sem); + f2fs_down_write(&F2FS_I(new_inode)->i_sem); if (!new_dir_entry) file_lost_pino(new_inode); else /* adjust dir's i_pino to pass fsck check */ f2fs_i_pino_write(new_inode, old_dir->i_ino); - up_write(&F2FS_I(new_inode)->i_sem); + f2fs_up_write(&F2FS_I(new_inode)->i_sem); new_dir->i_ctime = current_time(new_dir); if (new_nlink) { - down_write(&F2FS_I(new_dir)->i_sem); + f2fs_down_write(&F2FS_I(new_dir)->i_sem); f2fs_i_links_write(new_dir, new_nlink > 0); - up_write(&F2FS_I(new_dir)->i_sem); + f2fs_up_write(&F2FS_I(new_dir)->i_sem); } f2fs_mark_inode_dirty_sync(new_dir, false); @@ -1312,7 +1315,8 @@ static int f2fs_rename2(struct user_namespace *mnt_userns, * VFS has already handled the new dentry existence case, * here, we just deal with "RENAME_NOREPLACE" as regular rename. */ - return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); + return f2fs_rename(mnt_userns, old_dir, old_dentry, + new_dir, new_dentry, flags); } static const char *f2fs_encrypted_get_link(struct dentry *dentry, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f810c6bbeff0..d44b386a37f3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -36,6 +36,7 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.", __func__, nid); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; } return 0; @@ -59,7 +60,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) avail_ram = val.totalram - val.totalhigh; /* - * give 25%, 25%, 50%, 50%, 50% memory for each components respectively + * give 25%, 25%, 50%, 50%, 25%, 25% memory for each components respectively */ if (type == FREE_NIDS) { mem_size = (nm_i->nid_cnt[FREE_NID] * @@ -84,16 +85,16 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct ino_entry); mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); - } else if (type == EXTENT_CACHE) { - mem_size = (atomic_read(&sbi->total_ext_tree) * + } else if (type == READ_EXTENT_CACHE || type == AGE_EXTENT_CACHE) { + enum extent_type etype = type == READ_EXTENT_CACHE ? + EX_READ : EX_BLOCK_AGE; + struct extent_tree_info *eti = &sbi->extent_tree[etype]; + + mem_size = (atomic_read(&eti->total_ext_tree) * sizeof(struct extent_tree) + - atomic_read(&sbi->total_ext_node) * + atomic_read(&eti->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; - res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); - } else if (type == INMEM_PAGES) { - /* it allows 20% / total_ram for inmemory pages */ - mem_size = get_pages(sbi, F2FS_INMEM_PAGES); - res = mem_size < (val.totalram / 5); + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == DISCARD_CACHE) { mem_size = (atomic_read(&dcc->discard_cmd_cnt) * sizeof(struct discard_cmd)) >> PAGE_SHIFT; @@ -382,14 +383,14 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool need = false; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { if (!get_nat_flag(e, IS_CHECKPOINTED) && !get_nat_flag(e, HAS_FSYNCED_INODE)) need = true; } - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return need; } @@ -399,11 +400,11 @@ bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool is_cp = true; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e && !get_nat_flag(e, IS_CHECKPOINTED)) is_cp = false; - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return is_cp; } @@ -413,13 +414,13 @@ bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) struct nat_entry *e; bool need_update = true; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ino); if (e && get_nat_flag(e, HAS_LAST_FSYNC) && (get_nat_flag(e, IS_CHECKPOINTED) || get_nat_flag(e, HAS_FSYNCED_INODE))) need_update = false; - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return need_update; } @@ -430,11 +431,15 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *new, *e; + /* Let's mitigate lock contention of nat_tree_lock during checkpoint */ + if (f2fs_rwsem_is_locked(&sbi->cp_global_sem)) + return; + new = __alloc_nat_entry(sbi, nid, false); if (!new) return; - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (!e) e = __init_nat_entry(nm_i, new, ne, false); @@ -443,7 +448,7 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, nat_get_blkaddr(e) != le32_to_cpu(ne->block_addr) || nat_get_version(e) != ne->version); - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); if (e != new) __free_nat_entry(new); } @@ -455,7 +460,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, struct nat_entry *e; struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true); - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { e = __init_nat_entry(nm_i, new, NULL, true); @@ -504,7 +509,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, set_nat_flag(e, HAS_FSYNCED_INODE, true); set_nat_flag(e, HAS_LAST_FSYNC, fsync_done); } - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); } int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) @@ -512,7 +517,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) struct f2fs_nm_info *nm_i = NM_I(sbi); int nr = nr_shrink; - if (!down_write_trylock(&nm_i->nat_tree_lock)) + if (!f2fs_down_write_trylock(&nm_i->nat_tree_lock)) return 0; spin_lock(&nm_i->nat_list_lock); @@ -534,12 +539,12 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) } spin_unlock(&nm_i->nat_list_lock); - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); return nr - nr_shrink; } int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, - struct node_info *ni) + struct node_info *ni, bool checkpoint_context) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -556,13 +561,13 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, ni->nid = nid; retry: /* Check nat cache */ - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return 0; } @@ -572,10 +577,11 @@ retry: * nat_tree_lock. Therefore, we should retry, if we failed to grab here * while not bothering checkpoint. */ - if (!rwsem_is_locked(&sbi->cp_global_sem)) { + if (!f2fs_rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) { down_read(&curseg->journal_rwsem); - } else if (!down_read_trylock(&curseg->journal_rwsem)) { - up_read(&nm_i->nat_tree_lock); + } else if (f2fs_rwsem_is_contended(&nm_i->nat_tree_lock) || + !down_read_trylock(&curseg->journal_rwsem)) { + f2fs_up_read(&nm_i->nat_tree_lock); goto retry; } @@ -586,13 +592,13 @@ retry: } up_read(&curseg->journal_rwsem); if (i >= 0) { - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); goto cache; } /* Fill node_info from nat page */ index = current_nat_addr(sbi, nid); - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); page = f2fs_get_meta_page(sbi, index); if (IS_ERR(page)) @@ -857,7 +863,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) blkaddr = data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node + 1); - f2fs_update_extent_tree_range_compressed(dn->inode, + f2fs_update_read_extent_tree_range_compressed(dn->inode, index, blkaddr, F2FS_I(dn->inode)->i_cluster_size, c_len); @@ -887,7 +893,7 @@ static int truncate_node(struct dnode_of_data *dn) int err; pgoff_t index; - err = f2fs_get_node_info(sbi, dn->nid, &ni); + err = f2fs_get_node_info(sbi, dn->nid, &ni, false); if (err) return err; @@ -1286,7 +1292,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) goto fail; #ifdef CONFIG_F2FS_CHECK_FS - err = f2fs_get_node_info(sbi, dn->nid, &new_ni); + err = f2fs_get_node_info(sbi, dn->nid, &new_ni, false); if (err) { dec_valid_node_count(sbi, dn->inode, !ofs); goto fail; @@ -1294,6 +1300,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) if (unlikely(new_ni.blk_addr != NULL_ADDR)) { err = -EFSCORRUPTED; set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; } #endif @@ -1352,7 +1359,7 @@ static int read_node_page(struct page *page, int op_flags) return LOCKED_PAGE; } - err = f2fs_get_node_info(sbi, page->index, &ni); + err = f2fs_get_node_info(sbi, page->index, &ni, false); if (err) return err; @@ -1367,7 +1374,7 @@ static int read_node_page(struct page *page, int op_flags) err = f2fs_submit_page_bio(&fio); if (!err) - f2fs_update_iostat(sbi, FS_NODE_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_NODE_READ_IO, F2FS_BLKSIZE); return err; } @@ -1414,8 +1421,7 @@ repeat: err = read_node_page(page, 0); if (err < 0) { - f2fs_put_page(page, 1); - return ERR_PTR(err); + goto out_put_err; } else if (err == LOCKED_PAGE) { err = 0; goto page_hit; @@ -1441,19 +1447,23 @@ repeat: goto out_err; } page_hit: - if (unlikely(nid != nid_of_node(page))) { - f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + if (likely(nid == nid_of_node(page))) + return page; + + f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); - set_sbi_flag(sbi, SBI_NEED_FSCK); - err = -EINVAL; + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; out_err: - ClearPageUptodate(page); - f2fs_put_page(page, 1); - return ERR_PTR(err); - } - return page; + ClearPageUptodate(page); +out_put_err: + /* ENOENT comes from read_node_page which is not an error. */ + if (err != -ENOENT) + f2fs_handle_page_eio(sbi, page->index, NODE); + f2fs_put_page(page, 1); + return ERR_PTR(err); } struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) @@ -1603,21 +1613,21 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); - if (f2fs_get_node_info(sbi, nid, &ni)) + if (f2fs_get_node_info(sbi, nid, &ni, !do_balance)) goto redirty_out; if (wbc->for_reclaim) { - if (!down_read_trylock(&sbi->node_write)) + if (!f2fs_down_read_trylock(&sbi->node_write)) goto redirty_out; } else { - down_read(&sbi->node_write); + f2fs_down_read(&sbi->node_write); } /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { ClearPageUptodate(page); dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); unlock_page(page); return 0; } @@ -1625,11 +1635,11 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, if (__is_valid_data_blkaddr(ni.blk_addr) && !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC_ENHANCE)) { - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); goto redirty_out; } - if (atomic && !test_opt(sbi, NOBARRIER)) + if (atomic && !test_opt(sbi, NOBARRIER) && !f2fs_sb_has_blkzoned(sbi)) fio.op_flags |= REQ_PREFLUSH | REQ_FUA; /* should add to global list before clearing PAGECACHE status */ @@ -1646,7 +1656,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, f2fs_do_write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); if (wbc->for_reclaim) { f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE); @@ -1780,6 +1790,7 @@ continue_unlock: if (!atomic || page == last_page) { set_fsync_mark(page, 1); + percpu_counter_inc(&sbi->rf_node_block_count); if (IS_INODE(page)) { if (is_inode_flag_set(inode, FI_DIRTY_INODE)) @@ -1943,7 +1954,6 @@ next_step: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; bool submitted = false; - bool may_dirty = true; /* give a priority to WB_SYNC threads */ if (atomic_read(&sbi->wb_sync_req[NODE]) && @@ -1996,11 +2006,8 @@ continue_unlock: } /* flush dirty inode */ - if (IS_INODE(page) && may_dirty) { - may_dirty = false; - if (flush_dirty_inode(page)) - goto lock_node; - } + if (IS_INODE(page) && flush_dirty_inode(page)) + goto lock_node; write_node: f2fs_wait_on_page_writeback(page, NODE, true, true); @@ -2144,8 +2151,7 @@ static int f2fs_set_node_page_dirty(struct page *page) if (IS_INODE(page)) f2fs_inode_chksum_set(F2FS_P_SB(page), page); #endif - if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + if (__set_page_dirty_nobuffers(page)) { inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); set_page_private_reference(page); return 1; @@ -2227,14 +2233,14 @@ bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi) unsigned int i; bool ret = true; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); for (i = 0; i < nm_i->nat_blocks; i++) { if (!test_bit_le(i, nm_i->nat_block_bitmap)) { ret = false; break; } } - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return ret; } @@ -2417,7 +2423,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) unsigned int i, idx; nid_t nid; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); for (i = 0; i < nm_i->nat_blocks; i++) { if (!test_bit_le(i, nm_i->nat_block_bitmap)) @@ -2440,7 +2446,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) out: scan_curseg_cache(sbi); - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); } static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, @@ -2475,7 +2481,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); while (1) { if (!test_bit_le(NAT_BLOCK_OFFSET(nid), @@ -2490,7 +2496,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, } if (ret) { - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); f2fs_err(sbi, "NAT is corrupt, run fsck to fix it"); return ret; } @@ -2510,7 +2516,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, /* find free nids from current sum_pages */ scan_curseg_cache(sbi); - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); @@ -2708,7 +2714,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page) goto recover_xnid; /* 1: invalidate the previous xattr nid */ - err = f2fs_get_node_info(sbi, prev_xnid, &ni); + err = f2fs_get_node_info(sbi, prev_xnid, &ni, false); if (err) return err; @@ -2748,7 +2754,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) struct page *ipage; int err; - err = f2fs_get_node_info(sbi, ino, &old_ni); + err = f2fs_get_node_info(sbi, ino, &old_ni, false); if (err) return err; @@ -2955,7 +2961,7 @@ void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_ofs; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) { unsigned int valid = 0, nid_ofs = 0; @@ -2975,7 +2981,7 @@ void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi) __update_nat_bits(nm_i, nat_ofs, valid); } - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); } static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, @@ -3073,15 +3079,15 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * nat_cnt[DIRTY_NAT]. */ if (cpc->reason & CP_UMOUNT) { - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); remove_nats_in_journal(sbi); - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); } if (!nm_i->nat_cnt[DIRTY_NAT]) return 0; - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); /* * if there are no enough space in journal to store dirty nat @@ -3110,7 +3116,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) break; } - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); /* Allow dirty nats by node block allocation in write_begin */ return err; @@ -3220,6 +3226,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; + nm_i->max_rf_node_blocks = DEF_RF_NODE_BLOCKS; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); @@ -3230,7 +3237,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->nid_list_lock); - init_rwsem(&nm_i->nat_tree_lock); + init_f2fs_rwsem(&nm_i->nat_tree_lock); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); @@ -3336,7 +3343,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) spin_unlock(&nm_i->nid_list_lock); /* destroy nat cache */ - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_cache(nm_i, nid, NATVEC_SIZE, natvec))) { unsigned idx; @@ -3366,7 +3373,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) kmem_cache_free(nat_entry_set_slab, setvec[idx]); } } - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); kvfree(nm_i->nat_block_bitmap); if (nm_i->free_nid_bitmap) { diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index ff14a6e5ac1c..99454d46a939 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -31,6 +31,9 @@ /* control total # of nats */ #define DEF_NAT_CACHE_THRESHOLD 100000 +/* control total # of node writes used for roll-fowrad recovery */ +#define DEF_RF_NODE_BLOCKS 0 + /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 #define SETVEC_SIZE 32 @@ -138,18 +141,13 @@ static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD; } -static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi) -{ - return get_pages(sbi, F2FS_DIRTY_NODES) >= sbi->blocks_per_seg * 8; -} - enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ - EXTENT_CACHE, /* indicates extent cache */ - INMEM_PAGES, /* indicates inmemory pages */ + READ_EXTENT_CACHE, /* indicates read extent cache */ + AGE_EXTENT_CACHE, /* indicates age extent cache */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index ed21e34b59c7..9eb9ec9bddb2 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -55,6 +55,10 @@ bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi) if (sbi->last_valid_block_count + nalloc > sbi->user_block_count) return false; + if (NM_I(sbi)->max_rf_node_blocks && + percpu_counter_sum_positive(&sbi->rf_node_block_count) >= + NM_I(sbi)->max_rf_node_blocks) + return false; return true; } @@ -342,6 +346,19 @@ static int recover_inode(struct inode *inode, struct page *page) return 0; } +static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi, + unsigned int ra_blocks, unsigned int blkaddr, + unsigned int next_blkaddr) +{ + if (blkaddr + 1 == next_blkaddr) + ra_blocks = min_t(unsigned int, RECOVERY_MAX_RA_BLOCKS, + ra_blocks * 2); + else if (next_blkaddr % sbi->blocks_per_seg) + ra_blocks = max_t(unsigned int, RECOVERY_MIN_RA_BLOCKS, + ra_blocks / 2); + return ra_blocks; +} + static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, bool check_only) { @@ -349,6 +366,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, struct page *page = NULL; block_t blkaddr; unsigned int loop_cnt = 0; + unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg - valid_user_blocks(sbi); int err = 0; @@ -423,11 +441,14 @@ next: break; } + ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr, + next_blkaddr_of_node(page)); + /* check next segment */ blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); - f2fs_ra_meta_pages_cond(sbi, blkaddr); + f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); } return err; } @@ -485,6 +506,7 @@ got_it: if (ofs_in_node >= max_addrs) { f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", ofs_in_node, dn->inode->i_ino, nid, max_addrs); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUMMARY); return -EFSCORRUPTED; } @@ -604,7 +626,7 @@ retry_dn: f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); - err = f2fs_get_node_info(sbi, dn.nid, &ni); + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (err) goto err; @@ -615,6 +637,7 @@ retry_dn: inode->i_ino, ofs_of_node(dn.node_page), ofs_of_node(page)); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); goto err; } @@ -627,12 +650,14 @@ retry_dn: if (__is_valid_data_blkaddr(src) && !f2fs_is_valid_blkaddr(sbi, src, META_POR)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto err; } if (__is_valid_data_blkaddr(dest) && !f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto err; } @@ -691,6 +716,8 @@ retry_prev: f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u", dest, inode->i_ino, dn.ofs_in_node); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); goto err; } @@ -721,6 +748,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, struct page *page = NULL; int err = 0; block_t blkaddr; + unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); @@ -732,8 +760,6 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) break; - f2fs_ra_meta_pages_cond(sbi, blkaddr); - page = f2fs_get_tmp_page(sbi, blkaddr); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -776,9 +802,14 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, if (entry->blkaddr == blkaddr) list_move_tail(&entry->list, tmp_inode_list); next: + ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr, + next_blkaddr_of_node(page)); + /* check next segment */ blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); + + f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); } if (!err) f2fs_allocate_new_segments(sbi); @@ -804,8 +835,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } #ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sbi->sb->s_flags |= SB_ACTIVE; /* Turn on quotas so that they are updated correctly */ quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); #endif @@ -815,7 +844,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) INIT_LIST_HEAD(&dir_list); /* prevent checkpoint */ - down_write(&sbi->cp_global_sem); + f2fs_down_write(&sbi->cp_global_sem); /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list, check_only); @@ -833,10 +862,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); - else { - /* restore s_flags to let iput() trash data */ - sbi->sb->s_flags = s_flags; - } + else + f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE); skip: fix_curseg_write_pointer = !check_only || list_empty(&inode_list); @@ -866,7 +893,7 @@ skip: if (!err) clear_sbi_flag(sbi, SBI_POR_DOING); - up_write(&sbi->cp_global_sem); + f2fs_up_write(&sbi->cp_global_sem); /* let's drop all the directory inodes for clean checkpoint */ destroy_fsync_dnodes(&dir_list, err); @@ -896,9 +923,7 @@ int __init f2fs_create_recovery_cache(void) { fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) - return -ENOMEM; - return 0; + return fsync_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_recovery_cache(void) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 194c0811fbdf..6dbf8bfe512c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -28,7 +29,7 @@ static struct kmem_cache *discard_entry_slab; static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; -static struct kmem_cache *inmem_entry_slab; +static struct kmem_cache *revoke_entry_slab; static unsigned long __reverse_ulong(unsigned char *str) { @@ -183,312 +184,195 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } -void f2fs_register_inmem_page(struct inode *inode, struct page *page) +void f2fs_abort_atomic_write(struct inode *inode, bool clean) { - struct inmem_pages *new; + struct f2fs_inode_info *fi = F2FS_I(inode); - set_page_private_atomic(page); + if (!f2fs_is_atomic_file(inode)) + return; - new = f2fs_kmem_cache_alloc(inmem_entry_slab, - GFP_NOFS, true, NULL); + clear_inode_flag(fi->cow_inode, FI_COW_FILE); + iput(fi->cow_inode); + fi->cow_inode = NULL; + release_atomic_write_cnt(inode); + clear_inode_flag(inode, FI_ATOMIC_COMMITTED); + clear_inode_flag(inode, FI_ATOMIC_REPLACE); + clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_inode(inode); - /* add atomic page indices to the list */ - new->page = page; - INIT_LIST_HEAD(&new->list); - - /* increase reference count with clean state */ - get_page(page); - mutex_lock(&F2FS_I(inode)->inmem_lock); - list_add_tail(&new->list, &F2FS_I(inode)->inmem_pages); - inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); - mutex_unlock(&F2FS_I(inode)->inmem_lock); - - trace_f2fs_register_inmem_page(page, INMEM); + if (clean) { + truncate_inode_pages_final(inode->i_mapping); + f2fs_i_size_write(inode, fi->original_i_size); + } } -static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover, - bool trylock) +static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, + block_t new_addr, block_t *old_addr, bool recover) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inmem_pages *cur, *tmp; - int err = 0; + struct dnode_of_data dn; + struct node_info ni; + int err; + +retry: + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE_RA); + if (err) { + if (err == -ENOMEM) { + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + goto retry; + } + return err; + } + + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + + if (recover) { + /* dn.data_blkaddr is always valid */ + if (!__is_valid_data_blkaddr(new_addr)) { + if (new_addr == NULL_ADDR) + dec_valid_block_count(sbi, inode, 1); + f2fs_invalidate_blocks(sbi, dn.data_blkaddr); + f2fs_update_data_blkaddr(&dn, new_addr); + } else { + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + new_addr, ni.version, true, true); + } + } else { + blkcnt_t count = 1; + + *old_addr = dn.data_blkaddr; + f2fs_truncate_data_blocks_range(&dn, 1); + dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count); + inc_valid_block_count(sbi, inode, &count); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, + ni.version, true, false); + } + + f2fs_put_dnode(&dn); + return 0; +} + +static void __complete_revoke_list(struct inode *inode, struct list_head *head, + bool revoke) +{ + struct revoke_entry *cur, *tmp; + bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE); list_for_each_entry_safe(cur, tmp, head, list) { - struct page *page = cur->page; - - if (drop) - trace_f2fs_commit_inmem_page(page, INMEM_DROP); - - if (trylock) { - /* - * to avoid deadlock in between page lock and - * inmem_lock. - */ - if (!trylock_page(page)) - continue; - } else { - lock_page(page); - } - - f2fs_wait_on_page_writeback(page, DATA, true, true); - - if (recover) { - struct dnode_of_data dn; - struct node_info ni; - - trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); -retry: - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_get_dnode_of_data(&dn, page->index, - LOOKUP_NODE); - if (err) { - if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); - cond_resched(); - goto retry; - } - err = -EAGAIN; - goto next; - } - - err = f2fs_get_node_info(sbi, dn.nid, &ni); - if (err) { - f2fs_put_dnode(&dn); - return err; - } - - if (cur->old_addr == NEW_ADDR) { - f2fs_invalidate_blocks(sbi, dn.data_blkaddr); - f2fs_update_data_blkaddr(&dn, NEW_ADDR); - } else - f2fs_replace_block(sbi, &dn, dn.data_blkaddr, - cur->old_addr, ni.version, true, true); - f2fs_put_dnode(&dn); - } -next: - /* we don't need to invalidate this in the sccessful status */ - if (drop || recover) { - ClearPageUptodate(page); - clear_page_private_gcing(page); - } - detach_page_private(page); - set_page_private(page, 0); - f2fs_put_page(page, 1); + if (revoke) + __replace_atomic_write_block(inode, cur->index, + cur->old_addr, NULL, true); list_del(&cur->list); - kmem_cache_free(inmem_entry_slab, cur); - dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + kmem_cache_free(revoke_entry_slab, cur); } - return err; + + if (!revoke && truncate) + f2fs_do_truncate_blocks(inode, 0, false); } -void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) -{ - struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; - struct inode *inode; - struct f2fs_inode_info *fi; - unsigned int count = sbi->atomic_files; - unsigned int looped = 0; -next: - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (list_empty(head)) { - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - return; - } - fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist); - inode = igrab(&fi->vfs_inode); - if (inode) - list_move_tail(&fi->inmem_ilist, head); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - - if (inode) { - if (gc_failure) { - if (!fi->i_gc_failures[GC_FAILURE_ATOMIC]) - goto skip; - } - set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - f2fs_drop_inmem_pages(inode); -skip: - iput(inode); - } - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); - cond_resched(); - if (gc_failure) { - if (++looped >= count) - return; - } - goto next; -} - -void f2fs_drop_inmem_pages(struct inode *inode) +static int __f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - - do { - mutex_lock(&fi->inmem_lock); - if (list_empty(&fi->inmem_pages)) { - fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; - - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); - sbi->atomic_files--; - } - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - - mutex_unlock(&fi->inmem_lock); - break; - } - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, true); - mutex_unlock(&fi->inmem_lock); - } while (1); -} - -void f2fs_drop_inmem_page(struct inode *inode, struct page *page) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct list_head *head = &fi->inmem_pages; - struct inmem_pages *cur = NULL; - struct inmem_pages *tmp; - - f2fs_bug_on(sbi, !page_private_atomic(page)); - - mutex_lock(&fi->inmem_lock); - list_for_each_entry(tmp, head, list) { - if (tmp->page == page) { - cur = tmp; - break; - } - } - - f2fs_bug_on(sbi, !cur); - list_del(&cur->list); - mutex_unlock(&fi->inmem_lock); - - dec_page_count(sbi, F2FS_INMEM_PAGES); - kmem_cache_free(inmem_entry_slab, cur); - - ClearPageUptodate(page); - clear_page_private_atomic(page); - f2fs_put_page(page, 0); - - detach_page_private(page); - set_page_private(page, 0); - - trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); -} - -static int __f2fs_commit_inmem_pages(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - struct inmem_pages *cur, *tmp; - struct f2fs_io_info fio = { - .sbi = sbi, - .ino = inode->i_ino, - .type = DATA, - .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_PRIO, - .io_type = FS_DATA_IO, - }; + struct inode *cow_inode = fi->cow_inode; + struct revoke_entry *new; struct list_head revoke_list; - bool submit_bio = false; - int err = 0; + block_t blkaddr; + struct dnode_of_data dn; + pgoff_t len = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + pgoff_t off = 0, blen, index; + int ret = 0, i; INIT_LIST_HEAD(&revoke_list); - list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { - struct page *page = cur->page; + while (len) { + blen = min_t(pgoff_t, ADDRS_PER_BLOCK(cow_inode), len); - lock_page(page); - if (page->mapping == inode->i_mapping) { - trace_f2fs_commit_inmem_page(page, INMEM); - - f2fs_wait_on_page_writeback(page, DATA, true, true); - - set_page_dirty(page); - if (clear_page_dirty_for_io(page)) { - inode_dec_dirty_pages(inode); - f2fs_remove_dirty_inode(inode); - } -retry: - fio.page = page; - fio.old_blkaddr = NULL_ADDR; - fio.encrypted_page = NULL; - fio.need_lock = LOCK_DONE; - err = f2fs_do_write_data_page(&fio); - if (err) { - if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); - cond_resched(); - goto retry; - } - unlock_page(page); - break; - } - /* record old blkaddr for revoking */ - cur->old_addr = fio.old_blkaddr; - submit_bio = true; + set_new_dnode(&dn, cow_inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + ret = 0; + if (dn.max_level == 0) + goto out; + goto next; } - unlock_page(page); - list_move_tail(&cur->list, &revoke_list); + + blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, cow_inode), + len); + index = off; + for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { + blkaddr = f2fs_data_blkaddr(&dn); + + if (!__is_valid_data_blkaddr(blkaddr)) { + continue; + } else if (!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) { + f2fs_put_dnode(&dn); + ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); + goto out; + } + + new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS, + true, NULL); + + ret = __replace_atomic_write_block(inode, index, blkaddr, + &new->old_addr, false); + if (ret) { + f2fs_put_dnode(&dn); + kmem_cache_free(revoke_entry_slab, new); + goto out; + } + + f2fs_update_data_blkaddr(&dn, NULL_ADDR); + new->index = index; + list_add_tail(&new->list, &revoke_list); + } + f2fs_put_dnode(&dn); +next: + off += blen; + len -= blen; } - if (submit_bio) - f2fs_submit_merged_write_cond(sbi, inode, NULL, 0, DATA); - - if (err) { - /* - * try to revoke all committed pages, but still we could fail - * due to no memory or other reason, if that happened, EAGAIN - * will be returned, which means in such case, transaction is - * already not integrity, caller should use journal to do the - * recovery or rewrite & commit last transaction. For other - * error number, revoking was done by filesystem itself. - */ - err = __revoke_inmem_pages(inode, &revoke_list, - false, true, false); - - /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, false); +out: + if (ret) { + sbi->revoked_atomic_block += fi->atomic_write_cnt; } else { - __revoke_inmem_pages(inode, &revoke_list, - false, false, false); + sbi->committed_atomic_block += fi->atomic_write_cnt; + set_inode_flag(inode, FI_ATOMIC_COMMITTED); } - return err; + __complete_revoke_list(inode, &revoke_list, ret ? true : false); + + return ret; } -int f2fs_commit_inmem_pages(struct inode *inode) +int f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); int err; - f2fs_balance_fs(sbi, true); - - down_write(&fi->i_gc_rwsem[WRITE]); + err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (err) + return err; + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); f2fs_lock_op(sbi); - set_inode_flag(inode, FI_ATOMIC_COMMIT); - mutex_lock(&fi->inmem_lock); - err = __f2fs_commit_inmem_pages(inode); - mutex_unlock(&fi->inmem_lock); - - clear_inode_flag(inode, FI_ATOMIC_COMMIT); + err = __f2fs_commit_atomic_write(inode); f2fs_unlock_op(sbi); - up_write(&fi->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); return err; } @@ -501,7 +385,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); } /* balance_fs_bg is able to be pending */ @@ -526,20 +410,52 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) io_schedule(); finish_wait(&sbi->gc_thread->fggc_wq, &wait); } else { - down_write(&sbi->gc_lock); - f2fs_gc(sbi, false, false, false, NULL_SEGNO); + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = BG_GC, + .no_bg_gc = true, + .should_migrate_blocks = false, + .err_gc_skipped = false, + .nr_free_secs = 1 }; + f2fs_down_write(&sbi->gc_lock); + f2fs_gc(sbi, &gc_control); } } } +static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi) +{ + int factor = f2fs_rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2; + unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA); + unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES); + unsigned int meta = get_pages(sbi, F2FS_DIRTY_META); + unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA); + unsigned int threshold = sbi->blocks_per_seg * factor * + DEFAULT_DIRTY_THRESHOLD; + unsigned int global_threshold = threshold * 3 / 2; + + if (dents >= threshold || qdata >= threshold || + nodes >= threshold || meta >= threshold || + imeta >= threshold) + return true; + return dents + qdata + nodes + meta + imeta > global_threshold; +} + void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) { if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return; /* try to shrink extent cache when there is no enough memory */ - if (!f2fs_available_free_memory(sbi, EXTENT_CACHE)) - f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); + if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE)) + f2fs_shrink_read_extent_tree(sbi, + READ_EXTENT_CACHE_SHRINK_NUMBER); + + /* try to shrink age extent cache when there is no enough memory */ + if (!f2fs_available_free_memory(sbi, AGE_EXTENT_CACHE)) + f2fs_shrink_age_extent_tree(sbi, + AGE_EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) @@ -550,13 +466,13 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) else f2fs_build_free_nids(sbi, false, false); - if (excess_dirty_nats(sbi) || excess_dirty_nodes(sbi) || - excess_prefree_segs(sbi)) + if (excess_dirty_nats(sbi) || excess_dirty_threshold(sbi) || + excess_prefree_segs(sbi) || !f2fs_space_for_roll_forward(sbi)) goto do_sync; /* there is background inflight IO or foreground operation recently */ if (is_inflight_io(sbi, REQ_TIME) || - (!f2fs_time_over(sbi, REQ_TIME) && rwsem_is_locked(&sbi->cp_rwsem))) + (!f2fs_time_over(sbi, REQ_TIME) && f2fs_rwsem_is_locked(&sbi->cp_rwsem))) return; /* exceed periodical checkpoint timeout threshold */ @@ -580,7 +496,7 @@ do_sync: mutex_unlock(&sbi->flush_lock); } - f2fs_sync_fs(sbi->sb, true); + f2fs_sync_fs(sbi->sb, 1); stat_inc_bg_cp_count(sbi->stat_info); } @@ -721,12 +637,11 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct flush_cmd_control *fcc; - int err = 0; if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; if (fcc->f2fs_issue_flush) - return err; + return 0; goto init_thread; } @@ -739,19 +654,19 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; if (!test_opt(sbi, FLUSH_MERGE)) - return err; + return 0; init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { - err = PTR_ERR(fcc->f2fs_issue_flush); - kfree(fcc); - SM_I(sbi)->fcc_info = NULL; + int err = PTR_ERR(fcc->f2fs_issue_flush); + + fcc->f2fs_issue_flush = NULL; return err; } - return err; + return 0; } void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) @@ -789,12 +704,12 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) do { ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); } while (ret && --count); if (ret) { - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FLUSH_FAIL); break; } @@ -835,7 +750,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, get_valid_blocks(sbi, segno, true); f2fs_bug_on(sbi, unlikely(!valid_blocks || - valid_blocks == BLKS_PER_SEC(sbi))); + valid_blocks == CAP_BLKS_PER_SEC(sbi))); if (!IS_CURSEC(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); @@ -871,7 +786,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); if (!valid_blocks || - valid_blocks == BLKS_PER_SEC(sbi)) { + valid_blocks == CAP_BLKS_PER_SEC(sbi)) { clear_bit(secno, dirty_i->dirty_secmap); return; } @@ -957,7 +872,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi) } mutex_unlock(&dirty_i->seglist_lock); - unusable = holes[DATA] > holes[NODE] ? holes[DATA] : holes[NODE]; + unusable = max(holes[DATA], holes[NODE]); if (unusable > ovp_holes) return unusable - ovp_holes; return 0; @@ -1142,34 +1057,34 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->ordered = false; dpolicy->granularity = granularity; - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->max_requests = dcc->max_discard_request; dpolicy->io_aware_gran = MAX_PLIST_NUM; dpolicy->timeout = false; if (discard_type == DPOLICY_BG) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->min_interval = dcc->min_discard_issue_time; + dpolicy->mid_interval = dcc->mid_discard_issue_time; + dpolicy->max_interval = dcc->max_discard_issue_time; dpolicy->io_aware = true; dpolicy->sync = false; dpolicy->ordered = true; - if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { - dpolicy->granularity = 1; + if (utilization(sbi) > dcc->discard_urgent_util) { + dpolicy->granularity = MIN_DISCARD_GRANULARITY; if (atomic_read(&dcc->discard_cmd_cnt)) dpolicy->max_interval = - DEF_MIN_DISCARD_ISSUE_TIME; + dcc->min_discard_issue_time; } } else if (discard_type == DPOLICY_FORCE) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->min_interval = dcc->min_discard_issue_time; + dpolicy->mid_interval = dcc->mid_discard_issue_time; + dpolicy->max_interval = dcc->max_discard_issue_time; dpolicy->io_aware = false; } else if (discard_type == DPOLICY_FSTRIM) { dpolicy->io_aware = false; } else if (discard_type == DPOLICY_UMOUNT) { dpolicy->io_aware = false; /* we need to issue all to keep CP_TRIMMED_FLAG */ - dpolicy->granularity = 1; + dpolicy->granularity = MIN_DISCARD_GRANULARITY; dpolicy->timeout = true; } } @@ -1228,13 +1143,12 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, if (time_to_inject(sbi, FAULT_DISCARD)) { f2fs_show_injection_info(sbi, FAULT_DISCARD); err = -EIO; - goto submit; - } - err = __blkdev_issue_discard(bdev, + } else { + err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), GFP_NOFS, 0, &bio); -submit: + } if (err) { spin_lock_irqsave(&dc->lock, flags); if (dc->state == D_PARTIAL) @@ -1272,7 +1186,7 @@ submit: atomic_inc(&dcc->issued_discard); - f2fs_update_iostat(sbi, FS_DISCARD, 1); + f2fs_update_iostat(sbi, NULL, FS_DISCARD, len * F2FS_BLKSIZE); lstart += len; start += len; @@ -1445,13 +1359,13 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, } } -static int __queue_discard_cmd(struct f2fs_sb_info *sbi, +static void __queue_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { block_t lblkstart = blkstart; if (!f2fs_bdev_support_discard(bdev)) - return 0; + return; trace_f2fs_queue_discard(bdev, blkstart, blklen); @@ -1463,7 +1377,6 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); - return 0; } static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, @@ -1551,7 +1464,7 @@ retry: if (i + 1 < dpolicy->granularity) break; - if (i + 1 < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) + if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered) return __issue_discard_cmd_orderly(sbi, dpolicy); pend_list = &dcc->pend_list[i]; @@ -1654,33 +1567,32 @@ static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); - struct discard_cmd *dc, *tmp; - bool need_wait; + struct discard_cmd *dc = NULL, *iter, *tmp; unsigned int trimmed = 0; next: - need_wait = false; + dc = NULL; mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (dc->lstart + dc->len <= start || end <= dc->lstart) + list_for_each_entry_safe(iter, tmp, wait_list, list) { + if (iter->lstart + iter->len <= start || end <= iter->lstart) continue; - if (dc->len < dpolicy->granularity) + if (iter->len < dpolicy->granularity) continue; - if (dc->state == D_DONE && !dc->ref) { - wait_for_completion_io(&dc->wait); - if (!dc->error) - trimmed += dc->len; - __remove_discard_cmd(sbi, dc); + if (iter->state == D_DONE && !iter->ref) { + wait_for_completion_io(&iter->wait); + if (!iter->error) + trimmed += iter->len; + __remove_discard_cmd(sbi, iter); } else { - dc->ref++; - need_wait = true; + iter->ref++; + dc = iter; break; } } mutex_unlock(&dcc->cmd_lock); - if (need_wait) { + if (dc) { trimmed += __wait_one_discard_bio(sbi, dc); goto next; } @@ -1749,6 +1661,9 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) struct discard_policy dpolicy; bool dropped; + if (!atomic_read(&dcc->discard_cmd_cnt)) + return false; + __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); @@ -1767,12 +1682,17 @@ static int issue_discard_thread(void *data) struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; struct discard_policy dpolicy; - unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + unsigned int wait_ms = dcc->min_discard_issue_time; int issued; set_freezable(); do { + wait_event_interruptible_timeout(*q, + kthread_should_stop() || freezing(current) || + dcc->discard_wake, + msecs_to_jiffies(wait_ms)); + if (sbi->gc_mode == GC_URGENT_HIGH || !f2fs_available_free_memory(sbi, DISCARD_CACHE)) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); @@ -1780,14 +1700,6 @@ static int issue_discard_thread(void *data) __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, dcc->discard_granularity); - if (!atomic_read(&dcc->discard_cmd_cnt)) - wait_ms = dpolicy.max_interval; - - wait_event_interruptible_timeout(*q, - kthread_should_stop() || freezing(current) || - dcc->discard_wake, - msecs_to_jiffies(wait_ms)); - if (dcc->discard_wake) dcc->discard_wake = 0; @@ -1801,12 +1713,11 @@ static int issue_discard_thread(void *data) continue; if (kthread_should_stop()) return 0; - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || + !atomic_read(&dcc->discard_cmd_cnt)) { wait_ms = dpolicy.max_interval; continue; } - if (!atomic_read(&dcc->discard_cmd_cnt)) - continue; sb_start_intwrite(sbi->sb); @@ -1821,6 +1732,8 @@ static int issue_discard_thread(void *data) } else { wait_ms = dpolicy.max_interval; } + if (!atomic_read(&dcc->discard_cmd_cnt)) + wait_ms = dpolicy.max_interval; sb_end_intwrite(sbi->sb); @@ -1864,7 +1777,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } /* For conventional zones, use regular discard if supported */ - return __queue_discard_cmd(sbi, bdev, lblkstart, blklen); + __queue_discard_cmd(sbi, bdev, lblkstart, blklen); + return 0; } #endif @@ -1875,7 +1789,8 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return __queue_discard_cmd(sbi, bdev, blkstart, blklen); + __queue_discard_cmd(sbi, bdev, blkstart, blklen); + return 0; } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, @@ -2152,6 +2067,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) dcc->discard_granularity = sbi->blocks_per_seg; else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) @@ -2168,6 +2084,11 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; + dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST; + dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME; + dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME; + dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME; + dcc->discard_urgent_util = DEF_DISCARD_URGENT_UTIL; dcc->undiscard_blks = 0; dcc->next_pos = 0; dcc->root = RB_ROOT_CACHED; @@ -2198,8 +2119,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) * Recovery can cache discard commands, so in error path of * fill_super(), it needs to give a chance to handle them. */ - if (unlikely(atomic_read(&dcc->discard_cmd_cnt))) - f2fs_issue_discard_timeout(sbi); + f2fs_issue_discard_timeout(sbi); kfree(dcc); SM_I(sbi)->dcc_info = NULL; @@ -2635,6 +2555,8 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) unsigned short seg_type = curseg->seg_type; sanity_check_seg_type(sbi, seg_type); + if (f2fs_need_rand_seg(sbi)) + return prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec); /* if segs_per_sec is large than 1, we need to keep original policy. */ if (__is_large_section(sbi)) @@ -2686,6 +2608,9 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) curseg->next_segno = segno; reset_curseg(sbi, type, 1); curseg->alloc_type = LFS; + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) + curseg->fragment_remained_chunk = + prandom_u32() % sbi->max_fragment_chunk + 1; } static int __next_free_blkoff(struct f2fs_sb_info *sbi, @@ -2712,12 +2637,22 @@ static int __next_free_blkoff(struct f2fs_sb_info *sbi, static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, struct curseg_info *seg) { - if (seg->alloc_type == SSR) + if (seg->alloc_type == SSR) { seg->next_blkoff = __next_free_blkoff(sbi, seg->segno, seg->next_blkoff + 1); - else + } else { seg->next_blkoff++; + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) { + /* To allocate block chunks in different sizes, use random number */ + if (--seg->fragment_remained_chunk <= 0) { + seg->fragment_remained_chunk = + prandom_u32() % sbi->max_fragment_chunk + 1; + seg->next_blkoff += + prandom_u32() % sbi->max_fragment_hole + 1; + } + } + } } bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) @@ -2729,7 +2664,7 @@ bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) +static void change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2737,9 +2672,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) struct f2fs_summary_block *sum_node; struct page *sum_page; - if (flush) - write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, curseg->segno)); + write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); __set_test_and_inuse(sbi, new_segno); @@ -2778,7 +2711,7 @@ static void get_atssr_segment(struct f2fs_sb_info *sbi, int type, struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno); curseg->seg_type = se->type; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } else { /* allocate cold segment by default */ curseg->seg_type = CURSEG_COLD_DATA; @@ -2794,7 +2727,7 @@ static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi) if (!sbi->am.atgc_enabled) return; - down_read(&SM_I(sbi)->curseg_lock); + f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&SIT_I(sbi)->sentry_lock); @@ -2804,7 +2737,7 @@ static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi) up_write(&SIT_I(sbi)->sentry_lock); mutex_unlock(&curseg->curseg_mutex); - up_read(&SM_I(sbi)->curseg_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi) @@ -2922,31 +2855,20 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, return 0; } -/* - * flush out current segment and replace it with new segment - * This function should be returned with success, otherwise BUG - */ -static void allocate_segment_by_default(struct f2fs_sb_info *sbi, - int type, bool force) +static bool need_new_seg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); - if (force) - new_curseg(sbi, type, true); - else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && - curseg->seg_type == CURSEG_WARM_NODE) - new_curseg(sbi, type, false); - else if (curseg->alloc_type == LFS && - is_next_segment_free(sbi, curseg, type) && - likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) - new_curseg(sbi, type, false); - else if (f2fs_need_SSR(sbi) && - get_ssr_segment(sbi, type, SSR, 0)) - change_curseg(sbi, type, true); - else - new_curseg(sbi, type, false); - - stat_inc_seg_type(sbi, curseg); + if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && + curseg->seg_type == CURSEG_WARM_NODE) + return true; + if (curseg->alloc_type == LFS && + is_next_segment_free(sbi, curseg, type) && + likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return true; + if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0)) + return true; + return false; } void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, @@ -2955,7 +2877,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno; - down_read(&SM_I(sbi)->curseg_lock); + f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&SIT_I(sbi)->sentry_lock); @@ -2964,7 +2886,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, goto unlock; if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) - change_curseg(sbi, type, true); + change_curseg(sbi, type); else new_curseg(sbi, type, true); @@ -2979,7 +2901,7 @@ unlock: type, segno, curseg->segno); mutex_unlock(&curseg->curseg_mutex); - up_read(&SM_I(sbi)->curseg_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, @@ -2999,7 +2921,8 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, return; alloc: old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); + new_curseg(sbi, type, true); + stat_inc_seg_type(sbi, curseg); locate_dirty_segment(sbi, old_segno); } @@ -3011,29 +2934,25 @@ static void __allocate_new_section(struct f2fs_sb_info *sbi, void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) { - down_read(&SM_I(sbi)->curseg_lock); + f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); __allocate_new_section(sbi, type, force); up_write(&SIT_I(sbi)->sentry_lock); - up_read(&SM_I(sbi)->curseg_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) { int i; - down_read(&SM_I(sbi)->curseg_lock); + f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) __allocate_new_segment(sbi, i, false, false); up_write(&SIT_I(sbi)->sentry_lock); - up_read(&SM_I(sbi)->curseg_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } -static const struct segment_allocation default_salloc_ops = { - .allocate_segment = allocate_segment_by_default, -}; - bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) { @@ -3106,7 +3025,7 @@ next: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto next; } skip: @@ -3165,9 +3084,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (sbi->discard_blks == 0) goto out; - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); err = f2fs_write_checkpoint(sbi, &cpc); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); if (err) goto out; @@ -3214,101 +3133,6 @@ int f2fs_rw_hint_to_seg_type(enum rw_hint hint) } } -/* This returns write hints for each segment type. This hints will be - * passed down to block layer. There are mapping tables which depend on - * the mount option 'whint_mode'. - * - * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. - * - * 2) whint_mode=user-based. F2FS tries to pass down hints given by users. - * - * User F2FS Block - * ---- ---- ----- - * META WRITE_LIFE_NOT_SET - * HOT_NODE " - * WARM_NODE " - * COLD_NODE " - * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME - * extension list " " - * - * -- buffered io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " " - * WRITE_LIFE_MEDIUM " " - * WRITE_LIFE_LONG " " - * - * -- direct io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " WRITE_LIFE_NONE - * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM - * WRITE_LIFE_LONG " WRITE_LIFE_LONG - * - * 3) whint_mode=fs-based. F2FS passes down hints with its policy. - * - * User F2FS Block - * ---- ---- ----- - * META WRITE_LIFE_MEDIUM; - * HOT_NODE WRITE_LIFE_NOT_SET - * WARM_NODE " - * COLD_NODE WRITE_LIFE_NONE - * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME - * extension list " " - * - * -- buffered io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG - * WRITE_LIFE_NONE " " - * WRITE_LIFE_MEDIUM " " - * WRITE_LIFE_LONG " " - * - * -- direct io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " WRITE_LIFE_NONE - * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM - * WRITE_LIFE_LONG " WRITE_LIFE_LONG - */ - -enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, - enum page_type type, enum temp_type temp) -{ - if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) { - if (type == DATA) { - if (temp == WARM) - return WRITE_LIFE_NOT_SET; - else if (temp == HOT) - return WRITE_LIFE_SHORT; - else if (temp == COLD) - return WRITE_LIFE_EXTREME; - } else { - return WRITE_LIFE_NOT_SET; - } - } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) { - if (type == DATA) { - if (temp == WARM) - return WRITE_LIFE_LONG; - else if (temp == HOT) - return WRITE_LIFE_SHORT; - else if (temp == COLD) - return WRITE_LIFE_EXTREME; - } else if (type == NODE) { - if (temp == WARM || temp == HOT) - return WRITE_LIFE_NOT_SET; - else if (temp == COLD) - return WRITE_LIFE_NONE; - } else if (type == META) { - return WRITE_LIFE_MEDIUM; - } - } - return WRITE_LIFE_NOT_SET; -} - static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) @@ -3334,10 +3158,28 @@ static int __get_segment_type_4(struct f2fs_io_info *fio) } } +static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_info ei = {}; + + if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) { + if (!ei.age) + return NO_CHECK_TYPE; + if (ei.age <= sbi->hot_data_age_threshold) + return CURSEG_HOT_DATA; + if (ei.age <= sbi->warm_data_age_threshold) + return CURSEG_WARM_DATA; + return CURSEG_COLD_DATA; + } + return NO_CHECK_TYPE; +} + static int __get_segment_type_6(struct f2fs_io_info *fio) { if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; + int type; if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return CURSEG_COLD_DATA_PINNED; @@ -3352,10 +3194,14 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) } if (file_is_cold(inode) || f2fs_need_compress_data(inode)) return CURSEG_COLD_DATA; + + type = __get_age_segment_type(inode, fio->page->index); + if (type != NO_CHECK_TYPE) + return type; + if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || - f2fs_is_atomic_file(inode) || - f2fs_is_volatile_file(inode)) + f2fs_is_cow_file(inode)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(inode->i_write_hint); } else { @@ -3404,7 +3250,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, bool from_gc = (type == CURSEG_ALL_DATA_ATGC); struct seg_entry *se = NULL; - down_read(&SM_I(sbi)->curseg_lock); + f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); @@ -3449,11 +3295,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, update_sit_entry(sbi, old_blkaddr, -1); if (!__has_curseg_space(sbi, curseg)) { - if (from_gc) + /* + * Flush out current segment and replace it with new segment. + */ + if (from_gc) { get_atssr_segment(sbi, type, se->type, AT_SSR, se->mtime); - else - sit_i->s_ops->allocate_segment(sbi, type, false); + } else { + if (need_new_seg(sbi, type)) + new_curseg(sbi, type, false); + else + change_curseg(sbi, type); + stat_inc_seg_type(sbi, curseg); + } } /* * segment dirty status should be updated after segment allocation, @@ -3463,6 +3317,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); + if (IS_DATASEG(type)) + atomic64_inc(&sbi->allocated_data_blocks); + up_write(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) { @@ -3487,27 +3344,33 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&curseg->curseg_mutex); - up_read(&SM_I(sbi)->curseg_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } -static void update_device_state(struct f2fs_io_info *fio) +void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, + block_t blkaddr, unsigned int blkcnt) { - struct f2fs_sb_info *sbi = fio->sbi; - unsigned int devidx; - if (!f2fs_is_multi_device(sbi)) return; - devidx = f2fs_target_device_index(sbi, fio->new_blkaddr); + while (1) { + unsigned int devidx = f2fs_target_device_index(sbi, blkaddr); + unsigned int blks = FDEV(devidx).end_blk - blkaddr + 1; - /* update device state for fsync */ - f2fs_set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); + /* update device state for fsync */ + f2fs_set_dirty_device(sbi, ino, devidx, FLUSH_INO); - /* update device state for checkpoint */ - if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { - spin_lock(&sbi->dev_lock); - f2fs_set_bit(devidx, (char *)&sbi->dirty_device); - spin_unlock(&sbi->dev_lock); + /* update device state for checkpoint */ + if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { + spin_lock(&sbi->dev_lock); + f2fs_set_bit(devidx, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } + + if (blkcnt <= blks) + break; + blkcnt -= blks; + blkaddr += blks; } } @@ -3517,7 +3380,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA); if (keep_order) - down_read(&fio->sbi->io_order_lock); + f2fs_down_read(&fio->sbi->io_order_lock); reallocate: f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio); @@ -3534,10 +3397,10 @@ reallocate: goto reallocate; } - update_device_state(fio); + f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); if (keep_order) - up_read(&fio->sbi->io_order_lock); + f2fs_up_read(&fio->sbi->io_order_lock); } void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, @@ -3564,7 +3427,7 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, f2fs_submit_page_write(&fio); stat_inc_meta_count(sbi, page->index); - f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, io_type, F2FS_BLKSIZE); } void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -3574,7 +3437,7 @@ void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(fio->sbi, NULL, fio->io_type, F2FS_BLKSIZE); } void f2fs_outplace_write_data(struct dnode_of_data *dn, @@ -3584,11 +3447,13 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn, struct f2fs_summary sum; f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); + if (fio->io_type == FS_DATA_IO || fio->io_type == FS_CP_DATA_IO) + f2fs_update_age_extent_cache(dn); set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); - f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, dn->inode, fio->io_type, F2FS_BLKSIZE); } int f2fs_inplace_write_data(struct f2fs_io_info *fio) @@ -3608,6 +3473,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.", __func__, segno); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); goto drop_bio; } @@ -3616,6 +3482,10 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) goto drop_bio; } + if (fio->post_read) + invalidate_mapping_pages(META_MAPPING(sbi), + fio->new_blkaddr, fio->new_blkaddr); + stat_inc_inplace_blocks(fio->sbi); if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE))) @@ -3623,8 +3493,10 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) else err = f2fs_submit_page_bio(fio); if (!err) { - update_device_state(fio); - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_device_state(fio->sbi, fio->ino, + fio->new_blkaddr, 1); + f2fs_update_iostat(fio->sbi, fio->page->mapping->host, + fio->io_type, F2FS_BLKSIZE); } return err; @@ -3668,7 +3540,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, se = get_seg_entry(sbi, segno); type = se->type; - down_write(&SM_I(sbi)->curseg_lock); + f2fs_down_write(&SM_I(sbi)->curseg_lock); if (!recover_curseg) { /* for recovery flow */ @@ -3701,7 +3573,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); @@ -3729,7 +3601,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = old_blkoff; curseg->alloc_type = old_alloc_type; @@ -3737,7 +3609,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); - up_write(&SM_I(sbi)->curseg_lock); + f2fs_up_write(&SM_I(sbi)->curseg_lock); } void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, @@ -3795,10 +3667,16 @@ void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); block_t i; + if (!f2fs_post_read_required(inode)) + return; + for (i = 0; i < len; i++) f2fs_wait_on_block_writeback(inode, blkaddr + i); + + invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr + len - 1); } static int read_compacted_summaries(struct f2fs_sb_info *sbi) @@ -4142,10 +4020,12 @@ static void adjust_sit_entry_set(struct sit_entry_set *ses, return; list_for_each_entry_continue(next, head, set_list) - if (ses->entry_cnt <= next->entry_cnt) - break; + if (ses->entry_cnt <= next->entry_cnt) { + list_move_tail(&ses->set_list, &next->set_list); + return; + } - list_move_tail(&ses->set_list, &next->set_list); + list_move_tail(&ses->set_list, head); } static void add_sit_entry(unsigned int segno, struct list_head *head) @@ -4418,9 +4298,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; #endif - /* init SIT information */ - sit_i->s_ops = &default_salloc_ops; - sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; sit_i->written_valid_blocks = 0; @@ -4542,6 +4419,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (se->type >= NR_PERSISTENT_LOG) { f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); + f2fs_handle_error(sbi, + ERROR_INCONSISTENT_SUM_TYPE); return -EFSCORRUPTED; } @@ -4578,6 +4457,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Wrong journal entry on segno %u", start); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_JOURNAL); break; } @@ -4597,6 +4477,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); break; } @@ -4628,6 +4509,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", sit_valid_blocks[NODE], valid_node_count(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NODE_COUNT); return -EFSCORRUPTED; } @@ -4636,6 +4518,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", sit_valid_blocks[DATA], sit_valid_blocks[NODE], valid_user_blocks(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_BLOCK_COUNT); return -EFSCORRUPTED; } @@ -4673,7 +4556,6 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno = 0, offset = 0, secno; block_t valid_blocks, usable_blks_in_seg; - block_t blks_per_sec = BLKS_PER_SEC(sbi); while (1) { /* find dirty segment based on free segmap */ @@ -4702,7 +4584,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) valid_blocks = get_valid_blocks(sbi, segno, true); secno = GET_SEC_FROM_SEG(sbi, segno); - if (!valid_blocks || valid_blocks == blks_per_sec) + if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi)) continue; if (IS_CURSEC(sbi, secno)) continue; @@ -4719,6 +4601,13 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; + + dirty_i->pinned_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); + if (!dirty_i->pinned_secmap) + return -ENOMEM; + + dirty_i->pinned_secmap_cnt = 0; + dirty_i->enable_pin_section = true; return 0; } @@ -4780,6 +4669,7 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Current segment has invalid alloc_type:%d", curseg->alloc_type); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } @@ -4797,6 +4687,7 @@ out: "Current segment's next free block offset is inconsistent with bitmap, logtype:%u, segno:%u, type:%u, next_blkoff:%u, blkofs:%u", i, curseg->segno, curseg->alloc_type, curseg->next_blkoff, blkofs); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } } @@ -5078,7 +4969,7 @@ static unsigned int get_zone_idx(struct f2fs_sb_info *sbi, unsigned int secno, static inline unsigned int f2fs_usable_zone_segs_in_sec( struct f2fs_sb_info *sbi, unsigned int segno) { - unsigned int dev_idx, zone_idx, unusable_segs_in_sec; + unsigned int dev_idx, zone_idx; dev_idx = f2fs_target_device_index(sbi, START_BLOCK(sbi, segno)); zone_idx = get_zone_idx(sbi, GET_SEC_FROM_SEG(sbi, segno), dev_idx); @@ -5087,18 +4978,12 @@ static inline unsigned int f2fs_usable_zone_segs_in_sec( if (is_conv_zone(sbi, zone_idx, dev_idx)) return sbi->segs_per_sec; - /* - * If the zone_capacity_blocks array is NULL, then zone capacity - * is equal to the zone size for all zones - */ - if (!FDEV(dev_idx).zone_capacity_blocks) + if (!sbi->unusable_blocks_per_sec) return sbi->segs_per_sec; /* Get the segment count beyond zone capacity block */ - unusable_segs_in_sec = (sbi->blocks_per_blkz - - FDEV(dev_idx).zone_capacity_blocks[zone_idx]) >> - sbi->log_blocks_per_seg; - return sbi->segs_per_sec - unusable_segs_in_sec; + return sbi->segs_per_sec - (sbi->unusable_blocks_per_sec >> + sbi->log_blocks_per_seg); } /* @@ -5127,12 +5012,11 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg( if (is_conv_zone(sbi, zone_idx, dev_idx)) return sbi->blocks_per_seg; - if (!FDEV(dev_idx).zone_capacity_blocks) + if (!sbi->unusable_blocks_per_sec) return sbi->blocks_per_seg; sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); - sec_cap_blkaddr = sec_start_blkaddr + - FDEV(dev_idx).zone_capacity_blocks[zone_idx]; + sec_cap_blkaddr = sec_start_blkaddr + CAP_BLKS_PER_SEC(sbi); /* * If segment starts before zone capacity and spans beyond @@ -5252,13 +5136,11 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sm_info->sit_entry_set); - init_rwsem(&sm_info->curseg_lock); + init_f2fs_rwsem(&sm_info->curseg_lock); - if (!f2fs_readonly(sbi->sb)) { - err = f2fs_create_flush_cmd_control(sbi); - if (err) - return err; - } + err = f2fs_create_flush_cmd_control(sbi); + if (err) + return err; err = create_discard_cmd_control(sbi); if (err) @@ -5307,6 +5189,7 @@ static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + kvfree(dirty_i->pinned_secmap); kvfree(dirty_i->victim_secmap); } @@ -5417,9 +5300,9 @@ int __init f2fs_create_segment_manager_caches(void) if (!sit_entry_set_slab) goto destroy_discard_cmd; - inmem_entry_slab = f2fs_kmem_cache_create("f2fs_inmem_page_entry", - sizeof(struct inmem_pages)); - if (!inmem_entry_slab) + revoke_entry_slab = f2fs_kmem_cache_create("f2fs_revoke_entry", + sizeof(struct revoke_entry)); + if (!revoke_entry_slab) goto destroy_sit_entry_set; return 0; @@ -5438,5 +5321,5 @@ void f2fs_destroy_segment_manager_caches(void) kmem_cache_destroy(sit_entry_set_slab); kmem_cache_destroy(discard_cmd_slab); kmem_cache_destroy(discard_entry_slab); - kmem_cache_destroy(inmem_entry_slab); + kmem_cache_destroy(revoke_entry_slab); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 957edb6d70d7..3ad1b7b6fa94 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -101,6 +101,9 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, GET_SEGNO_FROM_SEG0(sbi, blk_addr))) #define BLKS_PER_SEC(sbi) \ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) +#define CAP_BLKS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec * (sbi)->blocks_per_seg - \ + (sbi)->unusable_blocks_per_sec) #define GET_SEC_FROM_SEG(sbi, segno) \ (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec) #define GET_SEG_FROM_SEC(sbi, secno) \ @@ -219,21 +222,15 @@ struct sec_entry { unsigned int valid_blocks; /* # of valid blocks in a section */ }; -struct segment_allocation { - void (*allocate_segment)(struct f2fs_sb_info *, int, bool); -}; - #define MAX_SKIP_GC_COUNT 16 -struct inmem_pages { +struct revoke_entry { struct list_head list; - struct page *page; block_t old_addr; /* for revoking when fail to commit */ + pgoff_t index; }; struct sit_info { - const struct segment_allocation *s_ops; - block_t sit_base_addr; /* start block address of SIT area */ block_t sit_blocks; /* # of blocks used by SIT area */ block_t written_valid_blocks; /* # of valid blocks in main area */ @@ -295,6 +292,9 @@ struct dirty_seglist_info { struct mutex seglist_lock; /* lock for segment bitmaps */ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ unsigned long *victim_secmap; /* background GC victims */ + unsigned long *pinned_secmap; /* pinned victims from foreground GC */ + unsigned int pinned_secmap_cnt; /* count of victims which has pinned data */ + bool enable_pin_section; /* enable pinning section */ }; /* victim selection function for cleaning and SSR */ @@ -315,6 +315,7 @@ struct curseg_info { unsigned short next_blkoff; /* next block offset to write */ unsigned int zone; /* current zone number */ unsigned int next_segno; /* preallocated segment */ + int fragment_remained_chunk; /* remained block size in a chunk for block fragmentation mode */ bool inited; /* indicate inmem log is inited */ }; @@ -605,10 +606,10 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, get_pages(sbi, F2FS_DIRTY_DENTS) + get_pages(sbi, F2FS_DIRTY_IMETA); unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); - unsigned int node_secs = total_node_blocks / BLKS_PER_SEC(sbi); - unsigned int dent_secs = total_dent_blocks / BLKS_PER_SEC(sbi); - unsigned int node_blocks = total_node_blocks % BLKS_PER_SEC(sbi); - unsigned int dent_blocks = total_dent_blocks % BLKS_PER_SEC(sbi); + unsigned int node_secs = total_node_blocks / CAP_BLKS_PER_SEC(sbi); + unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi); + unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi); + unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); unsigned int free, need_lower, need_upper; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) @@ -659,7 +660,9 @@ static inline int utilization(struct f2fs_sb_info *sbi) * pages over min_fsync_blocks. (=default option) * F2FS_IPU_ASYNC - do IPU given by asynchronous write requests. * F2FS_IPU_NOCACHE - disable IPU bio cache. - * F2FS_IPUT_DISABLE - disable IPU. (=default option in LFS mode) + * F2FS_IPU_HONOR_OPU_WRITE - use OPU write prior to IPU write if inode has + * FI_OPU_WRITE flag. + * F2FS_IPU_DISABLE - disable IPU. (=default option in LFS mode) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 @@ -675,6 +678,7 @@ enum { F2FS_IPU_FSYNC, F2FS_IPU_ASYNC, F2FS_IPU_NOCACHE, + F2FS_IPU_HONOR_OPU_WRITE, }; static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, @@ -743,6 +747,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Mismatch valid blocks %d vs. %d", GET_SIT_VBLOCKS(raw_sit), valid_blocks); set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } @@ -757,6 +762,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Wrong valid blocks %d or segno %u", GET_SIT_VBLOCKS(raw_sit), segno); set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } return 0; diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index dd3c3c7a90ec..83d6fb97dcae 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -28,10 +28,13 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) return count > 0 ? count : 0; } -static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) +static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi, + enum extent_type type) { - return atomic_read(&sbi->total_zombie_tree) + - atomic_read(&sbi->total_ext_node); + struct extent_tree_info *eti = &sbi->extent_tree[type]; + + return atomic_read(&eti->total_zombie_tree) + + atomic_read(&eti->total_ext_node); } unsigned long f2fs_shrink_count(struct shrinker *shrink, @@ -53,8 +56,11 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, } spin_unlock(&f2fs_list_lock); - /* count extent cache entries */ - count += __count_extent_cache(sbi); + /* count read extent cache entries */ + count += __count_extent_cache(sbi, EX_READ); + + /* count block age extent cache entries */ + count += __count_extent_cache(sbi, EX_BLOCK_AGE); /* count clean nat cache entries */ count += __count_nat_entries(sbi); @@ -100,7 +106,10 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, sbi->shrinker_run_no = run_no; /* shrink extent cache entries */ - freed += f2fs_shrink_extent_tree(sbi, nr >> 1); + freed += f2fs_shrink_age_extent_tree(sbi, nr >> 2); + + /* shrink read extent cache entries */ + freed += f2fs_shrink_read_extent_tree(sbi, nr >> 2); /* shrink clean nat cache entries */ if (freed < nr) @@ -130,7 +139,9 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi) void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) { - f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi)); + f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ)); + f2fs_shrink_age_extent_tree(sbi, + __count_extent_cache(sbi, EX_BLOCK_AGE)); spin_lock(&f2fs_list_lock); list_del_init(&sbi->s_list); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f4e8de1f4789..76ce1d948047 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -59,6 +61,8 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_WRITE_IO] = "write IO error", [FAULT_SLAB_ALLOC] = "slab alloc", [FAULT_DQUOT_INIT] = "dquot initialize", + [FAULT_LOCK_OP] = "lock_op", + [FAULT_BLKADDR] = "invalid blkaddr", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, @@ -108,6 +112,7 @@ enum { Opt_noinline_dentry, Opt_flush_merge, Opt_noflush_merge, + Opt_barrier, Opt_nobarrier, Opt_fastboot, Opt_extent_cache, @@ -137,7 +142,6 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, - Opt_whint, Opt_alloc, Opt_fsync, Opt_test_dummy_encryption, @@ -159,6 +163,8 @@ enum { Opt_gc_merge, Opt_nogc_merge, Opt_discard_unit, + Opt_memory_mode, + Opt_age_extent_cache, Opt_err, }; @@ -184,6 +190,7 @@ static match_table_t f2fs_tokens = { {Opt_noinline_dentry, "noinline_dentry"}, {Opt_flush_merge, "flush_merge"}, {Opt_noflush_merge, "noflush_merge"}, + {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_fastboot, "fastboot"}, {Opt_extent_cache, "extent_cache"}, @@ -213,7 +220,6 @@ static match_table_t f2fs_tokens = { {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, - {Opt_whint, "whint_mode=%s"}, {Opt_alloc, "alloc_mode=%s"}, {Opt_fsync, "fsync_mode=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, @@ -236,6 +242,8 @@ static match_table_t f2fs_tokens = { {Opt_gc_merge, "gc_merge"}, {Opt_nogc_merge, "nogc_merge"}, {Opt_discard_unit, "discard_unit=%s"}, + {Opt_memory_mode, "memory=%s"}, + {Opt_age_extent_cache, "age_extent_cache"}, {Opt_err, NULL}, }; @@ -290,9 +298,7 @@ static int __init f2fs_create_casefold_cache(void) { f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name", F2FS_NAME_LEN); - if (!f2fs_cf_name_slab) - return -ENOMEM; - return 0; + return f2fs_cf_name_slab ? 0 : -ENOMEM; } static void f2fs_destroy_casefold_cache(void) @@ -500,9 +506,19 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, bool is_remount) { struct f2fs_sb_info *sbi = F2FS_SB(sb); -#ifdef CONFIG_FS_ENCRYPTION + struct fs_parameter param = { + .type = fs_value_is_string, + .string = arg->from ? arg->from : "", + }; + struct fscrypt_dummy_policy *policy = + &F2FS_OPTION(sbi).dummy_enc_policy; int err; + if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { + f2fs_warn(sbi, "test_dummy_encryption option not supported"); + return -EINVAL; + } + if (!f2fs_sb_has_encrypt(sbi)) { f2fs_err(sbi, "Encrypt feature is off"); return -EINVAL; @@ -514,12 +530,12 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ - if (is_remount && !F2FS_OPTION(sbi).dummy_enc_policy.policy) { + if (is_remount && !fscrypt_is_dummy_policy_set(policy)) { f2fs_warn(sbi, "Can't set test_dummy_encryption on remount"); return -EINVAL; } - err = fscrypt_set_test_dummy_encryption( - sb, arg->from, &F2FS_OPTION(sbi).dummy_enc_policy); + + err = fscrypt_parse_test_dummy_encryption(¶m, policy); if (err) { if (err == -EEXIST) f2fs_warn(sbi, @@ -532,10 +548,13 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, opt, err); return -EINVAL; } + err = fscrypt_add_test_dummy_key(sb, policy); + if (err) { + f2fs_warn(sbi, "Error adding test dummy encryption key [%d]", + err); + return err; + } f2fs_warn(sbi, "Test dummy encryption mode enabled"); -#else - f2fs_warn(sbi, "Test dummy encryption mount option ignored"); -#endif return 0; } @@ -798,14 +817,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_nobarrier: set_opt(sbi, NOBARRIER); break; + case Opt_barrier: + clear_opt(sbi, NOBARRIER); + break; case Opt_fastboot: set_opt(sbi, FASTBOOT); break; case Opt_extent_cache: - set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noextent_cache: - clear_opt(sbi, EXTENT_CACHE); + clear_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); @@ -858,6 +880,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; } else if (!strcmp(name, "lfs")) { F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; + } else if (!strcmp(name, "fragment:segment")) { + F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_SEG; + } else if (!strcmp(name, "fragment:block")) { + F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_BLK; } else { kfree(name); return -EINVAL; @@ -977,22 +1003,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) f2fs_info(sbi, "quota operations not supported"); break; #endif - case Opt_whint: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "user-based")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER; - } else if (!strcmp(name, "off")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; - } else if (!strcmp(name, "fs-based")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; case Opt_alloc: name = match_strdup(&args[0]); if (!name) @@ -1241,6 +1251,25 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } kfree(name); break; + case Opt_memory_mode: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "normal")) { + F2FS_OPTION(sbi).memory_mode = + MEMORY_MODE_NORMAL; + } else if (!strcmp(name, "low")) { + F2FS_OPTION(sbi).memory_mode = + MEMORY_MODE_LOW; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_age_extent_cache: + set_opt(sbi, AGE_EXTENT_CACHE); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1330,11 +1359,15 @@ default_check: return -EINVAL; } - /* Not pass down write hints if the number of active logs is lesser - * than NR_CURSEG_PERSIST_TYPE. - */ - if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_PERSIST_TYPE) - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; + if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) { + f2fs_err(sbi, "LFS not compatible with ATGC"); + return -EINVAL; + } + + if (f2fs_is_readonly(sbi) && test_opt(sbi, FLUSH_MERGE)) { + f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode"); + return -EINVAL; + } if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_err(sbi, "Allow to mount readonly mode only"); @@ -1357,16 +1390,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); atomic_set(&fi->i_compr_blocks, 0); - init_rwsem(&fi->i_sem); + init_f2fs_rwsem(&fi->i_sem); spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); - INIT_LIST_HEAD(&fi->inmem_ilist); - INIT_LIST_HEAD(&fi->inmem_pages); - mutex_init(&fi->inmem_lock); - init_rwsem(&fi->i_gc_rwsem[READ]); - init_rwsem(&fi->i_gc_rwsem[WRITE]); - init_rwsem(&fi->i_xattr_sem); + init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); + init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); + init_f2fs_rwsem(&fi->i_xattr_sem); /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; @@ -1404,9 +1434,7 @@ static int f2fs_drop_inode(struct inode *inode) atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); - /* some remained atomic pages should discarded */ - if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); @@ -1503,8 +1531,9 @@ static void f2fs_free_inode(struct inode *inode) static void destroy_percpu_info(struct f2fs_sb_info *sbi) { - percpu_counter_destroy(&sbi->alloc_valid_block_count); percpu_counter_destroy(&sbi->total_valid_inode_count); + percpu_counter_destroy(&sbi->rf_node_block_count); + percpu_counter_destroy(&sbi->alloc_valid_block_count); } static void destroy_device_list(struct f2fs_sb_info *sbi) @@ -1515,7 +1544,6 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) blkdev_put(FDEV(i).bdev, FMODE_EXCL); #ifdef CONFIG_BLK_DEV_ZONED kvfree(FDEV(i).blkz_seq); - kfree(FDEV(i).zone_capacity_blocks); #endif } kvfree(sbi->devs); @@ -1557,8 +1585,7 @@ static void f2fs_put_super(struct super_block *sb) /* be sure to wait for any on-going discard commands */ dropped = f2fs_issue_discard_timeout(sbi); - if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) && - !sbi->discard_blks && !dropped) { + if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && !dropped) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; @@ -1727,18 +1754,23 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count; u64 avail_node_count; + unsigned int total_valid_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); - user_block_count = sbi->user_block_count; start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); buf->f_type = F2FS_SUPER_MAGIC; buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; + + spin_lock(&sbi->stat_lock); + + user_block_count = sbi->user_block_count; + total_valid_node_count = valid_node_count(sbi); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; - spin_lock(&sbi->stat_lock); if (unlikely(buf->f_bfree <= sbi->unusable_block_count)) buf->f_bfree = 0; else @@ -1751,14 +1783,12 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) else buf->f_bavail = 0; - avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - if (avail_node_count > user_block_count) { buf->f_files = user_block_count; buf->f_ffree = buf->f_bavail; } else { buf->f_files = avail_node_count; - buf->f_ffree = min(avail_node_count - valid_node_count(sbi), + buf->f_ffree = min(avail_node_count - total_valid_node_count, buf->f_bavail); } @@ -1922,16 +1952,22 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",inline_dentry"); else seq_puts(seq, ",noinline_dentry"); - if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) + if (test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); + else + seq_puts(seq, ",noflush_merge"); if (test_opt(sbi, NOBARRIER)) seq_puts(seq, ",nobarrier"); + else + seq_puts(seq, ",barrier"); if (test_opt(sbi, FASTBOOT)) seq_puts(seq, ",fastboot"); - if (test_opt(sbi, EXTENT_CACHE)) + if (test_opt(sbi, READ_EXTENT_CACHE)) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); + if (test_opt(sbi, AGE_EXTENT_CACHE)) + seq_puts(seq, ",age_extent_cache"); if (test_opt(sbi, DATA_FLUSH)) seq_puts(seq, ",data_flush"); @@ -1940,6 +1976,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, "adaptive"); else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS) seq_puts(seq, "lfs"); + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG) + seq_puts(seq, "fragment:segment"); + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) + seq_puts(seq, "fragment:block"); seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); if (test_opt(sbi, RESERVE_ROOT)) seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", @@ -1970,10 +2010,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",prjquota"); #endif f2fs_show_quota_options(seq, sbi->sb); - if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) - seq_printf(seq, ",whint_mode=%s", "user-based"); - else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) - seq_printf(seq, ",whint_mode=%s", "fs-based"); fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb); @@ -2013,6 +2049,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) seq_printf(seq, ",discard_unit=%s", "section"); + if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_NORMAL) + seq_printf(seq, ",memory=%s", "normal"); + else if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW) + seq_printf(seq, ",memory=%s", "low"); + return 0; } @@ -2025,8 +2066,11 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; - F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; + if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main) <= + SMALL_VOLUME_SEGMENTS) + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; + else + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); @@ -2035,19 +2079,21 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).compress_ext_cnt = 0; F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; + F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL; sbi->sb->s_flags &= ~SB_INLINECRYPT; set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); - set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, READ_EXTENT_CACHE); set_opt(sbi, NOHEAP); clear_opt(sbi, DISABLE_CHECKPOINT); set_opt(sbi, MERGE_CHECKPOINT); F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; - set_opt(sbi, FLUSH_MERGE); + if (!f2fs_is_readonly(sbi)) + set_opt(sbi, FLUSH_MERGE); if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) set_opt(sbi, DISCARD); if (f2fs_sb_has_blkzoned(sbi)) { @@ -2076,6 +2122,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; + unsigned int gc_mode = sbi->gc_mode; int err = 0; int ret; block_t unusable; @@ -2086,11 +2133,25 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) } sbi->sb->s_flags |= SB_ACTIVE; + /* check if we need more GC first */ + unusable = f2fs_get_unusable_blocks(sbi); + if (!f2fs_disable_cp_again(sbi, unusable)) + goto skip_gc; + f2fs_update_time(sbi, DISABLE_TIME); + sbi->gc_mode = GC_URGENT_HIGH; + while (!f2fs_time_over(sbi, DISABLE_TIME)) { - down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true, + .nr_free_secs = 1 }; + + f2fs_down_write(&sbi->gc_lock); + err = f2fs_gc(sbi, &gc_control); if (err == -ENODATA) { err = 0; break; @@ -2111,7 +2172,8 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) goto restore_flag; } - down_write(&sbi->gc_lock); +skip_gc: + f2fs_down_write(&sbi->gc_lock); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); err = f2fs_write_checkpoint(sbi, &cpc); @@ -2123,8 +2185,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) spin_unlock(&sbi->stat_lock); out_unlock: - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); restore_flag: + sbi->gc_mode = gc_mode; sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ return err; } @@ -2136,19 +2199,18 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) /* we should flush all the data to keep data consistency */ do { sync_inodes_sb(sbi->sb); - cond_resched(); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); if (unlikely(retry < 0)) f2fs_warn(sbi, "checkpoint=enable has some unwritten data."); - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); clear_sbi_flag(sbi, SBI_CP_DISABLED); set_sbi_flag(sbi, SBI_IS_DIRTY); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); f2fs_sync_fs(sbi->sb, 1); @@ -2166,14 +2228,14 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_ckpt = false, need_stop_ckpt = false; bool need_restart_flush = false, need_stop_flush = false; bool need_restart_discard = false, need_stop_discard = false; - bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); + bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE); + bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE); bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); bool no_discard = !test_opt(sbi, DISCARD); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); bool block_unit_discard = f2fs_block_unit_discard(sbi); - struct discard_cmd_control *dcc; #ifdef CONFIG_QUOTA int i, j; #endif @@ -2256,11 +2318,17 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } /* disallow enable/disable extent_cache dynamically */ - if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { + if (no_read_extent_cache == !!test_opt(sbi, READ_EXTENT_CACHE)) { err = -EINVAL; f2fs_warn(sbi, "switch extent_cache option is not allowed"); goto restore_opts; } + /* disallow enable/disable age extent_cache dynamically */ + if (no_age_extent_cache == !!test_opt(sbi, AGE_EXTENT_CACHE)) { + err = -EINVAL; + f2fs_warn(sbi, "switch age_extent_cache option is not allowed"); + goto restore_opts; + } if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) { err = -EINVAL; @@ -2305,8 +2373,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & SB_RDONLY || - F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) { + if (*flags & SB_RDONLY) { sync_inodes_sb(sb); set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -2355,10 +2422,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_flush; need_stop_discard = true; } else { - dcc = SM_I(sbi)->dcc_info; f2fs_stop_discard_thread(sbi); - if (atomic_read(&dcc->discard_cmd_cnt)) - f2fs_issue_discard_timeout(sbi); + f2fs_issue_discard_timeout(sbi); need_restart_discard = true; } } @@ -2442,7 +2507,6 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, size_t toread; loff_t i_size = i_size_read(inode); struct page *page; - char *kaddr; if (off > i_size) return 0; @@ -2476,9 +2540,7 @@ repeat: return -EIO; } - kaddr = kmap_atomic(page); - memcpy(data, kaddr + offset, tocopy); - kunmap_atomic(kaddr); + memcpy_from_page(data, page, offset, tocopy); f2fs_put_page(page, 1); offset = 0; @@ -2500,7 +2562,6 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, size_t towrite = len; struct page *page; void *fsdata = NULL; - char *kaddr; int err = 0; int tocopy; @@ -2512,18 +2573,14 @@ retry: &page, &fsdata); if (unlikely(err)) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); break; } - kaddr = kmap_atomic(page); - memcpy(kaddr + offset, data, tocopy); - kunmap_atomic(kaddr); - flush_dcache_page(page); + memcpy_to_page(page, offset, data, tocopy); a_ops->write_end(NULL, mapping, off, tocopy, tocopy, page, fsdata); @@ -2716,18 +2773,18 @@ int f2fs_quota_sync(struct super_block *sb, int type) /* * do_quotactl * f2fs_quota_sync - * down_read(quota_sem) + * f2fs_down_read(quota_sem) * dquot_writeback_dquots() * f2fs_dquot_commit * block_operation - * down_read(quota_sem) + * f2fs_down_read(quota_sem) */ f2fs_lock_op(sbi); - down_read(&sbi->quota_sem); + f2fs_down_read(&sbi->quota_sem); ret = f2fs_quota_sync_file(sbi, cnt); - up_read(&sbi->quota_sem); + f2fs_up_read(&sbi->quota_sem); f2fs_unlock_op(sbi); if (!f2fs_sb_has_quota_ino(sbi)) @@ -2853,11 +2910,11 @@ static int f2fs_dquot_commit(struct dquot *dquot) struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); int ret; - down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING); + f2fs_down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING); ret = dquot_commit(dquot); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); + f2fs_up_read(&sbi->quota_sem); return ret; } @@ -2866,11 +2923,11 @@ static int f2fs_dquot_acquire(struct dquot *dquot) struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); int ret; - down_read(&sbi->quota_sem); + f2fs_down_read(&sbi->quota_sem); ret = dquot_acquire(dquot); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); + f2fs_up_read(&sbi->quota_sem); return ret; } @@ -3018,23 +3075,24 @@ static void f2fs_get_ino_and_lblk_bits(struct super_block *sb, *lblk_bits_ret = 8 * sizeof(block_t); } -static int f2fs_get_num_devices(struct super_block *sb) -{ - struct f2fs_sb_info *sbi = F2FS_SB(sb); - - if (f2fs_is_multi_device(sbi)) - return sbi->s_ndevs; - return 1; -} - -static void f2fs_get_devices(struct super_block *sb, - struct request_queue **devs) +static struct block_device **f2fs_get_devices(struct super_block *sb, + unsigned int *num_devs) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct block_device **devs; int i; + if (!f2fs_is_multi_device(sbi)) + return NULL; + + devs = kmalloc_array(sbi->s_ndevs, sizeof(*devs), GFP_KERNEL); + if (!devs) + return ERR_PTR(-ENOMEM); + for (i = 0; i < sbi->s_ndevs; i++) - devs[i] = bdev_get_queue(FDEV(i).bdev); + devs[i] = FDEV(i).bdev; + *num_devs = sbi->s_ndevs; + return devs; } static const struct fscrypt_operations f2fs_cryptops = { @@ -3043,10 +3101,8 @@ static const struct fscrypt_operations f2fs_cryptops = { .set_context = f2fs_set_context, .get_dummy_policy = f2fs_get_dummy_policy, .empty_dir = f2fs_empty_dir, - .max_namelen = F2FS_NAME_LEN, .has_stable_inodes = f2fs_has_stable_inodes, .get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits, - .get_num_devices = f2fs_get_num_devices, .get_devices = f2fs_get_devices, }; #endif @@ -3585,11 +3641,16 @@ static void init_sb_info(struct f2fs_sb_info *sbi) F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino); F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino); sbi->cur_victim_sec = NULL_SECNO; + sbi->gc_mode = GC_NORMAL; sbi->next_victim_seg[BG_GC] = NULL_SEGNO; sbi->next_victim_seg[FG_GC] = NULL_SEGNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; sbi->migration_granularity = sbi->segs_per_sec; sbi->seq_file_ra_mul = MIN_RA_MUL; + sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; + sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; + spin_lock_init(&sbi->gc_remaining_trials_lock); + atomic64_set(&sbi->current_atomic_write, 0); sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; @@ -3609,14 +3670,14 @@ static void init_sb_info(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); - init_rwsem(&sbi->io_order_lock); + init_f2fs_rwsem(&sbi->io_order_lock); spin_lock_init(&sbi->cp_lock); sbi->dirty_device = 0; spin_lock_init(&sbi->dev_lock); - init_rwsem(&sbi->sb_lock); - init_rwsem(&sbi->pin_sem); + init_f2fs_rwsem(&sbi->sb_lock); + init_f2fs_rwsem(&sbi->pin_sem); } static int init_percpu_info(struct f2fs_sb_info *sbi) @@ -3627,35 +3688,49 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) if (err) return err; + err = percpu_counter_init(&sbi->rf_node_block_count, 0, GFP_KERNEL); + if (err) + goto err_valid_block; + err = percpu_counter_init(&sbi->total_valid_inode_count, 0, GFP_KERNEL); if (err) - percpu_counter_destroy(&sbi->alloc_valid_block_count); + goto err_node_block; + return 0; +err_node_block: + percpu_counter_destroy(&sbi->rf_node_block_count); +err_valid_block: + percpu_counter_destroy(&sbi->alloc_valid_block_count); return err; } #ifdef CONFIG_BLK_DEV_ZONED struct f2fs_report_zones_args { + struct f2fs_sb_info *sbi; struct f2fs_dev_info *dev; - bool zone_cap_mismatch; }; static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct f2fs_report_zones_args *rz_args = data; + block_t unusable_blocks = (zone->len - zone->capacity) >> + F2FS_LOG_SECTORS_PER_BLOCK; if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return 0; set_bit(idx, rz_args->dev->blkz_seq); - rz_args->dev->zone_capacity_blocks[idx] = zone->capacity >> - F2FS_LOG_SECTORS_PER_BLOCK; - if (zone->len != zone->capacity && !rz_args->zone_cap_mismatch) - rz_args->zone_cap_mismatch = true; - + if (!rz_args->sbi->unusable_blocks_per_sec) { + rz_args->sbi->unusable_blocks_per_sec = unusable_blocks; + return 0; + } + if (rz_args->sbi->unusable_blocks_per_sec != unusable_blocks) { + f2fs_err(rz_args->sbi, "F2FS supports single zone capacity\n"); + return -EINVAL; + } return 0; } @@ -3664,22 +3739,29 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) struct block_device *bdev = FDEV(devi).bdev; sector_t nr_sectors = bdev_nr_sectors(bdev); struct f2fs_report_zones_args rep_zone_arg; + u64 zone_sectors; int ret; if (!f2fs_sb_has_blkzoned(sbi)) return 0; - if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != - SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) + zone_sectors = bdev_zone_sectors(bdev); + if (!is_power_of_2(zone_sectors)) { + f2fs_err(sbi, "F2FS does not support non power of 2 zone sizes\n"); return -EINVAL; - sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); + } + + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != + SECTOR_TO_BLOCK(zone_sectors)) + return -EINVAL; + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(zone_sectors); if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != __ilog2_u32(sbi->blocks_per_blkz)) return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> sbi->log_blocks_per_blkz; - if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) + if (nr_sectors & (zone_sectors - 1)) FDEV(devi).nr_blkz++; FDEV(devi).blkz_seq = f2fs_kvzalloc(sbi, @@ -3689,26 +3771,13 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) if (!FDEV(devi).blkz_seq) return -ENOMEM; - /* Get block zones type and zone-capacity */ - FDEV(devi).zone_capacity_blocks = f2fs_kzalloc(sbi, - FDEV(devi).nr_blkz * sizeof(block_t), - GFP_KERNEL); - if (!FDEV(devi).zone_capacity_blocks) - return -ENOMEM; - + rep_zone_arg.sbi = sbi; rep_zone_arg.dev = &FDEV(devi); - rep_zone_arg.zone_cap_mismatch = false; ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, f2fs_report_zone_cb, &rep_zone_arg); if (ret < 0) return ret; - - if (!rep_zone_arg.zone_cap_mismatch) { - kfree(FDEV(devi).zone_capacity_blocks); - FDEV(devi).zone_capacity_blocks = NULL; - } - return 0; } #endif @@ -3810,10 +3879,73 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; } +void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + int err; + + f2fs_down_write(&sbi->sb_lock); + + if (raw_super->s_stop_reason[reason] < ((1 << BITS_PER_BYTE) - 1)) + raw_super->s_stop_reason[reason]++; + + err = f2fs_commit_super(sbi, false); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record reason:%u err:%d", + reason, err); + f2fs_up_write(&sbi->sb_lock); +} + +static void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) +{ + spin_lock(&sbi->error_lock); + if (!test_bit(flag, (unsigned long *)sbi->errors)) { + set_bit(flag, (unsigned long *)sbi->errors); + sbi->error_dirty = true; + } + spin_unlock(&sbi->error_lock); +} + +static bool f2fs_update_errors(struct f2fs_sb_info *sbi) +{ + bool need_update = false; + + spin_lock(&sbi->error_lock); + if (sbi->error_dirty) { + memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors, + MAX_F2FS_ERRORS); + sbi->error_dirty = false; + need_update = true; + } + spin_unlock(&sbi->error_lock); + + return need_update; +} + +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error) +{ + int err; + + f2fs_save_errors(sbi, error); + + f2fs_down_write(&sbi->sb_lock); + + if (!f2fs_update_errors(sbi)) + goto out_unlock; + + err = f2fs_commit_super(sbi, false); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record errors:%u, err:%d", + error, err); +out_unlock: + f2fs_up_write(&sbi->sb_lock); +} + static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); unsigned int max_devices = MAX_DEVICES; + unsigned int logical_blksize; int i; /* Initialize single device information */ @@ -3834,6 +3966,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) if (!sbi->devs) return -ENOMEM; + logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev); + sbi->aligned_blksize = true; + for (i = 0; i < max_devices; i++) { if (i > 0 && !RDEV(i).path[0]) @@ -3870,6 +4005,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) /* to release errored devices */ sbi->s_ndevs = i + 1; + if (logical_blksize != bdev_logical_block_size(FDEV(i).bdev)) + sbi->aligned_blksize = false; + #ifdef CONFIG_BLK_DEV_ZONED if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && !f2fs_sb_has_blkzoned(sbi)) { @@ -3944,17 +4082,16 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) { - struct f2fs_sm_info *sm_i = SM_I(sbi); - /* adjust parameters according to the volume size */ - if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { - F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; + if (MAIN_SEGS(sbi) <= SMALL_VOLUME_SEGMENTS) { if (f2fs_block_unit_discard(sbi)) - sm_i->dcc_info->discard_granularity = 1; - sm_i->ipu_policy = 1 << F2FS_IPU_FORCE; + SM_I(sbi)->dcc_info->discard_granularity = + MIN_DISCARD_GRANULARITY; + SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE | + 1 << F2FS_IPU_HONOR_OPU_WRITE; } - sbi->readdir_ra = 1; + sbi->readdir_ra = true; } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) @@ -3982,6 +4119,24 @@ try_onemore: sbi->sb = sb; + /* initialize locks within allocated memory */ + init_f2fs_rwsem(&sbi->gc_lock); + mutex_init(&sbi->writepages); + init_f2fs_rwsem(&sbi->cp_global_sem); + init_f2fs_rwsem(&sbi->node_write); + init_f2fs_rwsem(&sbi->node_change); + spin_lock_init(&sbi->stat_lock); + init_f2fs_rwsem(&sbi->cp_rwsem); + init_f2fs_rwsem(&sbi->quota_sem); + init_waitqueue_head(&sbi->cp_wait); + spin_lock_init(&sbi->error_lock); + + for (i = 0; i < NR_INODE_TYPE; i++) { + INIT_LIST_HEAD(&sbi->inode_list[i]); + spin_lock_init(&sbi->inode_lock[i]); + } + mutex_init(&sbi->flush_lock); + /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { @@ -4005,6 +4160,8 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); + /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sbi)) sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, @@ -4061,44 +4218,14 @@ try_onemore: /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; - init_rwsem(&sbi->gc_lock); - mutex_init(&sbi->writepages); - init_rwsem(&sbi->cp_global_sem); - init_rwsem(&sbi->node_write); - init_rwsem(&sbi->node_change); /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); - spin_lock_init(&sbi->stat_lock); - for (i = 0; i < NR_PAGE_TYPE; i++) { - int n = (i == META) ? 1 : NR_TEMP_TYPE; - int j; + err = f2fs_init_write_merge_io(sbi); + if (err) + goto free_bio_info; - sbi->write_io[i] = - f2fs_kmalloc(sbi, - array_size(n, - sizeof(struct f2fs_bio_info)), - GFP_KERNEL); - if (!sbi->write_io[i]) { - err = -ENOMEM; - goto free_bio_info; - } - - for (j = HOT; j < n; j++) { - init_rwsem(&sbi->write_io[i][j].io_rwsem); - sbi->write_io[i][j].sbi = sbi; - sbi->write_io[i][j].bio = NULL; - spin_lock_init(&sbi->write_io[i][j].io_lock); - INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); - INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); - init_rwsem(&sbi->write_io[i][j].bio_list_lock); - } - } - - init_rwsem(&sbi->cp_rwsem); - init_rwsem(&sbi->quota_sem); - init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); err = f2fs_init_iostat(sbi); @@ -4176,12 +4303,6 @@ try_onemore: limit_reserve_root(sbi); adjust_unusable_cap_perc(sbi); - for (i = 0; i < NR_INODE_TYPE; i++) { - INIT_LIST_HEAD(&sbi->inode_list[i]); - spin_lock_init(&sbi->inode_lock[i]); - } - mutex_init(&sbi->flush_lock); - f2fs_init_extent_cache_info(sbi); f2fs_init_ino_entry_info(sbi); @@ -4387,6 +4508,8 @@ reset_checkpoint: f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + + cleancache_init_fs(sb); return 0; sync_free_meta: @@ -4522,7 +4645,7 @@ static struct file_system_type f2fs_fs_type = { .name = "f2fs", .mount = f2fs_mount, .kill_sb = kill_f2fs_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("f2fs"); @@ -4531,9 +4654,7 @@ static int __init init_inodecache(void) f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", sizeof(struct f2fs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL); - if (!f2fs_inode_cachep) - return -ENOMEM; - return 0; + return f2fs_inode_cachep ? 0 : -ENOMEM; } static void destroy_inodecache(void) @@ -4598,7 +4719,7 @@ static int __init init_f2fs_fs(void) goto free_iostat; err = f2fs_init_bioset(); if (err) - goto free_bio_enrty_cache; + goto free_bio_entry_cache; err = f2fs_init_compress_mempool(); if (err) goto free_bioset; @@ -4615,7 +4736,7 @@ free_compress_mempool: f2fs_destroy_compress_mempool(); free_bioset: f2fs_destroy_bioset(); -free_bio_enrty_cache: +free_bio_entry_cache: f2fs_destroy_bio_entry_cache(); free_iostat: f2fs_destroy_iostat_processing(); @@ -4674,5 +4795,6 @@ module_exit(exit_f2fs_fs) MODULE_AUTHOR("Samsung Electronics's Praesto Team"); MODULE_DESCRIPTION("Flash Friendly File System"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_SOFTDEP("pre: crc32"); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 8b36e61fe7ed..b89874f06744 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -41,11 +41,21 @@ enum { ATGC_INFO, /* struct atgc_management */ }; +static const char *gc_mode_names[MAX_GC_MODE] = { + "GC_NORMAL", + "GC_IDLE_CB", + "GC_IDLE_GREEDY", + "GC_IDLE_AT", + "GC_URGENT_HIGH", + "GC_URGENT_LOW", + "GC_URGENT_MID" +}; + struct f2fs_attr { struct attribute attr; - ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); - ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, - const char *, size_t); + ssize_t (*show)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf); + ssize_t (*store)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, + const char *buf, size_t len); int struct_type; int offset; int id; @@ -85,28 +95,28 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) static ssize_t dirty_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(dirty_segments(sbi))); } static ssize_t free_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(free_segments(sbi))); } static ssize_t ovp_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(overprovision_segments(sbi))); } static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(sbi->kbytes_written + ((f2fs_get_sectors_written(sbi) - sbi->sectors_written_start) >> 1))); @@ -115,7 +125,28 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, static ssize_t sb_status_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%lx\n", sbi->s_flag); + return sysfs_emit(buf, "%lx\n", sbi->s_flag); +} + +static ssize_t cp_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags)); +} + +static ssize_t pending_discard_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (!SM_I(sbi)->dcc_info) + return -EINVAL; + return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read( + &SM_I(sbi)->dcc_info->discard_cmd_cnt)); +} + +static ssize_t gc_mode_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%s\n", gc_mode_names[sbi->gc_mode]); } static ssize_t features_show(struct f2fs_attr *a, @@ -174,7 +205,7 @@ static ssize_t features_show(struct f2fs_attr *a, static ssize_t current_reserved_blocks_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%u\n", sbi->current_reserved_blocks); + return sysfs_emit(buf, "%u\n", sbi->current_reserved_blocks); } static ssize_t unusable_show(struct f2fs_attr *a, @@ -186,7 +217,7 @@ static ssize_t unusable_show(struct f2fs_attr *a, unusable = sbi->unusable_block_count; else unusable = f2fs_get_unusable_blocks(sbi); - return sprintf(buf, "%llu\n", (unsigned long long)unusable); + return sysfs_emit(buf, "%llu\n", (unsigned long long)unusable); } static ssize_t encoding_show(struct f2fs_attr *a, @@ -196,19 +227,19 @@ static ssize_t encoding_show(struct f2fs_attr *a, struct super_block *sb = sbi->sb; if (f2fs_sb_has_casefold(sbi)) - return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n", + return sysfs_emit(buf, "%s (%d.%d.%d)\n", sb->s_encoding->charset, (sb->s_encoding->version >> 16) & 0xff, (sb->s_encoding->version >> 8) & 0xff, sb->s_encoding->version & 0xff); #endif - return sprintf(buf, "(none)"); + return sysfs_emit(buf, "(none)\n"); } static ssize_t mounted_time_sec_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time); + return sysfs_emit(buf, "%llu\n", SIT_I(sbi)->mounted_time); } #ifdef CONFIG_F2FS_STAT_FS @@ -217,7 +248,7 @@ static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a, { struct f2fs_stat_info *si = F2FS_STAT(sbi); - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->tot_blks - (si->bg_data_blks + si->bg_node_blks))); } @@ -227,7 +258,7 @@ static ssize_t moved_blocks_background_show(struct f2fs_attr *a, { struct f2fs_stat_info *si = F2FS_STAT(sbi); - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->bg_data_blks + si->bg_node_blks)); } @@ -238,14 +269,14 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a, si->dirty_count = dirty_segments(sbi); f2fs_update_sit_info(sbi); - return sprintf(buf, "%llu\n", (unsigned long long)(si->avg_vblocks)); + return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->avg_vblocks)); } #endif static ssize_t main_blkaddr_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)MAIN_BLKADDR(sbi)); } @@ -316,9 +347,24 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, sbi->gc_reclaimed_segs[sbi->gc_segment_mode]); } + if (!strcmp(a->attr.name, "current_atomic_write")) { + s64 current_write = atomic64_read(&sbi->current_atomic_write); + + return sysfs_emit(buf, "%lld\n", current_write); + } + + if (!strcmp(a->attr.name, "peak_atomic_write")) + return sysfs_emit(buf, "%lld\n", sbi->peak_atomic_write); + + if (!strcmp(a->attr.name, "committed_atomic_block")) + return sysfs_emit(buf, "%llu\n", sbi->committed_atomic_block); + + if (!strcmp(a->attr.name, "revoked_atomic_block")) + return sysfs_emit(buf, "%llu\n", sbi->revoked_atomic_block); + ui = (unsigned int *)(ptr + a->offset); - return sprintf(buf, "%u\n", *ui); + return sysfs_emit(buf, "%u\n", *ui); } static ssize_t __sbi_store(struct f2fs_attr *a, @@ -355,7 +401,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN) return -EINVAL; - down_write(&sbi->sb_lock); + f2fs_down_write(&sbi->sb_lock); ret = f2fs_update_extension_list(sbi, name, hot, set); if (ret) @@ -365,7 +411,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, if (ret) f2fs_update_extension_list(sbi, name, hot, !set); out: - up_write(&sbi->sb_lock); + f2fs_up_write(&sbi->sb_lock); return ret ? ret : count; } @@ -439,14 +485,27 @@ out: return count; } + if (!strcmp(a->attr.name, "max_ordered_discard")) { + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (!f2fs_block_unit_discard(sbi)) + return -EINVAL; + *ui = t; + return count; + } + + if (!strcmp(a->attr.name, "discard_urgent_util")) { + if (t > 100) + return -EINVAL; + *ui = t; + return count; + } + if (!strcmp(a->attr.name, "migration_granularity")) { if (t == 0 || t > sbi->segs_per_sec) return -EINVAL; } - if (!strcmp(a->attr.name, "trim_sections")) - return -EINVAL; - if (!strcmp(a->attr.name, "gc_urgent")) { if (t == 0) { sbi->gc_mode = GC_NORMAL; @@ -460,6 +519,13 @@ out: } } else if (t == 2) { sbi->gc_mode = GC_URGENT_LOW; + } else if (t == 3) { + sbi->gc_mode = GC_URGENT_MID; + if (sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; + wake_up_interruptible_all( + &sbi->gc_thread->gc_wait_queue_head); + } } else { return -EINVAL; } @@ -480,6 +546,14 @@ out: return count; } + if (!strcmp(a->attr.name, "gc_remaining_trials")) { + spin_lock(&sbi->gc_remaining_trials_lock); + sbi->gc_remaining_trials = t; + spin_unlock(&sbi->gc_remaining_trials_lock); + + return count; + } + #ifdef CONFIG_F2FS_IOSTAT if (!strcmp(a->attr.name, "iostat_enable")) { sbi->iostat_enable = !!t; @@ -553,6 +627,66 @@ out: return count; } + if (!strcmp(a->attr.name, "max_fragment_chunk")) { + if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE) + sbi->max_fragment_chunk = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "max_fragment_hole")) { + if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE) + sbi->max_fragment_hole = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "peak_atomic_write")) { + if (t != 0) + return -EINVAL; + sbi->peak_atomic_write = 0; + return count; + } + + if (!strcmp(a->attr.name, "committed_atomic_block")) { + if (t != 0) + return -EINVAL; + sbi->committed_atomic_block = 0; + return count; + } + + if (!strcmp(a->attr.name, "revoked_atomic_block")) { + if (t != 0) + return -EINVAL; + sbi->revoked_atomic_block = 0; + return count; + } + + if (!strcmp(a->attr.name, "readdir_ra")) { + sbi->readdir_ra = !!t; + return count; + } + + if (!strcmp(a->attr.name, "hot_data_age_threshold")) { + if (t == 0 || t >= sbi->warm_data_age_threshold) + return -EINVAL; + if (t == *ui) + return count; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "warm_data_age_threshold")) { + if (t == 0 || t <= sbi->hot_data_age_threshold) + return -EINVAL; + if (t == *ui) + return count; + *ui = (unsigned int)t; + return count; + } + *ui = (unsigned int)t; return count; @@ -625,7 +759,7 @@ static void f2fs_sb_release(struct kobject *kobj) static ssize_t f2fs_feature_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "supported\n"); + return sysfs_emit(buf, "supported\n"); } #define F2FS_FEATURE_RO_ATTR(_name) \ @@ -638,8 +772,8 @@ static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { if (F2FS_HAS_FEATURE(sbi, a->id)) - return sprintf(buf, "supported\n"); - return sprintf(buf, "unsupported\n"); + return sysfs_emit(buf, "supported\n"); + return sysfs_emit(buf, "unsupported\n"); } #define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \ @@ -658,6 +792,11 @@ static struct f2fs_attr f2fs_attr_##_name = { \ .offset = _offset \ } +#define F2FS_RO_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0444, \ + f2fs_sbi_show, NULL, \ + offsetof(struct struct_name, elname)) + #define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ F2FS_ATTR_OFFSET(struct_type, name, 0644, \ f2fs_sbi_show, f2fs_sbi_store, \ @@ -683,9 +822,14 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_request); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_urgent_util, discard_urgent_util); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); @@ -695,6 +839,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, max_roll_forward_node_blocks, max_rf_node_blocks); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); @@ -719,6 +864,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_remaining_trials, gc_remaining_trials); F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); @@ -730,6 +876,8 @@ F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); F2FS_GENERAL_RO_ATTR(mounted_time_sec); F2FS_GENERAL_RO_ATTR(main_blkaddr); +F2FS_GENERAL_RO_ATTR(pending_discard); +F2FS_GENERAL_RO_ATTR(gc_mode); #ifdef CONFIG_F2FS_STAT_FS F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); @@ -749,6 +897,8 @@ F2FS_FEATURE_RO_ATTR(encrypted_casefold); #endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED F2FS_FEATURE_RO_ATTR(block_zoned); +F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, unusable_blocks_per_sec, + unusable_blocks_per_sec); #endif F2FS_FEATURE_RO_ATTR(atomic_write); F2FS_FEATURE_RO_ATTR(extra_attr); @@ -783,6 +933,18 @@ F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, seq_file_ra_mul, seq_file_ra_mul); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_chunk, max_fragment_chunk); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_hole, max_fragment_hole); + +/* For atomic write */ +F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, current_atomic_write, current_atomic_write); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, peak_atomic_write, peak_atomic_write); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, committed_atomic_block, committed_atomic_block); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block); + +/* For block age extent cache */ +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, hot_data_age_threshold, hot_data_age_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, warm_data_age_threshold, warm_data_age_threshold); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -795,8 +957,15 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(reclaim_segments), ATTR_LIST(main_blkaddr), ATTR_LIST(max_small_discards), + ATTR_LIST(max_discard_request), + ATTR_LIST(min_discard_issue_time), + ATTR_LIST(mid_discard_issue_time), + ATTR_LIST(max_discard_issue_time), + ATTR_LIST(discard_urgent_util), ATTR_LIST(discard_granularity), - ATTR_LIST(batched_trim_sections), + ATTR_LIST(max_ordered_discard), + ATTR_LIST(pending_discard), + ATTR_LIST(gc_mode), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), @@ -809,6 +978,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ram_thresh), ATTR_LIST(ra_nid_pages), ATTR_LIST(dirty_nats_ratio), + ATTR_LIST(max_roll_forward_node_blocks), ATTR_LIST(cp_interval), ATTR_LIST(idle_interval), ATTR_LIST(discard_idle_interval), @@ -828,6 +998,7 @@ static struct attribute *f2fs_attrs[] = { #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), + ATTR_LIST(gc_remaining_trials), ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), @@ -848,6 +1019,9 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(moved_blocks_background), ATTR_LIST(avg_vblocks), #endif +#ifdef CONFIG_BLK_DEV_ZONED + ATTR_LIST(unusable_blocks_per_sec), +#endif #ifdef CONFIG_F2FS_FS_COMPRESSION ATTR_LIST(compr_written_block), ATTR_LIST(compr_saved_block), @@ -861,6 +1035,14 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(seq_file_ra_mul), ATTR_LIST(gc_segment_mode), ATTR_LIST(gc_reclaimed_segments), + ATTR_LIST(max_fragment_chunk), + ATTR_LIST(max_fragment_hole), + ATTR_LIST(current_atomic_write), + ATTR_LIST(peak_atomic_write), + ATTR_LIST(committed_atomic_block), + ATTR_LIST(revoked_atomic_block), + ATTR_LIST(hot_data_age_threshold), + ATTR_LIST(warm_data_age_threshold), NULL, }; ATTRIBUTE_GROUPS(f2fs); @@ -901,8 +1083,10 @@ static struct attribute *f2fs_feat_attrs[] = { ATTRIBUTE_GROUPS(f2fs_feat); F2FS_GENERAL_RO_ATTR(sb_status); +F2FS_GENERAL_RO_ATTR(cp_status); static struct attribute *f2fs_stat_attrs[] = { ATTR_LIST(sb_status), + ATTR_LIST(cp_status), NULL, }; ATTRIBUTE_GROUPS(f2fs_stat); @@ -1107,6 +1291,44 @@ static int __maybe_unused victim_bits_seq_show(struct seq_file *seq, return 0; } +static int __maybe_unused discard_plist_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i, count; + + seq_puts(seq, "Discard pend list(Show diacrd_cmd count on each entry, .:not exist):\n"); + if (!f2fs_realtime_discard_enable(sbi)) + return 0; + + if (dcc) { + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) { + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + + if (i % 8 == 0) + seq_printf(seq, " %-3d", i); + count = 0; + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) + count++; + if (count) + seq_printf(seq, " %7d", count); + else + seq_puts(seq, " ."); + if (i % 8 == 7) + seq_putc(seq, '\n'); + } + seq_putc(seq, '\n'); + mutex_unlock(&dcc->cmd_lock); + } + + return 0; +} + int __init f2fs_init_sysfs(void) { int ret; @@ -1177,6 +1399,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) #endif proc_create_single_data("victim_bits", 0444, sbi->s_proc, victim_bits_seq_show, sb); + proc_create_single_data("discard_plist_info", 0444, sbi->s_proc, + discard_plist_seq_show, sb); } return 0; put_feature_list_kobj: @@ -1200,6 +1424,7 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry("victim_bits", sbi->s_proc); + remove_proc_entry("discard_plist_info", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index a28968bb56e6..a2719326382d 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -47,16 +47,13 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count, size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); struct page *page; - void *addr; page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, NULL); if (IS_ERR(page)) return PTR_ERR(page); - addr = kmap_atomic(page); - memcpy(buf, addr + offset_in_page(pos), n); - kunmap_atomic(addr); + memcpy_from_page(buf, page, offset_in_page(pos), n); put_page(page); @@ -82,7 +79,6 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, PAGE_SIZE - offset_in_page(pos)); struct page *page; void *fsdata; - void *addr; int res; res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0, @@ -90,9 +86,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, if (res) return res; - addr = kmap_atomic(page); - memcpy(addr + offset_in_page(pos), buf, n); - kunmap_atomic(addr); + memcpy_to_page(page, offset_in_page(pos), buf, n); res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n, page, fsdata); @@ -128,7 +122,7 @@ static int f2fs_begin_enable_verity(struct file *filp) if (f2fs_verity_in_progress(inode)) return -EBUSY; - if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) + if (f2fs_is_atomic_file(inode)) return -EOPNOTSUPP; /* @@ -208,7 +202,7 @@ cleanup: * from re-instantiating cached pages we are truncating (since unlike * normal file accesses, garbage collection isn't limited by i_size). */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); truncate_inode_pages(inode->i_mapping, inode->i_size); err2 = f2fs_truncate(inode); if (err2) { @@ -216,7 +210,7 @@ cleanup: err2); set_sbi_flag(sbi, SBI_NEED_FSCK); } - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); return err ?: err2; } @@ -245,6 +239,8 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, if (pos + size < pos || pos + size > inode->i_sb->s_maxbytes || pos < f2fs_verity_metadata_pos(inode) || size > INT_MAX) { f2fs_warn(F2FS_I_SB(inode), "invalid verity xattr"); + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_VERITY_XATTR); return -EFSCORRUPTED; } if (buf_size) { diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 797ac505a075..dc2e8637189e 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -226,15 +226,18 @@ static inline const struct xattr_handler *f2fs_xattr_handler(int index) } static struct f2fs_xattr_entry *__find_xattr(void *base_addr, - void *last_base_addr, int index, - size_t len, const char *name) + void *last_base_addr, void **last_addr, + int index, size_t len, const char *name) { struct f2fs_xattr_entry *entry; list_for_each_xattr(entry, base_addr) { if ((void *)(entry) + sizeof(__u32) > last_base_addr || - (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) + (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { + if (last_addr) + *last_addr = entry; return NULL; + } if (entry->e_name_index != index) continue; @@ -254,19 +257,9 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, unsigned int inline_size = inline_xattr_size(inode); void *max_addr = base_addr + inline_size; - list_for_each_xattr(entry, base_addr) { - if ((void *)entry + sizeof(__u32) > max_addr || - (void *)XATTR_NEXT_ENTRY(entry) > max_addr) { - *last_addr = entry; - return NULL; - } - if (entry->e_name_index != index) - continue; - if (entry->e_name_len != len) - continue; - if (!memcmp(entry->e_name, name, len)) - break; - } + entry = __find_xattr(base_addr, max_addr, last_addr, index, len, name); + if (!entry) + return NULL; /* inline xattr header or entry across max inline xattr size */ if (IS_XATTR_LAST_ENTRY(entry) && @@ -368,12 +361,14 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, else cur_addr = txattr_addr; - *xe = __find_xattr(cur_addr, last_txattr_addr, index, len, name); + *xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name); if (!*xe) { f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto out; } check: @@ -532,10 +527,10 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - down_read(&F2FS_I(inode)->i_xattr_sem); + f2fs_down_read(&F2FS_I(inode)->i_xattr_sem); error = lookup_all_xattrs(inode, ipage, index, len, name, &entry, &base_addr, &base_size, &is_inline); - up_read(&F2FS_I(inode)->i_xattr_sem); + f2fs_up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -569,9 +564,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error; size_t rest = buffer_size; - down_read(&F2FS_I(inode)->i_xattr_sem); + f2fs_down_read(&F2FS_I(inode)->i_xattr_sem); error = read_all_xattrs(inode, NULL, &base_addr); - up_read(&F2FS_I(inode)->i_xattr_sem); + f2fs_up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -590,6 +585,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto cleanup; } @@ -659,12 +656,14 @@ static int __f2fs_setxattr(struct inode *inode, int index, last_base_addr = (void *)base_addr + XATTR_SIZE(inode); /* find entry with wanted name. */ - here = __find_xattr(base_addr, last_base_addr, index, len, name); + here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name); if (!here) { f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto exit; } @@ -691,6 +690,8 @@ static int __f2fs_setxattr(struct inode *inode, int index, inode->i_ino, ENTRY_SIZE(last)); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto exit; } last = XATTR_NEXT_ENTRY(last); @@ -793,9 +794,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); - down_write(&F2FS_I(inode)->i_xattr_sem); + f2fs_down_write(&F2FS_I(inode)->i_xattr_sem); err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); - up_write(&F2FS_I(inode)->i_xattr_sem); + f2fs_up_write(&F2FS_I(inode)->i_xattr_sem); f2fs_unlock_op(sbi); f2fs_update_time(sbi, REQ_TIME); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index de0c9b013a85..f5b1f59c467a 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1980,3 +1980,4 @@ module_init(init_fat_fs) module_exit(exit_fat_fs) MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index efba301d68ae..c2e7e0c3ac60 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -682,6 +682,7 @@ static void __exit exit_msdos_fs(void) } MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_AUTHOR("Werner Almesberger"); MODULE_DESCRIPTION("MS-DOS filesystem support"); diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 5369d82e0bfb..2ec184effb0e 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -1078,6 +1078,7 @@ static void __exit exit_vfat_fs(void) } MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_DESCRIPTION("VFAT filesystem support"); MODULE_AUTHOR("Gordon Chaffee"); diff --git a/fs/file.c b/fs/file.c index 214364e19d76..17b8c3a1c7d9 100644 --- a/fs/file.c +++ b/fs/file.c @@ -670,7 +670,7 @@ int close_fd(unsigned fd) return filp_close(file, files); } -EXPORT_SYMBOL(close_fd); /* for ksys_close() */ +EXPORT_SYMBOL_NS(close_fd, ANDROID_GKI_VFS_EXPORT_ONLY); /* for ksys_close() */ /** * last_fd - return last valid index into fd table diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 578a5062706e..88fe3f470409 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -52,6 +52,7 @@ MODULE_AUTHOR("Christoph Hellwig, Krzysztof Blaszkowski"); MODULE_DESCRIPTION("Veritas Filesystem (VxFS) driver"); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); static struct kmem_cache *vxfs_inode_cachep; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f4a5a0c2858a..3a79887b692c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2541,7 +2541,7 @@ out_unlock: spin_unlock(&wb->list_lock); spin_unlock(&inode->i_lock); } -EXPORT_SYMBOL(__mark_inode_dirty); +EXPORT_SYMBOL_NS(__mark_inode_dirty, ANDROID_GKI_VFS_EXPORT_ONLY); /* * The @s_sync_lock is used to serialise concurrent sync operations @@ -2707,7 +2707,7 @@ void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true); up_read(&sb->s_umount); } -EXPORT_SYMBOL(try_to_writeback_inodes_sb); +EXPORT_SYMBOL_NS(try_to_writeback_inodes_sb, ANDROID_GKI_VFS_EXPORT_ONLY); /** * sync_inodes_sb - sync sb inode pages @@ -2774,7 +2774,7 @@ int write_inode_now(struct inode *inode, int sync) might_sleep(); return writeback_single_inode(inode, &wbc); } -EXPORT_SYMBOL(write_inode_now); +EXPORT_SYMBOL_NS(write_inode_now, ANDROID_GKI_VFS_EXPORT_ONLY); /** * sync_inode_metadata - write an inode to disk @@ -2794,4 +2794,4 @@ int sync_inode_metadata(struct inode *inode, int wait) return writeback_single_inode(inode, &wbc); } -EXPORT_SYMBOL(sync_inode_metadata); +EXPORT_SYMBOL_NS(sync_inode_metadata, ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/fs_types.c b/fs/fs_types.c index 78365e5dc08c..a11a1d8c7811 100644 --- a/fs/fs_types.c +++ b/fs/fs_types.c @@ -41,7 +41,7 @@ unsigned char fs_ftype_to_dtype(unsigned int filetype) return fs_dtype_by_ftype[filetype]; } -EXPORT_SYMBOL_GPL(fs_ftype_to_dtype); +EXPORT_SYMBOL_NS_GPL(fs_ftype_to_dtype, ANDROID_GKI_VFS_EXPORT_ONLY); /* * dirent file type to fs on-disk file type conversion diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile index 03a871d689bb..14739ac30081 100644 --- a/fs/fscache/Makefile +++ b/fs/fscache/Makefile @@ -3,6 +3,8 @@ # Makefile for general filesystem caching code # +ccflags-y += -DDEFAULT_SYMBOL_NAMESPACE=ANDROID_GKI_VFS_EXPORT_ONLY + fscache-y := \ cache.o \ cookie.o \ diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 40ce9a1c12e5..d6bd90b5c38a 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -52,3 +52,11 @@ config FUSE_DAX If you want to allow mounting a Virtio Filesystem with the "dax" option, answer Y. + +config FUSE_BPF + bool "Adds BPF to fuse" + depends on FUSE_FS + depends on BPF + help + Extends FUSE by adding BPF to prefilter calls and potentially pass to a + backing file system diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 0c48b35c058d..096bd78f06e7 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -8,6 +8,8 @@ obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-y += passthrough.o fuse-$(CONFIG_FUSE_DAX) += dax.o +fuse-$(CONFIG_FUSE_BPF) += backing.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/OWNERS b/fs/fuse/OWNERS new file mode 100644 index 000000000000..5ee6098fadeb --- /dev/null +++ b/fs/fuse/OWNERS @@ -0,0 +1 @@ +balsini@google.com diff --git a/fs/fuse/backing.c b/fs/fuse/backing.c new file mode 100644 index 000000000000..4f0044e5f6f1 --- /dev/null +++ b/fs/fuse/backing.c @@ -0,0 +1,2468 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE-BPF: Filesystem in Userspace with BPF + * Copyright (c) 2021 Google LLC + */ + +#include "fuse_i.h" + +#include +#include +#include +#include + +#include "../internal.h" + +#define FUSE_BPF_IOCB_MASK (IOCB_APPEND | IOCB_DSYNC | IOCB_HIPRI | IOCB_NOWAIT | IOCB_SYNC) + +struct fuse_bpf_aio_req { + struct kiocb iocb; + refcount_t ref; + struct kiocb *iocb_orig; +}; + +static struct kmem_cache *fuse_bpf_aio_request_cachep; + +static void fuse_stat_to_attr(struct fuse_conn *fc, struct inode *inode, + struct kstat *stat, struct fuse_attr *attr); + +static void fuse_file_accessed(struct file *dst_file, struct file *src_file) +{ + struct inode *dst_inode; + struct inode *src_inode; + + if (dst_file->f_flags & O_NOATIME) + return; + + dst_inode = file_inode(dst_file); + src_inode = file_inode(src_file); + + if ((!timespec64_equal(&dst_inode->i_mtime, &src_inode->i_mtime) || + !timespec64_equal(&dst_inode->i_ctime, &src_inode->i_ctime))) { + dst_inode->i_mtime = src_inode->i_mtime; + dst_inode->i_ctime = src_inode->i_ctime; + } + + touch_atime(&dst_file->f_path); +} + +int fuse_open_initialize(struct fuse_bpf_args *fa, struct fuse_open_io *foio, + struct inode *inode, struct file *file, bool isdir) +{ + foio->foi = (struct fuse_open_in) { + .flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY), + }; + + foio->foo = (struct fuse_open_out) {0}; + + *fa = (struct fuse_bpf_args) { + .nodeid = get_fuse_inode(inode)->nodeid, + .opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN, + .in_numargs = 1, + .out_numargs = 1, + .in_args[0] = (struct fuse_bpf_in_arg) { + .size = sizeof(foio->foi), + .value = &foio->foi, + }, + .out_args[0] = (struct fuse_bpf_arg) { + .size = sizeof(foio->foo), + .value = &foio->foo, + }, + }; + + return 0; +} + +int fuse_open_backing(struct fuse_bpf_args *fa, + struct inode *inode, struct file *file, bool isdir) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + const struct fuse_open_in *foi = fa->in_args[0].value; + struct fuse_file *ff; + int retval; + int mask; + struct fuse_dentry *fd = get_fuse_dentry(file->f_path.dentry); + struct file *backing_file; + + ff = fuse_file_alloc(fm); + if (!ff) + return -ENOMEM; + file->private_data = ff; + + switch (foi->flags & O_ACCMODE) { + case O_RDONLY: + mask = MAY_READ; + break; + + case O_WRONLY: + mask = MAY_WRITE; + break; + + case O_RDWR: + mask = MAY_READ | MAY_WRITE; + break; + + default: + return -EINVAL; + } + + retval = inode_permission(&init_user_ns, + get_fuse_inode(inode)->backing_inode, mask); + if (retval) + return retval; + + backing_file = dentry_open(&fd->backing_path, + foi->flags, + current_cred()); + + if (IS_ERR(backing_file)) { + fuse_file_free(ff); + file->private_data = NULL; + return PTR_ERR(backing_file); + } + ff->backing_file = backing_file; + + return 0; +} + +void *fuse_open_finalize(struct fuse_bpf_args *fa, + struct inode *inode, struct file *file, bool isdir) +{ + struct fuse_file *ff = file->private_data; + struct fuse_open_out *foo = fa->out_args[0].value; + + if (ff) { + ff->fh = foo->fh; + ff->nodeid = get_fuse_inode(inode)->nodeid; + } + return 0; +} + +int fuse_create_open_initialize( + struct fuse_bpf_args *fa, struct fuse_create_open_io *fcoio, + struct inode *dir, struct dentry *entry, + struct file *file, unsigned int flags, umode_t mode) +{ + fcoio->fci = (struct fuse_create_in) { + .flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY), + .mode = mode, + }; + + fcoio->feo = (struct fuse_entry_out) {0}; + fcoio->foo = (struct fuse_open_out) {0}; + + *fa = (struct fuse_bpf_args) { + .nodeid = get_node_id(dir), + .opcode = FUSE_CREATE, + .in_numargs = 2, + .out_numargs = 2, + .in_args[0] = (struct fuse_bpf_in_arg) { + .size = sizeof(fcoio->fci), + .value = &fcoio->fci, + }, + .in_args[1] = (struct fuse_bpf_in_arg) { + .size = entry->d_name.len + 1, + .value = entry->d_name.name, + }, + .out_args[0] = (struct fuse_bpf_arg) { + .size = sizeof(fcoio->feo), + .value = &fcoio->feo, + }, + .out_args[1] = (struct fuse_bpf_arg) { + .size = sizeof(fcoio->foo), + .value = &fcoio->foo, + }, + }; + + return 0; +} + +static int fuse_open_file_backing(struct inode *inode, struct file *file) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + struct dentry *entry = file->f_path.dentry; + struct fuse_dentry *fuse_dentry = get_fuse_dentry(entry); + struct fuse_file *fuse_file; + struct file *backing_file; + + fuse_file = fuse_file_alloc(fm); + if (!fuse_file) + return -ENOMEM; + file->private_data = fuse_file; + + backing_file = dentry_open(&fuse_dentry->backing_path, file->f_flags, + current_cred()); + if (IS_ERR(backing_file)) { + fuse_file_free(fuse_file); + file->private_data = NULL; + return PTR_ERR(backing_file); + } + fuse_file->backing_file = backing_file; + + return 0; +} + +int fuse_create_open_backing( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry, + struct file *file, unsigned int flags, umode_t mode) +{ + struct fuse_inode *dir_fuse_inode = get_fuse_inode(dir); + struct fuse_dentry *dir_fuse_dentry = get_fuse_dentry(entry->d_parent); + struct dentry *backing_dentry = NULL; + struct inode *inode = NULL; + struct dentry *newent; + int err = 0; + const struct fuse_create_in *fci = fa->in_args[0].value; + struct inode *d_inode = entry->d_inode; + u64 target_nodeid = 0; + + if (!dir_fuse_inode || !dir_fuse_dentry) + return -EIO; + + inode_lock_nested(dir_fuse_inode->backing_inode, I_MUTEX_PARENT); + backing_dentry = lookup_one_len(fa->in_args[1].value, + dir_fuse_dentry->backing_path.dentry, + strlen(fa->in_args[1].value)); + inode_unlock(dir_fuse_inode->backing_inode); + + if (IS_ERR(backing_dentry)) + return PTR_ERR(backing_dentry); + + if (d_really_is_positive(backing_dentry)) { + err = -EIO; + goto out; + } + + err = vfs_create(&init_user_ns, dir_fuse_inode->backing_inode, + backing_dentry, fci->mode, true); + if (err) + goto out; + + if (get_fuse_dentry(entry)->backing_path.dentry) + path_put(&get_fuse_dentry(entry)->backing_path); + get_fuse_dentry(entry)->backing_path = (struct path) { + .mnt = dir_fuse_dentry->backing_path.mnt, + .dentry = backing_dentry, + }; + path_get(&get_fuse_dentry(entry)->backing_path); + + if (d_inode) + target_nodeid = get_fuse_inode(d_inode)->nodeid; + + inode = fuse_iget_backing(dir->i_sb, target_nodeid, + get_fuse_dentry(entry)->backing_path.dentry->d_inode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } + + if (get_fuse_inode(inode)->bpf) + bpf_prog_put(get_fuse_inode(inode)->bpf); + get_fuse_inode(inode)->bpf = dir_fuse_inode->bpf; + if (get_fuse_inode(inode)->bpf) + bpf_prog_inc(dir_fuse_inode->bpf); + + newent = d_splice_alias(inode, entry); + if (IS_ERR(newent)) { + err = PTR_ERR(newent); + goto out; + } + + entry = newent ? newent : entry; + err = finish_open(file, entry, fuse_open_file_backing); + +out: + dput(backing_dentry); + return err; +} + +void *fuse_create_open_finalize( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry, + struct file *file, unsigned int flags, umode_t mode) +{ + struct fuse_file *ff = file->private_data; + struct fuse_inode *fi = get_fuse_inode(file->f_inode); + struct fuse_entry_out *feo = fa->out_args[0].value; + struct fuse_open_out *foo = fa->out_args[1].value; + + if (fi) + fi->nodeid = feo->nodeid; + if (ff) + ff->fh = foo->fh; + return 0; +} + +int fuse_release_initialize(struct fuse_bpf_args *fa, struct fuse_release_in *fri, + struct inode *inode, struct file *file) +{ + struct fuse_file *fuse_file = file->private_data; + + /* Always put backing file whatever bpf/userspace says */ + fput(fuse_file->backing_file); + + *fri = (struct fuse_release_in) { + .fh = ((struct fuse_file *)(file->private_data))->fh, + }; + + *fa = (struct fuse_bpf_args) { + .nodeid = get_fuse_inode(inode)->nodeid, + .opcode = FUSE_RELEASE, + .in_numargs = 1, + .in_args[0].size = sizeof(*fri), + .in_args[0].value = fri, + }; + + return 0; +} + +int fuse_releasedir_initialize(struct fuse_bpf_args *fa, + struct fuse_release_in *fri, + struct inode *inode, struct file *file) +{ + struct fuse_file *fuse_file = file->private_data; + + /* Always put backing file whatever bpf/userspace says */ + fput(fuse_file->backing_file); + + *fri = (struct fuse_release_in) { + .fh = ((struct fuse_file *)(file->private_data))->fh, + }; + + *fa = (struct fuse_bpf_args) { + .nodeid = get_fuse_inode(inode)->nodeid, + .opcode = FUSE_RELEASEDIR, + .in_numargs = 1, + .in_args[0].size = sizeof(*fri), + .in_args[0].value = fri, + }; + + return 0; +} + +int fuse_release_backing(struct fuse_bpf_args *fa, + struct inode *inode, struct file *file) +{ + return 0; +} + +void *fuse_release_finalize(struct fuse_bpf_args *fa, + struct inode *inode, struct file *file) +{ + fuse_file_free(file->private_data); + return NULL; +} + +int fuse_flush_initialize(struct fuse_bpf_args *fa, struct fuse_flush_in *ffi, + struct file *file, fl_owner_t id) +{ + struct fuse_file *fuse_file = file->private_data; + + *ffi = (struct fuse_flush_in) { + .fh = fuse_file->fh, + }; + + *fa = (struct fuse_bpf_args) { + .nodeid = get_node_id(file->f_inode), + .opcode = FUSE_FLUSH, + .in_numargs = 1, + .in_args[0].size = sizeof(*ffi), + .in_args[0].value = ffi, + .flags = FUSE_BPF_FORCE, + }; + + return 0; +} + +int fuse_flush_backing(struct fuse_bpf_args *fa, struct file *file, fl_owner_t id) +{ + struct fuse_file *fuse_file = file->private_data; + struct file *backing_file = fuse_file->backing_file; + + if (backing_file->f_op->flush) + return backing_file->f_op->flush(backing_file, id); + return 0; +} + +void *fuse_flush_finalize(struct fuse_bpf_args *fa, struct file *file, fl_owner_t id) +{ + return NULL; +} + +int fuse_lseek_initialize(struct fuse_bpf_args *fa, struct fuse_lseek_io *flio, + struct file *file, loff_t offset, int whence) +{ + struct fuse_file *fuse_file = file->private_data; + + flio->fli = (struct fuse_lseek_in) { + .fh = fuse_file->fh, + .offset = offset, + .whence = whence, + }; + + *fa = (struct fuse_bpf_args) { + .nodeid = get_node_id(file->f_inode), + .opcode = FUSE_LSEEK, + .in_numargs = 1, + .in_args[0].size = sizeof(flio->fli), + .in_args[0].value = &flio->fli, + .out_numargs = 1, + .out_args[0].size = sizeof(flio->flo), + .out_args[0].value = &flio->flo, + }; + + return 0; +} + +int fuse_lseek_backing(struct fuse_bpf_args *fa, struct file *file, loff_t offset, int whence) +{ + const struct fuse_lseek_in *fli = fa->in_args[0].value; + struct fuse_lseek_out *flo = fa->out_args[0].value; + struct fuse_file *fuse_file = file->private_data; + struct file *backing_file = fuse_file->backing_file; + loff_t ret; + + /* TODO: Handle changing of the file handle */ + if (offset == 0) { + if (whence == SEEK_CUR) { + flo->offset = file->f_pos; + return flo->offset; + } + + if (whence == SEEK_SET) { + flo->offset = vfs_setpos(file, 0, 0); + return flo->offset; + } + } + + inode_lock(file->f_inode); + backing_file->f_pos = file->f_pos; + ret = vfs_llseek(backing_file, fli->offset, fli->whence); + flo->offset = ret; + inode_unlock(file->f_inode); + return ret; +} + +void *fuse_lseek_finalize(struct fuse_bpf_args *fa, struct file *file, loff_t offset, int whence) +{ + struct fuse_lseek_out *flo = fa->out_args[0].value; + + if (!fa->error_in) + file->f_pos = flo->offset; + return ERR_PTR(flo->offset); +} + +int fuse_copy_file_range_initialize(struct fuse_bpf_args *fa, struct fuse_copy_file_range_io *fcf, + struct file *file_in, loff_t pos_in, struct file *file_out, + loff_t pos_out, size_t len, unsigned int flags) +{ + struct fuse_file *fuse_file_in = file_in->private_data; + struct fuse_file *fuse_file_out = file_out->private_data; + + + fcf->fci = (struct fuse_copy_file_range_in) { + .fh_in = fuse_file_in->fh, + .off_in = pos_in, + .nodeid_out = fuse_file_out->nodeid, + .fh_out = fuse_file_out->fh, + .off_out = pos_out, + .len = len, + .flags = flags, + }; + + *fa = (struct fuse_bpf_args) { + .nodeid = get_node_id(file_in->f_inode), + .opcode = FUSE_COPY_FILE_RANGE, + .in_numargs = 1, + .in_args[0].size = sizeof(fcf->fci), + .in_args[0].value = &fcf->fci, + .out_numargs = 1, + .out_args[0].size = sizeof(fcf->fwo), + .out_args[0].value = &fcf->fwo, + }; + + return 0; +} + +int fuse_copy_file_range_backing(struct fuse_bpf_args *fa, struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, size_t len, + unsigned int flags) +{ + const struct fuse_copy_file_range_in *fci = fa->in_args[0].value; + struct fuse_file *fuse_file_in = file_in->private_data; + struct file *backing_file_in = fuse_file_in->backing_file; + struct fuse_file *fuse_file_out = file_out->private_data; + struct file *backing_file_out = fuse_file_out->backing_file; + + /* TODO: Handle changing of in/out files */ + if (backing_file_out) + return vfs_copy_file_range(backing_file_in, fci->off_in, backing_file_out, + fci->off_out, fci->len, fci->flags); + else + return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len, + flags); +} + +void *fuse_copy_file_range_finalize(struct fuse_bpf_args *fa, struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, size_t len, + unsigned int flags) +{ + return NULL; +} + +int fuse_fsync_initialize(struct fuse_bpf_args *fa, struct fuse_fsync_in *ffi, + struct file *file, loff_t start, loff_t end, int datasync) +{ + struct fuse_file *fuse_file = file->private_data; + + *ffi = (struct fuse_fsync_in) { + .fh = fuse_file->fh, + .fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0, + }; + + *fa = (struct fuse_bpf_args) { + .nodeid = get_fuse_inode(file->f_inode)->nodeid, + .opcode = FUSE_FSYNC, + .in_numargs = 1, + .in_args[0].size = sizeof(*ffi), + .in_args[0].value = ffi, + .flags = FUSE_BPF_FORCE, + }; + + return 0; +} + +int fuse_fsync_backing(struct fuse_bpf_args *fa, + struct file *file, loff_t start, loff_t end, int datasync) +{ + struct fuse_file *fuse_file = file->private_data; + struct file *backing_file = fuse_file->backing_file; + const struct fuse_fsync_in *ffi = fa->in_args[0].value; + int new_datasync = (ffi->fsync_flags & FUSE_FSYNC_FDATASYNC) ? 1 : 0; + + return vfs_fsync(backing_file, new_datasync); +} + +void *fuse_fsync_finalize(struct fuse_bpf_args *fa, + struct file *file, loff_t start, loff_t end, int datasync) +{ + return NULL; +} + +int fuse_dir_fsync_initialize(struct fuse_bpf_args *fa, struct fuse_fsync_in *ffi, + struct file *file, loff_t start, loff_t end, int datasync) +{ + struct fuse_file *fuse_file = file->private_data; + + *ffi = (struct fuse_fsync_in) { + .fh = fuse_file->fh, + .fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0, + }; + + *fa = (struct fuse_bpf_args) { + .nodeid = get_fuse_inode(file->f_inode)->nodeid, + .opcode = FUSE_FSYNCDIR, + .in_numargs = 1, + .in_args[0].size = sizeof(*ffi), + .in_args[0].value = ffi, + .flags = FUSE_BPF_FORCE, + }; + + return 0; +} + +int fuse_getxattr_initialize(struct fuse_bpf_args *fa, + struct fuse_getxattr_io *fgio, + struct dentry *dentry, const char *name, void *value, + size_t size) +{ + *fgio = (struct fuse_getxattr_io) { + .fgi.size = size, + }; + + *fa = (struct fuse_bpf_args) { + .nodeid = get_fuse_inode(dentry->d_inode)->nodeid, + .opcode = FUSE_GETXATTR, + .in_numargs = 2, + .out_numargs = 1, + .in_args[0] = (struct fuse_bpf_in_arg) { + .size = sizeof(fgio->fgi), + .value = &fgio->fgi, + }, + .in_args[1] = (struct fuse_bpf_in_arg) { + .size = strlen(name) + 1, + .value = name, + }, + .flags = size ? FUSE_BPF_OUT_ARGVAR : 0, + .out_args[0].size = size ? size : sizeof(fgio->fgo), + .out_args[0].value = size ? value : &fgio->fgo, + }; + return 0; +} + +int fuse_getxattr_backing(struct fuse_bpf_args *fa, + struct dentry *dentry, const char *name, void *value, + size_t size) +{ + ssize_t ret = vfs_getxattr(&init_user_ns, + get_fuse_dentry(dentry)->backing_path.dentry, + fa->in_args[1].value, value, size); + + if (fa->flags & FUSE_BPF_OUT_ARGVAR) + fa->out_args[0].size = ret; + else + ((struct fuse_getxattr_out *)fa->out_args[0].value)->size = ret; + + return 0; +} + +void *fuse_getxattr_finalize(struct fuse_bpf_args *fa, + struct dentry *dentry, const char *name, void *value, + size_t size) +{ + struct fuse_getxattr_out *fgo; + + if (fa->flags & FUSE_BPF_OUT_ARGVAR) + return ERR_PTR(fa->out_args[0].size); + + fgo = fa->out_args[0].value; + + return ERR_PTR(fgo->size); + +} + +int fuse_listxattr_initialize(struct fuse_bpf_args *fa, + struct fuse_getxattr_io *fgio, + struct dentry *dentry, char *list, size_t size) +{ + *fgio = (struct fuse_getxattr_io){ + .fgi.size = size, + }; + + *fa = (struct fuse_bpf_args){ + .nodeid = get_fuse_inode(dentry->d_inode)->nodeid, + .opcode = FUSE_LISTXATTR, + .in_numargs = 1, + .out_numargs = 1, + .in_args[0] = + (struct fuse_bpf_in_arg){ + .size = sizeof(fgio->fgi), + .value = &fgio->fgi, + }, + .flags = size ? FUSE_BPF_OUT_ARGVAR : 0, + .out_args[0].size = size ? size : sizeof(fgio->fgo), + .out_args[0].value = size ? (void *)list : &fgio->fgo, + }; + + return 0; +} + +int fuse_listxattr_backing(struct fuse_bpf_args *fa, struct dentry *dentry, + char *list, size_t size) +{ + ssize_t ret = + vfs_listxattr(get_fuse_dentry(dentry)->backing_path.dentry, + list, size); + + if (ret < 0) + return ret; + + if (fa->flags & FUSE_BPF_OUT_ARGVAR) + fa->out_args[0].size = ret; + else + ((struct fuse_getxattr_out *)fa->out_args[0].value)->size = ret; + + return ret; +} + +void *fuse_listxattr_finalize(struct fuse_bpf_args *fa, struct dentry *dentry, + char *list, size_t size) +{ + struct fuse_getxattr_out *fgo; + + if (fa->error_in) + return NULL; + + if (fa->flags & FUSE_BPF_OUT_ARGVAR) + return ERR_PTR(fa->out_args[0].size); + + fgo = fa->out_args[0].value; + return ERR_PTR(fgo->size); +} + +int fuse_setxattr_initialize(struct fuse_bpf_args *fa, + struct fuse_setxattr_in *fsxi, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + *fsxi = (struct fuse_setxattr_in) { + .size = size, + .flags = flags, + }; + + *fa = (struct fuse_bpf_args) { + .nodeid = get_fuse_inode(dentry->d_inode)->nodeid, + .opcode = FUSE_SETXATTR, + .in_numargs = 3, + .in_args[0] = (struct fuse_bpf_in_arg) { + .size = sizeof(*fsxi), + .value = fsxi, + }, + .in_args[1] = (struct fuse_bpf_in_arg) { + .size = strlen(name) + 1, + .value = name, + }, + .in_args[2] = (struct fuse_bpf_in_arg) { + .size = size, + .value = value, + }, + }; + + return 0; +} + +int fuse_setxattr_backing(struct fuse_bpf_args *fa, struct dentry *dentry, + const char *name, const void *value, size_t size, + int flags) +{ + return vfs_setxattr(&init_user_ns, + get_fuse_dentry(dentry)->backing_path.dentry, name, + value, size, flags); +} + +void *fuse_setxattr_finalize(struct fuse_bpf_args *fa, struct dentry *dentry, + const char *name, const void *value, size_t size, + int flags) +{ + return NULL; +} + +int fuse_removexattr_initialize(struct fuse_bpf_args *fa, + struct fuse_dummy_io *unused, + struct dentry *dentry, const char *name) +{ + *fa = (struct fuse_bpf_args) { + .nodeid = get_fuse_inode(dentry->d_inode)->nodeid, + .opcode = FUSE_REMOVEXATTR, + .in_numargs = 1, + .in_args[0] = (struct fuse_bpf_in_arg) { + .size = strlen(name) + 1, + .value = name, + }, + }; + + return 0; +} + +int fuse_removexattr_backing(struct fuse_bpf_args *fa, + struct dentry *dentry, const char *name) +{ + struct path *backing_path = + &get_fuse_dentry(dentry)->backing_path; + + /* TODO account for changes of the name by prefilter */ + return vfs_removexattr(&init_user_ns, backing_path->dentry, name); +} + +void *fuse_removexattr_finalize(struct fuse_bpf_args *fa, + struct dentry *dentry, const char *name) +{ + return NULL; +} + +static inline void fuse_bpf_aio_put(struct fuse_bpf_aio_req *aio_req) +{ + if (refcount_dec_and_test(&aio_req->ref)) + kmem_cache_free(fuse_bpf_aio_request_cachep, aio_req); +} + +static void fuse_bpf_aio_cleanup_handler(struct fuse_bpf_aio_req *aio_req) +{ + struct kiocb *iocb = &aio_req->iocb; + struct kiocb *iocb_orig = aio_req->iocb_orig; + + if (iocb->ki_flags & IOCB_WRITE) { + __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb, + SB_FREEZE_WRITE); + file_end_write(iocb->ki_filp); + fuse_copyattr(iocb_orig->ki_filp, iocb->ki_filp); + } + iocb_orig->ki_pos = iocb->ki_pos; + fuse_bpf_aio_put(aio_req); +} + +static void fuse_bpf_aio_rw_complete(struct kiocb *iocb, long res, long res2) +{ + struct fuse_bpf_aio_req *aio_req = + container_of(iocb, struct fuse_bpf_aio_req, iocb); + struct kiocb *iocb_orig = aio_req->iocb_orig; + + fuse_bpf_aio_cleanup_handler(aio_req); + iocb_orig->ki_complete(iocb_orig, res, res2); +} + + +int fuse_file_read_iter_initialize( + struct fuse_bpf_args *fa, struct fuse_file_read_iter_io *fri, + struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + + fri->fri = (struct fuse_read_in) { + .fh = ff->fh, + .offset = iocb->ki_pos, + .size = to->count, + }; + + fri->frio = (struct fuse_read_iter_out) { + .ret = fri->fri.size, + }; + + /* TODO we can't assume 'to' is a kvec */ + /* TODO we also can't assume the vector has only one component */ + *fa = (struct fuse_bpf_args) { + .opcode = FUSE_READ, + .nodeid = ff->nodeid, + .in_numargs = 1, + .in_args[0].size = sizeof(fri->fri), + .in_args[0].value = &fri->fri, + .out_numargs = 1, + .out_args[0].size = sizeof(fri->frio), + .out_args[0].value = &fri->frio, + /* + * TODO Design this properly. + * Possible approach: do not pass buf to bpf + * If going to userland, do a deep copy + * For extra credit, do that to/from the vector, rather than + * making an extra copy in the kernel + */ + }; + + return 0; +} + +int fuse_file_read_iter_backing(struct fuse_bpf_args *fa, + struct kiocb *iocb, struct iov_iter *to) +{ + struct fuse_read_iter_out *frio = fa->out_args[0].value; + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + ssize_t ret; + + if (!iov_iter_count(to)) + return 0; + + if ((iocb->ki_flags & IOCB_DIRECT) && + (!ff->backing_file->f_mapping->a_ops || + !ff->backing_file->f_mapping->a_ops->direct_IO)) + return -EINVAL; + + /* TODO This just plain ignores any change to fuse_read_in */ + if (is_sync_kiocb(iocb)) { + ret = vfs_iter_read(ff->backing_file, to, &iocb->ki_pos, + iocb_to_rw_flags(iocb->ki_flags, FUSE_BPF_IOCB_MASK)); + } else { + struct fuse_bpf_aio_req *aio_req; + + ret = -ENOMEM; + aio_req = kmem_cache_zalloc(fuse_bpf_aio_request_cachep, GFP_KERNEL); + if (!aio_req) + goto out; + + aio_req->iocb_orig = iocb; + kiocb_clone(&aio_req->iocb, iocb, ff->backing_file); + aio_req->iocb.ki_complete = fuse_bpf_aio_rw_complete; + refcount_set(&aio_req->ref, 2); + ret = vfs_iocb_iter_read(ff->backing_file, &aio_req->iocb, to); + fuse_bpf_aio_put(aio_req); + if (ret != -EIOCBQUEUED) + fuse_bpf_aio_cleanup_handler(aio_req); + } + + frio->ret = ret; + + /* TODO Need to point value at the buffer for post-modification */ + +out: + fuse_file_accessed(file, ff->backing_file); + + return ret; +} + +void *fuse_file_read_iter_finalize(struct fuse_bpf_args *fa, + struct kiocb *iocb, struct iov_iter *to) +{ + struct fuse_read_iter_out *frio = fa->out_args[0].value; + + return ERR_PTR(frio->ret); +} + +int fuse_file_write_iter_initialize( + struct fuse_bpf_args *fa, struct fuse_file_write_iter_io *fwio, + struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + + *fwio = (struct fuse_file_write_iter_io) { + .fwi.fh = ff->fh, + .fwi.offset = iocb->ki_pos, + .fwi.size = from->count, + }; + + /* TODO we can't assume 'from' is a kvec */ + *fa = (struct fuse_bpf_args) { + .opcode = FUSE_WRITE, + .nodeid = ff->nodeid, + .in_numargs = 2, + .in_args[0].size = sizeof(fwio->fwi), + .in_args[0].value = &fwio->fwi, + .in_args[1].size = fwio->fwi.size, + .in_args[1].value = from->kvec->iov_base, + .out_numargs = 1, + .out_args[0].size = sizeof(fwio->fwio), + .out_args[0].value = &fwio->fwio, + }; + + return 0; +} + +int fuse_file_write_iter_backing(struct fuse_bpf_args *fa, + struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct fuse_write_iter_out *fwio = fa->out_args[0].value; + ssize_t ret; + + if (!iov_iter_count(from)) + return 0; + + /* TODO This just plain ignores any change to fuse_write_in */ + /* TODO uint32_t seems smaller than ssize_t.... right? */ + inode_lock(file_inode(file)); + + fuse_copyattr(file, ff->backing_file); + + if (is_sync_kiocb(iocb)) { + file_start_write(ff->backing_file); + ret = vfs_iter_write(ff->backing_file, from, &iocb->ki_pos, + iocb_to_rw_flags(iocb->ki_flags, FUSE_BPF_IOCB_MASK)); + file_end_write(ff->backing_file); + + /* Must reflect change in size of backing file to upper file */ + if (ret > 0) + fuse_copyattr(file, ff->backing_file); + } else { + struct fuse_bpf_aio_req *aio_req; + + ret = -ENOMEM; + aio_req = kmem_cache_zalloc(fuse_bpf_aio_request_cachep, GFP_KERNEL); + if (!aio_req) + goto out; + + file_start_write(ff->backing_file); + __sb_writers_release(file_inode(ff->backing_file)->i_sb, SB_FREEZE_WRITE); + aio_req->iocb_orig = iocb; + kiocb_clone(&aio_req->iocb, iocb, ff->backing_file); + aio_req->iocb.ki_complete = fuse_bpf_aio_rw_complete; + refcount_set(&aio_req->ref, 2); + ret = vfs_iocb_iter_write(ff->backing_file, &aio_req->iocb, from); + fuse_bpf_aio_put(aio_req); + if (ret != -EIOCBQUEUED) + fuse_bpf_aio_cleanup_handler(aio_req); + } + +out: + inode_unlock(file_inode(file)); + fwio->ret = ret; + if (ret < 0) + return ret; + return 0; +} + +void *fuse_file_write_iter_finalize(struct fuse_bpf_args *fa, + struct kiocb *iocb, struct iov_iter *from) +{ + struct fuse_write_iter_out *fwio = fa->out_args[0].value; + + return ERR_PTR(fwio->ret); +} + +ssize_t fuse_backing_mmap(struct file *file, struct vm_area_struct *vma) +{ + int ret; + struct fuse_file *ff = file->private_data; + struct inode *fuse_inode = file_inode(file); + struct file *backing_file = ff->backing_file; + struct inode *backing_inode = file_inode(backing_file); + + if (!backing_file->f_op->mmap) + return -ENODEV; + + if (WARN_ON(file != vma->vm_file)) + return -EIO; + + vma->vm_file = get_file(backing_file); + + ret = call_mmap(vma->vm_file, vma); + + if (ret) + fput(backing_file); + else + fput(file); + + if (file->f_flags & O_NOATIME) + return ret; + + if ((!timespec64_equal(&fuse_inode->i_mtime, + &backing_inode->i_mtime) || + !timespec64_equal(&fuse_inode->i_ctime, + &backing_inode->i_ctime))) { + fuse_inode->i_mtime = backing_inode->i_mtime; + fuse_inode->i_ctime = backing_inode->i_ctime; + } + touch_atime(&file->f_path); + + return ret; +} + +int fuse_file_fallocate_initialize(struct fuse_bpf_args *fa, + struct fuse_fallocate_in *ffi, + struct file *file, int mode, loff_t offset, loff_t length) +{ + struct fuse_file *ff = file->private_data; + + *ffi = (struct fuse_fallocate_in) { + .fh = ff->fh, + .offset = offset, + .length = length, + .mode = mode + }; + + *fa = (struct fuse_bpf_args) { + .opcode = FUSE_FALLOCATE, + .nodeid = ff->nodeid, + .in_numargs = 1, + .in_args[0].size = sizeof(*ffi), + .in_args[0].value = ffi, + }; + + return 0; +} + +int fuse_file_fallocate_backing(struct fuse_bpf_args *fa, + struct file *file, int mode, loff_t offset, loff_t length) +{ + const struct fuse_fallocate_in *ffi = fa->in_args[0].value; + struct fuse_file *ff = file->private_data; + + return vfs_fallocate(ff->backing_file, ffi->mode, ffi->offset, + ffi->length); +} + +void *fuse_file_fallocate_finalize(struct fuse_bpf_args *fa, + struct file *file, int mode, loff_t offset, loff_t length) +{ + return NULL; +} + +/******************************************************************************* + * Directory operations after here * + ******************************************************************************/ + +int fuse_lookup_initialize(struct fuse_bpf_args *fa, struct fuse_lookup_io *fli, + struct inode *dir, struct dentry *entry, unsigned int flags) +{ + *fa = (struct fuse_bpf_args) { + .nodeid = get_fuse_inode(dir)->nodeid, + .opcode = FUSE_LOOKUP, + .in_numargs = 1, + .out_numargs = 2, + .flags = FUSE_BPF_OUT_ARGVAR, + .in_args[0] = (struct fuse_bpf_in_arg) { + .size = entry->d_name.len + 1, + .value = entry->d_name.name, + }, + .out_args[0] = (struct fuse_bpf_arg) { + .size = sizeof(fli->feo), + .value = &fli->feo, + }, + .out_args[1] = (struct fuse_bpf_arg) { + .size = sizeof(fli->feb.out), + .value = &fli->feb.out, + }, + }; + + return 0; +} + +int fuse_lookup_backing(struct fuse_bpf_args *fa, struct inode *dir, + struct dentry *entry, unsigned int flags) +{ + struct fuse_dentry *fuse_entry = get_fuse_dentry(entry); + struct fuse_dentry *dir_fuse_entry = get_fuse_dentry(entry->d_parent); + struct dentry *dir_backing_entry = dir_fuse_entry->backing_path.dentry; + struct inode *dir_backing_inode = dir_backing_entry->d_inode; + struct dentry *backing_entry; + struct fuse_entry_out *feo = (void *)fa->out_args[0].value; + struct kstat stat; + int err; + + /* TODO this will not handle lookups over mount points */ + inode_lock_nested(dir_backing_inode, I_MUTEX_PARENT); + backing_entry = lookup_one_len(entry->d_name.name, dir_backing_entry, + strlen(entry->d_name.name)); + inode_unlock(dir_backing_inode); + + if (IS_ERR(backing_entry)) + return PTR_ERR(backing_entry); + + fuse_entry->backing_path = (struct path) { + .dentry = backing_entry, + .mnt = mntget(dir_fuse_entry->backing_path.mnt), + }; + + if (d_is_negative(backing_entry)) { + fa->error_in = -ENOENT; + return 0; + } + + err = vfs_getattr(&fuse_entry->backing_path, &stat, + STATX_BASIC_STATS, 0); + if (err) { + path_put_init(&fuse_entry->backing_path); + return err; + } + + fuse_stat_to_attr(get_fuse_conn(dir), + backing_entry->d_inode, &stat, &feo->attr); + return 0; +} + +int fuse_handle_backing(struct fuse_entry_bpf *feb, struct inode **backing_inode, + struct path *backing_path) +{ + switch (feb->out.backing_action) { + case FUSE_ACTION_KEEP: + /* backing inode/path are added in fuse_lookup_backing */ + break; + + case FUSE_ACTION_REMOVE: + iput(*backing_inode); + *backing_inode = NULL; + path_put_init(backing_path); + break; + + case FUSE_ACTION_REPLACE: { + struct file *backing_file = feb->backing_file; + + if (!backing_file) + return -EINVAL; + if (IS_ERR(backing_file)) + return PTR_ERR(backing_file); + + if (backing_inode) + iput(*backing_inode); + *backing_inode = backing_file->f_inode; + ihold(*backing_inode); + + path_put(backing_path); + *backing_path = backing_file->f_path; + path_get(backing_path); + + fput(backing_file); + break; + } + + default: + return -EINVAL; + } + + return 0; +} + +int fuse_handle_bpf_prog(struct fuse_entry_bpf *feb, struct inode *parent, + struct bpf_prog **bpf) +{ + struct fuse_inode *pi; + + // Parent isn't presented, but we want to keep + // Don't touch bpf program at all in this case + if (feb->out.bpf_action == FUSE_ACTION_KEEP && !parent) + goto out; + + if (*bpf) { + bpf_prog_put(*bpf); + *bpf = NULL; + } + + switch (feb->out.bpf_action) { + case FUSE_ACTION_KEEP: + pi = get_fuse_inode(parent); + *bpf = pi->bpf; + if (*bpf) + bpf_prog_inc(*bpf); + break; + + case FUSE_ACTION_REMOVE: + break; + + case FUSE_ACTION_REPLACE: { + struct file *bpf_file = feb->bpf_file; + struct bpf_prog *bpf_prog = ERR_PTR(-EINVAL); + + if (bpf_file && !IS_ERR(bpf_file)) + bpf_prog = fuse_get_bpf_prog(bpf_file); + + if (IS_ERR(bpf_prog)) + return PTR_ERR(bpf_prog); + + *bpf = bpf_prog; + break; + } + + default: + return -EINVAL; + } + +out: + return 0; +} + +struct dentry *fuse_lookup_finalize(struct fuse_bpf_args *fa, struct inode *dir, + struct dentry *entry, unsigned int flags) +{ + struct fuse_dentry *fd; + struct dentry *bd; + struct inode *inode, *backing_inode; + struct inode *d_inode = entry->d_inode; + struct fuse_entry_out *feo = fa->out_args[0].value; + struct fuse_entry_bpf_out *febo = fa->out_args[1].value; + struct fuse_entry_bpf *feb = container_of(febo, struct fuse_entry_bpf, out); + int error = -1; + u64 target_nodeid = 0; + + fd = get_fuse_dentry(entry); + if (!fd) + return ERR_PTR(-EIO); + bd = fd->backing_path.dentry; + if (!bd) + return ERR_PTR(-ENOENT); + backing_inode = bd->d_inode; + if (!backing_inode) + return 0; + + if (d_inode) + target_nodeid = get_fuse_inode(d_inode)->nodeid; + + inode = fuse_iget_backing(dir->i_sb, target_nodeid, backing_inode); + + if (IS_ERR(inode)) + return ERR_PTR(PTR_ERR(inode)); + + error = fuse_handle_bpf_prog(feb, dir, &get_fuse_inode(inode)->bpf); + if (error) + return ERR_PTR(error); + + error = fuse_handle_backing(feb, &get_fuse_inode(inode)->backing_inode, &fd->backing_path); + if (error) + return ERR_PTR(error); + + get_fuse_inode(inode)->nodeid = feo->nodeid; + + return d_splice_alias(inode, entry); +} + +int fuse_revalidate_backing(struct dentry *entry, unsigned int flags) +{ + struct fuse_dentry *fuse_dentry = get_fuse_dentry(entry); + struct dentry *backing_entry = fuse_dentry->backing_path.dentry; + + spin_lock(&backing_entry->d_lock); + if (d_unhashed(backing_entry)) { + spin_unlock(&backing_entry->d_lock); + return 0; + } + spin_unlock(&backing_entry->d_lock); + + if (unlikely(backing_entry->d_flags & DCACHE_OP_REVALIDATE)) + return backing_entry->d_op->d_revalidate(backing_entry, flags); + return 1; +} + +int fuse_canonical_path_initialize(struct fuse_bpf_args *fa, + struct fuse_dummy_io *fdi, + const struct path *path, + struct path *canonical_path) +{ + fa->opcode = FUSE_CANONICAL_PATH; + return 0; +} + +int fuse_canonical_path_backing(struct fuse_bpf_args *fa, const struct path *path, + struct path *canonical_path) +{ + get_fuse_backing_path(path->dentry, canonical_path); + return 0; +} + +void *fuse_canonical_path_finalize(struct fuse_bpf_args *fa, + const struct path *path, + struct path *canonical_path) +{ + return NULL; +} + +int fuse_mknod_initialize( + struct fuse_bpf_args *fa, struct fuse_mknod_in *fmi, + struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev) +{ + *fmi = (struct fuse_mknod_in) { + .mode = mode, + .rdev = new_encode_dev(rdev), + .umask = current_umask(), + }; + *fa = (struct fuse_bpf_args) { + .nodeid = get_node_id(dir), + .opcode = FUSE_MKNOD, + .in_numargs = 2, + .in_args[0] = (struct fuse_bpf_in_arg) { + .size = sizeof(*fmi), + .value = fmi, + }, + .in_args[1] = (struct fuse_bpf_in_arg) { + .size = entry->d_name.len + 1, + .value = entry->d_name.name, + }, + }; + + return 0; +} + +int fuse_mknod_backing( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev) +{ + int err = 0; + const struct fuse_mknod_in *fmi = fa->in_args[0].value; + struct fuse_inode *fuse_inode = get_fuse_inode(dir); + struct inode *backing_inode = fuse_inode->backing_inode; + struct path backing_path = {}; + struct inode *inode = NULL; + + //TODO Actually deal with changing the backing entry in mknod + get_fuse_backing_path(entry, &backing_path); + if (!backing_path.dentry) + return -EBADF; + + inode_lock_nested(backing_inode, I_MUTEX_PARENT); + mode = fmi->mode; + if (!IS_POSIXACL(backing_inode)) + mode &= ~fmi->umask; + err = vfs_mknod(&init_user_ns, backing_inode, backing_path.dentry, + mode, new_decode_dev(fmi->rdev)); + inode_unlock(backing_inode); + if (err) + goto out; + if (d_really_is_negative(backing_path.dentry) || + unlikely(d_unhashed(backing_path.dentry))) { + err = -EINVAL; + /** + * TODO: overlayfs responds to this situation with a + * lookupOneLen. Should we do that too? + */ + goto out; + } + inode = fuse_iget_backing(dir->i_sb, fuse_inode->nodeid, backing_inode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } + d_instantiate(entry, inode); +out: + path_put(&backing_path); + return err; +} + +void *fuse_mknod_finalize( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev) +{ + return NULL; +} + +int fuse_mkdir_initialize( + struct fuse_bpf_args *fa, struct fuse_mkdir_in *fmi, + struct inode *dir, struct dentry *entry, umode_t mode) +{ + *fmi = (struct fuse_mkdir_in) { + .mode = mode, + .umask = current_umask(), + }; + *fa = (struct fuse_bpf_args) { + .nodeid = get_node_id(dir), + .opcode = FUSE_MKDIR, + .in_numargs = 2, + .in_args[0] = (struct fuse_bpf_in_arg) { + .size = sizeof(*fmi), + .value = fmi, + }, + .in_args[1] = (struct fuse_bpf_in_arg) { + .size = entry->d_name.len + 1, + .value = entry->d_name.name, + }, + }; + + return 0; +} + +int fuse_mkdir_backing( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry, umode_t mode) +{ + int err = 0; + const struct fuse_mkdir_in *fmi = fa->in_args[0].value; + struct fuse_inode *fuse_inode = get_fuse_inode(dir); + struct inode *backing_inode = fuse_inode->backing_inode; + struct path backing_path = {}; + struct inode *inode = NULL; + struct dentry *d; + + //TODO Actually deal with changing the backing entry in mkdir + get_fuse_backing_path(entry, &backing_path); + if (!backing_path.dentry) + return -EBADF; + + inode_lock_nested(backing_inode, I_MUTEX_PARENT); + mode = fmi->mode; + if (!IS_POSIXACL(backing_inode)) + mode &= ~fmi->umask; + err = vfs_mkdir(&init_user_ns, backing_inode, backing_path.dentry, mode); + if (err) + goto out; + if (d_really_is_negative(backing_path.dentry) || + unlikely(d_unhashed(backing_path.dentry))) { + d = lookup_one_len(entry->d_name.name, backing_path.dentry->d_parent, + entry->d_name.len); + if (IS_ERR(d)) { + err = PTR_ERR(d); + goto out; + } + dput(backing_path.dentry); + backing_path.dentry = d; + } + inode = fuse_iget_backing(dir->i_sb, fuse_inode->nodeid, backing_inode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } + d_instantiate(entry, inode); +out: + inode_unlock(backing_inode); + path_put(&backing_path); + return err; +} + +void *fuse_mkdir_finalize( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry, umode_t mode) +{ + return NULL; +} + +int fuse_rmdir_initialize( + struct fuse_bpf_args *fa, struct fuse_dummy_io *dummy, + struct inode *dir, struct dentry *entry) +{ + *fa = (struct fuse_bpf_args) { + .nodeid = get_node_id(dir), + .opcode = FUSE_RMDIR, + .in_numargs = 1, + .in_args[0] = (struct fuse_bpf_in_arg) { + .size = entry->d_name.len + 1, + .value = entry->d_name.name, + }, + }; + + return 0; +} + +int fuse_rmdir_backing( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry) +{ + int err = 0; + struct path backing_path = {}; + struct dentry *backing_parent_dentry; + struct inode *backing_inode; + + /* TODO Actually deal with changing the backing entry in rmdir */ + get_fuse_backing_path(entry, &backing_path); + if (!backing_path.dentry) + return -EBADF; + + /* TODO Not sure if we should reverify like overlayfs, or get inode from d_parent */ + backing_parent_dentry = dget_parent(backing_path.dentry); + backing_inode = d_inode(backing_parent_dentry); + + inode_lock_nested(backing_inode, I_MUTEX_PARENT); + err = vfs_rmdir(&init_user_ns, backing_inode, backing_path.dentry); + inode_unlock(backing_inode); + + dput(backing_parent_dentry); + if (!err) + d_drop(entry); + path_put(&backing_path); + return err; +} + +void *fuse_rmdir_finalize( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry) +{ + return NULL; +} + +static int fuse_rename_backing_common( + struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags) +{ + int err = 0; + struct path old_backing_path; + struct path new_backing_path; + struct dentry *old_backing_dir_dentry; + struct dentry *old_backing_dentry; + struct dentry *new_backing_dir_dentry; + struct dentry *new_backing_dentry; + struct dentry *trap = NULL; + struct inode *target_inode; + struct renamedata rd; + + //TODO Actually deal with changing anything that isn't a flag + get_fuse_backing_path(oldent, &old_backing_path); + if (!old_backing_path.dentry) + return -EBADF; + get_fuse_backing_path(newent, &new_backing_path); + if (!new_backing_path.dentry) { + /* + * TODO A file being moved from a backing path to another + * backing path which is not yet instrumented with FUSE-BPF. + * This may be slow and should be substituted with something + * more clever. + */ + err = -EXDEV; + goto put_old_path; + } + if (new_backing_path.mnt != old_backing_path.mnt) { + err = -EXDEV; + goto put_new_path; + } + old_backing_dentry = old_backing_path.dentry; + new_backing_dentry = new_backing_path.dentry; + old_backing_dir_dentry = dget_parent(old_backing_dentry); + new_backing_dir_dentry = dget_parent(new_backing_dentry); + target_inode = d_inode(newent); + + trap = lock_rename(old_backing_dir_dentry, new_backing_dir_dentry); + if (trap == old_backing_dentry) { + err = -EINVAL; + goto put_parents; + } + if (trap == new_backing_dentry) { + err = -ENOTEMPTY; + goto put_parents; + } + rd = (struct renamedata) { + .old_mnt_userns = &init_user_ns, + .old_dir = d_inode(old_backing_dir_dentry), + .old_dentry = old_backing_dentry, + .new_mnt_userns = &init_user_ns, + .new_dir = d_inode(new_backing_dir_dentry), + .new_dentry = new_backing_dentry, + .flags = flags, + }; + err = vfs_rename(&rd); + if (err) + goto unlock; + if (target_inode) + fsstack_copy_attr_all(target_inode, + get_fuse_inode(target_inode)->backing_inode); + fsstack_copy_attr_all(d_inode(oldent), d_inode(old_backing_dentry)); +unlock: + unlock_rename(old_backing_dir_dentry, new_backing_dir_dentry); +put_parents: + dput(new_backing_dir_dentry); + dput(old_backing_dir_dentry); +put_new_path: + path_put(&new_backing_path); +put_old_path: + path_put(&old_backing_path); + return err; +} + +int fuse_rename2_initialize(struct fuse_bpf_args *fa, struct fuse_rename2_in *fri, + struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags) +{ + *fri = (struct fuse_rename2_in) { + .newdir = get_node_id(newdir), + .flags = flags, + }; + *fa = (struct fuse_bpf_args) { + .nodeid = get_node_id(olddir), + .opcode = FUSE_RENAME2, + .in_numargs = 3, + .in_args[0] = (struct fuse_bpf_in_arg) { + .size = sizeof(*fri), + .value = fri, + }, + .in_args[1] = (struct fuse_bpf_in_arg) { + .size = oldent->d_name.len + 1, + .value = oldent->d_name.name, + }, + .in_args[2] = (struct fuse_bpf_in_arg) { + .size = newent->d_name.len + 1, + .value = newent->d_name.name, + }, + }; + + return 0; +} + +int fuse_rename2_backing(struct fuse_bpf_args *fa, + struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags) +{ + const struct fuse_rename2_in *fri = fa->in_args[0].value; + + /* TODO: deal with changing dirs/ents */ + return fuse_rename_backing_common(olddir, oldent, newdir, newent, fri->flags); +} + +void *fuse_rename2_finalize(struct fuse_bpf_args *fa, + struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags) +{ + return NULL; +} + +int fuse_rename_initialize(struct fuse_bpf_args *fa, struct fuse_rename_in *fri, + struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent) +{ + *fri = (struct fuse_rename_in) { + .newdir = get_node_id(newdir), + }; + *fa = (struct fuse_bpf_args) { + .nodeid = get_node_id(olddir), + .opcode = FUSE_RENAME, + .in_numargs = 3, + .in_args[0] = (struct fuse_bpf_in_arg) { + .size = sizeof(*fri), + .value = fri, + }, + .in_args[1] = (struct fuse_bpf_in_arg) { + .size = oldent->d_name.len + 1, + .value = oldent->d_name.name, + }, + .in_args[2] = (struct fuse_bpf_in_arg) { + .size = newent->d_name.len + 1, + .value = newent->d_name.name, + }, + }; + + return 0; +} + +int fuse_rename_backing(struct fuse_bpf_args *fa, + struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent) +{ + /* TODO: deal with changing dirs/ents */ + return fuse_rename_backing_common(olddir, oldent, newdir, newent, 0); +} + +void *fuse_rename_finalize(struct fuse_bpf_args *fa, + struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent) +{ + return NULL; +} + +int fuse_unlink_initialize( + struct fuse_bpf_args *fa, struct fuse_dummy_io *dummy, + struct inode *dir, struct dentry *entry) +{ + *fa = (struct fuse_bpf_args) { + .nodeid = get_node_id(dir), + .opcode = FUSE_UNLINK, + .in_numargs = 1, + .in_args[0] = (struct fuse_bpf_in_arg) { + .size = entry->d_name.len + 1, + .value = entry->d_name.name, + }, + }; + + return 0; +} + +int fuse_unlink_backing( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry) +{ + int err = 0; + struct path backing_path = {}; + struct dentry *backing_parent_dentry; + struct inode *backing_inode; + + /* TODO Actually deal with changing the backing entry in unlink */ + get_fuse_backing_path(entry, &backing_path); + if (!backing_path.dentry) + return -EBADF; + + /* TODO Not sure if we should reverify like overlayfs, or get inode from d_parent */ + backing_parent_dentry = dget_parent(backing_path.dentry); + backing_inode = d_inode(backing_parent_dentry); + + inode_lock_nested(backing_inode, I_MUTEX_PARENT); + err = vfs_unlink(&init_user_ns, backing_inode, backing_path.dentry, NULL); + inode_unlock(backing_inode); + + dput(backing_parent_dentry); + if (!err) + d_drop(entry); + path_put(&backing_path); + return err; +} + +void *fuse_unlink_finalize( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry) +{ + return NULL; +} + +int fuse_link_initialize(struct fuse_bpf_args *fa, struct fuse_link_in *fli, + struct dentry *entry, struct inode *dir, + struct dentry *newent) +{ + struct inode *src_inode = entry->d_inode; + + *fli = (struct fuse_link_in){ + .oldnodeid = get_node_id(src_inode), + }; + + fa->opcode = FUSE_LINK; + fa->in_numargs = 2; + fa->in_args[0].size = sizeof(*fli); + fa->in_args[0].value = fli; + fa->in_args[1].size = newent->d_name.len + 1; + fa->in_args[1].value = newent->d_name.name; + + return 0; +} + +int fuse_link_backing(struct fuse_bpf_args *fa, struct dentry *entry, + struct inode *dir, struct dentry *newent) +{ + int err = 0; + struct path backing_old_path = {}; + struct path backing_new_path = {}; + struct dentry *backing_dir_dentry; + struct inode *fuse_new_inode = NULL; + struct fuse_inode *fuse_dir_inode = get_fuse_inode(dir); + struct inode *backing_dir_inode = fuse_dir_inode->backing_inode; + + get_fuse_backing_path(entry, &backing_old_path); + if (!backing_old_path.dentry) + return -EBADF; + + get_fuse_backing_path(newent, &backing_new_path); + if (!backing_new_path.dentry) { + err = -EBADF; + goto err_dst_path; + } + + backing_dir_dentry = dget_parent(backing_new_path.dentry); + backing_dir_inode = d_inode(backing_dir_dentry); + + inode_lock_nested(backing_dir_inode, I_MUTEX_PARENT); + err = vfs_link(backing_old_path.dentry, &init_user_ns, + backing_dir_inode, backing_new_path.dentry, NULL); + inode_unlock(backing_dir_inode); + if (err) + goto out; + + if (d_really_is_negative(backing_new_path.dentry) || + unlikely(d_unhashed(backing_new_path.dentry))) { + err = -EINVAL; + /** + * TODO: overlayfs responds to this situation with a + * lookupOneLen. Should we do that too? + */ + goto out; + } + + fuse_new_inode = fuse_iget_backing(dir->i_sb, fuse_dir_inode->nodeid, backing_dir_inode); + if (IS_ERR(fuse_new_inode)) { + err = PTR_ERR(fuse_new_inode); + goto out; + } + d_instantiate(newent, fuse_new_inode); + +out: + dput(backing_dir_dentry); + path_put(&backing_new_path); +err_dst_path: + path_put(&backing_old_path); + return err; +} + +void *fuse_link_finalize(struct fuse_bpf_args *fa, struct dentry *entry, + struct inode *dir, struct dentry *newent) +{ + return NULL; +} + +int fuse_getattr_initialize(struct fuse_bpf_args *fa, struct fuse_getattr_io *fgio, + const struct dentry *entry, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + fgio->fgi = (struct fuse_getattr_in) { + .getattr_flags = flags, + .fh = -1, /* TODO is this OK? */ + }; + + fgio->fao = (struct fuse_attr_out) {0}; + + *fa = (struct fuse_bpf_args) { + .nodeid = get_node_id(entry->d_inode), + .opcode = FUSE_GETATTR, + .in_numargs = 1, + .out_numargs = 1, + .in_args[0] = (struct fuse_bpf_in_arg) { + .size = sizeof(fgio->fgi), + .value = &fgio->fgi, + }, + .out_args[0] = (struct fuse_bpf_arg) { + .size = sizeof(fgio->fao), + .value = &fgio->fao, + }, + }; + + return 0; +} + +static void fuse_stat_to_attr(struct fuse_conn *fc, struct inode *inode, + struct kstat *stat, struct fuse_attr *attr) +{ + unsigned int blkbits; + + /* see the comment in fuse_change_attributes() */ + if (fc->writeback_cache && S_ISREG(inode->i_mode)) { + stat->size = i_size_read(inode); + stat->mtime.tv_sec = inode->i_mtime.tv_sec; + stat->mtime.tv_nsec = inode->i_mtime.tv_nsec; + stat->ctime.tv_sec = inode->i_ctime.tv_sec; + stat->ctime.tv_nsec = inode->i_ctime.tv_nsec; + } + + attr->ino = stat->ino; + attr->mode = (inode->i_mode & S_IFMT) | (stat->mode & 07777); + attr->nlink = stat->nlink; + attr->uid = from_kuid(fc->user_ns, stat->uid); + attr->gid = from_kgid(fc->user_ns, stat->gid); + attr->atime = stat->atime.tv_sec; + attr->atimensec = stat->atime.tv_nsec; + attr->mtime = stat->mtime.tv_sec; + attr->mtimensec = stat->mtime.tv_nsec; + attr->ctime = stat->ctime.tv_sec; + attr->ctimensec = stat->ctime.tv_nsec; + attr->size = stat->size; + attr->blocks = stat->blocks; + + if (stat->blksize != 0) + blkbits = ilog2(stat->blksize); + else + blkbits = inode->i_sb->s_blocksize_bits; + + attr->blksize = 1 << blkbits; +} + +int fuse_getattr_backing(struct fuse_bpf_args *fa, + const struct dentry *entry, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + struct path *backing_path = + &get_fuse_dentry(entry)->backing_path; + struct inode *backing_inode = backing_path->dentry->d_inode; + struct fuse_attr_out *fao = fa->out_args[0].value; + struct kstat tmp; + int err; + + if (!stat) + stat = &tmp; + + err = vfs_getattr(backing_path, stat, request_mask, flags); + + if (!err) + fuse_stat_to_attr(get_fuse_conn(entry->d_inode), + backing_inode, stat, &fao->attr); + + return err; +} + +void *fuse_getattr_finalize(struct fuse_bpf_args *fa, + const struct dentry *entry, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + struct fuse_attr_out *outarg = fa->out_args[0].value; + struct inode *inode = entry->d_inode; + u64 attr_version = fuse_get_attr_version(get_fuse_mount(inode)->fc); + int err = 0; + + /* TODO: Ensure this doesn't happen if we had an error getting attrs in + * backing. + */ + err = finalize_attr(inode, outarg, attr_version, stat); + return ERR_PTR(err); +} + +static void fattr_to_iattr(struct fuse_conn *fc, + const struct fuse_setattr_in *arg, + struct iattr *iattr) +{ + unsigned int fvalid = arg->valid; + + if (fvalid & FATTR_MODE) + iattr->ia_valid |= ATTR_MODE, iattr->ia_mode = arg->mode; + if (fvalid & FATTR_UID) { + iattr->ia_valid |= ATTR_UID; + iattr->ia_uid = make_kuid(fc->user_ns, arg->uid); + } + if (fvalid & FATTR_GID) { + iattr->ia_valid |= ATTR_GID; + iattr->ia_gid = make_kgid(fc->user_ns, arg->gid); + } + if (fvalid & FATTR_SIZE) + iattr->ia_valid |= ATTR_SIZE, iattr->ia_size = arg->size; + if (fvalid & FATTR_ATIME) { + iattr->ia_valid |= ATTR_ATIME; + iattr->ia_atime.tv_sec = arg->atime; + iattr->ia_atime.tv_nsec = arg->atimensec; + if (!(fvalid & FATTR_ATIME_NOW)) + iattr->ia_valid |= ATTR_ATIME_SET; + } + if (fvalid & FATTR_MTIME) { + iattr->ia_valid |= ATTR_MTIME; + iattr->ia_mtime.tv_sec = arg->mtime; + iattr->ia_mtime.tv_nsec = arg->mtimensec; + if (!(fvalid & FATTR_MTIME_NOW)) + iattr->ia_valid |= ATTR_MTIME_SET; + } + if (fvalid & FATTR_CTIME) { + iattr->ia_valid |= ATTR_CTIME; + iattr->ia_ctime.tv_sec = arg->ctime; + iattr->ia_ctime.tv_nsec = arg->ctimensec; + } +} + +int fuse_setattr_initialize(struct fuse_bpf_args *fa, struct fuse_setattr_io *fsio, + struct dentry *dentry, struct iattr *attr, struct file *file) +{ + struct fuse_conn *fc = get_fuse_conn(dentry->d_inode); + + *fsio = (struct fuse_setattr_io) {0}; + iattr_to_fattr(fc, attr, &fsio->fsi, true); + + *fa = (struct fuse_bpf_args) { + .opcode = FUSE_SETATTR, + .nodeid = get_node_id(dentry->d_inode), + .in_numargs = 1, + .in_args[0].size = sizeof(fsio->fsi), + .in_args[0].value = &fsio->fsi, + .out_numargs = 1, + .out_args[0].size = sizeof(fsio->fao), + .out_args[0].value = &fsio->fao, + }; + + return 0; +} + +int fuse_setattr_backing(struct fuse_bpf_args *fa, + struct dentry *dentry, struct iattr *attr, struct file *file) +{ + struct fuse_conn *fc = get_fuse_conn(dentry->d_inode); + const struct fuse_setattr_in *fsi = fa->in_args[0].value; + struct iattr new_attr = {0}; + struct path *backing_path = &get_fuse_dentry(dentry)->backing_path; + int res; + + fattr_to_iattr(fc, fsi, &new_attr); + /* TODO: Some info doesn't get saved by the attr->fattr->attr transition + * When we actually allow the bpf to change these, we may have to consider + * the extra flags more, or pass more info into the bpf. Until then we can + * keep everything except for ATTR_FILE, since we'd need a file on the + * lower fs. For what it's worth, neither f2fs nor ext4 make use of that + * even if it is present. + */ + new_attr.ia_valid = attr->ia_valid & ~ATTR_FILE; + inode_lock(d_inode(backing_path->dentry)); + res = notify_change(&init_user_ns, backing_path->dentry, &new_attr, + NULL); + inode_unlock(d_inode(backing_path->dentry)); + + if (res == 0 && (new_attr.ia_valid & ATTR_SIZE)) + i_size_write(dentry->d_inode, new_attr.ia_size); + return res; +} + +void *fuse_setattr_finalize(struct fuse_bpf_args *fa, + struct dentry *dentry, struct iattr *attr, struct file *file) +{ + return NULL; +} + +int fuse_statfs_initialize( + struct fuse_bpf_args *fa, struct fuse_statfs_out *fso, + struct dentry *dentry, struct kstatfs *buf) +{ + *fso = (struct fuse_statfs_out) {0}; + *fa = (struct fuse_bpf_args) { + .nodeid = get_node_id(d_inode(dentry)), + .opcode = FUSE_STATFS, + .out_numargs = 1, + .out_numargs = 1, + .out_args[0].size = sizeof(fso), + .out_args[0].value = fso, + }; + + return 0; +} + +int fuse_statfs_backing( + struct fuse_bpf_args *fa, + struct dentry *dentry, struct kstatfs *buf) +{ + int err = 0; + struct path backing_path; + struct fuse_statfs_out *fso = fa->out_args[0].value; + + get_fuse_backing_path(dentry, &backing_path); + if (!backing_path.dentry) + return -EBADF; + err = vfs_statfs(&backing_path, buf); + path_put(&backing_path); + buf->f_type = FUSE_SUPER_MAGIC; + + //TODO Provide postfilter opportunity to modify + if (!err) + convert_statfs_to_fuse(&fso->st, buf); + + return err; +} + +void *fuse_statfs_finalize( + struct fuse_bpf_args *fa, + struct dentry *dentry, struct kstatfs *buf) +{ + struct fuse_statfs_out *fso = fa->out_args[0].value; + + if (!fa->error_in) + convert_fuse_statfs(buf, &fso->st); + return NULL; +} + +int fuse_get_link_initialize(struct fuse_bpf_args *fa, struct fuse_dummy_io *unused, + struct inode *inode, struct dentry *dentry, + struct delayed_call *callback, const char **out) +{ + /* + * TODO + * If we want to handle changing these things, we'll need to copy + * the lower fs's data into our own buffer, and provide our own callback + * to free that buffer. + * + * Pre could change the name we're looking at + * postfilter can change the name we return + * + * We ought to only make that buffer if it's been requested, so leaving + * this unimplemented for the moment + */ + *fa = (struct fuse_bpf_args) { + .opcode = FUSE_READLINK, + .nodeid = get_node_id(inode), + .in_numargs = 1, + .in_args[0] = (struct fuse_bpf_in_arg) { + .size = dentry->d_name.len + 1, + .value = dentry->d_name.name, + }, + /* + * .out_argvar = 1, + * .out_numargs = 1, + * .out_args[0].size = , + * .out_args[0].value = , + */ + }; + + return 0; +} + +int fuse_get_link_backing(struct fuse_bpf_args *fa, + struct inode *inode, struct dentry *dentry, + struct delayed_call *callback, const char **out) +{ + struct path backing_path; + + if (!dentry) { + *out = ERR_PTR(-ECHILD); + return PTR_ERR(*out); + } + + get_fuse_backing_path(dentry, &backing_path); + if (!backing_path.dentry) { + *out = ERR_PTR(-ECHILD); + return PTR_ERR(*out); + } + + /* + * TODO: If we want to do our own thing, copy the data and then call the + * callback + */ + *out = vfs_get_link(backing_path.dentry, callback); + + path_put(&backing_path); + return 0; +} + +void *fuse_get_link_finalize(struct fuse_bpf_args *fa, + struct inode *inode, struct dentry *dentry, + struct delayed_call *callback, const char **out) +{ + return NULL; +} + +int fuse_symlink_initialize( + struct fuse_bpf_args *fa, struct fuse_dummy_io *unused, + struct inode *dir, struct dentry *entry, const char *link, int len) +{ + *fa = (struct fuse_bpf_args) { + .nodeid = get_node_id(dir), + .opcode = FUSE_SYMLINK, + .in_numargs = 2, + .in_args[0] = (struct fuse_bpf_in_arg) { + .size = entry->d_name.len + 1, + .value = entry->d_name.name, + }, + .in_args[1] = (struct fuse_bpf_in_arg) { + .size = len, + .value = link, + }, + }; + + return 0; +} + +int fuse_symlink_backing( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry, const char *link, int len) +{ + int err = 0; + struct fuse_inode *fuse_inode = get_fuse_inode(dir); + struct inode *backing_inode = fuse_inode->backing_inode; + struct path backing_path = {}; + struct inode *inode = NULL; + + //TODO Actually deal with changing the backing entry in symlink + get_fuse_backing_path(entry, &backing_path); + if (!backing_path.dentry) + return -EBADF; + + inode_lock_nested(backing_inode, I_MUTEX_PARENT); + err = vfs_symlink(&init_user_ns, backing_inode, backing_path.dentry, + link); + inode_unlock(backing_inode); + if (err) + goto out; + if (d_really_is_negative(backing_path.dentry) || + unlikely(d_unhashed(backing_path.dentry))) { + err = -EINVAL; + /** + * TODO: overlayfs responds to this situation with a + * lookupOneLen. Should we do that too? + */ + goto out; + } + inode = fuse_iget_backing(dir->i_sb, fuse_inode->nodeid, backing_inode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } + d_instantiate(entry, inode); +out: + path_put(&backing_path); + return err; +} + +void *fuse_symlink_finalize( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry, const char *link, int len) +{ + return NULL; +} + +int fuse_readdir_initialize(struct fuse_bpf_args *fa, struct fuse_read_io *frio, + struct file *file, struct dir_context *ctx, + bool *force_again, bool *allow_force, bool is_continued) +{ + struct fuse_file *ff = file->private_data; + u8 *page = (u8 *)__get_free_page(GFP_KERNEL); + + if (!page) + return -ENOMEM; + + *fa = (struct fuse_bpf_args) { + .nodeid = ff->nodeid, + .opcode = FUSE_READDIR, + .in_numargs = 1, + .flags = FUSE_BPF_OUT_ARGVAR, + .out_numargs = 2, + .in_args[0] = (struct fuse_bpf_in_arg) { + .size = sizeof(frio->fri), + .value = &frio->fri, + }, + .out_args[0] = (struct fuse_bpf_arg) { + .size = sizeof(frio->fro), + .value = &frio->fro, + }, + .out_args[1] = (struct fuse_bpf_arg) { + .size = PAGE_SIZE, + .value = page, + }, + }; + + frio->fri = (struct fuse_read_in) { + .fh = ff->fh, + .offset = ctx->pos, + .size = PAGE_SIZE, + }; + frio->fro = (struct fuse_read_out) { + .again = 0, + .offset = 0, + }; + *force_again = false; + *allow_force = true; + return 0; +} + +struct extfuse_ctx { + struct dir_context ctx; + u8 *addr; + size_t offset; +}; + +static int filldir(struct dir_context *ctx, const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct extfuse_ctx *ec = container_of(ctx, struct extfuse_ctx, ctx); + struct fuse_dirent *fd = (struct fuse_dirent *) (ec->addr + ec->offset); + + if (ec->offset + sizeof(struct fuse_dirent) + namelen > PAGE_SIZE) + return -ENOMEM; + + *fd = (struct fuse_dirent) { + .ino = ino, + .off = offset, + .namelen = namelen, + .type = d_type, + }; + + memcpy(fd->name, name, namelen); + ec->offset += FUSE_DIRENT_SIZE(fd); + + return 0; +} + +static int parse_dirfile(char *buf, size_t nbytes, struct dir_context *ctx) +{ + while (nbytes >= FUSE_NAME_OFFSET) { + struct fuse_dirent *dirent = (struct fuse_dirent *) buf; + size_t reclen = FUSE_DIRENT_SIZE(dirent); + + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + if (memchr(dirent->name, '/', dirent->namelen) != NULL) + return -EIO; + + ctx->pos = dirent->off; + if (!dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, + dirent->type)) + break; + + buf += reclen; + nbytes -= reclen; + } + + return 0; +} + + +int fuse_readdir_backing(struct fuse_bpf_args *fa, + struct file *file, struct dir_context *ctx, + bool *force_again, bool *allow_force, bool is_continued) +{ + struct fuse_file *ff = file->private_data; + struct file *backing_dir = ff->backing_file; + struct fuse_read_out *fro = fa->out_args[0].value; + struct extfuse_ctx ec; + int err; + + ec = (struct extfuse_ctx) { + .ctx.actor = filldir, + .ctx.pos = ctx->pos, + .addr = fa->out_args[1].value, + }; + + if (!ec.addr) + return -ENOMEM; + + if (!is_continued) + backing_dir->f_pos = file->f_pos; + + err = iterate_dir(backing_dir, &ec.ctx); + if (ec.offset == 0) + *allow_force = false; + fa->out_args[1].size = ec.offset; + + fro->offset = ec.ctx.pos; + fro->again = false; + return err; +} + +void *fuse_readdir_finalize(struct fuse_bpf_args *fa, + struct file *file, struct dir_context *ctx, + bool *force_again, bool *allow_force, bool is_continued) +{ + struct fuse_read_out *fro = fa->out_args[0].value; + struct fuse_file *ff = file->private_data; + struct file *backing_dir = ff->backing_file; + int err = 0; + + err = parse_dirfile(fa->out_args[1].value, fa->out_args[1].size, ctx); + *force_again = !!fro->again; + if (*force_again && !*allow_force) + err = -EINVAL; + + ctx->pos = fro->offset; + backing_dir->f_pos = fro->offset; + + free_page((unsigned long) fa->out_args[1].value); + return ERR_PTR(err); +} + +int fuse_access_initialize(struct fuse_bpf_args *fa, struct fuse_access_in *fai, + struct inode *inode, int mask) +{ + *fai = (struct fuse_access_in) { + .mask = mask, + }; + + *fa = (struct fuse_bpf_args) { + .opcode = FUSE_ACCESS, + .nodeid = get_node_id(inode), + .in_numargs = 1, + .in_args[0].size = sizeof(*fai), + .in_args[0].value = fai, + }; + + return 0; +} + +int fuse_access_backing(struct fuse_bpf_args *fa, struct inode *inode, int mask) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + const struct fuse_access_in *fai = fa->in_args[0].value; + + return inode_permission(&init_user_ns, fi->backing_inode, fai->mask); +} + +void *fuse_access_finalize(struct fuse_bpf_args *fa, struct inode *inode, int mask) +{ + return NULL; +} + +int __init fuse_bpf_init(void) +{ + fuse_bpf_aio_request_cachep = kmem_cache_create("fuse_bpf_aio_req", + sizeof(struct fuse_bpf_aio_req), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!fuse_bpf_aio_request_cachep) + return -ENOMEM; + + return 0; +} + +void __exit fuse_bpf_cleanup(void) +{ + kmem_cache_destroy(fuse_bpf_aio_request_cachep); +} + +ssize_t fuse_bpf_simple_request(struct fuse_mount *fm, struct fuse_bpf_args *bpf_args) +{ + int i; + ssize_t res; + struct fuse_args args = { + .nodeid = bpf_args->nodeid, + .opcode = bpf_args->opcode, + .error_in = bpf_args->error_in, + .in_numargs = bpf_args->in_numargs, + .out_numargs = bpf_args->out_numargs, + .force = !!(bpf_args->flags & FUSE_BPF_FORCE), + .out_argvar = !!(bpf_args->flags & FUSE_BPF_OUT_ARGVAR), + }; + + for (i = 0; i < args.in_numargs; ++i) + args.in_args[i] = (struct fuse_in_arg) { + .size = bpf_args->in_args[i].size, + .value = bpf_args->in_args[i].value, + }; + for (i = 0; i < args.out_numargs; ++i) + args.out_args[i] = (struct fuse_arg) { + .size = bpf_args->out_args[i].size, + .value = bpf_args->out_args[i].value, + }; + + res = fuse_simple_request(fm, &args); + + *bpf_args = (struct fuse_bpf_args) { + .nodeid = args.nodeid, + .opcode = args.opcode, + .error_in = args.error_in, + .in_numargs = args.in_numargs, + .out_numargs = args.out_numargs, + }; + if (args.force) + bpf_args->flags |= FUSE_BPF_FORCE; + if (args.out_argvar) + bpf_args->flags |= FUSE_BPF_OUT_ARGVAR; + for (i = 0; i < args.in_numargs; ++i) + bpf_args->in_args[i] = (struct fuse_bpf_in_arg) { + .size = args.in_args[i].size, + .value = args.in_args[i].value, + }; + for (i = 0; i < args.out_numargs; ++i) + bpf_args->out_args[i] = (struct fuse_bpf_arg) { + .size = args.out_args[i].size, + .value = args.out_args[i].value, + }; + return res; +} diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 79f01d09c78c..4aae0e606ba4 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -395,7 +395,7 @@ int __init fuse_ctl_init(void) return register_filesystem(&fuse_ctl_fs_type); } -void __exit fuse_ctl_cleanup(void) +void fuse_ctl_cleanup(void) { unregister_filesystem(&fuse_ctl_fs_type); } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index d6b5339c56e2..cb26f9a129e7 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -207,10 +208,13 @@ static unsigned int fuse_req_hash(u64 unique) /** * A new request is available, wake fiq->waitq */ -static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq) +static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq, bool sync) __releases(fiq->lock) { - wake_up(&fiq->waitq); + if (sync) + wake_up_sync(&fiq->waitq); + else + wake_up(&fiq->waitq); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); spin_unlock(&fiq->lock); } @@ -223,14 +227,14 @@ const struct fuse_iqueue_ops fuse_dev_fiq_ops = { EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops); static void queue_request_and_unlock(struct fuse_iqueue *fiq, - struct fuse_req *req) + struct fuse_req *req, bool sync) __releases(fiq->lock) { req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); list_add_tail(&req->list, &fiq->pending); - fiq->ops->wake_pending_and_unlock(fiq); + fiq->ops->wake_pending_and_unlock(fiq, sync); } void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, @@ -238,6 +242,11 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, { struct fuse_iqueue *fiq = &fc->iq; + if (nodeid == 0) { + kfree(forget); + return; + } + forget->forget_one.nodeid = nodeid; forget->forget_one.nlookup = nlookup; @@ -245,7 +254,7 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, if (fiq->connected) { fiq->forget_list_tail->next = forget; fiq->forget_list_tail = forget; - fiq->ops->wake_forget_and_unlock(fiq); + fiq->ops->wake_forget_and_unlock(fiq, false); } else { kfree(forget); spin_unlock(&fiq->lock); @@ -265,7 +274,7 @@ static void flush_bg_queue(struct fuse_conn *fc) fc->active_background++; spin_lock(&fiq->lock); req->in.h.unique = fuse_get_unique(fiq); - queue_request_and_unlock(fiq, req); + queue_request_and_unlock(fiq, req, false); } } @@ -358,7 +367,7 @@ static int queue_interrupt(struct fuse_req *req) spin_unlock(&fiq->lock); return 0; } - fiq->ops->wake_interrupt_and_unlock(fiq); + fiq->ops->wake_interrupt_and_unlock(fiq, false); } else { spin_unlock(&fiq->lock); } @@ -425,7 +434,7 @@ static void __fuse_request_send(struct fuse_req *req) /* acquire extra reference, since request is still needed after fuse_request_end() */ __fuse_get_request(req); - queue_request_and_unlock(fiq, req); + queue_request_and_unlock(fiq, req, true); request_wait_answer(req); /* Pairs with smp_wmb() in fuse_request_end() */ @@ -479,6 +488,7 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) { req->in.h.opcode = args->opcode; req->in.h.nodeid = args->nodeid; + req->in.h.padding = args->error_in; req->args = args; if (args->end) __set_bit(FR_ASYNC, &req->flags); @@ -600,7 +610,7 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm, spin_lock(&fiq->lock); if (fiq->connected) { - queue_request_and_unlock(fiq, req); + queue_request_and_unlock(fiq, req, false); } else { err = -ENODEV; spin_unlock(&fiq->lock); @@ -785,7 +795,8 @@ static int fuse_check_page(struct page *page) 1 << PG_active | 1 << PG_workingset | 1 << PG_reclaim | - 1 << PG_waiters))) { + 1 << PG_waiters | + LRU_GEN_MASK | LRU_REFS_MASK))) { dump_page(page, "fuse: trying to steal weird page"); return 1; } @@ -1930,6 +1941,27 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, err = copy_out_args(cs, req->args, nbytes); fuse_copy_finish(cs); + if (!err && req->in.h.opcode == FUSE_CANONICAL_PATH) { + char *path = (char *)req->args->out_args[0].value; + + path[req->args->out_args[0].size - 1] = 0; + req->out.h.error = + kern_path(path, 0, req->args->canonical_path); + } + + if (!err && (req->in.h.opcode == FUSE_LOOKUP || + req->in.h.opcode == (FUSE_LOOKUP | FUSE_POSTFILTER)) && + req->args->out_args[1].size == sizeof(struct fuse_entry_bpf_out)) { + struct fuse_entry_bpf_out *febo = (struct fuse_entry_bpf_out *) + req->args->out_args[1].value; + struct fuse_entry_bpf *feb = container_of(febo, struct fuse_entry_bpf, out); + + if (febo->backing_action == FUSE_ACTION_REPLACE) + feb->backing_file = fget(febo->backing_fd); + if (febo->bpf_action == FUSE_ACTION_REPLACE) + feb->bpf_file = fget(febo->bpf_fd); + } + spin_lock(&fpq->lock); clear_bit(FR_LOCKED, &req->flags); if (!fpq->connected) @@ -2276,7 +2308,8 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, * uses the same ioctl handler. */ if (old->f_op == file->f_op && - old->f_cred->user_ns == file->f_cred->user_ns) + old->f_cred->user_ns == + file->f_cred->user_ns) fud = fuse_get_dev(old); if (fud) { @@ -2288,6 +2321,15 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, } } break; + case FUSE_DEV_IOC_PASSTHROUGH_OPEN: + res = -EFAULT; + if (!get_user(oldfd, (__u32 __user *)arg)) { + res = -EINVAL; + fud = fuse_get_dev(file); + if (fud) + res = fuse_passthrough_open(fud, oldfd); + } + break; default: res = -ENOTTY; break; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 80a2181b402b..ec7026f9528f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -8,8 +8,10 @@ #include "fuse_i.h" +#include #include #include +#include #include #include #include @@ -18,6 +20,8 @@ #include #include +#include "../internal.h" + static void fuse_advise_use_readdirplus(struct inode *dir) { struct fuse_inode *fi = get_fuse_inode(dir); @@ -25,7 +29,7 @@ static void fuse_advise_use_readdirplus(struct inode *dir) set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); } -#if BITS_PER_LONG >= 64 +#if BITS_PER_LONG >= 64 && !defined(CONFIG_FUSE_BPF) static inline void __fuse_dentry_settime(struct dentry *entry, u64 time) { entry->d_fsdata = (void *) time; @@ -37,19 +41,15 @@ static inline u64 fuse_dentry_time(const struct dentry *entry) } #else -union fuse_dentry { - u64 time; - struct rcu_head rcu; -}; static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time) { - ((union fuse_dentry *) dentry->d_fsdata)->time = time; + ((struct fuse_dentry *) dentry->d_fsdata)->time = time; } static inline u64 fuse_dentry_time(const struct dentry *entry) { - return ((union fuse_dentry *) entry->d_fsdata)->time; + return ((struct fuse_dentry *) entry->d_fsdata)->time; } #endif @@ -74,26 +74,16 @@ static void fuse_dentry_settime(struct dentry *dentry, u64 time) __fuse_dentry_settime(dentry, time); } -/* - * FUSE caches dentries and attributes with separate timeout. The - * time in jiffies until the dentry/attributes are valid is stored in - * dentry->d_fsdata and fuse_inode->i_time respectively. - */ - -/* - * Calculate the time in jiffies until a dentry/attributes are valid - */ -static u64 time_to_jiffies(u64 sec, u32 nsec) +void fuse_init_dentry_root(struct dentry *root, struct file *backing_dir) { - if (sec || nsec) { - struct timespec64 ts = { - sec, - min_t(u32, nsec, NSEC_PER_SEC - 1) - }; +#ifdef CONFIG_FUSE_BPF + struct fuse_dentry *fuse_dentry = root->d_fsdata; - return get_jiffies_64() + timespec64_to_jiffies(&ts); - } else - return 0; + if (backing_dir) { + fuse_dentry->backing_path = backing_dir->f_path; + path_get(&fuse_dentry->backing_path); + } +#endif } /* @@ -106,11 +96,6 @@ void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o) time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); } -static u64 attr_timeout(struct fuse_attr_out *o) -{ - return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); -} - u64 entry_attr_timeout(struct fuse_entry_out *o) { return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); @@ -171,7 +156,8 @@ static void fuse_invalidate_entry(struct dentry *entry) static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, u64 nodeid, const struct qstr *name, - struct fuse_entry_out *outarg) + struct fuse_entry_out *outarg, + struct fuse_entry_bpf_out *bpf_outarg) { memset(outarg, 0, sizeof(struct fuse_entry_out)); args->opcode = FUSE_LOOKUP; @@ -179,11 +165,52 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, args->in_numargs = 1; args->in_args[0].size = name->len + 1; args->in_args[0].value = name->name; - args->out_numargs = 1; + args->out_argvar = true; + args->out_numargs = 2; args->out_args[0].size = sizeof(struct fuse_entry_out); args->out_args[0].value = outarg; + args->out_args[1].size = sizeof(struct fuse_entry_bpf_out); + args->out_args[1].value = bpf_outarg; } +#ifdef CONFIG_FUSE_BPF +static bool backing_data_changed(struct fuse_inode *fi, struct dentry *entry, + struct fuse_entry_bpf *bpf_arg) +{ + struct path new_backing_path; + struct inode *new_backing_inode; + struct bpf_prog *bpf = NULL; + int err; + bool ret = true; + + if (!entry) + return false; + + get_fuse_backing_path(entry, &new_backing_path); + new_backing_inode = fi->backing_inode; + ihold(new_backing_inode); + + err = fuse_handle_backing(bpf_arg, &new_backing_inode, &new_backing_path); + + if (err) + goto put_inode; + + err = fuse_handle_bpf_prog(bpf_arg, entry->d_parent->d_inode, &bpf); + if (err) + goto put_bpf; + + ret = (bpf != fi->bpf || fi->backing_inode != new_backing_inode || + !path_equal(&get_fuse_dentry(entry)->backing_path, &new_backing_path)); +put_bpf: + if (bpf) + bpf_prog_put(bpf); +put_inode: + iput(new_backing_inode); + path_put(&new_backing_path); + return ret; +} +#endif + /* * Check whether the dentry is still valid * @@ -204,9 +231,23 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) inode = d_inode_rcu(entry); if (inode && fuse_is_bad(inode)) goto invalid; - else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || + +#ifdef CONFIG_FUSE_BPF + /* TODO: Do we need bpf support for revalidate? + * If the lower filesystem says the entry is invalid, FUSE probably shouldn't + * try to fix that without going through the normal lookup path... + */ + if (get_fuse_dentry(entry)->backing_path.dentry) { + ret = fuse_revalidate_backing(entry, flags); + if (ret <= 0) { + goto out; + } + } +#endif + if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || (flags & (LOOKUP_EXCL | LOOKUP_REVAL))) { struct fuse_entry_out outarg; + struct fuse_entry_bpf bpf_arg; FUSE_ARGS(args); struct fuse_forget_link *forget; u64 attr_version; @@ -218,27 +259,44 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) ret = -ECHILD; if (flags & LOOKUP_RCU) goto out; - fm = get_fuse_mount(inode); + parent = dget_parent(entry); + +#ifdef CONFIG_FUSE_BPF + /* TODO: Once we're handling timeouts for backing inodes, do a + * bpf based lookup_revalidate here. + */ + if (get_fuse_inode(parent->d_inode)->backing_inode) { + dput(parent); + ret = 1; + goto out; + } +#endif forget = fuse_alloc_forget(); ret = -ENOMEM; - if (!forget) + if (!forget) { + dput(parent); goto out; + } attr_version = fuse_get_attr_version(fm->fc); - parent = dget_parent(entry); fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)), - &entry->d_name, &outarg); + &entry->d_name, &outarg, &bpf_arg.out); ret = fuse_simple_request(fm, &args); dput(parent); + /* Zero nodeid is same as -ENOENT */ if (!ret && !outarg.nodeid) ret = -ENOENT; - if (!ret) { + if (!ret || ret == sizeof(bpf_arg.out)) { fi = get_fuse_inode(inode); if (outarg.nodeid != get_node_id(inode) || +#ifdef CONFIG_FUSE_BPF + (ret == sizeof(bpf_arg.out) && + backing_data_changed(fi, entry, &bpf_arg)) || +#endif (bool) IS_AUTOMOUNT(inode) != (bool) (outarg.attr.flags & FUSE_ATTR_SUBMOUNT)) { fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); @@ -280,17 +338,20 @@ invalid: goto out; } -#if BITS_PER_LONG < 64 +#if BITS_PER_LONG < 64 || defined(CONFIG_FUSE_BPF) static int fuse_dentry_init(struct dentry *dentry) { - dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), + dentry->d_fsdata = kzalloc(sizeof(struct fuse_dentry), GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); return dentry->d_fsdata ? 0 : -ENOMEM; } static void fuse_dentry_release(struct dentry *dentry) { - union fuse_dentry *fd = dentry->d_fsdata; + struct fuse_dentry *fd = dentry->d_fsdata; + + if (fd && fd->backing_path.dentry) + path_put(&fd->backing_path); kfree_rcu(fd, rcu); } @@ -328,18 +389,70 @@ static struct vfsmount *fuse_dentry_automount(struct path *path) return mnt; } +/* + * Get the canonical path. Since we must translate to a path, this must be done + * in the context of the userspace daemon, however, the userspace daemon cannot + * look up paths on its own. Instead, we handle the lookup as a special case + * inside of the write request. + */ +static void fuse_dentry_canonical_path(const struct path *path, + struct path *canonical_path) +{ + struct inode *inode = d_inode(path->dentry); + //struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount_super(path->mnt->mnt_sb); + FUSE_ARGS(args); + char *path_name; + int err; + +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(inode, struct fuse_dummy_io, + fuse_canonical_path_initialize, + fuse_canonical_path_backing, + fuse_canonical_path_finalize, path, + canonical_path); + if (fer.ret) + return; +#endif + + path_name = (char *)get_zeroed_page(GFP_KERNEL); + if (!path_name) + goto default_path; + + args.opcode = FUSE_CANONICAL_PATH; + args.nodeid = get_node_id(inode); + args.in_numargs = 0; + args.out_numargs = 1; + args.out_args[0].size = PATH_MAX; + args.out_args[0].value = path_name; + args.canonical_path = canonical_path; + args.out_argvar = 1; + + err = fuse_simple_request(fm, &args); + free_page((unsigned long)path_name); + if (err > 0) + return; +default_path: + canonical_path->dentry = path->dentry; + canonical_path->mnt = path->mnt; + path_get(canonical_path); +} + const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, .d_delete = fuse_dentry_delete, -#if BITS_PER_LONG < 64 +#if BITS_PER_LONG < 64 || defined(CONFIG_FUSE_BPF) .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, #endif .d_automount = fuse_dentry_automount, + .d_canonical_path = fuse_dentry_canonical_path, }; const struct dentry_operations fuse_root_dentry_operations = { -#if BITS_PER_LONG < 64 +#if BITS_PER_LONG < 64 || defined(CONFIG_FUSE_BPF) .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, #endif @@ -358,10 +471,13 @@ bool fuse_invalid_attr(struct fuse_attr *attr) } int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, - struct fuse_entry_out *outarg, struct inode **inode) + struct fuse_entry_out *outarg, + struct dentry *entry, + struct inode **inode) { struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); + struct fuse_entry_bpf bpf_arg = {0}; struct fuse_forget_link *forget; u64 attr_version; int err; @@ -379,23 +495,68 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name attr_version = fuse_get_attr_version(fm->fc); - fuse_lookup_init(fm->fc, &args, nodeid, name, outarg); + fuse_lookup_init(fm->fc, &args, nodeid, name, outarg, &bpf_arg.out); err = fuse_simple_request(fm, &args); - /* Zero nodeid is same as -ENOENT, but with valid timeout */ - if (err || !outarg->nodeid) - goto out_put_forget; - err = -EIO; - if (!outarg->nodeid) - goto out_put_forget; - if (fuse_invalid_attr(&outarg->attr)) - goto out_put_forget; +#ifdef CONFIG_FUSE_BPF + if (err == sizeof(bpf_arg.out)) { + /* TODO Make sure this handles invalid handles */ + struct file *backing_file; + struct inode *backing_inode; + + err = -ENOENT; + if (!entry) + goto out_queue_forget; + + err = -EINVAL; + backing_file = bpf_arg.backing_file; + if (!backing_file) + goto out_queue_forget; + + if (IS_ERR(backing_file)) { + err = PTR_ERR(backing_file); + goto out_queue_forget; + } + + backing_inode = backing_file->f_inode; + *inode = fuse_iget_backing(sb, outarg->nodeid, backing_inode); + if (!*inode) + goto bpf_arg_out; + + err = fuse_handle_backing(&bpf_arg, + &get_fuse_inode(*inode)->backing_inode, + &get_fuse_dentry(entry)->backing_path); + if (err) + goto out; + + err = fuse_handle_bpf_prog(&bpf_arg, NULL, &get_fuse_inode(*inode)->bpf); + if (err) + goto out; +bpf_arg_out: + fput(backing_file); + } else +#endif + { + /* Zero nodeid is same as -ENOENT, but with valid timeout */ + if (err || !outarg->nodeid) + goto out_put_forget; + + err = -EIO; + if (!outarg->nodeid) + goto out_put_forget; + if (fuse_invalid_attr(&outarg->attr)) + goto out_put_forget; + + *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, + &outarg->attr, entry_attr_timeout(outarg), + attr_version); + } - *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, - &outarg->attr, entry_attr_timeout(outarg), - attr_version); err = -ENOMEM; - if (!*inode) { +#ifdef CONFIG_FUSE_BPF +out_queue_forget: +#endif + if (!*inode && outarg->nodeid) { fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1); goto out; } @@ -417,12 +578,23 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, bool outarg_valid = true; bool locked; +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(dir, struct fuse_lookup_io, + fuse_lookup_initialize, fuse_lookup_backing, + fuse_lookup_finalize, + dir, entry, flags); + if (fer.ret) + return fer.result; +#endif + if (fuse_is_bad(dir)) return ERR_PTR(-EIO); locked = fuse_lock_inode(dir); err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, - &outarg, &inode); + &outarg, entry, &inode); fuse_unlock_inode(dir, locked); if (err == -ENOENT) { outarg_valid = false; @@ -468,6 +640,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, { int err; struct inode *inode; + struct fuse_conn *fc = get_fuse_conn(dir); struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); struct fuse_forget_link *forget; @@ -480,6 +653,20 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, /* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); +#ifdef CONFIG_FUSE_BPF + { + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(dir, struct fuse_create_open_io, + fuse_create_open_initialize, + fuse_create_open_backing, + fuse_create_open_finalize, + dir, entry, file, flags, mode); + if (fer.ret) + return PTR_ERR(fer.result); + } +#endif + forget = fuse_alloc_forget(); err = -ENOMEM; if (!forget) @@ -529,6 +716,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, ff->fh = outopen.fh; ff->nodeid = outentry.nodeid; ff->open_flags = outopen.open_flags; + fuse_passthrough_setup(fc, ff, &outopen); inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, &outentry.attr, entry_attr_timeout(&outentry), 0); if (!inode) { @@ -678,6 +866,17 @@ static int fuse_mknod(struct user_namespace *mnt_userns, struct inode *dir, struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(dir, struct fuse_mknod_in, + fuse_mknod_initialize, fuse_mknod_backing, + fuse_mknod_finalize, + dir, entry, mode, rdev); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + if (!fm->fc->dont_mask) mode &= ~current_umask(); @@ -707,6 +906,17 @@ static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(dir, struct fuse_mkdir_in, + fuse_mkdir_initialize, fuse_mkdir_backing, + fuse_mkdir_finalize, + dir, entry, mode); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + if (!fm->fc->dont_mask) mode &= ~current_umask(); @@ -729,6 +939,17 @@ static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir, unsigned len = strlen(link) + 1; FUSE_ARGS(args); +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(dir, struct fuse_dummy_io, + fuse_symlink_initialize, fuse_symlink_backing, + fuse_symlink_finalize, + dir, entry, link, len); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + args.opcode = FUSE_SYMLINK; args.in_numargs = 2; args.in_args[0].size = entry->d_name.len + 1; @@ -763,6 +984,20 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) if (fuse_is_bad(dir)) return -EIO; +#ifdef CONFIG_FUSE_BPF + { + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(dir, struct fuse_dummy_io, + fuse_unlink_initialize, + fuse_unlink_backing, + fuse_unlink_finalize, + dir, entry); + if (fer.ret) + return PTR_ERR(fer.result); + } +#endif + args.opcode = FUSE_UNLINK; args.nodeid = get_node_id(dir); args.in_numargs = 1; @@ -802,6 +1037,20 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) if (fuse_is_bad(dir)) return -EIO; +#ifdef CONFIG_FUSE_BPF + { + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(dir, struct fuse_dummy_io, + fuse_rmdir_initialize, + fuse_rmdir_backing, + fuse_rmdir_finalize, + dir, entry); + if (fer.ret) + return PTR_ERR(fer.result); + } +#endif + args.opcode = FUSE_RMDIR; args.nodeid = get_node_id(dir); args.in_numargs = 1; @@ -887,6 +1136,18 @@ static int fuse_rename2(struct user_namespace *mnt_userns, struct inode *olddir, return -EINVAL; if (flags) { +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(olddir, struct fuse_rename2_in, + fuse_rename2_initialize, fuse_rename2_backing, + fuse_rename2_finalize, + olddir, oldent, newdir, newent, flags); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + + /* TODO: how should this go with bpfs involved? */ if (fc->no_rename2 || fc->minor < 23) return -EINVAL; @@ -898,6 +1159,17 @@ static int fuse_rename2(struct user_namespace *mnt_userns, struct inode *olddir, err = -EINVAL; } } else { +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(olddir, struct fuse_rename_in, + fuse_rename_initialize, fuse_rename_backing, + fuse_rename_finalize, + olddir, oldent, newdir, newent); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + err = fuse_rename_common(olddir, oldent, newdir, newent, 0, FUSE_RENAME, sizeof(struct fuse_rename_in)); @@ -915,6 +1187,16 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(inode, struct fuse_link_in, fuse_link_initialize, + fuse_link_backing, fuse_link_finalize, entry, + newdir, newent); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); args.opcode = FUSE_LINK; @@ -946,7 +1228,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, return err; } -static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, +void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, struct kstat *stat) { unsigned int blkbits; @@ -1015,23 +1297,13 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; err = fuse_simple_request(fm, &args); - if (!err) { - if (fuse_invalid_attr(&outarg.attr) || - inode_wrong_type(inode, outarg.attr.mode)) { - fuse_make_bad(inode); - err = -EIO; - } else { - fuse_change_attributes(inode, &outarg.attr, - attr_timeout(&outarg), - attr_version); - if (stat) - fuse_fillattr(inode, &outarg.attr, stat); - } - } + if (!err) + err = finalize_attr(inode, &outarg, attr_version, stat); return err; } static int fuse_update_get_attr(struct inode *inode, struct file *file, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -1039,6 +1311,17 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, int err = 0; bool sync; +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(inode, struct fuse_getattr_io, + fuse_getattr_initialize, fuse_getattr_backing, + fuse_getattr_finalize, + path->dentry, stat, request_mask, flags); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + if (flags & AT_STATX_FORCE_SYNC) sync = true; else if (flags & AT_STATX_DONT_SYNC) @@ -1063,7 +1346,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, int fuse_update_attributes(struct inode *inode, struct file *file) { /* Do *not* need to get atime for internal purposes */ - return fuse_update_get_attr(inode, file, NULL, + return fuse_update_get_attr(inode, file, &file->f_path, NULL, STATX_BASIC_STATS & ~STATX_ATIME, 0); } @@ -1172,6 +1455,16 @@ static int fuse_access(struct inode *inode, int mask) struct fuse_access_in inarg; int err; +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(inode, struct fuse_access_in, + fuse_access_initialize, fuse_access_backing, + fuse_access_finalize, inode, mask); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + BUG_ON(mask & MAY_NOT_BLOCK); if (fm->fc->no_access) @@ -1220,6 +1513,10 @@ static int fuse_permission(struct user_namespace *mnt_userns, struct fuse_conn *fc = get_fuse_conn(inode); bool refreshed = false; int err = 0; + struct fuse_inode *fi = get_fuse_inode(inode); +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; +#endif if (fuse_is_bad(inode)) return -EIO; @@ -1227,12 +1524,19 @@ static int fuse_permission(struct user_namespace *mnt_userns, if (!fuse_allow_current_process(fc)) return -EACCES; +#ifdef CONFIG_FUSE_BPF + fer = fuse_bpf_backing(inode, struct fuse_access_in, + fuse_access_initialize, fuse_access_backing, + fuse_access_finalize, inode, mask); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + /* * If attributes are needed, refresh them before proceeding */ if (fc->default_permissions || ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { - struct fuse_inode *fi = get_fuse_inode(inode); u32 perm_mask = STATX_MODE | STATX_UID | STATX_GID; if (perm_mask & READ_ONCE(fi->inval_mask) || @@ -1323,6 +1627,21 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, if (fuse_is_bad(inode)) goto out_err; +#ifdef CONFIG_FUSE_BPF + { + struct fuse_err_ret fer; + const char *out = NULL; + + fer = fuse_bpf_backing(inode, struct fuse_dummy_io, + fuse_get_link_initialize, + fuse_get_link_backing, + fuse_get_link_finalize, + inode, dentry, callback, &out); + if (fer.ret) + return fer.result ?: out; + } +#endif + if (fc->cache_symlinks) return page_get_link(dentry, inode, callback); @@ -1356,8 +1675,18 @@ static int fuse_dir_open(struct inode *inode, struct file *file) static int fuse_dir_release(struct inode *inode, struct file *file) { - fuse_release_common(file, true); +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + fer = fuse_bpf_backing(inode, struct fuse_release_in, + fuse_releasedir_initialize, fuse_release_backing, + fuse_release_finalize, + inode, file); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + + fuse_release_common(file, true); return 0; } @@ -1371,6 +1700,19 @@ static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end, if (fuse_is_bad(inode)) return -EIO; +#ifdef CONFIG_FUSE_BPF + { + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(inode, struct fuse_fsync_in, + fuse_dir_fsync_initialize, fuse_fsync_backing, + fuse_fsync_finalize, + file, start, end, datasync); + if (fer.ret) + return PTR_ERR(fer.result); + } +#endif + if (fc->no_fsyncdir) return 0; @@ -1409,58 +1751,6 @@ static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd, FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR); } -static bool update_mtime(unsigned ivalid, bool trust_local_mtime) -{ - /* Always update if mtime is explicitly set */ - if (ivalid & ATTR_MTIME_SET) - return true; - - /* Or if kernel i_mtime is the official one */ - if (trust_local_mtime) - return true; - - /* If it's an open(O_TRUNC) or an ftruncate(), don't update */ - if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE))) - return false; - - /* In all other cases update */ - return true; -} - -static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr, - struct fuse_setattr_in *arg, bool trust_local_cmtime) -{ - unsigned ivalid = iattr->ia_valid; - - if (ivalid & ATTR_MODE) - arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; - if (ivalid & ATTR_UID) - arg->valid |= FATTR_UID, arg->uid = from_kuid(fc->user_ns, iattr->ia_uid); - if (ivalid & ATTR_GID) - arg->valid |= FATTR_GID, arg->gid = from_kgid(fc->user_ns, iattr->ia_gid); - if (ivalid & ATTR_SIZE) - arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; - if (ivalid & ATTR_ATIME) { - arg->valid |= FATTR_ATIME; - arg->atime = iattr->ia_atime.tv_sec; - arg->atimensec = iattr->ia_atime.tv_nsec; - if (!(ivalid & ATTR_ATIME_SET)) - arg->valid |= FATTR_ATIME_NOW; - } - if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) { - arg->valid |= FATTR_MTIME; - arg->mtime = iattr->ia_mtime.tv_sec; - arg->mtimensec = iattr->ia_mtime.tv_nsec; - if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime) - arg->valid |= FATTR_MTIME_NOW; - } - if ((ivalid & ATTR_CTIME) && trust_local_cmtime) { - arg->valid |= FATTR_CTIME; - arg->ctime = iattr->ia_ctime.tv_sec; - arg->ctimensec = iattr->ia_ctime.tv_nsec; - } -} - /* * Prevent concurrent writepages on inode * @@ -1575,6 +1865,16 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); bool fault_blocked = false; +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(inode, struct fuse_setattr_io, + fuse_setattr_initialize, fuse_setattr_backing, + fuse_setattr_finalize, dentry, attr, file); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; @@ -1749,11 +2049,22 @@ static int fuse_setattr(struct user_namespace *mnt_userns, struct dentry *entry, * This should be done on write(), truncate() and chown(). */ if (!fc->handle_killpriv && !fc->handle_killpriv_v2) { +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + /* * ia_mode calculation may have used stale i_mode. * Refresh and recalculate. */ - ret = fuse_do_getattr(inode, NULL, file); + fer = fuse_bpf_backing(inode, struct fuse_getattr_io, + fuse_getattr_initialize, fuse_getattr_backing, + fuse_getattr_finalize, + entry, NULL, 0, 0); + if (fer.ret) + ret = PTR_ERR(fer.result); + else +#endif + ret = fuse_do_getattr(inode, NULL, file); if (ret) return ret; @@ -1810,7 +2121,8 @@ static int fuse_getattr(struct user_namespace *mnt_userns, return -EACCES; } - return fuse_update_get_attr(inode, NULL, stat, request_mask, flags); + return fuse_update_get_attr(inode, NULL, path, stat, request_mask, + flags); } static const struct inode_operations fuse_dir_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index cc95a1c37644..6149a4b56f3d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -8,6 +8,7 @@ #include "fuse_i.h" +#include #include #include #include @@ -146,7 +147,7 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, if (!err) { ff->fh = outarg.fh; ff->open_flags = outarg.open_flags; - + fuse_passthrough_setup(fc, ff, &outarg); } else if (err != -ENOSYS) { fuse_file_free(ff); return ERR_PTR(err); @@ -240,6 +241,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (err) return err; +#ifdef CONFIG_FUSE_BPF + { + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(inode, struct fuse_open_io, + fuse_open_initialize, + fuse_open_backing, + fuse_open_finalize, + inode, file, isdir); + if (fer.ret) + return PTR_ERR(fer.result); + } +#endif + if (is_wb_truncate || dax_truncate) { inode_lock(inode); fuse_set_nowrite(inode); @@ -305,6 +320,8 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff, struct fuse_release_args *ra = ff->release_args; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; + fuse_passthrough_release(&ff->passthrough); + fuse_prepare_release(fi, ff, open_flags, opcode); if (ff->flock) { @@ -341,6 +358,17 @@ static int fuse_release(struct inode *inode, struct file *file) { struct fuse_conn *fc = get_fuse_conn(inode); +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(inode, struct fuse_release_in, + fuse_release_initialize, fuse_release_backing, + fuse_release_finalize, + inode, file); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + /* see fuse_vma_close() for !writeback_cache case */ if (fc->writeback_cache) write_inode_now(inode, 1); @@ -480,6 +508,17 @@ static int fuse_flush(struct file *file, fl_owner_t id) FUSE_ARGS(args); int err; +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(file->f_inode, struct fuse_flush_in, + fuse_flush_initialize, fuse_flush_backing, + fuse_flush_finalize, + file, id); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + if (fuse_is_bad(inode)) return -EIO; @@ -552,6 +591,17 @@ static int fuse_fsync(struct file *file, loff_t start, loff_t end, struct fuse_conn *fc = get_fuse_conn(inode); int err; +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(inode, struct fuse_fsync_in, + fuse_fsync_initialize, fuse_fsync_backing, + fuse_fsync_finalize, + file, start, end, datasync); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + if (fuse_is_bad(inode)) return -EIO; @@ -1584,7 +1634,23 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (FUSE_IS_DAX(inode)) return fuse_dax_read_iter(iocb, to); - if (!(ff->open_flags & FOPEN_DIRECT_IO)) +#ifdef CONFIG_FUSE_BPF + { + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(inode, struct fuse_file_read_iter_io, + fuse_file_read_iter_initialize, + fuse_file_read_iter_backing, + fuse_file_read_iter_finalize, + iocb, to); + if (fer.ret) + return PTR_ERR(fer.result); + } +#endif + + if (ff->passthrough.filp) + return fuse_passthrough_read_iter(iocb, to); + else if (!(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_cache_read_iter(iocb, to); else return fuse_direct_read_iter(iocb, to); @@ -1602,7 +1668,23 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (FUSE_IS_DAX(inode)) return fuse_dax_write_iter(iocb, from); - if (!(ff->open_flags & FOPEN_DIRECT_IO)) +#ifdef CONFIG_FUSE_BPF + { + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(inode, struct fuse_file_write_iter_io, + fuse_file_write_iter_initialize, + fuse_file_write_iter_backing, + fuse_file_write_iter_finalize, + iocb, from); + if (fer.ret) + return PTR_ERR(fer.result); + } +#endif + + if (ff->passthrough.filp) + return fuse_passthrough_write_iter(iocb, from); + else if (!(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_cache_write_iter(iocb, from); else return fuse_direct_write_iter(iocb, from); @@ -1849,6 +1931,19 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) struct fuse_file *ff; int err; + /** + * TODO - fully understand why this is necessary + * + * With fuse-bpf, fsstress fails if rename is enabled without this + * + * We are getting writes here on directory inodes, which do not have an + * initialized file list so crash. + * + * The question is why we are getting those writes + */ + if (!S_ISREG(inode->i_mode)) + return 0; + /* * Inode is always written before the last reference is dropped and * hence this should not be reached from reclaim. @@ -2396,6 +2491,7 @@ static const struct vm_operations_struct fuse_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = fuse_page_mkwrite, + .speculative = true, }; static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) @@ -2406,6 +2502,15 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) if (FUSE_IS_DAX(file_inode(file))) return fuse_dax_mmap(file, vma); +#ifdef CONFIG_FUSE_BPF + /* TODO - this is simply passthrough, not a proper BPF filter */ + if (ff->backing_file) + return fuse_backing_mmap(file, vma); +#endif + + if (ff->passthrough.filp) + return fuse_passthrough_mmap(file, vma); + if (ff->open_flags & FOPEN_DIRECT_IO) { /* Can't provide the coherency needed for MAP_SHARED */ if (vma->vm_flags & VM_MAYSHARE) @@ -2651,6 +2756,17 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) { loff_t retval; struct inode *inode = file_inode(file); +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(inode, struct fuse_lseek_io, + fuse_lseek_initialize, + fuse_lseek_backing, + fuse_lseek_finalize, + file, offset, whence); + if (fer.ret) + return PTR_ERR(fer.result); +#endif switch (whence) { case SEEK_SET: @@ -2941,6 +3057,18 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, (!(mode & FALLOC_FL_KEEP_SIZE) || (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))); +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(inode, struct fuse_fallocate_in, + fuse_file_fallocate_initialize, + fuse_file_fallocate_backing, + fuse_file_fallocate_finalize, + file, mode, offset, length); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; @@ -3046,6 +3174,18 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, bool is_unstable = (!fc->writeback_cache) && ((pos_out + len) > inode_out->i_size); +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(file_in->f_inode, struct fuse_copy_file_range_io, + fuse_copy_file_range_initialize, + fuse_copy_file_range_backing, + fuse_copy_file_range_finalize, + file_in, pos_in, file_out, pos_out, len, flags); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + if (fc->no_copy_file_range) return -EOPNOTSUPP; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c3a87586a15f..75042483b34b 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -13,6 +13,9 @@ # define pr_fmt(fmt) "fuse: " fmt #endif +#include +#include +#include #include #include #include @@ -31,6 +34,9 @@ #include #include #include +#include + +#define FUSE_SUPER_MAGIC 0x65735546 /** Default max number of pages that can be used in a single read request */ #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 @@ -63,11 +69,57 @@ struct fuse_forget_link { struct fuse_forget_link *next; }; +/** FUSE specific dentry data */ +#if BITS_PER_LONG < 64 || defined(CONFIG_FUSE_BPF) +struct fuse_dentry { + union { + u64 time; + struct rcu_head rcu; + }; + struct path backing_path; +}; + +static inline struct fuse_dentry *get_fuse_dentry(const struct dentry *entry) +{ + return entry->d_fsdata; +} +#endif + +#ifdef CONFIG_FUSE_BPF +static inline void get_fuse_backing_path(const struct dentry *d, + struct path *path) +{ + struct fuse_dentry *di = get_fuse_dentry(d); + + if (!di) { + *path = (struct path) {}; + return; + } + + *path = di->backing_path; + path_get(path); +} +#endif + /** FUSE inode */ struct fuse_inode { /** Inode data */ struct inode inode; +#ifdef CONFIG_FUSE_BPF + /** + * Backing inode, if this inode is from a backing file system. + * If this is set, nodeid is 0. + */ + struct inode *backing_inode; + + /** + * bpf_prog, run on all operations to determine whether to pass through + * or handle in place + */ + struct bpf_prog *bpf; +#endif + /** Unique ID, which identifies the inode between userspace * and kernel */ u64 nodeid; @@ -173,6 +225,17 @@ struct fuse_conn; struct fuse_mount; struct fuse_release_args; +/** + * Reference to lower filesystem file for read/write operations handled in + * passthrough mode. + * This struct also tracks the credentials to be used for handling read/write + * operations. + */ +struct fuse_passthrough { + struct file *filp; + struct cred *cred; +}; + /** FUSE specific file data */ struct fuse_file { /** Fuse connection for this file */ @@ -218,6 +281,17 @@ struct fuse_file { } readdir; + /** Container for data related to the passthrough functionality */ + struct fuse_passthrough passthrough; + +#ifdef CONFIG_FUSE_BPF + /** + * TODO: Reconcile with passthrough file + * backing file when in bpf mode + */ + struct file *backing_file; +#endif + /** RB node to be linked on fuse_conn->polled_files */ struct rb_node polled_node; @@ -249,6 +323,7 @@ struct fuse_page_desc { struct fuse_args { uint64_t nodeid; uint32_t opcode; + uint32_t error_in; unsigned short in_numargs; unsigned short out_numargs; bool force:1; @@ -261,9 +336,12 @@ struct fuse_args { bool page_zeroing:1; bool page_replace:1; bool may_block:1; - struct fuse_in_arg in_args[3]; - struct fuse_arg out_args[2]; + struct fuse_in_arg in_args[FUSE_MAX_IN_ARGS]; + struct fuse_arg out_args[FUSE_MAX_OUT_ARGS]; void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); + + /* Path used for completing d_canonical_path */ + struct path *canonical_path; }; struct fuse_args_pages { @@ -367,10 +445,8 @@ struct fuse_req { /** Used to wake up the task waiting for completion of request*/ wait_queue_head_t waitq; -#if IS_ENABLED(CONFIG_VIRTIO_FS) /** virtio-fs's physically contiguous buffer for in and out args */ void *argbuf; -#endif /** fuse_mount this request belongs to */ struct fuse_mount *fm; @@ -390,19 +466,19 @@ struct fuse_iqueue_ops { /** * Signal that a forget has been queued */ - void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq) + void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq, bool sync) __releases(fiq->lock); /** * Signal that an INTERRUPT request has been queued */ - void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq) + void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq, bool sync) __releases(fiq->lock); /** * Signal that a request has been queued */ - void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq) + void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq, bool sync) __releases(fiq->lock); /** @@ -499,9 +575,12 @@ struct fuse_fs_context { bool no_force_umount:1; bool legacy_opts_show:1; bool dax:1; + bool no_daemon:1; unsigned int max_read; unsigned int blksize; const char *subtype; + struct bpf_prog *root_bpf; + struct file *root_dir; /* DAX device, may be NULL */ struct dax_device *dax_dev; @@ -763,9 +842,15 @@ struct fuse_conn { /* Auto-mount submounts announced by the server */ unsigned int auto_submounts:1; + /** Passthrough mode for read/write IO */ + unsigned int passthrough:1; + /* Propagate syncfs() to server */ unsigned int sync_fs:1; + /** BPF Only, no Daemon running */ + unsigned int no_daemon:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -812,6 +897,12 @@ struct fuse_conn { /* New writepages go into this bucket */ struct fuse_sync_bucket __rcu *curr_bucket; + + /** IDR for passthrough requests */ + struct idr passthrough_req; + + /** Protects passthrough_req */ + spinlock_t passthrough_req_lock; }; /* @@ -931,14 +1022,18 @@ extern const struct dentry_operations fuse_dentry_operations; extern const struct dentry_operations fuse_root_dentry_operations; /** - * Get a filled in inode + * Get a filled-in inode */ +struct inode *fuse_iget_backing(struct super_block *sb, + u64 nodeid, + struct inode *backing_inode); struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, u64 attr_valid, u64 attr_version); int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, - struct fuse_entry_out *outarg, struct inode **inode); + struct fuse_entry_out *outarg, + struct dentry *entry, struct inode **inode); /** * Send FORGET command @@ -975,7 +1070,6 @@ struct fuse_io_args { void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, size_t count, int opcode); - /** * Send OPEN or OPENDIR request */ @@ -1045,7 +1139,7 @@ int fuse_dev_init(void); void fuse_dev_cleanup(void); int fuse_ctl_init(void); -void __exit fuse_ctl_cleanup(void); +void fuse_ctl_cleanup(void); /** * Simple request sending that does request allocation and freeing @@ -1073,6 +1167,7 @@ void fuse_invalidate_entry_cache(struct dentry *entry); void fuse_invalidate_atime(struct inode *inode); u64 entry_attr_timeout(struct fuse_entry_out *o); +void fuse_init_dentry_root(struct dentry *root, struct file *backing_dir); void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o); /** @@ -1283,4 +1378,647 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir); +/* passthrough.c */ +void fuse_copyattr(struct file *dst_file, struct file *src_file); +int fuse_passthrough_open(struct fuse_dev *fud, u32 lower_fd); +int fuse_passthrough_setup(struct fuse_conn *fc, struct fuse_file *ff, + struct fuse_open_out *openarg); +void fuse_passthrough_release(struct fuse_passthrough *passthrough); +ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *to); +ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, struct iov_iter *from); +ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma); + +/* backing.c */ + +/* + * Dummy io passed to fuse_bpf_backing when io operation needs no scratch space + */ +struct fuse_dummy_io { + int unused; +}; + +struct fuse_open_io { + struct fuse_open_in foi; + struct fuse_open_out foo; +}; + +int fuse_open_initialize(struct fuse_bpf_args *fa, struct fuse_open_io *foi, + struct inode *inode, struct file *file, bool isdir); +int fuse_open_backing(struct fuse_bpf_args *fa, + struct inode *inode, struct file *file, bool isdir); +void *fuse_open_finalize(struct fuse_bpf_args *fa, + struct inode *inode, struct file *file, bool isdir); + +struct fuse_create_open_io { + struct fuse_create_in fci; + struct fuse_entry_out feo; + struct fuse_open_out foo; +}; + +int fuse_create_open_initialize( + struct fuse_bpf_args *fa, struct fuse_create_open_io *fcoi, + struct inode *dir, struct dentry *entry, + struct file *file, unsigned int flags, umode_t mode); +int fuse_create_open_backing( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry, + struct file *file, unsigned int flags, umode_t mode); +void *fuse_create_open_finalize( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry, + struct file *file, unsigned int flags, umode_t mode); + +int fuse_mknod_initialize( + struct fuse_bpf_args *fa, struct fuse_mknod_in *fmi, + struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev); +int fuse_mknod_backing( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev); +void *fuse_mknod_finalize( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev); + +int fuse_mkdir_initialize( + struct fuse_bpf_args *fa, struct fuse_mkdir_in *fmi, + struct inode *dir, struct dentry *entry, umode_t mode); +int fuse_mkdir_backing( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry, umode_t mode); +void *fuse_mkdir_finalize( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry, umode_t mode); + +int fuse_rmdir_initialize( + struct fuse_bpf_args *fa, struct fuse_dummy_io *fmi, + struct inode *dir, struct dentry *entry); +int fuse_rmdir_backing( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry); +void *fuse_rmdir_finalize( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry); + +int fuse_rename2_initialize(struct fuse_bpf_args *fa, struct fuse_rename2_in *fri, + struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags); +int fuse_rename2_backing(struct fuse_bpf_args *fa, + struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags); +void *fuse_rename2_finalize(struct fuse_bpf_args *fa, + struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags); + +int fuse_rename_initialize(struct fuse_bpf_args *fa, struct fuse_rename_in *fri, + struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent); +int fuse_rename_backing(struct fuse_bpf_args *fa, + struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent); +void *fuse_rename_finalize(struct fuse_bpf_args *fa, + struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent); + +int fuse_unlink_initialize( + struct fuse_bpf_args *fa, struct fuse_dummy_io *fmi, + struct inode *dir, struct dentry *entry); +int fuse_unlink_backing( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry); +void *fuse_unlink_finalize( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry); + +int fuse_link_initialize(struct fuse_bpf_args *fa, struct fuse_link_in *fli, + struct dentry *entry, struct inode *dir, + struct dentry *newent); +int fuse_link_backing(struct fuse_bpf_args *fa, struct dentry *entry, + struct inode *dir, struct dentry *newent); +void *fuse_link_finalize(struct fuse_bpf_args *fa, struct dentry *entry, + struct inode *dir, struct dentry *newent); + +int fuse_release_initialize(struct fuse_bpf_args *fa, struct fuse_release_in *fri, + struct inode *inode, struct file *file); +int fuse_releasedir_initialize(struct fuse_bpf_args *fa, + struct fuse_release_in *fri, + struct inode *inode, struct file *file); +int fuse_release_backing(struct fuse_bpf_args *fa, + struct inode *inode, struct file *file); +void *fuse_release_finalize(struct fuse_bpf_args *fa, + struct inode *inode, struct file *file); + +int fuse_flush_initialize(struct fuse_bpf_args *fa, struct fuse_flush_in *ffi, + struct file *file, fl_owner_t id); +int fuse_flush_backing(struct fuse_bpf_args *fa, struct file *file, fl_owner_t id); +void *fuse_flush_finalize(struct fuse_bpf_args *fa, + struct file *file, fl_owner_t id); + +struct fuse_lseek_io { + struct fuse_lseek_in fli; + struct fuse_lseek_out flo; +}; + +int fuse_lseek_initialize(struct fuse_bpf_args *fa, struct fuse_lseek_io *fli, + struct file *file, loff_t offset, int whence); +int fuse_lseek_backing(struct fuse_bpf_args *fa, struct file *file, loff_t offset, int whence); +void *fuse_lseek_finalize(struct fuse_bpf_args *fa, struct file *file, loff_t offset, int whence); + +struct fuse_copy_file_range_io { + struct fuse_copy_file_range_in fci; + struct fuse_write_out fwo; +}; + +int fuse_copy_file_range_initialize(struct fuse_bpf_args *fa, + struct fuse_copy_file_range_io *fcf, + struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags); +int fuse_copy_file_range_backing(struct fuse_bpf_args *fa, + struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags); +void *fuse_copy_file_range_finalize(struct fuse_bpf_args *fa, + struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags); + +int fuse_fsync_initialize(struct fuse_bpf_args *fa, struct fuse_fsync_in *ffi, + struct file *file, loff_t start, loff_t end, int datasync); +int fuse_fsync_backing(struct fuse_bpf_args *fa, + struct file *file, loff_t start, loff_t end, int datasync); +void *fuse_fsync_finalize(struct fuse_bpf_args *fa, + struct file *file, loff_t start, loff_t end, int datasync); +int fuse_dir_fsync_initialize(struct fuse_bpf_args *fa, struct fuse_fsync_in *ffi, + struct file *file, loff_t start, loff_t end, int datasync); + +struct fuse_getxattr_io { + struct fuse_getxattr_in fgi; + struct fuse_getxattr_out fgo; +}; + +int fuse_getxattr_initialize( + struct fuse_bpf_args *fa, struct fuse_getxattr_io *fgio, + struct dentry *dentry, const char *name, void *value, + size_t size); +int fuse_getxattr_backing( + struct fuse_bpf_args *fa, + struct dentry *dentry, const char *name, void *value, + size_t size); +void *fuse_getxattr_finalize( + struct fuse_bpf_args *fa, + struct dentry *dentry, const char *name, void *value, + size_t size); + +int fuse_listxattr_initialize(struct fuse_bpf_args *fa, + struct fuse_getxattr_io *fgio, + struct dentry *dentry, char *list, size_t size); +int fuse_listxattr_backing(struct fuse_bpf_args *fa, struct dentry *dentry, + char *list, size_t size); +void *fuse_listxattr_finalize(struct fuse_bpf_args *fa, struct dentry *dentry, + char *list, size_t size); + +int fuse_setxattr_initialize(struct fuse_bpf_args *fa, + struct fuse_setxattr_in *fsxi, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +int fuse_setxattr_backing(struct fuse_bpf_args *fa, struct dentry *dentry, + const char *name, const void *value, size_t size, + int flags); +void *fuse_setxattr_finalize(struct fuse_bpf_args *fa, struct dentry *dentry, + const char *name, const void *value, size_t size, + int flags); + +int fuse_removexattr_initialize(struct fuse_bpf_args *fa, + struct fuse_dummy_io *unused, + struct dentry *dentry, const char *name); +int fuse_removexattr_backing(struct fuse_bpf_args *fa, + struct dentry *dentry, const char *name); +void *fuse_removexattr_finalize(struct fuse_bpf_args *fa, + struct dentry *dentry, const char *name); + +struct fuse_read_iter_out { + uint64_t ret; +}; +struct fuse_file_read_iter_io { + struct fuse_read_in fri; + struct fuse_read_iter_out frio; +}; + +int fuse_file_read_iter_initialize( + struct fuse_bpf_args *fa, struct fuse_file_read_iter_io *fri, + struct kiocb *iocb, struct iov_iter *to); +int fuse_file_read_iter_backing(struct fuse_bpf_args *fa, + struct kiocb *iocb, struct iov_iter *to); +void *fuse_file_read_iter_finalize(struct fuse_bpf_args *fa, + struct kiocb *iocb, struct iov_iter *to); + +struct fuse_write_iter_out { + uint64_t ret; +}; +struct fuse_file_write_iter_io { + struct fuse_write_in fwi; + struct fuse_write_out fwo; + struct fuse_write_iter_out fwio; +}; + +int fuse_file_write_iter_initialize( + struct fuse_bpf_args *fa, struct fuse_file_write_iter_io *fwio, + struct kiocb *iocb, struct iov_iter *from); +int fuse_file_write_iter_backing(struct fuse_bpf_args *fa, + struct kiocb *iocb, struct iov_iter *from); +void *fuse_file_write_iter_finalize(struct fuse_bpf_args *fa, + struct kiocb *iocb, struct iov_iter *from); + +ssize_t fuse_backing_mmap(struct file *file, struct vm_area_struct *vma); + +int fuse_file_fallocate_initialize(struct fuse_bpf_args *fa, + struct fuse_fallocate_in *ffi, + struct file *file, int mode, loff_t offset, loff_t length); +int fuse_file_fallocate_backing(struct fuse_bpf_args *fa, + struct file *file, int mode, loff_t offset, loff_t length); +void *fuse_file_fallocate_finalize(struct fuse_bpf_args *fa, + struct file *file, int mode, loff_t offset, loff_t length); + +struct fuse_lookup_io { + struct fuse_entry_out feo; + struct fuse_entry_bpf feb; +}; + +int fuse_handle_backing(struct fuse_entry_bpf *feb, struct inode **backing_inode, + struct path *backing_path); +int fuse_handle_bpf_prog(struct fuse_entry_bpf *feb, struct inode *parent, + struct bpf_prog **bpf); + +int fuse_lookup_initialize(struct fuse_bpf_args *fa, struct fuse_lookup_io *feo, + struct inode *dir, struct dentry *entry, unsigned int flags); +int fuse_lookup_backing(struct fuse_bpf_args *fa, struct inode *dir, + struct dentry *entry, unsigned int flags); +struct dentry *fuse_lookup_finalize(struct fuse_bpf_args *fa, struct inode *dir, + struct dentry *entry, unsigned int flags); +int fuse_revalidate_backing(struct dentry *entry, unsigned int flags); + +int fuse_canonical_path_initialize(struct fuse_bpf_args *fa, + struct fuse_dummy_io *fdi, + const struct path *path, + struct path *canonical_path); +int fuse_canonical_path_backing(struct fuse_bpf_args *fa, const struct path *path, + struct path *canonical_path); +void *fuse_canonical_path_finalize(struct fuse_bpf_args *fa, + const struct path *path, + struct path *canonical_path); + +struct fuse_getattr_io { + struct fuse_getattr_in fgi; + struct fuse_attr_out fao; +}; +int fuse_getattr_initialize(struct fuse_bpf_args *fa, struct fuse_getattr_io *fgio, + const struct dentry *entry, struct kstat *stat, + u32 request_mask, unsigned int flags); +int fuse_getattr_backing(struct fuse_bpf_args *fa, + const struct dentry *entry, struct kstat *stat, + u32 request_mask, unsigned int flags); +void *fuse_getattr_finalize(struct fuse_bpf_args *fa, + const struct dentry *entry, struct kstat *stat, + u32 request_mask, unsigned int flags); + +struct fuse_setattr_io { + struct fuse_setattr_in fsi; + struct fuse_attr_out fao; +}; + +int fuse_setattr_initialize(struct fuse_bpf_args *fa, struct fuse_setattr_io *fsi, + struct dentry *dentry, struct iattr *attr, struct file *file); +int fuse_setattr_backing(struct fuse_bpf_args *fa, + struct dentry *dentry, struct iattr *attr, struct file *file); +void *fuse_setattr_finalize(struct fuse_bpf_args *fa, + struct dentry *dentry, struct iattr *attr, struct file *file); + +int fuse_statfs_initialize(struct fuse_bpf_args *fa, struct fuse_statfs_out *fso, + struct dentry *dentry, struct kstatfs *buf); +int fuse_statfs_backing(struct fuse_bpf_args *fa, + struct dentry *dentry, struct kstatfs *buf); +void *fuse_statfs_finalize(struct fuse_bpf_args *fa, + struct dentry *dentry, struct kstatfs *buf); + +int fuse_get_link_initialize(struct fuse_bpf_args *fa, struct fuse_dummy_io *dummy, + struct inode *inode, struct dentry *dentry, + struct delayed_call *callback, const char **out); +int fuse_get_link_backing(struct fuse_bpf_args *fa, + struct inode *inode, struct dentry *dentry, + struct delayed_call *callback, const char **out); +void *fuse_get_link_finalize(struct fuse_bpf_args *fa, + struct inode *inode, struct dentry *dentry, + struct delayed_call *callback, const char **out); + +int fuse_symlink_initialize( + struct fuse_bpf_args *fa, struct fuse_dummy_io *unused, + struct inode *dir, struct dentry *entry, const char *link, int len); +int fuse_symlink_backing( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry, const char *link, int len); +void *fuse_symlink_finalize( + struct fuse_bpf_args *fa, + struct inode *dir, struct dentry *entry, const char *link, int len); + +struct fuse_read_io { + struct fuse_read_in fri; + struct fuse_read_out fro; +}; + +int fuse_readdir_initialize(struct fuse_bpf_args *fa, struct fuse_read_io *frio, + struct file *file, struct dir_context *ctx, + bool *force_again, bool *allow_force, bool is_continued); +int fuse_readdir_backing(struct fuse_bpf_args *fa, + struct file *file, struct dir_context *ctx, + bool *force_again, bool *allow_force, bool is_continued); +void *fuse_readdir_finalize(struct fuse_bpf_args *fa, + struct file *file, struct dir_context *ctx, + bool *force_again, bool *allow_force, bool is_continued); + +int fuse_access_initialize(struct fuse_bpf_args *fa, struct fuse_access_in *fai, + struct inode *inode, int mask); +int fuse_access_backing(struct fuse_bpf_args *fa, struct inode *inode, int mask); +void *fuse_access_finalize(struct fuse_bpf_args *fa, struct inode *inode, int mask); + +/* + * FUSE caches dentries and attributes with separate timeout. The + * time in jiffies until the dentry/attributes are valid is stored in + * dentry->d_fsdata and fuse_inode->i_time respectively. + */ + +/* + * Calculate the time in jiffies until a dentry/attributes are valid + */ +static inline u64 time_to_jiffies(u64 sec, u32 nsec) +{ + if (sec || nsec) { + struct timespec64 ts = { + sec, + min_t(u32, nsec, NSEC_PER_SEC - 1) + }; + + return get_jiffies_64() + timespec64_to_jiffies(&ts); + } else + return 0; +} + +static inline u64 attr_timeout(struct fuse_attr_out *o) +{ + return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); +} + +static inline bool update_mtime(unsigned int ivalid, bool trust_local_mtime) +{ + /* Always update if mtime is explicitly set */ + if (ivalid & ATTR_MTIME_SET) + return true; + + /* Or if kernel i_mtime is the official one */ + if (trust_local_mtime) + return true; + + /* If it's an open(O_TRUNC) or an ftruncate(), don't update */ + if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE))) + return false; + + /* In all other cases update */ + return true; +} + +void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, + struct kstat *stat); + +static inline void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr, + struct fuse_setattr_in *arg, bool trust_local_cmtime) +{ + unsigned int ivalid = iattr->ia_valid; + + if (ivalid & ATTR_MODE) + arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; + if (ivalid & ATTR_UID) + arg->valid |= FATTR_UID, arg->uid = from_kuid(fc->user_ns, iattr->ia_uid); + if (ivalid & ATTR_GID) + arg->valid |= FATTR_GID, arg->gid = from_kgid(fc->user_ns, iattr->ia_gid); + if (ivalid & ATTR_SIZE) + arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; + if (ivalid & ATTR_ATIME) { + arg->valid |= FATTR_ATIME; + arg->atime = iattr->ia_atime.tv_sec; + arg->atimensec = iattr->ia_atime.tv_nsec; + if (!(ivalid & ATTR_ATIME_SET)) + arg->valid |= FATTR_ATIME_NOW; + } + if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) { + arg->valid |= FATTR_MTIME; + arg->mtime = iattr->ia_mtime.tv_sec; + arg->mtimensec = iattr->ia_mtime.tv_nsec; + if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime) + arg->valid |= FATTR_MTIME_NOW; + } + if ((ivalid & ATTR_CTIME) && trust_local_cmtime) { + arg->valid |= FATTR_CTIME; + arg->ctime = iattr->ia_ctime.tv_sec; + arg->ctimensec = iattr->ia_ctime.tv_nsec; + } +} + +static inline int finalize_attr(struct inode *inode, struct fuse_attr_out *outarg, + u64 attr_version, struct kstat *stat) +{ + int err = 0; + + if (fuse_invalid_attr(&outarg->attr) || + ((inode->i_mode ^ outarg->attr.mode) & S_IFMT)) { + fuse_make_bad(inode); + err = -EIO; + } else { + fuse_change_attributes(inode, &outarg->attr, + attr_timeout(outarg), + attr_version); + if (stat) + fuse_fillattr(inode, &outarg->attr, stat); + } + return err; +} + +static inline void convert_statfs_to_fuse(struct fuse_kstatfs *attr, struct kstatfs *stbuf) +{ + attr->bsize = stbuf->f_bsize; + attr->frsize = stbuf->f_frsize; + attr->blocks = stbuf->f_blocks; + attr->bfree = stbuf->f_bfree; + attr->bavail = stbuf->f_bavail; + attr->files = stbuf->f_files; + attr->ffree = stbuf->f_ffree; + attr->namelen = stbuf->f_namelen; + /* fsid is left zero */ +} + +static inline void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) +{ + stbuf->f_type = FUSE_SUPER_MAGIC; + stbuf->f_bsize = attr->bsize; + stbuf->f_frsize = attr->frsize; + stbuf->f_blocks = attr->blocks; + stbuf->f_bfree = attr->bfree; + stbuf->f_bavail = attr->bavail; + stbuf->f_files = attr->files; + stbuf->f_ffree = attr->ffree; + stbuf->f_namelen = attr->namelen; + /* fsid is left zero */ +} + +#ifdef CONFIG_FUSE_BPF +struct fuse_err_ret { + void *result; + bool ret; +}; + +int __init fuse_bpf_init(void); +void __exit fuse_bpf_cleanup(void); + +ssize_t fuse_bpf_simple_request(struct fuse_mount *fm, struct fuse_bpf_args *args); + +/* + * expression statement to wrap the backing filter logic + * struct inode *inode: inode with bpf and backing inode + * typedef io: (typically complex) type whose components fuse_args can point to. + * An instance of this type is created locally and passed to initialize + * void initialize(struct fuse_bpf_args *fa, io *in_out, args...): function that sets + * up fa and io based on args + * int backing(struct fuse_bpf_args *fa, args...): function that actually performs + * the backing io operation + * void *finalize(struct fuse_bpf_args *, args...): function that performs any final + * work needed to commit the backing io + */ +#define fuse_bpf_backing(inode, io, initialize, backing, finalize, \ + args...) \ +({ \ + struct fuse_err_ret fer = {0}; \ + int ext_flags; \ + struct fuse_inode *fuse_inode = get_fuse_inode(inode); \ + struct fuse_mount *fm = get_fuse_mount(inode); \ + io feo = {0}; \ + struct fuse_bpf_args fa = {0}, fa_backup = {0}; \ + bool locked; \ + ssize_t res; \ + void *err; \ + int i; \ + bool initialized = false; \ + \ + do { \ + if (!fuse_inode || !fuse_inode->backing_inode) \ + break; \ + \ + err = ERR_PTR(initialize(&fa, &feo, args)); \ + if (err) { \ + fer = (struct fuse_err_ret) { \ + err, \ + true, \ + }; \ + break; \ + } \ + initialized = true; \ + \ + fa_backup = fa; \ + fa.opcode |= FUSE_PREFILTER; \ + for (i = 0; i < fa.in_numargs; ++i) \ + fa.out_args[i] = (struct fuse_bpf_arg) { \ + .size = fa.in_args[i].size, \ + .value = (void *)fa.in_args[i].value, \ + }; \ + fa.out_numargs = fa.in_numargs; \ + \ + ext_flags = fuse_inode->bpf ? \ + bpf_prog_run(fuse_inode->bpf, &fa) : \ + FUSE_BPF_BACKING; \ + if (ext_flags < 0) { \ + fer = (struct fuse_err_ret) { \ + ERR_PTR(ext_flags), \ + true, \ + }; \ + break; \ + } \ + \ + if (ext_flags & FUSE_BPF_USER_FILTER) { \ + locked = fuse_lock_inode(inode); \ + res = fuse_bpf_simple_request(fm, &fa); \ + fuse_unlock_inode(inode, locked); \ + if (res < 0) { \ + fer = (struct fuse_err_ret) { \ + ERR_PTR(res), \ + true, \ + }; \ + break; \ + } \ + } \ + \ + if (!(ext_flags & FUSE_BPF_BACKING)) \ + break; \ + \ + fa.opcode &= ~FUSE_PREFILTER; \ + for (i = 0; i < fa.in_numargs; ++i) \ + fa.in_args[i] = (struct fuse_bpf_in_arg) { \ + .size = fa.out_args[i].size, \ + .value = fa.out_args[i].value, \ + }; \ + for (i = 0; i < fa_backup.out_numargs; ++i) \ + fa.out_args[i] = (struct fuse_bpf_arg) { \ + .size = fa_backup.out_args[i].size, \ + .value = fa_backup.out_args[i].value, \ + }; \ + fa.out_numargs = fa_backup.out_numargs; \ + \ + fer = (struct fuse_err_ret) { \ + ERR_PTR(backing(&fa, args)), \ + true, \ + }; \ + if (IS_ERR(fer.result)) \ + fa.error_in = PTR_ERR(fer.result); \ + if (!(ext_flags & FUSE_BPF_POST_FILTER)) \ + break; \ + \ + fa.opcode |= FUSE_POSTFILTER; \ + for (i = 0; i < fa.out_numargs; ++i) \ + fa.in_args[fa.in_numargs++] = \ + (struct fuse_bpf_in_arg) { \ + .size = fa.out_args[i].size, \ + .value = fa.out_args[i].value, \ + }; \ + ext_flags = bpf_prog_run(fuse_inode->bpf, &fa); \ + if (ext_flags < 0) { \ + fer = (struct fuse_err_ret) { \ + ERR_PTR(ext_flags), \ + true, \ + }; \ + break; \ + } \ + if (!(ext_flags & FUSE_BPF_USER_FILTER)) \ + break; \ + \ + fa.out_args[0].size = fa_backup.out_args[0].size; \ + fa.out_args[1].size = fa_backup.out_args[1].size; \ + fa.out_numargs = fa_backup.out_numargs; \ + locked = fuse_lock_inode(inode); \ + res = fuse_bpf_simple_request(fm, &fa); \ + fuse_unlock_inode(inode, locked); \ + if (res < 0) { \ + fer.result = ERR_PTR(res); \ + break; \ + } \ + } while (false); \ + \ + if (initialized && fer.ret) { \ + err = finalize(&fa, args); \ + if (err) \ + fer.result = err; \ + } \ + \ + fer; \ +}) + +#endif /* CONFIG_FUSE_BPF */ + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 50365143f50e..1278988518ab 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -27,6 +27,7 @@ MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Filesystem in Userspace"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; @@ -50,8 +51,6 @@ MODULE_PARM_DESC(max_user_congthresh, "Global limit for the maximum congestion threshold an " "unprivileged user can set"); -#define FUSE_SUPER_MAGIC 0x65735546 - #define FUSE_DEFAULT_BLKSIZE 512 /** Maximum number of outstanding background requests */ @@ -79,6 +78,10 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->i_time = 0; fi->inval_mask = 0; +#ifdef CONFIG_FUSE_BPF + fi->backing_inode = NULL; + fi->bpf = NULL; +#endif fi->nodeid = 0; fi->nlookup = 0; fi->attr_version = 0; @@ -121,6 +124,12 @@ static void fuse_evict_inode(struct inode *inode) /* Will write inode on close/munmap and in all other dirtiers */ WARN_ON(inode->i_state & I_DIRTY_INODE); +#ifdef CONFIG_FUSE_BPF + iput(fi->backing_inode); + if (fi->bpf) + bpf_prog_put(fi->bpf); + fi->bpf = NULL; +#endif truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_sb->s_flags & SB_ACTIVE) { @@ -163,6 +172,28 @@ static ino_t fuse_squash_ino(u64 ino64) return ino; } +static void fuse_fill_attr_from_inode(struct fuse_attr *attr, + const struct inode *inode) +{ + *attr = (struct fuse_attr){ + .ino = inode->i_ino, + .size = inode->i_size, + .blocks = inode->i_blocks, + .atime = inode->i_atime.tv_sec, + .mtime = inode->i_mtime.tv_sec, + .ctime = inode->i_ctime.tv_sec, + .atimensec = inode->i_atime.tv_nsec, + .mtimensec = inode->i_mtime.tv_nsec, + .ctimensec = inode->i_ctime.tv_nsec, + .mode = inode->i_mode, + .nlink = inode->i_nlink, + .uid = inode->i_uid.val, + .gid = inode->i_gid.val, + .rdev = inode->i_rdev, + .blksize = 1u << inode->i_blkbits, + }; +} + void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, u64 attr_valid) { @@ -297,28 +328,104 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { fuse_init_common(inode); - init_special_inode(inode, inode->i_mode, - new_decode_dev(attr->rdev)); + init_special_inode(inode, inode->i_mode, attr->rdev); } else BUG(); } +struct fuse_inode_identifier { + u64 nodeid; + struct inode *backing_inode; +}; + static int fuse_inode_eq(struct inode *inode, void *_nodeidp) { - u64 nodeid = *(u64 *) _nodeidp; - if (get_node_id(inode) == nodeid) - return 1; - else - return 0; + struct fuse_inode_identifier *fii = + (struct fuse_inode_identifier *) _nodeidp; + struct fuse_inode *fi = get_fuse_inode(inode); + + return fii->nodeid == fi->nodeid; +} + +static int fuse_inode_backing_eq(struct inode *inode, void *_nodeidp) +{ + struct fuse_inode_identifier *fii = + (struct fuse_inode_identifier *) _nodeidp; + struct fuse_inode *fi = get_fuse_inode(inode); + + return fii->nodeid == fi->nodeid +#ifdef CONFIG_FUSE_BPF + && fii->backing_inode == fi->backing_inode +#endif + ; } static int fuse_inode_set(struct inode *inode, void *_nodeidp) { - u64 nodeid = *(u64 *) _nodeidp; - get_fuse_inode(inode)->nodeid = nodeid; + struct fuse_inode_identifier *fii = + (struct fuse_inode_identifier *) _nodeidp; + struct fuse_inode *fi = get_fuse_inode(inode); + + fi->nodeid = fii->nodeid; + return 0; } +static int fuse_inode_backing_set(struct inode *inode, void *_nodeidp) +{ + struct fuse_inode_identifier *fii = + (struct fuse_inode_identifier *) _nodeidp; + struct fuse_inode *fi = get_fuse_inode(inode); + + fi->nodeid = fii->nodeid; +#ifdef CONFIG_FUSE_BPF + fi->backing_inode = fii->backing_inode; + if (fi->backing_inode) + ihold(fi->backing_inode); +#endif + + return 0; +} + +struct inode *fuse_iget_backing(struct super_block *sb, u64 nodeid, + struct inode *backing_inode) +{ + struct inode *inode; + struct fuse_inode *fi; + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_inode_identifier fii = { + .nodeid = nodeid, + .backing_inode = backing_inode, + }; + struct fuse_attr attr; + unsigned long hash = (unsigned long) backing_inode; + + if (nodeid) + hash = nodeid; + + fuse_fill_attr_from_inode(&attr, backing_inode); + inode = iget5_locked(sb, hash, fuse_inode_backing_eq, + fuse_inode_backing_set, &fii); + if (!inode) + return NULL; + + if ((inode->i_state & I_NEW)) { + inode->i_flags |= S_NOATIME; + if (!fc->writeback_cache) + inode->i_flags |= S_NOCMTIME; + fuse_init_common(inode); + unlock_new_inode(inode); + } + + fi = get_fuse_inode(inode); + fuse_init_inode(inode, &attr); + spin_lock(&fi->lock); + fi->nlookup++; + spin_unlock(&fi->lock); + + return inode; +} + struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, u64 attr_valid, u64 attr_version) @@ -326,6 +433,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, struct inode *inode; struct fuse_inode *fi; struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_inode_identifier fii = { + .nodeid = nodeid, + }; /* * Auto mount points get their node id from the submount root, which is @@ -347,7 +457,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, } retry: - inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid); + inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &fii); if (!inode) return NULL; @@ -379,13 +489,16 @@ struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, { struct fuse_mount *fm_iter; struct inode *inode; + struct fuse_inode_identifier fii = { + .nodeid = nodeid, + }; WARN_ON(!rwsem_is_locked(&fc->killsb)); list_for_each_entry(fm_iter, &fc->mounts, fc_entry) { if (!fm_iter->sb) continue; - inode = ilookup5(fm_iter->sb, nodeid, fuse_inode_eq, &nodeid); + inode = ilookup5(fm_iter->sb, nodeid, fuse_inode_eq, &fii); if (inode) { if (fm) *fm = fm_iter; @@ -466,20 +579,6 @@ static void fuse_send_destroy(struct fuse_mount *fm) } } -static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) -{ - stbuf->f_type = FUSE_SUPER_MAGIC; - stbuf->f_bsize = attr->bsize; - stbuf->f_frsize = attr->frsize; - stbuf->f_blocks = attr->blocks; - stbuf->f_bfree = attr->bfree; - stbuf->f_bavail = attr->bavail; - stbuf->f_files = attr->files; - stbuf->f_ffree = attr->ffree; - stbuf->f_namelen = attr->namelen; - /* fsid is left zero */ -} - static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -487,12 +586,24 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) FUSE_ARGS(args); struct fuse_statfs_out outarg; int err; +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; +#endif if (!fuse_allow_current_process(fm->fc)) { buf->f_type = FUSE_SUPER_MAGIC; return 0; } +#ifdef CONFIG_FUSE_BPF + fer = fuse_bpf_backing(dentry->d_inode, struct fuse_statfs_out, + fuse_statfs_initialize, fuse_statfs_backing, + fuse_statfs_finalize, + dentry, buf); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + memset(&outarg, 0, sizeof(outarg)); args.in_numargs = 0; args.opcode = FUSE_STATFS; @@ -609,6 +720,9 @@ enum { OPT_ALLOW_OTHER, OPT_MAX_READ, OPT_BLKSIZE, + OPT_ROOT_BPF, + OPT_ROOT_DIR, + OPT_NO_DAEMON, OPT_ERR }; @@ -623,6 +737,9 @@ static const struct fs_parameter_spec fuse_fs_parameters[] = { fsparam_u32 ("max_read", OPT_MAX_READ), fsparam_u32 ("blksize", OPT_BLKSIZE), fsparam_string ("subtype", OPT_SUBTYPE), + fsparam_u32 ("root_bpf", OPT_ROOT_BPF), + fsparam_u32 ("root_dir", OPT_ROOT_DIR), + fsparam_flag ("no_daemon", OPT_NO_DAEMON), {} }; @@ -706,6 +823,26 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param) ctx->blksize = result.uint_32; break; + case OPT_ROOT_BPF: + ctx->root_bpf = bpf_prog_get_type_dev(result.uint_32, + BPF_PROG_TYPE_FUSE, false); + if (IS_ERR(ctx->root_bpf)) { + ctx->root_bpf = NULL; + return invalfc(fsc, "Unable to open bpf program"); + } + break; + + case OPT_ROOT_DIR: + ctx->root_dir = fget(result.uint_32); + if (!ctx->root_dir) + return invalfc(fsc, "Unable to open root directory"); + break; + + case OPT_NO_DAEMON: + ctx->no_daemon = true; + ctx->fd_present = true; + break; + default: return -EINVAL; } @@ -718,6 +855,10 @@ static void fuse_free_fsc(struct fs_context *fsc) struct fuse_fs_context *ctx = fsc->fs_private; if (ctx) { + if (ctx->root_dir) + fput(ctx->root_dir); + if (ctx->root_bpf) + bpf_prog_put(ctx->root_bpf); kfree(ctx->subtype); kfree(ctx); } @@ -783,6 +924,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); spin_lock_init(&fc->bg_lock); + spin_lock_init(&fc->passthrough_req_lock); init_rwsem(&fc->killsb); refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); @@ -791,6 +933,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); INIT_LIST_HEAD(&fc->devices); + idr_init(&fc->passthrough_req); atomic_set(&fc->num_waiting, 0); fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; @@ -841,15 +984,34 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) } EXPORT_SYMBOL_GPL(fuse_conn_get); -static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) +static struct inode *fuse_get_root_inode(struct super_block *sb, + unsigned int mode, + struct bpf_prog *root_bpf, + struct file *backing_fd) { struct fuse_attr attr; - memset(&attr, 0, sizeof(attr)); + struct inode *inode; + memset(&attr, 0, sizeof(attr)); attr.mode = mode; attr.ino = FUSE_ROOT_ID; attr.nlink = 1; - return fuse_iget(sb, 1, 0, &attr, 0, 0); + inode = fuse_iget(sb, 1, 0, &attr, 0, 0); + if (!inode) + return NULL; + +#ifdef CONFIG_FUSE_BPF + get_fuse_inode(inode)->bpf = root_bpf; + if (root_bpf) + bpf_prog_inc(root_bpf); + + if (backing_fd) { + get_fuse_inode(inode)->backing_inode = backing_fd->f_inode; + ihold(backing_fd->f_inode); + } +#endif + + return inode; } struct fuse_inode_handle { @@ -864,11 +1026,14 @@ static struct dentry *fuse_get_dentry(struct super_block *sb, struct inode *inode; struct dentry *entry; int err = -ESTALE; + struct fuse_inode_identifier fii = { + .nodeid = handle->nodeid, + }; if (handle->nodeid == 0) goto out_err; - inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid); + inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &fii); if (!inode) { struct fuse_entry_out outarg; const struct qstr name = QSTR_INIT(".", 1); @@ -877,7 +1042,7 @@ static struct dentry *fuse_get_dentry(struct super_block *sb, goto out_err; err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg, - &inode); + NULL, &inode); if (err && err != -ENOENT) goto out_err; if (err || !inode) { @@ -971,13 +1136,14 @@ static struct dentry *fuse_get_parent(struct dentry *child) struct inode *inode; struct dentry *parent; struct fuse_entry_out outarg; + const struct qstr name = QSTR_INIT("..", 2); int err; if (!fc->export_support) return ERR_PTR(-ESTALE); err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode), - &dotdot_name, &outarg, &inode); + &name, &outarg, NULL, &inode); if (err) { if (err == -ENOENT) return ERR_PTR(-ESTALE); @@ -1150,6 +1316,12 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->handle_killpriv_v2 = 1; fm->sb->s_flags |= SB_NOSEC; } + if (arg->flags & FUSE_PASSTHROUGH) { + fc->passthrough = 1; + /* Prevent further stacking */ + fm->sb->s_stack_depth = + FILESYSTEM_MAX_STACK_DEPTH; + } if (arg->flags & FUSE_SETXATTR_EXT) fc->setxattr_ext = 1; } else { @@ -1195,6 +1367,7 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | + FUSE_PASSTHROUGH | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) @@ -1218,14 +1391,26 @@ void fuse_send_init(struct fuse_mount *fm) ia->args.nocreds = true; ia->args.end = process_init_reply; - if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0) + if (unlikely(fm->fc->no_daemon) || fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0) process_init_reply(fm, &ia->args, -ENOTCONN); } EXPORT_SYMBOL_GPL(fuse_send_init); +static int free_fuse_passthrough(int id, void *p, void *data) +{ + struct fuse_passthrough *passthrough = (struct fuse_passthrough *)p; + + fuse_passthrough_release(passthrough); + kfree(p); + + return 0; +} + void fuse_free_conn(struct fuse_conn *fc) { WARN_ON(!list_empty(&fc->devices)); + idr_for_each(&fc->passthrough_req, free_fuse_passthrough, NULL); + idr_destroy(&fc->passthrough_req); kfree_rcu(fc, rcu); } EXPORT_SYMBOL_GPL(fuse_free_conn); @@ -1330,28 +1515,6 @@ void fuse_dev_free(struct fuse_dev *fud) } EXPORT_SYMBOL_GPL(fuse_dev_free); -static void fuse_fill_attr_from_inode(struct fuse_attr *attr, - const struct fuse_inode *fi) -{ - *attr = (struct fuse_attr){ - .ino = fi->inode.i_ino, - .size = fi->inode.i_size, - .blocks = fi->inode.i_blocks, - .atime = fi->inode.i_atime.tv_sec, - .mtime = fi->inode.i_mtime.tv_sec, - .ctime = fi->inode.i_ctime.tv_sec, - .atimensec = fi->inode.i_atime.tv_nsec, - .mtimensec = fi->inode.i_mtime.tv_nsec, - .ctimensec = fi->inode.i_ctime.tv_nsec, - .mode = fi->inode.i_mode, - .nlink = fi->inode.i_nlink, - .uid = fi->inode.i_uid.val, - .gid = fi->inode.i_gid.val, - .rdev = fi->inode.i_rdev, - .blksize = 1u << fi->inode.i_blkbits, - }; -} - static void fuse_sb_defaults(struct super_block *sb) { sb->s_magic = FUSE_SUPER_MAGIC; @@ -1395,7 +1558,7 @@ static int fuse_fill_super_submount(struct super_block *sb, if (parent_sb->s_subtype && !sb->s_subtype) return -ENOMEM; - fuse_fill_attr_from_inode(&root_attr, parent_fi); + fuse_fill_attr_from_inode(&root_attr, &parent_fi->inode); root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0); /* * This inode is just a duplicate, so it is not looked up and @@ -1522,13 +1685,16 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) fc->destroy = ctx->destroy; fc->no_control = ctx->no_control; fc->no_force_umount = ctx->no_force_umount; + fc->no_daemon = ctx->no_daemon; err = -ENOMEM; - root = fuse_get_root_inode(sb, ctx->rootmode); + root = fuse_get_root_inode(sb, ctx->rootmode, ctx->root_bpf, + ctx->root_dir); sb->s_d_op = &fuse_root_dentry_operations; root_dentry = d_make_root(root); if (!root_dentry) goto err_dev_free; + fuse_init_dentry_root(root_dentry, ctx->root_dir); /* Root dentry doesn't have .d_revalidate */ sb->s_d_op = &fuse_dentry_operations; @@ -1567,18 +1733,20 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) struct fuse_fs_context *ctx = fsc->fs_private; int err; - if (!ctx->file || !ctx->rootmode_present || - !ctx->user_id_present || !ctx->group_id_present) - return -EINVAL; + if (!ctx->no_daemon) { + if (!ctx->file || !ctx->rootmode_present || + !ctx->user_id_present || !ctx->group_id_present) + return -EINVAL; - /* - * Require mount to happen from the same user namespace which - * opened /dev/fuse to prevent potential attacks. - */ - if ((ctx->file->f_op != &fuse_dev_operations) || - (ctx->file->f_cred->user_ns != sb->s_user_ns)) - return -EINVAL; - ctx->fudptr = &ctx->file->private_data; + /* + * Require mount to happen from the same user namespace which + * opened /dev/fuse to prevent potential attacks. + */ + if ((ctx->file->f_op != &fuse_dev_operations) || + (ctx->file->f_cred->user_ns != sb->s_user_ns)) + return -EINVAL; + ctx->fudptr = &ctx->file->private_data; + } err = fuse_fill_super_common(sb, ctx); if (err) @@ -1859,6 +2027,57 @@ static void fuse_fs_cleanup(void) static struct kobject *fuse_kobj; +static ssize_t fuse_bpf_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buff) +{ + return sysfs_emit(buff, "supported\n"); +} + +static struct kobj_attribute fuse_bpf_attr = + __ATTR_RO(fuse_bpf); + +static struct attribute *bpf_features[] = { + &fuse_bpf_attr.attr, + NULL, +}; + +static const struct attribute_group bpf_features_group = { + .name = "features", + .attrs = bpf_features, +}; + +/* + * TODO Remove this once fuse-bpf is upstreamed + * + * bpf_prog_type_fuse exports the bpf_prog_type_fuse 'constant', which cannot be + * constant until the code is upstreamed + */ +static ssize_t bpf_prog_type_fuse_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buff) +{ + return sysfs_emit(buff, "%d\n", BPF_PROG_TYPE_FUSE); +} + +static struct kobj_attribute bpf_prog_type_fuse_attr = + __ATTR_RO(bpf_prog_type_fuse); + +static struct attribute *bpf_attributes[] = { + &bpf_prog_type_fuse_attr.attr, + NULL, +}; + +static const struct attribute_group bpf_attr_group = { + .attrs = bpf_attributes, +}; + +static const struct attribute_group *attribute_groups[] = { + &bpf_features_group, + &bpf_attr_group, + NULL +}; + +/* TODO remove to here */ + static int fuse_sysfs_init(void) { int err; @@ -1873,8 +2092,15 @@ static int fuse_sysfs_init(void) if (err) goto out_fuse_unregister; + /* TODO Remove when BPF_PROG_TYPE_FUSE is upstreamed */ + err = sysfs_create_groups(fuse_kobj, attribute_groups); + if (err) + goto out_fuse_remove_mount_point; + return 0; + out_fuse_remove_mount_point: + sysfs_remove_mount_point(fuse_kobj, "connections"); out_fuse_unregister: kobject_put(fuse_kobj); out_err: @@ -1883,6 +2109,7 @@ static int fuse_sysfs_init(void) static void fuse_sysfs_cleanup(void) { + sysfs_remove_groups(fuse_kobj, attribute_groups); sysfs_remove_mount_point(fuse_kobj, "connections"); kobject_put(fuse_kobj); } @@ -1911,11 +2138,21 @@ static int __init fuse_init(void) if (res) goto err_sysfs_cleanup; +#ifdef CONFIG_FUSE_BPF + res = fuse_bpf_init(); + if (res) + goto err_ctl_cleanup; +#endif + sanitize_global_limit(&max_user_bgreq); sanitize_global_limit(&max_user_congthresh); return 0; +#ifdef CONFIG_FUSE_BPF + err_ctl_cleanup: + fuse_ctl_cleanup(); +#endif err_sysfs_cleanup: fuse_sysfs_cleanup(); err_dev_cleanup: @@ -1933,6 +2170,9 @@ static void __exit fuse_exit(void) fuse_ctl_cleanup(); fuse_sysfs_cleanup(); fuse_fs_cleanup(); +#ifdef CONFIG_FUSE_BPF + fuse_bpf_cleanup(); +#endif fuse_dev_cleanup(); } diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c new file mode 100644 index 000000000000..6031e72f3b5a --- /dev/null +++ b/fs/fuse/passthrough.c @@ -0,0 +1,293 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "fuse_i.h" + +#include +#include +#include + +#define PASSTHROUGH_IOCB_MASK \ + (IOCB_APPEND | IOCB_DSYNC | IOCB_HIPRI | IOCB_NOWAIT | IOCB_SYNC) + +struct fuse_aio_req { + struct kiocb iocb; + struct kiocb *iocb_fuse; +}; + +static void fuse_file_accessed(struct file *dst_file, struct file *src_file) +{ + struct inode *dst_inode; + struct inode *src_inode; + + if (dst_file->f_flags & O_NOATIME) + return; + + dst_inode = file_inode(dst_file); + src_inode = file_inode(src_file); + + if ((!timespec64_equal(&dst_inode->i_mtime, &src_inode->i_mtime) || + !timespec64_equal(&dst_inode->i_ctime, &src_inode->i_ctime))) { + dst_inode->i_mtime = src_inode->i_mtime; + dst_inode->i_ctime = src_inode->i_ctime; + } + + touch_atime(&dst_file->f_path); +} + +void fuse_copyattr(struct file *dst_file, struct file *src_file) +{ + struct inode *dst = file_inode(dst_file); + struct inode *src = file_inode(src_file); + + dst->i_atime = src->i_atime; + dst->i_mtime = src->i_mtime; + dst->i_ctime = src->i_ctime; + i_size_write(dst, i_size_read(src)); +} + +static void fuse_aio_cleanup_handler(struct fuse_aio_req *aio_req) +{ + struct kiocb *iocb = &aio_req->iocb; + struct kiocb *iocb_fuse = aio_req->iocb_fuse; + + if (iocb->ki_flags & IOCB_WRITE) { + __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb, + SB_FREEZE_WRITE); + file_end_write(iocb->ki_filp); + fuse_copyattr(iocb_fuse->ki_filp, iocb->ki_filp); + } + + iocb_fuse->ki_pos = iocb->ki_pos; + kfree(aio_req); +} + +static void fuse_aio_rw_complete(struct kiocb *iocb, long res, long res2) +{ + struct fuse_aio_req *aio_req = + container_of(iocb, struct fuse_aio_req, iocb); + struct kiocb *iocb_fuse = aio_req->iocb_fuse; + + fuse_aio_cleanup_handler(aio_req); + iocb_fuse->ki_complete(iocb_fuse, res, res2); +} + +ssize_t fuse_passthrough_read_iter(struct kiocb *iocb_fuse, + struct iov_iter *iter) +{ + ssize_t ret; + const struct cred *old_cred; + struct file *fuse_filp = iocb_fuse->ki_filp; + struct fuse_file *ff = fuse_filp->private_data; + struct file *passthrough_filp = ff->passthrough.filp; + + if (!iov_iter_count(iter)) + return 0; + + old_cred = override_creds(ff->passthrough.cred); + if (is_sync_kiocb(iocb_fuse)) { + ret = vfs_iter_read(passthrough_filp, iter, &iocb_fuse->ki_pos, + iocb_to_rw_flags(iocb_fuse->ki_flags, + PASSTHROUGH_IOCB_MASK)); + } else { + struct fuse_aio_req *aio_req; + + aio_req = kmalloc(sizeof(struct fuse_aio_req), GFP_KERNEL); + if (!aio_req) { + ret = -ENOMEM; + goto out; + } + + aio_req->iocb_fuse = iocb_fuse; + kiocb_clone(&aio_req->iocb, iocb_fuse, passthrough_filp); + aio_req->iocb.ki_complete = fuse_aio_rw_complete; + ret = call_read_iter(passthrough_filp, &aio_req->iocb, iter); + if (ret != -EIOCBQUEUED) + fuse_aio_cleanup_handler(aio_req); + } +out: + revert_creds(old_cred); + + fuse_file_accessed(fuse_filp, passthrough_filp); + + return ret; +} + +ssize_t fuse_passthrough_write_iter(struct kiocb *iocb_fuse, + struct iov_iter *iter) +{ + ssize_t ret; + const struct cred *old_cred; + struct file *fuse_filp = iocb_fuse->ki_filp; + struct fuse_file *ff = fuse_filp->private_data; + struct inode *fuse_inode = file_inode(fuse_filp); + struct file *passthrough_filp = ff->passthrough.filp; + struct inode *passthrough_inode = file_inode(passthrough_filp); + + if (!iov_iter_count(iter)) + return 0; + + inode_lock(fuse_inode); + + fuse_copyattr(fuse_filp, passthrough_filp); + + old_cred = override_creds(ff->passthrough.cred); + if (is_sync_kiocb(iocb_fuse)) { + file_start_write(passthrough_filp); + ret = vfs_iter_write(passthrough_filp, iter, &iocb_fuse->ki_pos, + iocb_to_rw_flags(iocb_fuse->ki_flags, + PASSTHROUGH_IOCB_MASK)); + file_end_write(passthrough_filp); + if (ret > 0) + fuse_copyattr(fuse_filp, passthrough_filp); + } else { + struct fuse_aio_req *aio_req; + + aio_req = kmalloc(sizeof(struct fuse_aio_req), GFP_KERNEL); + if (!aio_req) { + ret = -ENOMEM; + goto out; + } + + file_start_write(passthrough_filp); + __sb_writers_release(passthrough_inode->i_sb, SB_FREEZE_WRITE); + + aio_req->iocb_fuse = iocb_fuse; + kiocb_clone(&aio_req->iocb, iocb_fuse, passthrough_filp); + aio_req->iocb.ki_complete = fuse_aio_rw_complete; + ret = call_write_iter(passthrough_filp, &aio_req->iocb, iter); + if (ret != -EIOCBQUEUED) + fuse_aio_cleanup_handler(aio_req); + } +out: + revert_creds(old_cred); + inode_unlock(fuse_inode); + + return ret; +} + +ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma) +{ + int ret; + const struct cred *old_cred; + struct fuse_file *ff = file->private_data; + struct file *passthrough_filp = ff->passthrough.filp; + + if (!passthrough_filp->f_op->mmap) + return -ENODEV; + + if (WARN_ON(file != vma->vm_file)) + return -EIO; + + vma->vm_file = get_file(passthrough_filp); + + old_cred = override_creds(ff->passthrough.cred); + ret = call_mmap(vma->vm_file, vma); + revert_creds(old_cred); + + if (ret) + fput(passthrough_filp); + else + fput(file); + + fuse_file_accessed(file, passthrough_filp); + + return ret; +} + +int fuse_passthrough_open(struct fuse_dev *fud, u32 lower_fd) +{ + int res; + struct file *passthrough_filp; + struct fuse_conn *fc = fud->fc; + struct inode *passthrough_inode; + struct super_block *passthrough_sb; + struct fuse_passthrough *passthrough; + + if (!fc->passthrough) + return -EPERM; + + passthrough_filp = fget(lower_fd); + if (!passthrough_filp) { + pr_err("FUSE: invalid file descriptor for passthrough.\n"); + return -EBADF; + } + + if (!passthrough_filp->f_op->read_iter || + !passthrough_filp->f_op->write_iter) { + pr_err("FUSE: passthrough file misses file operations.\n"); + res = -EBADF; + goto err_free_file; + } + + passthrough_inode = file_inode(passthrough_filp); + passthrough_sb = passthrough_inode->i_sb; + if (passthrough_sb->s_stack_depth >= FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("FUSE: fs stacking depth exceeded for passthrough\n"); + res = -EINVAL; + goto err_free_file; + } + + passthrough = kmalloc(sizeof(struct fuse_passthrough), GFP_KERNEL); + if (!passthrough) { + res = -ENOMEM; + goto err_free_file; + } + + passthrough->filp = passthrough_filp; + passthrough->cred = prepare_creds(); + + idr_preload(GFP_KERNEL); + spin_lock(&fc->passthrough_req_lock); + res = idr_alloc(&fc->passthrough_req, passthrough, 1, 0, GFP_ATOMIC); + spin_unlock(&fc->passthrough_req_lock); + idr_preload_end(); + + if (res > 0) + return res; + + fuse_passthrough_release(passthrough); + kfree(passthrough); + +err_free_file: + fput(passthrough_filp); + + return res; +} + +int fuse_passthrough_setup(struct fuse_conn *fc, struct fuse_file *ff, + struct fuse_open_out *openarg) +{ + struct fuse_passthrough *passthrough; + int passthrough_fh = openarg->passthrough_fh; + + if (!fc->passthrough) + return -EPERM; + + /* Default case, passthrough is not requested */ + if (passthrough_fh <= 0) + return -EINVAL; + + spin_lock(&fc->passthrough_req_lock); + passthrough = idr_remove(&fc->passthrough_req, passthrough_fh); + spin_unlock(&fc->passthrough_req_lock); + + if (!passthrough) + return -EINVAL; + + ff->passthrough = *passthrough; + kfree(passthrough); + + return 0; +} + +void fuse_passthrough_release(struct fuse_passthrough *passthrough) +{ + if (passthrough->filp) { + fput(passthrough->filp); + passthrough->filp = NULL; + } + if (passthrough->cred) { + put_cred(passthrough->cred); + passthrough->cred = NULL; + } +} diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index d5294e663df5..05166864e1c2 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -20,6 +20,8 @@ static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) if (!fc->do_readdirplus) return false; + if (fi->nodeid == 0) + return false; if (!fc->readdirplus_auto) return true; if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) @@ -579,6 +581,26 @@ int fuse_readdir(struct file *file, struct dir_context *ctx) struct inode *inode = file_inode(file); int err; +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + bool allow_force; + bool force_again = false; + bool is_continued = false; + +again: + fer = fuse_bpf_backing(inode, struct fuse_read_io, + fuse_readdir_initialize, fuse_readdir_backing, + fuse_readdir_finalize, + file, ctx, &force_again, &allow_force, is_continued); + if (force_again && !IS_ERR(fer.result)) { + is_continued = true; + goto again; + } + + if (fer.ret) + return PTR_ERR(fer.result); +#endif + if (fuse_is_bad(inode)) return -EIO; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 94fc874f5de7..9901b3c5e7ff 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -972,7 +972,7 @@ static struct virtio_driver virtio_fs_driver = { #endif }; -static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq) +static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq, bool sync) __releases(fiq->lock) { struct fuse_forget_link *link; @@ -1007,7 +1007,8 @@ __releases(fiq->lock) kfree(link); } -static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq) +static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq, + bool sync) __releases(fiq->lock) { /* @@ -1222,7 +1223,8 @@ out: return ret; } -static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq) +static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq, + bool sync) __releases(fiq->lock) { unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */ diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 61dfaf7b7d20..6c95d0d9a82f 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -116,6 +116,17 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) struct fuse_getxattr_out outarg; ssize_t ret; +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(inode, struct fuse_getxattr_io, + fuse_listxattr_initialize, + fuse_listxattr_backing, fuse_listxattr_finalize, + entry, list, size); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + if (fuse_is_bad(inode)) return -EIO; @@ -184,6 +195,17 @@ static int fuse_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + fer = fuse_bpf_backing(inode, struct fuse_getxattr_io, + fuse_getxattr_initialize, fuse_getxattr_backing, + fuse_getxattr_finalize, + dentry, name, value, size); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + if (fuse_is_bad(inode)) return -EIO; @@ -196,6 +218,24 @@ static int fuse_xattr_set(const struct xattr_handler *handler, const char *name, const void *value, size_t size, int flags) { +#ifdef CONFIG_FUSE_BPF + struct fuse_err_ret fer; + + if (value) + fer = fuse_bpf_backing(inode, struct fuse_setxattr_in, + fuse_setxattr_initialize, fuse_setxattr_backing, + fuse_setxattr_finalize, dentry, name, value, + size, flags); + else + fer = fuse_bpf_backing(inode, struct fuse_dummy_io, + fuse_removexattr_initialize, + fuse_removexattr_backing, + fuse_removexattr_finalize, + dentry, name); + if (fer.ret) + return PTR_ERR(fer.result); +#endif + if (fuse_is_bad(inode)) return -EIO; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 28d0eb23e18e..3e7b2ebdd157 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -260,6 +260,7 @@ static void __exit exit_gfs2_fs(void) MODULE_DESCRIPTION("Global File System"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); module_init(init_gfs2_fs); module_exit(exit_gfs2_fs); diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 12d9bae39363..6432d65a0872 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -29,6 +29,7 @@ static struct kmem_cache *hfs_inode_cachep; MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); static int hfs_sync_fs(struct super_block *sb, int wait) { diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index b9e3db3f855f..4c9997172b1f 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -617,6 +617,7 @@ out: MODULE_AUTHOR("Brad Boyer"); MODULE_DESCRIPTION("Extended Macintosh Filesystem"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); static struct kmem_cache *hfsplus_inode_cachep; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index a7dbfc892022..d7598d70f329 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -791,3 +791,4 @@ static void __exit exit_hpfs_fs(void) module_init(init_hpfs_fs) module_exit(exit_hpfs_fs) MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/incfs/Kconfig b/fs/incfs/Kconfig new file mode 100644 index 000000000000..5f1500072cd7 --- /dev/null +++ b/fs/incfs/Kconfig @@ -0,0 +1,13 @@ +config INCREMENTAL_FS + tristate "Incremental file system support" + depends on BLOCK + select DECOMPRESS_LZ4 + select DECOMPRESS_ZSTD + select CRYPTO_SHA256 + help + Incremental FS is a read-only virtual file system that facilitates execution + of programs while their binaries are still being lazily downloaded over the + network, USB or pigeon post. + + To compile this file system support as a module, choose M here: the + module will be called incrementalfs. diff --git a/fs/incfs/Makefile b/fs/incfs/Makefile new file mode 100644 index 000000000000..05795d12c874 --- /dev/null +++ b/fs/incfs/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_INCREMENTAL_FS) += incrementalfs.o + +incrementalfs-y := \ + data_mgmt.o \ + format.o \ + integrity.o \ + main.o \ + pseudo_files.o \ + sysfs.o \ + vfs.o + +incrementalfs-$(CONFIG_FS_VERITY) += verity.o diff --git a/fs/incfs/OWNERS b/fs/incfs/OWNERS new file mode 100644 index 000000000000..1b97669c9b15 --- /dev/null +++ b/fs/incfs/OWNERS @@ -0,0 +1,2 @@ +akailash@google.com +paullawrence@google.com diff --git a/fs/incfs/data_mgmt.c b/fs/incfs/data_mgmt.c new file mode 100644 index 000000000000..5abb20a1098c --- /dev/null +++ b/fs/incfs/data_mgmt.c @@ -0,0 +1,1889 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 Google LLC + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "data_mgmt.h" +#include "format.h" +#include "integrity.h" +#include "sysfs.h" +#include "verity.h" + +static int incfs_scan_metadata_chain(struct data_file *df); + +static void log_wake_up_all(struct work_struct *work) +{ + struct delayed_work *dw = container_of(work, struct delayed_work, work); + struct read_log *rl = container_of(dw, struct read_log, ml_wakeup_work); + wake_up_all(&rl->ml_notif_wq); +} + +static void zstd_free_workspace(struct work_struct *work) +{ + struct delayed_work *dw = container_of(work, struct delayed_work, work); + struct mount_info *mi = + container_of(dw, struct mount_info, mi_zstd_cleanup_work); + + mutex_lock(&mi->mi_zstd_workspace_mutex); + kvfree(mi->mi_zstd_workspace); + mi->mi_zstd_workspace = NULL; + mi->mi_zstd_stream = NULL; + mutex_unlock(&mi->mi_zstd_workspace_mutex); +} + +struct mount_info *incfs_alloc_mount_info(struct super_block *sb, + struct mount_options *options, + struct path *backing_dir_path) +{ + struct mount_info *mi = NULL; + int error = 0; + struct incfs_sysfs_node *node; + + mi = kzalloc(sizeof(*mi), GFP_NOFS); + if (!mi) + return ERR_PTR(-ENOMEM); + + mi->mi_sb = sb; + mi->mi_backing_dir_path = *backing_dir_path; + mi->mi_owner = get_current_cred(); + path_get(&mi->mi_backing_dir_path); + mutex_init(&mi->mi_dir_struct_mutex); + init_waitqueue_head(&mi->mi_pending_reads_notif_wq); + init_waitqueue_head(&mi->mi_log.ml_notif_wq); + init_waitqueue_head(&mi->mi_blocks_written_notif_wq); + atomic_set(&mi->mi_blocks_written, 0); + INIT_DELAYED_WORK(&mi->mi_log.ml_wakeup_work, log_wake_up_all); + spin_lock_init(&mi->mi_log.rl_lock); + spin_lock_init(&mi->pending_read_lock); + INIT_LIST_HEAD(&mi->mi_reads_list_head); + spin_lock_init(&mi->mi_per_uid_read_timeouts_lock); + mutex_init(&mi->mi_zstd_workspace_mutex); + INIT_DELAYED_WORK(&mi->mi_zstd_cleanup_work, zstd_free_workspace); + mutex_init(&mi->mi_le_mutex); + + node = incfs_add_sysfs_node(options->sysfs_name, mi); + if (IS_ERR(node)) { + error = PTR_ERR(node); + goto err; + } + mi->mi_sysfs_node = node; + + error = incfs_realloc_mount_info(mi, options); + if (error) + goto err; + + return mi; + +err: + incfs_free_mount_info(mi); + return ERR_PTR(error); +} + +int incfs_realloc_mount_info(struct mount_info *mi, + struct mount_options *options) +{ + void *new_buffer = NULL; + void *old_buffer; + size_t new_buffer_size = 0; + + if (options->read_log_pages != mi->mi_options.read_log_pages) { + struct read_log_state log_state; + /* + * Even though having two buffers allocated at once isn't + * usually good, allocating a multipage buffer under a spinlock + * is even worse, so let's optimize for the shorter lock + * duration. It's not end of the world if we fail to increase + * the buffer size anyway. + */ + if (options->read_log_pages > 0) { + new_buffer_size = PAGE_SIZE * options->read_log_pages; + new_buffer = kzalloc(new_buffer_size, GFP_NOFS); + if (!new_buffer) + return -ENOMEM; + } + + spin_lock(&mi->mi_log.rl_lock); + old_buffer = mi->mi_log.rl_ring_buf; + mi->mi_log.rl_ring_buf = new_buffer; + mi->mi_log.rl_size = new_buffer_size; + log_state = (struct read_log_state){ + .generation_id = mi->mi_log.rl_head.generation_id + 1, + }; + mi->mi_log.rl_head = log_state; + mi->mi_log.rl_tail = log_state; + spin_unlock(&mi->mi_log.rl_lock); + + kfree(old_buffer); + } + + if (options->sysfs_name && !mi->mi_sysfs_node) + mi->mi_sysfs_node = incfs_add_sysfs_node(options->sysfs_name, + mi); + else if (!options->sysfs_name && mi->mi_sysfs_node) { + incfs_free_sysfs_node(mi->mi_sysfs_node); + mi->mi_sysfs_node = NULL; + } else if (options->sysfs_name && + strcmp(options->sysfs_name, + kobject_name(&mi->mi_sysfs_node->isn_sysfs_node))) { + incfs_free_sysfs_node(mi->mi_sysfs_node); + mi->mi_sysfs_node = incfs_add_sysfs_node(options->sysfs_name, + mi); + } + + if (IS_ERR(mi->mi_sysfs_node)) { + int err = PTR_ERR(mi->mi_sysfs_node); + + mi->mi_sysfs_node = NULL; + return err; + } + + mi->mi_options = *options; + return 0; +} + +void incfs_free_mount_info(struct mount_info *mi) +{ + int i; + if (!mi) + return; + + flush_delayed_work(&mi->mi_log.ml_wakeup_work); + flush_delayed_work(&mi->mi_zstd_cleanup_work); + + dput(mi->mi_index_dir); + dput(mi->mi_incomplete_dir); + path_put(&mi->mi_backing_dir_path); + mutex_destroy(&mi->mi_dir_struct_mutex); + mutex_destroy(&mi->mi_zstd_workspace_mutex); + put_cred(mi->mi_owner); + kfree(mi->mi_log.rl_ring_buf); + for (i = 0; i < ARRAY_SIZE(mi->pseudo_file_xattr); ++i) + kfree(mi->pseudo_file_xattr[i].data); + kfree(mi->mi_per_uid_read_timeouts); + incfs_free_sysfs_node(mi->mi_sysfs_node); + kfree(mi); +} + +static void data_file_segment_init(struct data_file_segment *segment) +{ + init_waitqueue_head(&segment->new_data_arrival_wq); + init_rwsem(&segment->rwsem); + INIT_LIST_HEAD(&segment->reads_list_head); +} + +char *file_id_to_str(incfs_uuid_t id) +{ + char *result = kmalloc(1 + sizeof(id.bytes) * 2, GFP_NOFS); + char *end; + + if (!result) + return NULL; + + end = bin2hex(result, id.bytes, sizeof(id.bytes)); + *end = 0; + return result; +} + +struct dentry *incfs_lookup_dentry(struct dentry *parent, const char *name) +{ + struct inode *inode; + struct dentry *result = NULL; + + if (!parent) + return ERR_PTR(-EFAULT); + + inode = d_inode(parent); + inode_lock_nested(inode, I_MUTEX_PARENT); + result = lookup_one_len(name, parent, strlen(name)); + inode_unlock(inode); + + if (IS_ERR(result)) + pr_warn("%s err:%ld\n", __func__, PTR_ERR(result)); + + return result; +} + +static struct data_file *handle_mapped_file(struct mount_info *mi, + struct data_file *df) +{ + char *file_id_str; + struct dentry *index_file_dentry; + struct path path; + struct file *bf; + struct data_file *result = NULL; + const struct cred *old_cred; + + file_id_str = file_id_to_str(df->df_id); + if (!file_id_str) + return ERR_PTR(-ENOENT); + + index_file_dentry = incfs_lookup_dentry(mi->mi_index_dir, + file_id_str); + kfree(file_id_str); + if (!index_file_dentry) + return ERR_PTR(-ENOENT); + if (IS_ERR(index_file_dentry)) + return (struct data_file *)index_file_dentry; + if (!d_really_is_positive(index_file_dentry)) { + result = ERR_PTR(-ENOENT); + goto out; + } + + path = (struct path) { + .mnt = mi->mi_backing_dir_path.mnt, + .dentry = index_file_dentry + }; + + old_cred = override_creds(mi->mi_owner); + bf = dentry_open(&path, O_RDWR | O_NOATIME | O_LARGEFILE, + current_cred()); + revert_creds(old_cred); + + if (IS_ERR(bf)) { + result = (struct data_file *)bf; + goto out; + } + + result = incfs_open_data_file(mi, bf); + fput(bf); + if (IS_ERR(result)) + goto out; + + result->df_mapped_offset = df->df_metadata_off; + +out: + dput(index_file_dentry); + return result; +} + +struct data_file *incfs_open_data_file(struct mount_info *mi, struct file *bf) +{ + struct data_file *df = NULL; + struct backing_file_context *bfc = NULL; + int md_records; + u64 size; + int error = 0; + int i; + + if (!bf || !mi) + return ERR_PTR(-EFAULT); + + if (!S_ISREG(bf->f_inode->i_mode)) + return ERR_PTR(-EBADF); + + bfc = incfs_alloc_bfc(mi, bf); + if (IS_ERR(bfc)) + return ERR_CAST(bfc); + + df = kzalloc(sizeof(*df), GFP_NOFS); + if (!df) { + error = -ENOMEM; + goto out; + } + + mutex_init(&df->df_enable_verity); + + df->df_backing_file_context = bfc; + df->df_mount_info = mi; + for (i = 0; i < ARRAY_SIZE(df->df_segments); i++) + data_file_segment_init(&df->df_segments[i]); + + error = incfs_read_file_header(bfc, &df->df_metadata_off, &df->df_id, + &size, &df->df_header_flags); + + if (error) + goto out; + + df->df_size = size; + if (size > 0) + df->df_data_block_count = get_blocks_count_for_size(size); + + if (df->df_header_flags & INCFS_FILE_MAPPED) { + struct data_file *mapped_df = handle_mapped_file(mi, df); + + incfs_free_data_file(df); + return mapped_df; + } + + md_records = incfs_scan_metadata_chain(df); + if (md_records < 0) + error = md_records; + +out: + if (error) { + incfs_free_bfc(bfc); + if (df) + df->df_backing_file_context = NULL; + incfs_free_data_file(df); + return ERR_PTR(error); + } + return df; +} + +void incfs_free_data_file(struct data_file *df) +{ + u32 data_blocks_written, hash_blocks_written; + + if (!df) + return; + + data_blocks_written = atomic_read(&df->df_data_blocks_written); + hash_blocks_written = atomic_read(&df->df_hash_blocks_written); + + if (data_blocks_written != df->df_initial_data_blocks_written || + hash_blocks_written != df->df_initial_hash_blocks_written) { + struct backing_file_context *bfc = df->df_backing_file_context; + int error = -1; + + if (bfc && !mutex_lock_interruptible(&bfc->bc_mutex)) { + error = incfs_write_status_to_backing_file( + df->df_backing_file_context, + df->df_status_offset, + data_blocks_written, + hash_blocks_written); + mutex_unlock(&bfc->bc_mutex); + } + + if (error) + /* Nothing can be done, just warn */ + pr_warn("incfs: failed to write status to backing file\n"); + } + + incfs_free_mtree(df->df_hash_tree); + incfs_free_bfc(df->df_backing_file_context); + kfree(df->df_signature); + kfree(df->df_verity_file_digest.data); + kfree(df->df_verity_signature); + mutex_destroy(&df->df_enable_verity); + kfree(df); +} + +int make_inode_ready_for_data_ops(struct mount_info *mi, + struct inode *inode, + struct file *backing_file) +{ + struct inode_info *node = get_incfs_node(inode); + struct data_file *df = NULL; + int err = 0; + + inode_lock(inode); + if (S_ISREG(inode->i_mode)) { + if (!node->n_file) { + df = incfs_open_data_file(mi, backing_file); + + if (IS_ERR(df)) + err = PTR_ERR(df); + else + node->n_file = df; + } + } else + err = -EBADF; + inode_unlock(inode); + return err; +} + +struct dir_file *incfs_open_dir_file(struct mount_info *mi, struct file *bf) +{ + struct dir_file *dir = NULL; + + if (!S_ISDIR(bf->f_inode->i_mode)) + return ERR_PTR(-EBADF); + + dir = kzalloc(sizeof(*dir), GFP_NOFS); + if (!dir) + return ERR_PTR(-ENOMEM); + + dir->backing_dir = get_file(bf); + dir->mount_info = mi; + return dir; +} + +void incfs_free_dir_file(struct dir_file *dir) +{ + if (!dir) + return; + if (dir->backing_dir) + fput(dir->backing_dir); + kfree(dir); +} + +static ssize_t zstd_decompress_safe(struct mount_info *mi, + struct mem_range src, struct mem_range dst) +{ + ssize_t result; + ZSTD_inBuffer inbuf = {.src = src.data, .size = src.len}; + ZSTD_outBuffer outbuf = {.dst = dst.data, .size = dst.len}; + + result = mutex_lock_interruptible(&mi->mi_zstd_workspace_mutex); + if (result) + return result; + + if (!mi->mi_zstd_stream) { + unsigned int workspace_size = ZSTD_DStreamWorkspaceBound( + INCFS_DATA_FILE_BLOCK_SIZE); + void *workspace = kvmalloc(workspace_size, GFP_NOFS); + ZSTD_DStream *stream; + + if (!workspace) { + result = -ENOMEM; + goto out; + } + + stream = ZSTD_initDStream(INCFS_DATA_FILE_BLOCK_SIZE, workspace, + workspace_size); + if (!stream) { + kvfree(workspace); + result = -EIO; + goto out; + } + + mi->mi_zstd_workspace = workspace; + mi->mi_zstd_stream = stream; + } + + result = ZSTD_decompressStream(mi->mi_zstd_stream, &outbuf, &inbuf) ? + -EBADMSG : outbuf.pos; + + mod_delayed_work(system_wq, &mi->mi_zstd_cleanup_work, + msecs_to_jiffies(5000)); + +out: + mutex_unlock(&mi->mi_zstd_workspace_mutex); + return result; +} + +static ssize_t decompress(struct mount_info *mi, + struct mem_range src, struct mem_range dst, int alg) +{ + int result; + + switch (alg) { + case INCFS_BLOCK_COMPRESSED_LZ4: + result = LZ4_decompress_safe(src.data, dst.data, src.len, + dst.len); + if (result < 0) + return -EBADMSG; + return result; + + case INCFS_BLOCK_COMPRESSED_ZSTD: + return zstd_decompress_safe(mi, src, dst); + + default: + WARN_ON(true); + return -EOPNOTSUPP; + } +} + +static void log_read_one_record(struct read_log *rl, struct read_log_state *rs) +{ + union log_record *record = + (union log_record *)((u8 *)rl->rl_ring_buf + rs->next_offset); + size_t record_size; + + switch (record->full_record.type) { + case FULL: + rs->base_record = record->full_record; + record_size = sizeof(record->full_record); + break; + + case SAME_FILE: + rs->base_record.block_index = + record->same_file.block_index; + rs->base_record.absolute_ts_us += + record->same_file.relative_ts_us; + rs->base_record.uid = record->same_file.uid; + record_size = sizeof(record->same_file); + break; + + case SAME_FILE_CLOSE_BLOCK: + rs->base_record.block_index += + record->same_file_close_block.block_index_delta; + rs->base_record.absolute_ts_us += + record->same_file_close_block.relative_ts_us; + record_size = sizeof(record->same_file_close_block); + break; + + case SAME_FILE_CLOSE_BLOCK_SHORT: + rs->base_record.block_index += + record->same_file_close_block_short.block_index_delta; + rs->base_record.absolute_ts_us += + record->same_file_close_block_short.relative_ts_tens_us * 10; + record_size = sizeof(record->same_file_close_block_short); + break; + + case SAME_FILE_NEXT_BLOCK: + ++rs->base_record.block_index; + rs->base_record.absolute_ts_us += + record->same_file_next_block.relative_ts_us; + record_size = sizeof(record->same_file_next_block); + break; + + case SAME_FILE_NEXT_BLOCK_SHORT: + ++rs->base_record.block_index; + rs->base_record.absolute_ts_us += + record->same_file_next_block_short.relative_ts_tens_us * 10; + record_size = sizeof(record->same_file_next_block_short); + break; + } + + rs->next_offset += record_size; + if (rs->next_offset > rl->rl_size - sizeof(*record)) { + rs->next_offset = 0; + ++rs->current_pass_no; + } + ++rs->current_record_no; +} + +static void log_block_read(struct mount_info *mi, incfs_uuid_t *id, + int block_index) +{ + struct read_log *log = &mi->mi_log; + struct read_log_state *head, *tail; + s64 now_us; + s64 relative_us; + union log_record record; + size_t record_size; + uid_t uid = current_uid().val; + int block_delta; + bool same_file, same_uid; + bool next_block, close_block, very_close_block; + bool close_time, very_close_time, very_very_close_time; + + /* + * This may read the old value, but it's OK to delay the logging start + * right after the configuration update. + */ + if (READ_ONCE(log->rl_size) == 0) + return; + + now_us = ktime_to_us(ktime_get()); + + spin_lock(&log->rl_lock); + if (log->rl_size == 0) { + spin_unlock(&log->rl_lock); + return; + } + + head = &log->rl_head; + tail = &log->rl_tail; + relative_us = now_us - head->base_record.absolute_ts_us; + + same_file = !memcmp(id, &head->base_record.file_id, + sizeof(incfs_uuid_t)); + same_uid = uid == head->base_record.uid; + + block_delta = block_index - head->base_record.block_index; + next_block = block_delta == 1; + very_close_block = block_delta >= S8_MIN && block_delta <= S8_MAX; + close_block = block_delta >= S16_MIN && block_delta <= S16_MAX; + + very_very_close_time = relative_us < (1 << 5) * 10; + very_close_time = relative_us < (1 << 13); + close_time = relative_us < (1 << 16); + + if (same_file && same_uid && next_block && very_very_close_time) { + record.same_file_next_block_short = + (struct same_file_next_block_short){ + .type = SAME_FILE_NEXT_BLOCK_SHORT, + .relative_ts_tens_us = div_s64(relative_us, 10), + }; + record_size = sizeof(struct same_file_next_block_short); + } else if (same_file && same_uid && next_block && very_close_time) { + record.same_file_next_block = (struct same_file_next_block){ + .type = SAME_FILE_NEXT_BLOCK, + .relative_ts_us = relative_us, + }; + record_size = sizeof(struct same_file_next_block); + } else if (same_file && same_uid && very_close_block && + very_very_close_time) { + record.same_file_close_block_short = + (struct same_file_close_block_short){ + .type = SAME_FILE_CLOSE_BLOCK_SHORT, + .relative_ts_tens_us = div_s64(relative_us, 10), + .block_index_delta = block_delta, + }; + record_size = sizeof(struct same_file_close_block_short); + } else if (same_file && same_uid && close_block && very_close_time) { + record.same_file_close_block = (struct same_file_close_block){ + .type = SAME_FILE_CLOSE_BLOCK, + .relative_ts_us = relative_us, + .block_index_delta = block_delta, + }; + record_size = sizeof(struct same_file_close_block); + } else if (same_file && close_time) { + record.same_file = (struct same_file){ + .type = SAME_FILE, + .block_index = block_index, + .relative_ts_us = relative_us, + .uid = uid, + }; + record_size = sizeof(struct same_file); + } else { + record.full_record = (struct full_record){ + .type = FULL, + .block_index = block_index, + .file_id = *id, + .absolute_ts_us = now_us, + .uid = uid, + }; + head->base_record.file_id = *id; + record_size = sizeof(struct full_record); + } + + head->base_record.block_index = block_index; + head->base_record.absolute_ts_us = now_us; + + /* Advance tail beyond area we are going to overwrite */ + while (tail->current_pass_no < head->current_pass_no && + tail->next_offset < head->next_offset + record_size) + log_read_one_record(log, tail); + + memcpy(((u8 *)log->rl_ring_buf) + head->next_offset, &record, + record_size); + head->next_offset += record_size; + if (head->next_offset > log->rl_size - sizeof(record)) { + head->next_offset = 0; + ++head->current_pass_no; + } + ++head->current_record_no; + + spin_unlock(&log->rl_lock); + schedule_delayed_work(&log->ml_wakeup_work, msecs_to_jiffies(16)); +} + +static int validate_hash_tree(struct backing_file_context *bfc, struct file *f, + int block_index, struct mem_range data, u8 *buf) +{ + struct data_file *df = get_incfs_data_file(f); + u8 stored_digest[INCFS_MAX_HASH_SIZE] = {}; + u8 calculated_digest[INCFS_MAX_HASH_SIZE] = {}; + struct mtree *tree = NULL; + struct incfs_df_signature *sig = NULL; + int digest_size; + int hash_block_index = block_index; + int lvl; + int res; + loff_t hash_block_offset[INCFS_MAX_MTREE_LEVELS]; + size_t hash_offset_in_block[INCFS_MAX_MTREE_LEVELS]; + int hash_per_block; + pgoff_t file_pages; + + /* + * Memory barrier to make sure tree is fully present if added via enable + * verity + */ + tree = smp_load_acquire(&df->df_hash_tree); + sig = df->df_signature; + if (!tree || !sig) + return 0; + + digest_size = tree->alg->digest_size; + hash_per_block = INCFS_DATA_FILE_BLOCK_SIZE / digest_size; + for (lvl = 0; lvl < tree->depth; lvl++) { + loff_t lvl_off = tree->hash_level_suboffset[lvl]; + + hash_block_offset[lvl] = + lvl_off + round_down(hash_block_index * digest_size, + INCFS_DATA_FILE_BLOCK_SIZE); + hash_offset_in_block[lvl] = hash_block_index * digest_size % + INCFS_DATA_FILE_BLOCK_SIZE; + hash_block_index /= hash_per_block; + } + + memcpy(stored_digest, tree->root_hash, digest_size); + + file_pages = DIV_ROUND_UP(df->df_size, INCFS_DATA_FILE_BLOCK_SIZE); + for (lvl = tree->depth - 1; lvl >= 0; lvl--) { + pgoff_t hash_page = + file_pages + + hash_block_offset[lvl] / INCFS_DATA_FILE_BLOCK_SIZE; + struct page *page = find_get_page_flags( + f->f_inode->i_mapping, hash_page, FGP_ACCESSED); + + if (page && PageChecked(page)) { + u8 *addr = kmap_atomic(page); + + memcpy(stored_digest, addr + hash_offset_in_block[lvl], + digest_size); + + kunmap_atomic(addr); + put_page(page); + continue; + } + + if (page) + put_page(page); + + res = incfs_kread(bfc, buf, INCFS_DATA_FILE_BLOCK_SIZE, + hash_block_offset[lvl] + sig->hash_offset); + if (res < 0) + return res; + if (res != INCFS_DATA_FILE_BLOCK_SIZE) + return -EIO; + res = incfs_calc_digest(tree->alg, + range(buf, INCFS_DATA_FILE_BLOCK_SIZE), + range(calculated_digest, digest_size)); + if (res) + return res; + + if (memcmp(stored_digest, calculated_digest, digest_size)) { + int i; + bool zero = true; + + pr_warn("incfs: Hash mismatch lvl:%d blk:%d\n", + lvl, block_index); + for (i = 0; i < digest_size; i++) + if (stored_digest[i]) { + zero = false; + break; + } + + if (zero) + pr_debug("Note saved_digest all zero - did you forget to load the hashes?\n"); + return -EBADMSG; + } + + memcpy(stored_digest, buf + hash_offset_in_block[lvl], + digest_size); + + page = grab_cache_page(f->f_inode->i_mapping, hash_page); + if (page) { + u8 *addr = kmap_atomic(page); + + memcpy(addr, buf, INCFS_DATA_FILE_BLOCK_SIZE); + kunmap_atomic(addr); + SetPageChecked(page); + SetPageUptodate(page); + unlock_page(page); + put_page(page); + } + } + + res = incfs_calc_digest(tree->alg, data, + range(calculated_digest, digest_size)); + if (res) + return res; + + if (memcmp(stored_digest, calculated_digest, digest_size)) { + pr_debug("Leaf hash mismatch blk:%d\n", block_index); + return -EBADMSG; + } + + return 0; +} + +static struct data_file_segment *get_file_segment(struct data_file *df, + int block_index) +{ + int seg_idx = block_index % ARRAY_SIZE(df->df_segments); + + return &df->df_segments[seg_idx]; +} + +static bool is_data_block_present(struct data_file_block *block) +{ + return (block->db_backing_file_data_offset != 0) && + (block->db_stored_size != 0); +} + +static void convert_data_file_block(struct incfs_blockmap_entry *bme, + struct data_file_block *res_block) +{ + u16 flags = le16_to_cpu(bme->me_flags); + + res_block->db_backing_file_data_offset = + le16_to_cpu(bme->me_data_offset_hi); + res_block->db_backing_file_data_offset <<= 32; + res_block->db_backing_file_data_offset |= + le32_to_cpu(bme->me_data_offset_lo); + res_block->db_stored_size = le16_to_cpu(bme->me_data_size); + res_block->db_comp_alg = flags & INCFS_BLOCK_COMPRESSED_MASK; +} + +static int get_data_file_block(struct data_file *df, int index, + struct data_file_block *res_block) +{ + struct incfs_blockmap_entry bme = {}; + struct backing_file_context *bfc = NULL; + loff_t blockmap_off = 0; + int error = 0; + + if (!df || !res_block) + return -EFAULT; + + blockmap_off = df->df_blockmap_off; + bfc = df->df_backing_file_context; + + if (index < 0 || blockmap_off == 0) + return -EINVAL; + + error = incfs_read_blockmap_entry(bfc, index, blockmap_off, &bme); + if (error) + return error; + + convert_data_file_block(&bme, res_block); + return 0; +} + +static int check_room_for_one_range(u32 size, u32 size_out) +{ + if (size_out + sizeof(struct incfs_filled_range) > size) + return -ERANGE; + return 0; +} + +static int copy_one_range(struct incfs_filled_range *range, void __user *buffer, + u32 size, u32 *size_out) +{ + int error = check_room_for_one_range(size, *size_out); + if (error) + return error; + + if (copy_to_user(((char __user *)buffer) + *size_out, range, + sizeof(*range))) + return -EFAULT; + + *size_out += sizeof(*range); + return 0; +} + +#define READ_BLOCKMAP_ENTRIES 512 +int incfs_get_filled_blocks(struct data_file *df, + struct incfs_file_data *fd, + struct incfs_get_filled_blocks_args *arg) +{ + int error = 0; + bool in_range = false; + struct incfs_filled_range range; + void __user *buffer = u64_to_user_ptr(arg->range_buffer); + u32 size = arg->range_buffer_size; + u32 end_index = + arg->end_index ? arg->end_index : df->df_total_block_count; + u32 *size_out = &arg->range_buffer_size_out; + int i = READ_BLOCKMAP_ENTRIES - 1; + int entries_read = 0; + struct incfs_blockmap_entry *bme; + int data_blocks_filled = 0; + int hash_blocks_filled = 0; + + *size_out = 0; + if (end_index > df->df_total_block_count) + end_index = df->df_total_block_count; + arg->total_blocks_out = df->df_total_block_count; + arg->data_blocks_out = df->df_data_block_count; + + if (atomic_read(&df->df_data_blocks_written) == + df->df_data_block_count) { + pr_debug("File marked full, fast get_filled_blocks"); + if (arg->start_index > end_index) { + arg->index_out = arg->start_index; + return 0; + } + arg->index_out = arg->start_index; + + error = check_room_for_one_range(size, *size_out); + if (error) + return error; + + range = (struct incfs_filled_range){ + .begin = arg->start_index, + .end = end_index, + }; + + error = copy_one_range(&range, buffer, size, size_out); + if (error) + return error; + arg->index_out = end_index; + return 0; + } + + bme = kzalloc(sizeof(*bme) * READ_BLOCKMAP_ENTRIES, + GFP_NOFS | __GFP_COMP); + if (!bme) + return -ENOMEM; + + for (arg->index_out = arg->start_index; arg->index_out < end_index; + ++arg->index_out) { + struct data_file_block dfb; + + if (++i == READ_BLOCKMAP_ENTRIES) { + entries_read = incfs_read_blockmap_entries( + df->df_backing_file_context, bme, + arg->index_out, READ_BLOCKMAP_ENTRIES, + df->df_blockmap_off); + if (entries_read < 0) { + error = entries_read; + break; + } + + i = 0; + } + + if (i >= entries_read) { + error = -EIO; + break; + } + + convert_data_file_block(bme + i, &dfb); + + if (is_data_block_present(&dfb)) { + if (arg->index_out >= df->df_data_block_count) + ++hash_blocks_filled; + else + ++data_blocks_filled; + } + + if (is_data_block_present(&dfb) == in_range) + continue; + + if (!in_range) { + error = check_room_for_one_range(size, *size_out); + if (error) + break; + in_range = true; + range.begin = arg->index_out; + } else { + range.end = arg->index_out; + error = copy_one_range(&range, buffer, size, size_out); + if (error) { + /* there will be another try out of the loop, + * it will reset the index_out if it fails too + */ + break; + } + in_range = false; + } + } + + if (in_range) { + range.end = arg->index_out; + error = copy_one_range(&range, buffer, size, size_out); + if (error) + arg->index_out = range.begin; + } + + if (arg->start_index == 0) { + fd->fd_get_block_pos = 0; + fd->fd_filled_data_blocks = 0; + fd->fd_filled_hash_blocks = 0; + } + + if (arg->start_index == fd->fd_get_block_pos) { + fd->fd_get_block_pos = arg->index_out + 1; + fd->fd_filled_data_blocks += data_blocks_filled; + fd->fd_filled_hash_blocks += hash_blocks_filled; + } + + if (fd->fd_get_block_pos == df->df_total_block_count + 1) { + if (fd->fd_filled_data_blocks > + atomic_read(&df->df_data_blocks_written)) + atomic_set(&df->df_data_blocks_written, + fd->fd_filled_data_blocks); + + if (fd->fd_filled_hash_blocks > + atomic_read(&df->df_hash_blocks_written)) + atomic_set(&df->df_hash_blocks_written, + fd->fd_filled_hash_blocks); + } + + kfree(bme); + return error; +} + +static bool is_read_done(struct pending_read *read) +{ + return atomic_read_acquire(&read->done) != 0; +} + +static void set_read_done(struct pending_read *read) +{ + atomic_set_release(&read->done, 1); +} + +/* + * Notifies a given data file about pending read from a given block. + * Returns a new pending read entry. + */ +static struct pending_read *add_pending_read(struct data_file *df, + int block_index) +{ + struct pending_read *result = NULL; + struct data_file_segment *segment = NULL; + struct mount_info *mi = NULL; + + segment = get_file_segment(df, block_index); + mi = df->df_mount_info; + + result = kzalloc(sizeof(*result), GFP_NOFS); + if (!result) + return NULL; + + result->file_id = df->df_id; + result->block_index = block_index; + result->timestamp_us = ktime_to_us(ktime_get()); + result->uid = current_uid().val; + + spin_lock(&mi->pending_read_lock); + + result->serial_number = ++mi->mi_last_pending_read_number; + mi->mi_pending_reads_count++; + + list_add_rcu(&result->mi_reads_list, &mi->mi_reads_list_head); + list_add_rcu(&result->segment_reads_list, &segment->reads_list_head); + + spin_unlock(&mi->pending_read_lock); + + wake_up_all(&mi->mi_pending_reads_notif_wq); + return result; +} + +static void free_pending_read_entry(struct rcu_head *entry) +{ + struct pending_read *read; + + read = container_of(entry, struct pending_read, rcu); + + kfree(read); +} + +/* Notifies a given data file that pending read is completed. */ +static void remove_pending_read(struct data_file *df, struct pending_read *read) +{ + struct mount_info *mi = NULL; + + if (!df || !read) { + WARN_ON(!df); + WARN_ON(!read); + return; + } + + mi = df->df_mount_info; + + spin_lock(&mi->pending_read_lock); + + list_del_rcu(&read->mi_reads_list); + list_del_rcu(&read->segment_reads_list); + + mi->mi_pending_reads_count--; + + spin_unlock(&mi->pending_read_lock); + + /* Don't free. Wait for readers */ + call_rcu(&read->rcu, free_pending_read_entry); +} + +static void notify_pending_reads(struct mount_info *mi, + struct data_file_segment *segment, + int index) +{ + struct pending_read *entry = NULL; + + /* Notify pending reads waiting for this block. */ + rcu_read_lock(); + list_for_each_entry_rcu(entry, &segment->reads_list_head, + segment_reads_list) { + if (entry->block_index == index) + set_read_done(entry); + } + rcu_read_unlock(); + wake_up_all(&segment->new_data_arrival_wq); + + atomic_inc(&mi->mi_blocks_written); + wake_up_all(&mi->mi_blocks_written_notif_wq); +} + +static int wait_for_data_block(struct data_file *df, int block_index, + struct data_file_block *res_block, + struct incfs_read_data_file_timeouts *timeouts, + unsigned int *delayed_min_us) +{ + struct data_file_block block = {}; + struct data_file_segment *segment = NULL; + struct pending_read *read = NULL; + struct mount_info *mi = NULL; + int error; + int wait_res = 0; + unsigned int delayed_pending_us = 0; + bool delayed_pending = false; + + if (!df || !res_block) + return -EFAULT; + + if (block_index < 0 || block_index >= df->df_data_block_count) + return -EINVAL; + + if (df->df_blockmap_off <= 0 || !df->df_mount_info) + return -ENODATA; + + mi = df->df_mount_info; + segment = get_file_segment(df, block_index); + + error = down_read_killable(&segment->rwsem); + if (error) + return error; + + /* Look up the given block */ + error = get_data_file_block(df, block_index, &block); + + up_read(&segment->rwsem); + + if (error) + return error; + + /* If the block was found, just return it. No need to wait. */ + if (is_data_block_present(&block)) { + *res_block = block; + if (timeouts && timeouts->min_time_us) { + *delayed_min_us = timeouts->min_time_us; + goto out; + } + return 0; + } else { + /* If it's not found, create a pending read */ + if (timeouts && timeouts->max_pending_time_us) { + read = add_pending_read(df, block_index); + if (!read) + return -ENOMEM; + } else { + log_block_read(mi, &df->df_id, block_index); + return -ETIME; + } + } + + /* Rest of function only applies if timeouts != NULL */ + if (!timeouts) { + pr_warn("incfs: timeouts unexpectedly NULL\n"); + return -EFSCORRUPTED; + } + + /* Wait for notifications about block's arrival */ + wait_res = + wait_event_interruptible_timeout(segment->new_data_arrival_wq, + (is_read_done(read)), + usecs_to_jiffies(timeouts->max_pending_time_us)); + + /* Woke up, the pending read is no longer needed. */ + remove_pending_read(df, read); + + if (wait_res == 0) { + /* Wait has timed out */ + log_block_read(mi, &df->df_id, block_index); + return -ETIME; + } + if (wait_res < 0) { + /* + * Only ERESTARTSYS is really expected here when a signal + * comes while we wait. + */ + return wait_res; + } + + delayed_pending = true; + delayed_pending_us = timeouts->max_pending_time_us - + jiffies_to_usecs(wait_res); + if (timeouts->min_pending_time_us > delayed_pending_us) + *delayed_min_us = timeouts->min_pending_time_us - + delayed_pending_us; + + error = down_read_killable(&segment->rwsem); + if (error) + return error; + + /* + * Re-read blocks info now, it has just arrived and + * should be available. + */ + error = get_data_file_block(df, block_index, &block); + if (!error) { + if (is_data_block_present(&block)) + *res_block = block; + else { + /* + * Somehow wait finished successfully but block still + * can't be found. It's not normal. + */ + pr_warn("incfs: Wait succeeded but block not found.\n"); + error = -ENODATA; + } + } + up_read(&segment->rwsem); + +out: + if (error) + return error; + + if (delayed_pending) { + mi->mi_reads_delayed_pending++; + mi->mi_reads_delayed_pending_us += + delayed_pending_us; + } + + if (delayed_min_us && *delayed_min_us) { + mi->mi_reads_delayed_min++; + mi->mi_reads_delayed_min_us += *delayed_min_us; + } + + return 0; +} + +static int incfs_update_sysfs_error(struct file *file, int index, int result, + struct mount_info *mi, struct data_file *df) +{ + int error; + + if (result >= 0) + return 0; + + error = mutex_lock_interruptible(&mi->mi_le_mutex); + if (error) + return error; + + mi->mi_le_file_id = df->df_id; + mi->mi_le_time_us = ktime_to_us(ktime_get()); + mi->mi_le_page = index; + mi->mi_le_errno = result; + mi->mi_le_uid = current_uid().val; + mutex_unlock(&mi->mi_le_mutex); + + return 0; +} + +ssize_t incfs_read_data_file_block(struct mem_range dst, struct file *f, + int index, struct mem_range tmp, + struct incfs_read_data_file_timeouts *timeouts, + unsigned int *delayed_min_us) +{ + loff_t pos; + ssize_t result; + size_t bytes_to_read; + struct mount_info *mi = NULL; + struct backing_file_context *bfc = NULL; + struct data_file_block block = {}; + struct data_file *df = get_incfs_data_file(f); + + if (!dst.data || !df || !tmp.data) + return -EFAULT; + + if (tmp.len < 2 * INCFS_DATA_FILE_BLOCK_SIZE) + return -ERANGE; + + mi = df->df_mount_info; + bfc = df->df_backing_file_context; + + result = wait_for_data_block(df, index, &block, timeouts, + delayed_min_us); + if (result < 0) + goto out; + + pos = block.db_backing_file_data_offset; + if (block.db_comp_alg == COMPRESSION_NONE) { + bytes_to_read = min(dst.len, block.db_stored_size); + result = incfs_kread(bfc, dst.data, bytes_to_read, pos); + + /* Some data was read, but not enough */ + if (result >= 0 && result != bytes_to_read) + result = -EIO; + } else { + bytes_to_read = min(tmp.len, block.db_stored_size); + result = incfs_kread(bfc, tmp.data, bytes_to_read, pos); + if (result == bytes_to_read) { + result = + decompress(mi, range(tmp.data, bytes_to_read), + dst, block.db_comp_alg); + if (result < 0) { + const char *name = + bfc->bc_file->f_path.dentry->d_name.name; + + pr_warn_once("incfs: Decompression error. %s", + name); + } + } else if (result >= 0) { + /* Some data was read, but not enough */ + result = -EIO; + } + } + + if (result > 0) { + int err = validate_hash_tree(bfc, f, index, dst, tmp.data); + + if (err < 0) + result = err; + } + + if (result >= 0) + log_block_read(mi, &df->df_id, index); + +out: + if (result == -ETIME) + mi->mi_reads_failed_timed_out++; + else if (result == -EBADMSG) + mi->mi_reads_failed_hash_verification++; + else if (result < 0) + mi->mi_reads_failed_other++; + + incfs_update_sysfs_error(f, index, result, mi, df); + + return result; +} + +ssize_t incfs_read_merkle_tree_blocks(struct mem_range dst, + struct data_file *df, size_t offset) +{ + struct backing_file_context *bfc = NULL; + struct incfs_df_signature *sig = NULL; + size_t to_read = dst.len; + + if (!dst.data || !df) + return -EFAULT; + + sig = df->df_signature; + bfc = df->df_backing_file_context; + + if (offset > sig->hash_size) + return -ERANGE; + + if (offset + to_read > sig->hash_size) + to_read = sig->hash_size - offset; + + return incfs_kread(bfc, dst.data, to_read, sig->hash_offset + offset); +} + +int incfs_process_new_data_block(struct data_file *df, + struct incfs_fill_block *block, u8 *data, + bool *complete) +{ + struct mount_info *mi = NULL; + struct backing_file_context *bfc = NULL; + struct data_file_segment *segment = NULL; + struct data_file_block existing_block = {}; + u16 flags = 0; + int error = 0; + + if (!df || !block) + return -EFAULT; + + bfc = df->df_backing_file_context; + mi = df->df_mount_info; + + if (block->block_index >= df->df_data_block_count) + return -ERANGE; + + segment = get_file_segment(df, block->block_index); + if (!segment) + return -EFAULT; + + if (block->compression == COMPRESSION_LZ4) + flags |= INCFS_BLOCK_COMPRESSED_LZ4; + else if (block->compression == COMPRESSION_ZSTD) + flags |= INCFS_BLOCK_COMPRESSED_ZSTD; + else if (block->compression) + return -EINVAL; + + error = down_read_killable(&segment->rwsem); + if (error) + return error; + + error = get_data_file_block(df, block->block_index, &existing_block); + + up_read(&segment->rwsem); + + if (error) + return error; + if (is_data_block_present(&existing_block)) + /* Block is already present, nothing to do here */ + return 0; + + error = down_write_killable(&segment->rwsem); + if (error) + return error; + + /* Recheck inside write lock */ + error = get_data_file_block(df, block->block_index, &existing_block); + if (error) + goto out_up_write; + + if (is_data_block_present(&existing_block)) + goto out_up_write; + + error = mutex_lock_interruptible(&bfc->bc_mutex); + if (error) + goto out_up_write; + + error = incfs_write_data_block_to_backing_file(bfc, + range(data, block->data_len), block->block_index, + df->df_blockmap_off, flags); + if (error) + goto out_mutex_unlock; + + if (atomic_inc_return(&df->df_data_blocks_written) + >= df->df_data_block_count) + *complete = true; + +out_mutex_unlock: + mutex_unlock(&bfc->bc_mutex); + if (!error) + notify_pending_reads(mi, segment, block->block_index); + +out_up_write: + up_write(&segment->rwsem); + + if (error) + pr_debug("%d error: %d\n", block->block_index, error); + return error; +} + +int incfs_read_file_signature(struct data_file *df, struct mem_range dst) +{ + struct backing_file_context *bfc = df->df_backing_file_context; + struct incfs_df_signature *sig; + int read_res = 0; + + if (!dst.data) + return -EFAULT; + + sig = df->df_signature; + if (!sig) + return 0; + + if (dst.len < sig->sig_size) + return -E2BIG; + + read_res = incfs_kread(bfc, dst.data, sig->sig_size, sig->sig_offset); + + if (read_res < 0) + return read_res; + + if (read_res != sig->sig_size) + return -EIO; + + return read_res; +} + +int incfs_process_new_hash_block(struct data_file *df, + struct incfs_fill_block *block, u8 *data) +{ + struct backing_file_context *bfc = NULL; + struct mount_info *mi = NULL; + struct mtree *hash_tree = NULL; + struct incfs_df_signature *sig = NULL; + loff_t hash_area_base = 0; + loff_t hash_area_size = 0; + int error = 0; + + if (!df || !block) + return -EFAULT; + + if (!(block->flags & INCFS_BLOCK_FLAGS_HASH)) + return -EINVAL; + + bfc = df->df_backing_file_context; + mi = df->df_mount_info; + + if (!df) + return -ENOENT; + + hash_tree = df->df_hash_tree; + sig = df->df_signature; + if (!hash_tree || !sig || sig->hash_offset == 0) + return -ENOTSUPP; + + hash_area_base = sig->hash_offset; + hash_area_size = sig->hash_size; + if (hash_area_size < block->block_index * INCFS_DATA_FILE_BLOCK_SIZE + + block->data_len) { + /* Hash block goes beyond dedicated hash area of this file. */ + return -ERANGE; + } + + error = mutex_lock_interruptible(&bfc->bc_mutex); + if (!error) { + error = incfs_write_hash_block_to_backing_file( + bfc, range(data, block->data_len), block->block_index, + hash_area_base, df->df_blockmap_off, df->df_size); + mutex_unlock(&bfc->bc_mutex); + } + if (!error) + atomic_inc(&df->df_hash_blocks_written); + + return error; +} + +static int process_blockmap_md(struct incfs_blockmap *bm, + struct metadata_handler *handler) +{ + struct data_file *df = handler->context; + int error = 0; + loff_t base_off = le64_to_cpu(bm->m_base_offset); + u32 block_count = le32_to_cpu(bm->m_block_count); + + if (!df) + return -EFAULT; + + if (df->df_data_block_count > block_count) + return -EBADMSG; + + df->df_total_block_count = block_count; + df->df_blockmap_off = base_off; + return error; +} + +static int process_file_signature_md(struct incfs_file_signature *sg, + struct metadata_handler *handler) +{ + struct data_file *df = handler->context; + struct mtree *hash_tree = NULL; + int error = 0; + struct incfs_df_signature *signature = + kzalloc(sizeof(*signature), GFP_NOFS); + void *buf = NULL; + ssize_t read; + + if (!signature) + return -ENOMEM; + + if (!df || !df->df_backing_file_context || + !df->df_backing_file_context->bc_file) { + error = -ENOENT; + goto out; + } + + signature->hash_offset = le64_to_cpu(sg->sg_hash_tree_offset); + signature->hash_size = le32_to_cpu(sg->sg_hash_tree_size); + signature->sig_offset = le64_to_cpu(sg->sg_sig_offset); + signature->sig_size = le32_to_cpu(sg->sg_sig_size); + + buf = kzalloc(signature->sig_size, GFP_NOFS); + if (!buf) { + error = -ENOMEM; + goto out; + } + + read = incfs_kread(df->df_backing_file_context, buf, + signature->sig_size, signature->sig_offset); + if (read < 0) { + error = read; + goto out; + } + + if (read != signature->sig_size) { + error = -EINVAL; + goto out; + } + + hash_tree = incfs_alloc_mtree(range(buf, signature->sig_size), + df->df_data_block_count); + if (IS_ERR(hash_tree)) { + error = PTR_ERR(hash_tree); + hash_tree = NULL; + goto out; + } + if (hash_tree->hash_tree_area_size != signature->hash_size) { + error = -EINVAL; + goto out; + } + if (signature->hash_size > 0 && + handler->md_record_offset <= signature->hash_offset) { + error = -EINVAL; + goto out; + } + if (handler->md_record_offset <= signature->sig_offset) { + error = -EINVAL; + goto out; + } + df->df_hash_tree = hash_tree; + hash_tree = NULL; + df->df_signature = signature; + signature = NULL; +out: + incfs_free_mtree(hash_tree); + kfree(signature); + kfree(buf); + + return error; +} + +static int process_status_md(struct incfs_status *is, + struct metadata_handler *handler) +{ + struct data_file *df = handler->context; + + df->df_initial_data_blocks_written = + le32_to_cpu(is->is_data_blocks_written); + atomic_set(&df->df_data_blocks_written, + df->df_initial_data_blocks_written); + + df->df_initial_hash_blocks_written = + le32_to_cpu(is->is_hash_blocks_written); + atomic_set(&df->df_hash_blocks_written, + df->df_initial_hash_blocks_written); + + df->df_status_offset = handler->md_record_offset; + return 0; +} + +static int process_file_verity_signature_md( + struct incfs_file_verity_signature *vs, + struct metadata_handler *handler) +{ + struct data_file *df = handler->context; + struct incfs_df_verity_signature *verity_signature; + + if (!df) + return -EFAULT; + + verity_signature = kzalloc(sizeof(*verity_signature), GFP_NOFS); + if (!verity_signature) + return -ENOMEM; + + verity_signature->offset = le64_to_cpu(vs->vs_offset); + verity_signature->size = le32_to_cpu(vs->vs_size); + if (verity_signature->size > FS_VERITY_MAX_SIGNATURE_SIZE) { + kfree(verity_signature); + return -EFAULT; + } + + df->df_verity_signature = verity_signature; + return 0; +} + +static int incfs_scan_metadata_chain(struct data_file *df) +{ + struct metadata_handler *handler = NULL; + int result = 0; + int records_count = 0; + int error = 0; + struct backing_file_context *bfc = NULL; + int nondata_block_count; + + if (!df || !df->df_backing_file_context) + return -EFAULT; + + bfc = df->df_backing_file_context; + + handler = kzalloc(sizeof(*handler), GFP_NOFS); + if (!handler) + return -ENOMEM; + + handler->md_record_offset = df->df_metadata_off; + handler->context = df; + handler->handle_blockmap = process_blockmap_md; + handler->handle_signature = process_file_signature_md; + handler->handle_status = process_status_md; + handler->handle_verity_signature = process_file_verity_signature_md; + + while (handler->md_record_offset > 0) { + error = incfs_read_next_metadata_record(bfc, handler); + if (error) { + pr_warn("incfs: Error during reading incfs-metadata record. Offset: %lld Record #%d Error code: %d\n", + handler->md_record_offset, records_count + 1, + -error); + break; + } + records_count++; + } + if (error) { + pr_warn("incfs: Error %d after reading %d incfs-metadata records.\n", + -error, records_count); + result = error; + } else + result = records_count; + + nondata_block_count = df->df_total_block_count - + df->df_data_block_count; + if (df->df_hash_tree) { + int hash_block_count = get_blocks_count_for_size( + df->df_hash_tree->hash_tree_area_size); + + /* + * Files that were created with a hash tree have the hash tree + * included in the block map, i.e. nondata_block_count == + * hash_block_count. Files whose hash tree was added by + * FS_IOC_ENABLE_VERITY will still have the original block + * count, i.e. nondata_block_count == 0. + */ + if (nondata_block_count != hash_block_count && + nondata_block_count != 0) + result = -EINVAL; + } else if (nondata_block_count != 0) { + result = -EINVAL; + } + + kfree(handler); + return result; +} + +/* + * Quickly checks if there are pending reads with a serial number larger + * than a given one. + */ +bool incfs_fresh_pending_reads_exist(struct mount_info *mi, int last_number) +{ + bool result = false; + + spin_lock(&mi->pending_read_lock); + result = (mi->mi_last_pending_read_number > last_number) && + (mi->mi_pending_reads_count > 0); + spin_unlock(&mi->pending_read_lock); + return result; +} + +int incfs_collect_pending_reads(struct mount_info *mi, int sn_lowerbound, + struct incfs_pending_read_info *reads, + struct incfs_pending_read_info2 *reads2, + int reads_size, int *new_max_sn) +{ + int reported_reads = 0; + struct pending_read *entry = NULL; + + if (!mi) + return -EFAULT; + + if (reads_size <= 0) + return 0; + + if (!incfs_fresh_pending_reads_exist(mi, sn_lowerbound)) + return 0; + + rcu_read_lock(); + + list_for_each_entry_rcu(entry, &mi->mi_reads_list_head, mi_reads_list) { + if (entry->serial_number <= sn_lowerbound) + continue; + + if (reads) { + reads[reported_reads].file_id = entry->file_id; + reads[reported_reads].block_index = entry->block_index; + reads[reported_reads].serial_number = + entry->serial_number; + reads[reported_reads].timestamp_us = + entry->timestamp_us; + } + + if (reads2) { + reads2[reported_reads].file_id = entry->file_id; + reads2[reported_reads].block_index = entry->block_index; + reads2[reported_reads].serial_number = + entry->serial_number; + reads2[reported_reads].timestamp_us = + entry->timestamp_us; + reads2[reported_reads].uid = entry->uid; + } + + if (entry->serial_number > *new_max_sn) + *new_max_sn = entry->serial_number; + + reported_reads++; + if (reported_reads >= reads_size) + break; + } + + rcu_read_unlock(); + + return reported_reads; +} + +struct read_log_state incfs_get_log_state(struct mount_info *mi) +{ + struct read_log *log = &mi->mi_log; + struct read_log_state result; + + spin_lock(&log->rl_lock); + result = log->rl_head; + spin_unlock(&log->rl_lock); + return result; +} + +int incfs_get_uncollected_logs_count(struct mount_info *mi, + const struct read_log_state *state) +{ + struct read_log *log = &mi->mi_log; + u32 generation; + u64 head_no, tail_no; + + spin_lock(&log->rl_lock); + tail_no = log->rl_tail.current_record_no; + head_no = log->rl_head.current_record_no; + generation = log->rl_head.generation_id; + spin_unlock(&log->rl_lock); + + if (generation != state->generation_id) + return head_no - tail_no; + else + return head_no - max_t(u64, tail_no, state->current_record_no); +} + +int incfs_collect_logged_reads(struct mount_info *mi, + struct read_log_state *state, + struct incfs_pending_read_info *reads, + struct incfs_pending_read_info2 *reads2, + int reads_size) +{ + int dst_idx; + struct read_log *log = &mi->mi_log; + struct read_log_state *head, *tail; + + spin_lock(&log->rl_lock); + head = &log->rl_head; + tail = &log->rl_tail; + + if (state->generation_id != head->generation_id) { + pr_debug("read ptr is wrong generation: %u/%u", + state->generation_id, head->generation_id); + + *state = (struct read_log_state){ + .generation_id = head->generation_id, + }; + } + + if (state->current_record_no < tail->current_record_no) { + pr_debug("read ptr is behind, moving: %u/%u -> %u/%u\n", + (u32)state->next_offset, + (u32)state->current_pass_no, + (u32)tail->next_offset, (u32)tail->current_pass_no); + + *state = *tail; + } + + for (dst_idx = 0; dst_idx < reads_size; dst_idx++) { + if (state->current_record_no == head->current_record_no) + break; + + log_read_one_record(log, state); + + if (reads) + reads[dst_idx] = (struct incfs_pending_read_info) { + .file_id = state->base_record.file_id, + .block_index = state->base_record.block_index, + .serial_number = state->current_record_no, + .timestamp_us = + state->base_record.absolute_ts_us, + }; + + if (reads2) + reads2[dst_idx] = (struct incfs_pending_read_info2) { + .file_id = state->base_record.file_id, + .block_index = state->base_record.block_index, + .serial_number = state->current_record_no, + .timestamp_us = + state->base_record.absolute_ts_us, + .uid = state->base_record.uid, + }; + } + + spin_unlock(&log->rl_lock); + return dst_idx; +} + diff --git a/fs/incfs/data_mgmt.h b/fs/incfs/data_mgmt.h new file mode 100644 index 000000000000..8bd5c2fcf6dc --- /dev/null +++ b/fs/incfs/data_mgmt.h @@ -0,0 +1,551 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2019 Google LLC + */ +#ifndef _INCFS_DATA_MGMT_H +#define _INCFS_DATA_MGMT_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" +#include "pseudo_files.h" + +#define SEGMENTS_PER_FILE 3 + +enum LOG_RECORD_TYPE { + FULL, + SAME_FILE, + SAME_FILE_CLOSE_BLOCK, + SAME_FILE_CLOSE_BLOCK_SHORT, + SAME_FILE_NEXT_BLOCK, + SAME_FILE_NEXT_BLOCK_SHORT, +}; + +struct full_record { + enum LOG_RECORD_TYPE type : 3; /* FULL */ + u32 block_index : 29; + incfs_uuid_t file_id; + u64 absolute_ts_us; + uid_t uid; +} __packed; /* 32 bytes */ + +struct same_file { + enum LOG_RECORD_TYPE type : 3; /* SAME_FILE */ + u32 block_index : 29; + uid_t uid; + u16 relative_ts_us; /* max 2^16 us ~= 64 ms */ +} __packed; /* 10 bytes */ + +struct same_file_close_block { + enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_CLOSE_BLOCK */ + u16 relative_ts_us : 13; /* max 2^13 us ~= 8 ms */ + s16 block_index_delta; +} __packed; /* 4 bytes */ + +struct same_file_close_block_short { + enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_CLOSE_BLOCK_SHORT */ + u8 relative_ts_tens_us : 5; /* max 2^5*10 us ~= 320 us */ + s8 block_index_delta; +} __packed; /* 2 bytes */ + +struct same_file_next_block { + enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_NEXT_BLOCK */ + u16 relative_ts_us : 13; /* max 2^13 us ~= 8 ms */ +} __packed; /* 2 bytes */ + +struct same_file_next_block_short { + enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_NEXT_BLOCK_SHORT */ + u8 relative_ts_tens_us : 5; /* max 2^5*10 us ~= 320 us */ +} __packed; /* 1 byte */ + +union log_record { + struct full_record full_record; + struct same_file same_file; + struct same_file_close_block same_file_close_block; + struct same_file_close_block_short same_file_close_block_short; + struct same_file_next_block same_file_next_block; + struct same_file_next_block_short same_file_next_block_short; +}; + +struct read_log_state { + /* Log buffer generation id, incremented on configuration changes */ + u32 generation_id; + + /* Offset in rl_ring_buf to write into. */ + u32 next_offset; + + /* Current number of writer passes over rl_ring_buf */ + u32 current_pass_no; + + /* Current full_record to diff against */ + struct full_record base_record; + + /* Current record number counting from configuration change */ + u64 current_record_no; +}; + +/* A ring buffer to save records about data blocks which were recently read. */ +struct read_log { + void *rl_ring_buf; + + int rl_size; + + struct read_log_state rl_head; + + struct read_log_state rl_tail; + + /* A lock to protect the above fields */ + spinlock_t rl_lock; + + /* A queue of waiters who want to be notified about reads */ + wait_queue_head_t ml_notif_wq; + + /* A work item to wake up those waiters without slowing down readers */ + struct delayed_work ml_wakeup_work; +}; + +struct mount_options { + unsigned int read_timeout_ms; + unsigned int readahead_pages; + unsigned int read_log_pages; + unsigned int read_log_wakeup_count; + bool report_uid; + char *sysfs_name; +}; + +struct mount_info { + struct super_block *mi_sb; + + struct path mi_backing_dir_path; + + struct dentry *mi_index_dir; + /* For stacking mounts, if true, this indicates if the index dir needs + * to be freed for this SB otherwise it was created by lower level SB */ + bool mi_index_free; + + struct dentry *mi_incomplete_dir; + /* For stacking mounts, if true, this indicates if the incomplete dir + * needs to be freed for this SB. Similar to mi_index_free */ + bool mi_incomplete_free; + + const struct cred *mi_owner; + + struct mount_options mi_options; + + /* This mutex is to be taken before create, rename, delete */ + struct mutex mi_dir_struct_mutex; + + /* + * A queue of waiters who want to be notified about new pending reads. + */ + wait_queue_head_t mi_pending_reads_notif_wq; + + /* + * Protects - RCU safe: + * - reads_list_head + * - mi_pending_reads_count + * - mi_last_pending_read_number + * - data_file_segment.reads_list_head + */ + spinlock_t pending_read_lock; + + /* List of active pending_read objects */ + struct list_head mi_reads_list_head; + + /* Total number of items in reads_list_head */ + int mi_pending_reads_count; + + /* + * Last serial number that was assigned to a pending read. + * 0 means no pending reads have been seen yet. + */ + int mi_last_pending_read_number; + + /* Temporary buffer for read logger. */ + struct read_log mi_log; + + /* SELinux needs special xattrs on our pseudo files */ + struct mem_range pseudo_file_xattr[PSEUDO_FILE_COUNT]; + + /* A queue of waiters who want to be notified about blocks_written */ + wait_queue_head_t mi_blocks_written_notif_wq; + + /* Number of blocks written since mount */ + atomic_t mi_blocks_written; + + /* Per UID read timeouts */ + spinlock_t mi_per_uid_read_timeouts_lock; + struct incfs_per_uid_read_timeouts *mi_per_uid_read_timeouts; + int mi_per_uid_read_timeouts_size; + + /* zstd workspace */ + struct mutex mi_zstd_workspace_mutex; + void *mi_zstd_workspace; + ZSTD_DStream *mi_zstd_stream; + struct delayed_work mi_zstd_cleanup_work; + + /* sysfs node */ + struct incfs_sysfs_node *mi_sysfs_node; + + /* Last error information */ + struct mutex mi_le_mutex; + incfs_uuid_t mi_le_file_id; + u64 mi_le_time_us; + u32 mi_le_page; + u32 mi_le_errno; + uid_t mi_le_uid; + + /* Number of reads timed out */ + u32 mi_reads_failed_timed_out; + + /* Number of reads failed because hash verification failed */ + u32 mi_reads_failed_hash_verification; + + /* Number of reads failed for another reason */ + u32 mi_reads_failed_other; + + /* Number of reads delayed because page had to be fetched */ + u32 mi_reads_delayed_pending; + + /* Total time waiting for pages to be fetched */ + u64 mi_reads_delayed_pending_us; + + /* + * Number of reads delayed because of per-uid min_time_us or + * min_pending_time_us settings + */ + u32 mi_reads_delayed_min; + + /* Total time waiting because of per-uid min_time_us or + * min_pending_time_us settings. + * + * Note that if a read is initially delayed because we have to wait for + * the page, then further delayed because of min_pending_time_us + * setting, this counter gets incremented by only the further delay + * time. + */ + u64 mi_reads_delayed_min_us; +}; + +struct data_file_block { + loff_t db_backing_file_data_offset; + + size_t db_stored_size; + + enum incfs_compression_alg db_comp_alg; +}; + +struct pending_read { + incfs_uuid_t file_id; + + s64 timestamp_us; + + atomic_t done; + + int block_index; + + int serial_number; + + uid_t uid; + + struct list_head mi_reads_list; + + struct list_head segment_reads_list; + + struct rcu_head rcu; +}; + +struct data_file_segment { + wait_queue_head_t new_data_arrival_wq; + + /* Protects reads and writes from the blockmap */ + struct rw_semaphore rwsem; + + /* List of active pending_read objects belonging to this segment */ + /* Protected by mount_info.pending_reads_mutex */ + struct list_head reads_list_head; +}; + +/* + * Extra info associated with a file. Just a few bytes set by a user. + */ +struct file_attr { + loff_t fa_value_offset; + + size_t fa_value_size; + + u32 fa_crc; +}; + + +struct data_file { + struct backing_file_context *df_backing_file_context; + + struct mount_info *df_mount_info; + + incfs_uuid_t df_id; + + /* + * Array of segments used to reduce lock contention for the file. + * Segment is chosen for a block depends on the block's index. + */ + struct data_file_segment df_segments[SEGMENTS_PER_FILE]; + + /* Base offset of the first metadata record. */ + loff_t df_metadata_off; + + /* Base offset of the block map. */ + loff_t df_blockmap_off; + + /* File size in bytes */ + loff_t df_size; + + /* File header flags */ + u32 df_header_flags; + + /* File size in DATA_FILE_BLOCK_SIZE blocks */ + int df_data_block_count; + + /* Total number of blocks, data + hash */ + int df_total_block_count; + + /* For mapped files, the offset into the actual file */ + loff_t df_mapped_offset; + + /* Number of data blocks written to file */ + atomic_t df_data_blocks_written; + + /* Number of data blocks in the status block */ + u32 df_initial_data_blocks_written; + + /* Number of hash blocks written to file */ + atomic_t df_hash_blocks_written; + + /* Number of hash blocks in the status block */ + u32 df_initial_hash_blocks_written; + + /* Offset to status metadata header */ + loff_t df_status_offset; + + /* + * Mutex acquired while enabling verity. Note that df_hash_tree is set + * by enable verity. + * + * The backing file mutex bc_mutex may be taken while this mutex is + * held. + */ + struct mutex df_enable_verity; + + /* + * Set either at construction time or during enabling verity. In the + * latter case, set via smp_store_release, so use smp_load_acquire to + * read it. + */ + struct mtree *df_hash_tree; + + /* Guaranteed set if df_hash_tree is set. */ + struct incfs_df_signature *df_signature; + + /* + * The verity file digest, set when verity is enabled and the file has + * been opened + */ + struct mem_range df_verity_file_digest; + + struct incfs_df_verity_signature *df_verity_signature; +}; + +struct dir_file { + struct mount_info *mount_info; + + struct file *backing_dir; +}; + +struct inode_info { + struct mount_info *n_mount_info; /* A mount, this file belongs to */ + + struct inode *n_backing_inode; + + struct data_file *n_file; + + struct inode n_vfs_inode; +}; + +struct dentry_info { + struct path backing_path; +}; + +enum FILL_PERMISSION { + CANT_FILL = 0, + CAN_FILL = 1, +}; + +struct incfs_file_data { + /* Does this file handle have INCFS_IOC_FILL_BLOCKS permission */ + enum FILL_PERMISSION fd_fill_permission; + + /* If INCFS_IOC_GET_FILLED_BLOCKS has been called, where are we */ + int fd_get_block_pos; + + /* And how many filled blocks are there up to that point */ + int fd_filled_data_blocks; + int fd_filled_hash_blocks; +}; + +struct mount_info *incfs_alloc_mount_info(struct super_block *sb, + struct mount_options *options, + struct path *backing_dir_path); + +int incfs_realloc_mount_info(struct mount_info *mi, + struct mount_options *options); + +void incfs_free_mount_info(struct mount_info *mi); + +char *file_id_to_str(incfs_uuid_t id); +struct dentry *incfs_lookup_dentry(struct dentry *parent, const char *name); +struct data_file *incfs_open_data_file(struct mount_info *mi, struct file *bf); +void incfs_free_data_file(struct data_file *df); + +struct dir_file *incfs_open_dir_file(struct mount_info *mi, struct file *bf); +void incfs_free_dir_file(struct dir_file *dir); + +struct incfs_read_data_file_timeouts { + u32 min_time_us; + u32 min_pending_time_us; + u32 max_pending_time_us; +}; + +ssize_t incfs_read_data_file_block(struct mem_range dst, struct file *f, + int index, struct mem_range tmp, + struct incfs_read_data_file_timeouts *timeouts, + unsigned int *delayed_min_us); + +ssize_t incfs_read_merkle_tree_blocks(struct mem_range dst, + struct data_file *df, size_t offset); + +int incfs_get_filled_blocks(struct data_file *df, + struct incfs_file_data *fd, + struct incfs_get_filled_blocks_args *arg); + +int incfs_read_file_signature(struct data_file *df, struct mem_range dst); + +int incfs_process_new_data_block(struct data_file *df, + struct incfs_fill_block *block, u8 *data, + bool *complete); + +int incfs_process_new_hash_block(struct data_file *df, + struct incfs_fill_block *block, u8 *data); + +bool incfs_fresh_pending_reads_exist(struct mount_info *mi, int last_number); + +/* + * Collects pending reads and saves them into the array (reads/reads_size). + * Only reads with serial_number > sn_lowerbound are reported. + * Returns how many reads were saved into the array. + */ +int incfs_collect_pending_reads(struct mount_info *mi, int sn_lowerbound, + struct incfs_pending_read_info *reads, + struct incfs_pending_read_info2 *reads2, + int reads_size, int *new_max_sn); + +int incfs_collect_logged_reads(struct mount_info *mi, + struct read_log_state *start_state, + struct incfs_pending_read_info *reads, + struct incfs_pending_read_info2 *reads2, + int reads_size); +struct read_log_state incfs_get_log_state(struct mount_info *mi); +int incfs_get_uncollected_logs_count(struct mount_info *mi, + const struct read_log_state *state); + +static inline struct inode_info *get_incfs_node(struct inode *inode) +{ + if (!inode) + return NULL; + + if (inode->i_sb->s_magic != INCFS_MAGIC_NUMBER) { + /* This inode doesn't belong to us. */ + pr_warn_once("incfs: %s on an alien inode.", __func__); + return NULL; + } + + return container_of(inode, struct inode_info, n_vfs_inode); +} + +static inline struct data_file *get_incfs_data_file(struct file *f) +{ + struct inode_info *node = NULL; + + if (!f) + return NULL; + + if (!S_ISREG(f->f_inode->i_mode)) + return NULL; + + node = get_incfs_node(f->f_inode); + if (!node) + return NULL; + + return node->n_file; +} + +static inline struct dir_file *get_incfs_dir_file(struct file *f) +{ + if (!f) + return NULL; + + if (!S_ISDIR(f->f_inode->i_mode)) + return NULL; + + return (struct dir_file *)f->private_data; +} + +/* + * Make sure that inode_info.n_file is initialized and inode can be used + * for reading and writing data from/to the backing file. + */ +int make_inode_ready_for_data_ops(struct mount_info *mi, + struct inode *inode, + struct file *backing_file); + +static inline struct dentry_info *get_incfs_dentry(const struct dentry *d) +{ + if (!d) + return NULL; + + return (struct dentry_info *)d->d_fsdata; +} + +static inline void get_incfs_backing_path(const struct dentry *d, + struct path *path) +{ + struct dentry_info *di = get_incfs_dentry(d); + + if (!di) { + *path = (struct path) {}; + return; + } + + *path = di->backing_path; + path_get(path); +} + +static inline int get_blocks_count_for_size(u64 size) +{ + if (size == 0) + return 0; + return 1 + (size - 1) / INCFS_DATA_FILE_BLOCK_SIZE; +} + +#endif /* _INCFS_DATA_MGMT_H */ diff --git a/fs/incfs/format.c b/fs/incfs/format.c new file mode 100644 index 000000000000..6ffcf334f105 --- /dev/null +++ b/fs/incfs/format.c @@ -0,0 +1,752 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2018 Google LLC + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "format.h" +#include "data_mgmt.h" + +struct backing_file_context *incfs_alloc_bfc(struct mount_info *mi, + struct file *backing_file) +{ + struct backing_file_context *result = NULL; + + result = kzalloc(sizeof(*result), GFP_NOFS); + if (!result) + return ERR_PTR(-ENOMEM); + + result->bc_file = get_file(backing_file); + result->bc_cred = mi->mi_owner; + mutex_init(&result->bc_mutex); + return result; +} + +void incfs_free_bfc(struct backing_file_context *bfc) +{ + if (!bfc) + return; + + if (bfc->bc_file) + fput(bfc->bc_file); + + mutex_destroy(&bfc->bc_mutex); + kfree(bfc); +} + +static loff_t incfs_get_end_offset(struct file *f) +{ + /* + * This function assumes that file size and the end-offset + * are the same. This is not always true. + */ + return i_size_read(file_inode(f)); +} + +/* + * Truncate the tail of the file to the given length. + * Used to rollback partially successful multistep writes. + */ +static int truncate_backing_file(struct backing_file_context *bfc, + loff_t new_end) +{ + struct inode *inode = NULL; + struct dentry *dentry = NULL; + loff_t old_end = 0; + struct iattr attr; + int result = 0; + + if (!bfc) + return -EFAULT; + + LOCK_REQUIRED(bfc->bc_mutex); + + if (!bfc->bc_file) + return -EFAULT; + + old_end = incfs_get_end_offset(bfc->bc_file); + if (old_end == new_end) + return 0; + if (old_end < new_end) + return -EINVAL; + + inode = bfc->bc_file->f_inode; + dentry = bfc->bc_file->f_path.dentry; + + attr.ia_size = new_end; + attr.ia_valid = ATTR_SIZE; + + inode_lock(inode); + result = notify_change(&init_user_ns, dentry, &attr, NULL); + inode_unlock(inode); + + return result; +} + +static int write_to_bf(struct backing_file_context *bfc, const void *buf, + size_t count, loff_t pos) +{ + ssize_t res = incfs_kwrite(bfc, buf, count, pos); + + if (res < 0) + return res; + if (res != count) + return -EIO; + return 0; +} + +static int append_zeros_no_fallocate(struct backing_file_context *bfc, + size_t file_size, size_t len) +{ + u8 buffer[256] = {}; + size_t i; + + for (i = 0; i < len; i += sizeof(buffer)) { + int to_write = len - i > sizeof(buffer) + ? sizeof(buffer) : len - i; + int err = write_to_bf(bfc, buffer, to_write, file_size + i); + + if (err) + return err; + } + + return 0; +} + +/* Append a given number of zero bytes to the end of the backing file. */ +static int append_zeros(struct backing_file_context *bfc, size_t len) +{ + loff_t file_size = 0; + loff_t new_last_byte_offset = 0; + int result; + + if (!bfc) + return -EFAULT; + + if (len == 0) + return 0; + + LOCK_REQUIRED(bfc->bc_mutex); + + /* + * Allocate only one byte at the new desired end of the file. + * It will increase file size and create a zeroed area of + * a given size. + */ + file_size = incfs_get_end_offset(bfc->bc_file); + new_last_byte_offset = file_size + len - 1; + result = vfs_fallocate(bfc->bc_file, 0, new_last_byte_offset, 1); + if (result != -EOPNOTSUPP) + return result; + + return append_zeros_no_fallocate(bfc, file_size, len); +} + +/* + * Append a given metadata record to the backing file and update a previous + * record to add the new record the the metadata list. + */ +static int append_md_to_backing_file(struct backing_file_context *bfc, + struct incfs_md_header *record) +{ + int result = 0; + loff_t record_offset; + loff_t file_pos; + __le64 new_md_offset; + size_t record_size; + + if (!bfc || !record) + return -EFAULT; + + if (bfc->bc_last_md_record_offset < 0) + return -EINVAL; + + LOCK_REQUIRED(bfc->bc_mutex); + + record_size = le16_to_cpu(record->h_record_size); + file_pos = incfs_get_end_offset(bfc->bc_file); + record->h_next_md_offset = 0; + + /* Write the metadata record to the end of the backing file */ + record_offset = file_pos; + new_md_offset = cpu_to_le64(record_offset); + result = write_to_bf(bfc, record, record_size, file_pos); + if (result) + return result; + + /* Update next metadata offset in a previous record or a superblock. */ + if (bfc->bc_last_md_record_offset) { + /* + * Find a place in the previous md record where new record's + * offset needs to be saved. + */ + file_pos = bfc->bc_last_md_record_offset + + offsetof(struct incfs_md_header, h_next_md_offset); + } else { + /* + * No metadata yet, file a place to update in the + * file_header. + */ + file_pos = offsetof(struct incfs_file_header, + fh_first_md_offset); + } + result = write_to_bf(bfc, &new_md_offset, sizeof(new_md_offset), + file_pos); + if (result) + return result; + + bfc->bc_last_md_record_offset = record_offset; + return result; +} + +/* + * Reserve 0-filled space for the blockmap body, and append + * incfs_blockmap metadata record pointing to it. + */ +int incfs_write_blockmap_to_backing_file(struct backing_file_context *bfc, + u32 block_count) +{ + struct incfs_blockmap blockmap = {}; + int result = 0; + loff_t file_end = 0; + size_t map_size = block_count * sizeof(struct incfs_blockmap_entry); + + if (!bfc) + return -EFAULT; + + blockmap.m_header.h_md_entry_type = INCFS_MD_BLOCK_MAP; + blockmap.m_header.h_record_size = cpu_to_le16(sizeof(blockmap)); + blockmap.m_header.h_next_md_offset = cpu_to_le64(0); + blockmap.m_block_count = cpu_to_le32(block_count); + + LOCK_REQUIRED(bfc->bc_mutex); + + /* Reserve 0-filled space for the blockmap body in the backing file. */ + file_end = incfs_get_end_offset(bfc->bc_file); + result = append_zeros(bfc, map_size); + if (result) + return result; + + /* Write blockmap metadata record pointing to the body written above. */ + blockmap.m_base_offset = cpu_to_le64(file_end); + result = append_md_to_backing_file(bfc, &blockmap.m_header); + if (result) + /* Error, rollback file changes */ + truncate_backing_file(bfc, file_end); + + return result; +} + +int incfs_write_signature_to_backing_file(struct backing_file_context *bfc, + struct mem_range sig, u32 tree_size, + loff_t *tree_offset, loff_t *sig_offset) +{ + struct incfs_file_signature sg = {}; + int result = 0; + loff_t rollback_pos = 0; + loff_t tree_area_pos = 0; + size_t alignment = 0; + + if (!bfc) + return -EFAULT; + + LOCK_REQUIRED(bfc->bc_mutex); + + rollback_pos = incfs_get_end_offset(bfc->bc_file); + + sg.sg_header.h_md_entry_type = INCFS_MD_SIGNATURE; + sg.sg_header.h_record_size = cpu_to_le16(sizeof(sg)); + sg.sg_header.h_next_md_offset = cpu_to_le64(0); + if (sig.data != NULL && sig.len > 0) { + sg.sg_sig_size = cpu_to_le32(sig.len); + sg.sg_sig_offset = cpu_to_le64(rollback_pos); + + result = write_to_bf(bfc, sig.data, sig.len, rollback_pos); + if (result) + goto err; + } + + tree_area_pos = incfs_get_end_offset(bfc->bc_file); + if (tree_size > 0) { + if (tree_size > 5 * INCFS_DATA_FILE_BLOCK_SIZE) { + /* + * If hash tree is big enough, it makes sense to + * align in the backing file for faster access. + */ + loff_t offset = round_up(tree_area_pos, PAGE_SIZE); + + alignment = offset - tree_area_pos; + tree_area_pos = offset; + } + + /* + * If root hash is not the only hash in the tree. + * reserve 0-filled space for the tree. + */ + result = append_zeros(bfc, tree_size + alignment); + if (result) + goto err; + + sg.sg_hash_tree_size = cpu_to_le32(tree_size); + sg.sg_hash_tree_offset = cpu_to_le64(tree_area_pos); + } + + /* Write a hash tree metadata record pointing to the hash tree above. */ + result = append_md_to_backing_file(bfc, &sg.sg_header); +err: + if (result) + /* Error, rollback file changes */ + truncate_backing_file(bfc, rollback_pos); + else { + if (tree_offset) + *tree_offset = tree_area_pos; + if (sig_offset) + *sig_offset = rollback_pos; + } + + return result; +} + +static int write_new_status_to_backing_file(struct backing_file_context *bfc, + u32 data_blocks_written, + u32 hash_blocks_written) +{ + int result; + loff_t rollback_pos; + struct incfs_status is = { + .is_header = { + .h_md_entry_type = INCFS_MD_STATUS, + .h_record_size = cpu_to_le16(sizeof(is)), + }, + .is_data_blocks_written = cpu_to_le32(data_blocks_written), + .is_hash_blocks_written = cpu_to_le32(hash_blocks_written), + }; + + LOCK_REQUIRED(bfc->bc_mutex); + rollback_pos = incfs_get_end_offset(bfc->bc_file); + result = append_md_to_backing_file(bfc, &is.is_header); + if (result) + truncate_backing_file(bfc, rollback_pos); + + return result; +} + +int incfs_write_status_to_backing_file(struct backing_file_context *bfc, + loff_t status_offset, + u32 data_blocks_written, + u32 hash_blocks_written) +{ + struct incfs_status is; + int result; + + if (!bfc) + return -EFAULT; + + if (status_offset == 0) + return write_new_status_to_backing_file(bfc, + data_blocks_written, hash_blocks_written); + + result = incfs_kread(bfc, &is, sizeof(is), status_offset); + if (result != sizeof(is)) + return -EIO; + + is.is_data_blocks_written = cpu_to_le32(data_blocks_written); + is.is_hash_blocks_written = cpu_to_le32(hash_blocks_written); + result = incfs_kwrite(bfc, &is, sizeof(is), status_offset); + if (result != sizeof(is)) + return -EIO; + + return 0; +} + +int incfs_write_verity_signature_to_backing_file( + struct backing_file_context *bfc, struct mem_range signature, + loff_t *offset) +{ + struct incfs_file_verity_signature vs = {}; + int result; + loff_t pos; + + /* No verity signature section is equivalent to an empty section */ + if (signature.data == NULL || signature.len == 0) + return 0; + + pos = incfs_get_end_offset(bfc->bc_file); + + vs = (struct incfs_file_verity_signature) { + .vs_header = (struct incfs_md_header) { + .h_md_entry_type = INCFS_MD_VERITY_SIGNATURE, + .h_record_size = cpu_to_le16(sizeof(vs)), + .h_next_md_offset = cpu_to_le64(0), + }, + .vs_size = cpu_to_le32(signature.len), + .vs_offset = cpu_to_le64(pos), + }; + + result = write_to_bf(bfc, signature.data, signature.len, pos); + if (result) + goto err; + + result = append_md_to_backing_file(bfc, &vs.vs_header); + if (result) + goto err; + + *offset = pos; +err: + if (result) + /* Error, rollback file changes */ + truncate_backing_file(bfc, pos); + return result; +} + +/* + * Write a backing file header + * It should always be called only on empty file. + * fh.fh_first_md_offset is 0 for now, but will be updated + * once first metadata record is added. + */ +int incfs_write_fh_to_backing_file(struct backing_file_context *bfc, + incfs_uuid_t *uuid, u64 file_size) +{ + struct incfs_file_header fh = {}; + loff_t file_pos = 0; + + if (!bfc) + return -EFAULT; + + fh.fh_magic = cpu_to_le64(INCFS_MAGIC_NUMBER); + fh.fh_version = cpu_to_le64(INCFS_FORMAT_CURRENT_VER); + fh.fh_header_size = cpu_to_le16(sizeof(fh)); + fh.fh_first_md_offset = cpu_to_le64(0); + fh.fh_data_block_size = cpu_to_le16(INCFS_DATA_FILE_BLOCK_SIZE); + + fh.fh_file_size = cpu_to_le64(file_size); + fh.fh_uuid = *uuid; + + LOCK_REQUIRED(bfc->bc_mutex); + + file_pos = incfs_get_end_offset(bfc->bc_file); + if (file_pos != 0) + return -EEXIST; + + return write_to_bf(bfc, &fh, sizeof(fh), file_pos); +} + +/* + * Write a backing file header for a mapping file + * It should always be called only on empty file. + */ +int incfs_write_mapping_fh_to_backing_file(struct backing_file_context *bfc, + incfs_uuid_t *uuid, u64 file_size, u64 offset) +{ + struct incfs_file_header fh = {}; + loff_t file_pos = 0; + + if (!bfc) + return -EFAULT; + + fh.fh_magic = cpu_to_le64(INCFS_MAGIC_NUMBER); + fh.fh_version = cpu_to_le64(INCFS_FORMAT_CURRENT_VER); + fh.fh_header_size = cpu_to_le16(sizeof(fh)); + fh.fh_original_offset = cpu_to_le64(offset); + fh.fh_data_block_size = cpu_to_le16(INCFS_DATA_FILE_BLOCK_SIZE); + + fh.fh_mapped_file_size = cpu_to_le64(file_size); + fh.fh_original_uuid = *uuid; + fh.fh_flags = cpu_to_le32(INCFS_FILE_MAPPED); + + LOCK_REQUIRED(bfc->bc_mutex); + + file_pos = incfs_get_end_offset(bfc->bc_file); + if (file_pos != 0) + return -EEXIST; + + return write_to_bf(bfc, &fh, sizeof(fh), file_pos); +} + +/* Write a given data block and update file's blockmap to point it. */ +int incfs_write_data_block_to_backing_file(struct backing_file_context *bfc, + struct mem_range block, int block_index, + loff_t bm_base_off, u16 flags) +{ + struct incfs_blockmap_entry bm_entry = {}; + int result = 0; + loff_t data_offset = 0; + loff_t bm_entry_off = + bm_base_off + sizeof(struct incfs_blockmap_entry) * block_index; + + if (!bfc) + return -EFAULT; + + if (block.len >= (1 << 16) || block_index < 0) + return -EINVAL; + + LOCK_REQUIRED(bfc->bc_mutex); + + data_offset = incfs_get_end_offset(bfc->bc_file); + if (data_offset <= bm_entry_off) { + /* Blockmap entry is beyond the file's end. It is not normal. */ + return -EINVAL; + } + + /* Write the block data at the end of the backing file. */ + result = write_to_bf(bfc, block.data, block.len, data_offset); + if (result) + return result; + + /* Update the blockmap to point to the newly written data. */ + bm_entry.me_data_offset_lo = cpu_to_le32((u32)data_offset); + bm_entry.me_data_offset_hi = cpu_to_le16((u16)(data_offset >> 32)); + bm_entry.me_data_size = cpu_to_le16((u16)block.len); + bm_entry.me_flags = cpu_to_le16(flags); + + return write_to_bf(bfc, &bm_entry, sizeof(bm_entry), + bm_entry_off); +} + +int incfs_write_hash_block_to_backing_file(struct backing_file_context *bfc, + struct mem_range block, + int block_index, + loff_t hash_area_off, + loff_t bm_base_off, + loff_t file_size) +{ + struct incfs_blockmap_entry bm_entry = {}; + int result; + loff_t data_offset = 0; + loff_t file_end = 0; + loff_t bm_entry_off = + bm_base_off + + sizeof(struct incfs_blockmap_entry) * + (block_index + get_blocks_count_for_size(file_size)); + + if (!bfc) + return -EFAULT; + + LOCK_REQUIRED(bfc->bc_mutex); + + data_offset = hash_area_off + block_index * INCFS_DATA_FILE_BLOCK_SIZE; + file_end = incfs_get_end_offset(bfc->bc_file); + if (data_offset + block.len > file_end) { + /* Block is located beyond the file's end. It is not normal. */ + return -EINVAL; + } + + result = write_to_bf(bfc, block.data, block.len, data_offset); + if (result) + return result; + + bm_entry.me_data_offset_lo = cpu_to_le32((u32)data_offset); + bm_entry.me_data_offset_hi = cpu_to_le16((u16)(data_offset >> 32)); + bm_entry.me_data_size = cpu_to_le16(INCFS_DATA_FILE_BLOCK_SIZE); + + return write_to_bf(bfc, &bm_entry, sizeof(bm_entry), bm_entry_off); +} + +int incfs_read_blockmap_entry(struct backing_file_context *bfc, int block_index, + loff_t bm_base_off, + struct incfs_blockmap_entry *bm_entry) +{ + int error = incfs_read_blockmap_entries(bfc, bm_entry, block_index, 1, + bm_base_off); + + if (error < 0) + return error; + + if (error == 0) + return -EIO; + + if (error != 1) + return -EFAULT; + + return 0; +} + +int incfs_read_blockmap_entries(struct backing_file_context *bfc, + struct incfs_blockmap_entry *entries, + int start_index, int blocks_number, + loff_t bm_base_off) +{ + loff_t bm_entry_off = + bm_base_off + sizeof(struct incfs_blockmap_entry) * start_index; + const size_t bytes_to_read = sizeof(struct incfs_blockmap_entry) + * blocks_number; + int result = 0; + + if (!bfc || !entries) + return -EFAULT; + + if (start_index < 0 || bm_base_off <= 0) + return -ENODATA; + + result = incfs_kread(bfc, entries, bytes_to_read, bm_entry_off); + if (result < 0) + return result; + return result / sizeof(*entries); +} + +int incfs_read_file_header(struct backing_file_context *bfc, + loff_t *first_md_off, incfs_uuid_t *uuid, + u64 *file_size, u32 *flags) +{ + ssize_t bytes_read = 0; + struct incfs_file_header fh = {}; + + if (!bfc || !first_md_off) + return -EFAULT; + + bytes_read = incfs_kread(bfc, &fh, sizeof(fh), 0); + if (bytes_read < 0) + return bytes_read; + + if (bytes_read < sizeof(fh)) + return -EBADMSG; + + if (le64_to_cpu(fh.fh_magic) != INCFS_MAGIC_NUMBER) + return -EILSEQ; + + if (le64_to_cpu(fh.fh_version) > INCFS_FORMAT_CURRENT_VER) + return -EILSEQ; + + if (le16_to_cpu(fh.fh_data_block_size) != INCFS_DATA_FILE_BLOCK_SIZE) + return -EILSEQ; + + if (le16_to_cpu(fh.fh_header_size) != sizeof(fh)) + return -EILSEQ; + + if (first_md_off) + *first_md_off = le64_to_cpu(fh.fh_first_md_offset); + if (uuid) + *uuid = fh.fh_uuid; + if (file_size) + *file_size = le64_to_cpu(fh.fh_file_size); + if (flags) + *flags = le32_to_cpu(fh.fh_flags); + return 0; +} + +/* + * Read through metadata records from the backing file one by one + * and call provided metadata handlers. + */ +int incfs_read_next_metadata_record(struct backing_file_context *bfc, + struct metadata_handler *handler) +{ + const ssize_t max_md_size = INCFS_MAX_METADATA_RECORD_SIZE; + ssize_t bytes_read = 0; + size_t md_record_size = 0; + loff_t next_record = 0; + int res = 0; + struct incfs_md_header *md_hdr = NULL; + + if (!bfc || !handler) + return -EFAULT; + + if (handler->md_record_offset == 0) + return -EPERM; + + memset(&handler->md_buffer, 0, max_md_size); + bytes_read = incfs_kread(bfc, &handler->md_buffer, max_md_size, + handler->md_record_offset); + if (bytes_read < 0) + return bytes_read; + if (bytes_read < sizeof(*md_hdr)) + return -EBADMSG; + + md_hdr = &handler->md_buffer.md_header; + next_record = le64_to_cpu(md_hdr->h_next_md_offset); + md_record_size = le16_to_cpu(md_hdr->h_record_size); + + if (md_record_size > max_md_size) { + pr_warn("incfs: The record is too large. Size: %zu", + md_record_size); + return -EBADMSG; + } + + if (bytes_read < md_record_size) { + pr_warn("incfs: The record hasn't been fully read."); + return -EBADMSG; + } + + if (next_record <= handler->md_record_offset && next_record != 0) { + pr_warn("incfs: Next record (%lld) points back in file.", + next_record); + return -EBADMSG; + } + + switch (md_hdr->h_md_entry_type) { + case INCFS_MD_NONE: + break; + case INCFS_MD_BLOCK_MAP: + if (handler->handle_blockmap) + res = handler->handle_blockmap( + &handler->md_buffer.blockmap, handler); + break; + case INCFS_MD_FILE_ATTR: + /* + * File attrs no longer supported, ignore section for + * compatibility + */ + break; + case INCFS_MD_SIGNATURE: + if (handler->handle_signature) + res = handler->handle_signature( + &handler->md_buffer.signature, handler); + break; + case INCFS_MD_STATUS: + if (handler->handle_status) + res = handler->handle_status( + &handler->md_buffer.status, handler); + break; + case INCFS_MD_VERITY_SIGNATURE: + if (handler->handle_verity_signature) + res = handler->handle_verity_signature( + &handler->md_buffer.verity_signature, handler); + break; + default: + res = -ENOTSUPP; + break; + } + + if (!res) { + if (next_record == 0) { + /* + * Zero offset for the next record means that the last + * metadata record has just been processed. + */ + bfc->bc_last_md_record_offset = + handler->md_record_offset; + } + handler->md_prev_record_offset = handler->md_record_offset; + handler->md_record_offset = next_record; + } + return res; +} + +ssize_t incfs_kread(struct backing_file_context *bfc, void *buf, size_t size, + loff_t pos) +{ + const struct cred *old_cred = override_creds(bfc->bc_cred); + int ret = kernel_read(bfc->bc_file, buf, size, &pos); + + revert_creds(old_cred); + return ret; +} + +ssize_t incfs_kwrite(struct backing_file_context *bfc, const void *buf, + size_t size, loff_t pos) +{ + const struct cred *old_cred = override_creds(bfc->bc_cred); + int ret = kernel_write(bfc->bc_file, buf, size, &pos); + + revert_creds(old_cred); + return ret; +} diff --git a/fs/incfs/format.h b/fs/incfs/format.h new file mode 100644 index 000000000000..14e475b79a16 --- /dev/null +++ b/fs/incfs/format.h @@ -0,0 +1,407 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2018 Google LLC + */ + +/* + * Overview + * -------- + * The backbone of the incremental-fs ondisk format is an append only linked + * list of metadata blocks. Each metadata block contains an offset of the next + * one. These blocks describe files and directories on the + * file system. They also represent actions of adding and removing file names + * (hard links). + * + * Every time incremental-fs instance is mounted, it reads through this list + * to recreate filesystem's state in memory. An offset of the first record in + * the metadata list is stored in the superblock at the beginning of the backing + * file. + * + * Most of the backing file is taken by data areas and blockmaps. + * Since data blocks can be compressed and have different sizes, + * single per-file data area can't be pre-allocated. That's why blockmaps are + * needed in order to find a location and size of each data block in + * the backing file. Each time a file is created, a corresponding block map is + * allocated to store future offsets of data blocks. + * + * Whenever a data block is given by data loader to incremental-fs: + * - A data area with the given block is appended to the end of + * the backing file. + * - A record in the blockmap for the given block index is updated to reflect + * its location, size, and compression algorithm. + + * Metadata records + * ---------------- + * incfs_blockmap - metadata record that specifies size and location + * of a blockmap area for a given file. This area + * contains an array of incfs_blockmap_entry-s. + * incfs_file_signature - metadata record that specifies where file signature + * and its hash tree can be found in the backing file. + * + * incfs_file_attr - metadata record that specifies where additional file + * attributes blob can be found. + * + * Metadata header + * --------------- + * incfs_md_header - header of a metadata record. It's always a part + * of other structures and served purpose of metadata + * bookkeeping. + * + * +-----------------------------------------------+ ^ + * | incfs_md_header | | + * | 1. type of body(BLOCKMAP, FILE_ATTR..) | | + * | 2. size of the whole record header + body | | + * | 3. CRC the whole record header + body | | + * | 4. offset of the previous md record |]------+ + * | 5. offset of the next md record (md link) |]---+ + * +-----------------------------------------------+ | + * | Metadata record body with useful data | | + * +-----------------------------------------------+ | + * +---> + * + * Other ondisk structures + * ----------------------- + * incfs_super_block - backing file header + * incfs_blockmap_entry - a record in a blockmap area that describes size + * and location of a data block. + * Data blocks dont have any particular structure, they are written to the + * backing file in a raw form as they come from a data loader. + * + * Backing file layout + * ------------------- + * + * + * +-------------------------------------------+ + * | incfs_file_header |]---+ + * +-------------------------------------------+ | + * | metadata |<---+ + * | incfs_file_signature |]---+ + * +-------------------------------------------+ | + * ......................... | + * +-------------------------------------------+ | metadata + * +------->| blockmap area | | list links + * | | [incfs_blockmap_entry] | | + * | | [incfs_blockmap_entry] | | + * | | [incfs_blockmap_entry] | | + * | +--[| [incfs_blockmap_entry] | | + * | | | [incfs_blockmap_entry] | | + * | | | [incfs_blockmap_entry] | | + * | | +-------------------------------------------+ | + * | | ......................... | + * | | +-------------------------------------------+ | + * | | | metadata |<---+ + * +----|--[| incfs_blockmap |]---+ + * | +-------------------------------------------+ | + * | ......................... | + * | +-------------------------------------------+ | + * +-->| data block | | + * +-------------------------------------------+ | + * ......................... | + * +-------------------------------------------+ | + * | metadata |<---+ + * | incfs_file_attr | + * +-------------------------------------------+ + */ +#ifndef _INCFS_FORMAT_H +#define _INCFS_FORMAT_H +#include +#include +#include + +#include "internal.h" + +#define INCFS_MAX_NAME_LEN 255 +#define INCFS_FORMAT_V1 1 +#define INCFS_FORMAT_CURRENT_VER INCFS_FORMAT_V1 + +enum incfs_metadata_type { + INCFS_MD_NONE = 0, + INCFS_MD_BLOCK_MAP = 1, + INCFS_MD_FILE_ATTR = 2, + INCFS_MD_SIGNATURE = 3, + INCFS_MD_STATUS = 4, + INCFS_MD_VERITY_SIGNATURE = 5, +}; + +enum incfs_file_header_flags { + INCFS_FILE_MAPPED = 1 << 1, +}; + +/* Header included at the beginning of all metadata records on the disk. */ +struct incfs_md_header { + __u8 h_md_entry_type; + + /* + * Size of the metadata record. + * (e.g. inode, dir entry etc) not just this struct. + */ + __le16 h_record_size; + + /* + * Was: CRC32 of the metadata record. + * (e.g. inode, dir entry etc) not just this struct. + */ + __le32 h_unused1; + + /* Offset of the next metadata entry if any */ + __le64 h_next_md_offset; + + /* Was: Offset of the previous metadata entry if any */ + __le64 h_unused2; + +} __packed; + +/* Backing file header */ +struct incfs_file_header { + /* Magic number: INCFS_MAGIC_NUMBER */ + __le64 fh_magic; + + /* Format version: INCFS_FORMAT_CURRENT_VER */ + __le64 fh_version; + + /* sizeof(incfs_file_header) */ + __le16 fh_header_size; + + /* INCFS_DATA_FILE_BLOCK_SIZE */ + __le16 fh_data_block_size; + + /* File flags, from incfs_file_header_flags */ + __le32 fh_flags; + + union { + /* Standard incfs file */ + struct { + /* Offset of the first metadata record */ + __le64 fh_first_md_offset; + + /* Full size of the file's content */ + __le64 fh_file_size; + + /* File uuid */ + incfs_uuid_t fh_uuid; + }; + + /* Mapped file - INCFS_FILE_MAPPED set in fh_flags */ + struct { + /* Offset in original file */ + __le64 fh_original_offset; + + /* Full size of the file's content */ + __le64 fh_mapped_file_size; + + /* Original file's uuid */ + incfs_uuid_t fh_original_uuid; + }; + }; +} __packed; + +enum incfs_block_map_entry_flags { + INCFS_BLOCK_COMPRESSED_LZ4 = 1, + INCFS_BLOCK_COMPRESSED_ZSTD = 2, + + /* Reserve 3 bits for compression alg */ + INCFS_BLOCK_COMPRESSED_MASK = 7, +}; + +/* Block map entry pointing to an actual location of the data block. */ +struct incfs_blockmap_entry { + /* Offset of the actual data block. Lower 32 bits */ + __le32 me_data_offset_lo; + + /* Offset of the actual data block. Higher 16 bits */ + __le16 me_data_offset_hi; + + /* How many bytes the data actually occupies in the backing file */ + __le16 me_data_size; + + /* Block flags from incfs_block_map_entry_flags */ + __le16 me_flags; +} __packed; + +/* Metadata record for locations of file blocks. Type = INCFS_MD_BLOCK_MAP */ +struct incfs_blockmap { + struct incfs_md_header m_header; + + /* Base offset of the array of incfs_blockmap_entry */ + __le64 m_base_offset; + + /* Size of the map entry array in blocks */ + __le32 m_block_count; +} __packed; + +/* + * Metadata record for file signature. Type = INCFS_MD_SIGNATURE + * + * The signature stored here is the APK V4 signature data blob. See the + * definition of incfs_new_file_args::signature_info for an explanation of this + * blob. Specifically, it contains the root hash, but it does *not* contain + * anything that the kernel treats as a signature. + * + * When FS_IOC_ENABLE_VERITY is called on a file without this record, an APK V4 + * signature blob and a hash tree are added to the file, and then this metadata + * record is created to record their locations. + */ +struct incfs_file_signature { + struct incfs_md_header sg_header; + + __le32 sg_sig_size; /* The size of the signature. */ + + __le64 sg_sig_offset; /* Signature's offset in the backing file */ + + __le32 sg_hash_tree_size; /* The size of the hash tree. */ + + __le64 sg_hash_tree_offset; /* Hash tree offset in the backing file */ +} __packed; + +/* In memory version of above */ +struct incfs_df_signature { + u32 sig_size; + u64 sig_offset; + u32 hash_size; + u64 hash_offset; +}; + +struct incfs_status { + struct incfs_md_header is_header; + + __le32 is_data_blocks_written; /* Number of data blocks written */ + + __le32 is_hash_blocks_written; /* Number of hash blocks written */ + + __le32 is_dummy[6]; /* Spare fields */ +} __packed; + +/* + * Metadata record for verity signature. Type = INCFS_MD_VERITY_SIGNATURE + * + * This record will only exist for verity-enabled files with signatures. Verity + * enabled files without signatures do not have this record. This signature is + * checked by fs-verity identically to any other fs-verity signature. + */ +struct incfs_file_verity_signature { + struct incfs_md_header vs_header; + + /* The size of the signature */ + __le32 vs_size; + + /* Signature's offset in the backing file */ + __le64 vs_offset; +} __packed; + +/* In memory version of above */ +struct incfs_df_verity_signature { + u32 size; + u64 offset; +}; + +/* State of the backing file. */ +struct backing_file_context { + /* Protects writes to bc_file */ + struct mutex bc_mutex; + + /* File object to read data from */ + struct file *bc_file; + + /* + * Offset of the last known metadata record in the backing file. + * 0 means there are no metadata records. + */ + loff_t bc_last_md_record_offset; + + /* + * Credentials to set before reads/writes + * Note that this is a pointer to the mount_info mi_owner field so + * there is no need to get/put the creds + */ + const struct cred *bc_cred; +}; + +struct metadata_handler { + loff_t md_record_offset; + loff_t md_prev_record_offset; + void *context; + + union { + struct incfs_md_header md_header; + struct incfs_blockmap blockmap; + struct incfs_file_signature signature; + struct incfs_status status; + struct incfs_file_verity_signature verity_signature; + } md_buffer; + + int (*handle_blockmap)(struct incfs_blockmap *bm, + struct metadata_handler *handler); + int (*handle_signature)(struct incfs_file_signature *sig, + struct metadata_handler *handler); + int (*handle_status)(struct incfs_status *sig, + struct metadata_handler *handler); + int (*handle_verity_signature)(struct incfs_file_verity_signature *s, + struct metadata_handler *handler); +}; +#define INCFS_MAX_METADATA_RECORD_SIZE \ + sizeof_field(struct metadata_handler, md_buffer) + +/* Backing file context management */ +struct mount_info; +struct backing_file_context *incfs_alloc_bfc(struct mount_info *mi, + struct file *backing_file); + +void incfs_free_bfc(struct backing_file_context *bfc); + +/* Writing stuff */ +int incfs_write_blockmap_to_backing_file(struct backing_file_context *bfc, + u32 block_count); + +int incfs_write_fh_to_backing_file(struct backing_file_context *bfc, + incfs_uuid_t *uuid, u64 file_size); + +int incfs_write_mapping_fh_to_backing_file(struct backing_file_context *bfc, + incfs_uuid_t *uuid, u64 file_size, u64 offset); + +int incfs_write_data_block_to_backing_file(struct backing_file_context *bfc, + struct mem_range block, + int block_index, loff_t bm_base_off, + u16 flags); + +int incfs_write_hash_block_to_backing_file(struct backing_file_context *bfc, + struct mem_range block, + int block_index, + loff_t hash_area_off, + loff_t bm_base_off, + loff_t file_size); + +int incfs_write_signature_to_backing_file(struct backing_file_context *bfc, + struct mem_range sig, u32 tree_size, + loff_t *tree_offset, loff_t *sig_offset); + +int incfs_write_status_to_backing_file(struct backing_file_context *bfc, + loff_t status_offset, + u32 data_blocks_written, + u32 hash_blocks_written); +int incfs_write_verity_signature_to_backing_file( + struct backing_file_context *bfc, struct mem_range signature, + loff_t *offset); + +/* Reading stuff */ +int incfs_read_file_header(struct backing_file_context *bfc, + loff_t *first_md_off, incfs_uuid_t *uuid, + u64 *file_size, u32 *flags); + +int incfs_read_blockmap_entry(struct backing_file_context *bfc, int block_index, + loff_t bm_base_off, + struct incfs_blockmap_entry *bm_entry); + +int incfs_read_blockmap_entries(struct backing_file_context *bfc, + struct incfs_blockmap_entry *entries, + int start_index, int blocks_number, + loff_t bm_base_off); + +int incfs_read_next_metadata_record(struct backing_file_context *bfc, + struct metadata_handler *handler); + +ssize_t incfs_kread(struct backing_file_context *bfc, void *buf, size_t size, + loff_t pos); +ssize_t incfs_kwrite(struct backing_file_context *bfc, const void *buf, + size_t size, loff_t pos); + +#endif /* _INCFS_FORMAT_H */ diff --git a/fs/incfs/integrity.c b/fs/incfs/integrity.c new file mode 100644 index 000000000000..93e764313ea0 --- /dev/null +++ b/fs/incfs/integrity.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 Google LLC + */ +#include +#include +#include +#include + +#include "integrity.h" + +struct incfs_hash_alg *incfs_get_hash_alg(enum incfs_hash_tree_algorithm id) +{ + static struct incfs_hash_alg sha256 = { + .name = "sha256", + .digest_size = SHA256_DIGEST_SIZE, + .id = INCFS_HASH_TREE_SHA256 + }; + struct incfs_hash_alg *result = NULL; + struct crypto_shash *shash; + + if (id == INCFS_HASH_TREE_SHA256) { + BUILD_BUG_ON(INCFS_MAX_HASH_SIZE < SHA256_DIGEST_SIZE); + result = &sha256; + } + + if (result == NULL) + return ERR_PTR(-ENOENT); + + /* pairs with cmpxchg_release() below */ + shash = smp_load_acquire(&result->shash); + if (shash) + return result; + + shash = crypto_alloc_shash(result->name, 0, 0); + if (IS_ERR(shash)) { + int err = PTR_ERR(shash); + + pr_err("Can't allocate hash alg %s, error code:%d", + result->name, err); + return ERR_PTR(err); + } + + /* pairs with smp_load_acquire() above */ + if (cmpxchg_release(&result->shash, NULL, shash) != NULL) + crypto_free_shash(shash); + + return result; +} + +struct signature_info { + u32 version; + enum incfs_hash_tree_algorithm hash_algorithm; + u8 log2_blocksize; + struct mem_range salt; + struct mem_range root_hash; +}; + +static bool read_u32(u8 **p, u8 *top, u32 *result) +{ + if (*p + sizeof(u32) > top) + return false; + + *result = le32_to_cpu(*(__le32 *)*p); + *p += sizeof(u32); + return true; +} + +static bool read_u8(u8 **p, u8 *top, u8 *result) +{ + if (*p + sizeof(u8) > top) + return false; + + *result = *(u8 *)*p; + *p += sizeof(u8); + return true; +} + +static bool read_mem_range(u8 **p, u8 *top, struct mem_range *range) +{ + u32 len; + + if (!read_u32(p, top, &len) || *p + len > top) + return false; + + range->len = len; + range->data = *p; + *p += len; + return true; +} + +static int incfs_parse_signature(struct mem_range signature, + struct signature_info *si) +{ + u8 *p = signature.data; + u8 *top = signature.data + signature.len; + u32 hash_section_size; + + if (signature.len > INCFS_MAX_SIGNATURE_SIZE) + return -EINVAL; + + if (!read_u32(&p, top, &si->version) || + si->version != INCFS_SIGNATURE_VERSION) + return -EINVAL; + + if (!read_u32(&p, top, &hash_section_size) || + p + hash_section_size > top) + return -EINVAL; + top = p + hash_section_size; + + if (!read_u32(&p, top, &si->hash_algorithm) || + si->hash_algorithm != INCFS_HASH_TREE_SHA256) + return -EINVAL; + + if (!read_u8(&p, top, &si->log2_blocksize) || si->log2_blocksize != 12) + return -EINVAL; + + if (!read_mem_range(&p, top, &si->salt)) + return -EINVAL; + + if (!read_mem_range(&p, top, &si->root_hash)) + return -EINVAL; + + if (p != top) + return -EINVAL; + + return 0; +} + +struct mtree *incfs_alloc_mtree(struct mem_range signature, + int data_block_count) +{ + int error; + struct signature_info si; + struct mtree *result = NULL; + struct incfs_hash_alg *hash_alg = NULL; + int hash_per_block; + int lvl; + int total_blocks = 0; + int blocks_in_level[INCFS_MAX_MTREE_LEVELS]; + int blocks = data_block_count; + + if (data_block_count <= 0) + return ERR_PTR(-EINVAL); + + error = incfs_parse_signature(signature, &si); + if (error) + return ERR_PTR(error); + + hash_alg = incfs_get_hash_alg(si.hash_algorithm); + if (IS_ERR(hash_alg)) + return ERR_PTR(PTR_ERR(hash_alg)); + + if (si.root_hash.len < hash_alg->digest_size) + return ERR_PTR(-EINVAL); + + result = kzalloc(sizeof(*result), GFP_NOFS); + if (!result) + return ERR_PTR(-ENOMEM); + + result->alg = hash_alg; + hash_per_block = INCFS_DATA_FILE_BLOCK_SIZE / result->alg->digest_size; + + /* Calculating tree geometry. */ + /* First pass: calculate how many blocks in each tree level. */ + for (lvl = 0; blocks > 1; lvl++) { + if (lvl >= INCFS_MAX_MTREE_LEVELS) { + pr_err("incfs: too much data in mtree"); + goto err; + } + + blocks = (blocks + hash_per_block - 1) / hash_per_block; + blocks_in_level[lvl] = blocks; + total_blocks += blocks; + } + result->depth = lvl; + result->hash_tree_area_size = total_blocks * INCFS_DATA_FILE_BLOCK_SIZE; + if (result->hash_tree_area_size > INCFS_MAX_HASH_AREA_SIZE) + goto err; + + blocks = 0; + /* Second pass: calculate offset of each level. 0th level goes last. */ + for (lvl = 0; lvl < result->depth; lvl++) { + u32 suboffset; + + blocks += blocks_in_level[lvl]; + suboffset = (total_blocks - blocks) + * INCFS_DATA_FILE_BLOCK_SIZE; + + result->hash_level_suboffset[lvl] = suboffset; + } + + /* Root hash is stored separately from the rest of the tree. */ + memcpy(result->root_hash, si.root_hash.data, hash_alg->digest_size); + return result; + +err: + kfree(result); + return ERR_PTR(-E2BIG); +} + +void incfs_free_mtree(struct mtree *tree) +{ + kfree(tree); +} + +int incfs_calc_digest(struct incfs_hash_alg *alg, struct mem_range data, + struct mem_range digest) +{ + SHASH_DESC_ON_STACK(desc, alg->shash); + + if (!alg || !alg->shash || !data.data || !digest.data) + return -EFAULT; + + if (alg->digest_size > digest.len) + return -EINVAL; + + desc->tfm = alg->shash; + + if (data.len < INCFS_DATA_FILE_BLOCK_SIZE) { + int err; + void *buf = kzalloc(INCFS_DATA_FILE_BLOCK_SIZE, GFP_NOFS); + + if (!buf) + return -ENOMEM; + + memcpy(buf, data.data, data.len); + err = crypto_shash_digest(desc, buf, INCFS_DATA_FILE_BLOCK_SIZE, + digest.data); + kfree(buf); + return err; + } + return crypto_shash_digest(desc, data.data, data.len, digest.data); +} + diff --git a/fs/incfs/integrity.h b/fs/incfs/integrity.h new file mode 100644 index 000000000000..cf79b64da736 --- /dev/null +++ b/fs/incfs/integrity.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2019 Google LLC + */ +#ifndef _INCFS_INTEGRITY_H +#define _INCFS_INTEGRITY_H +#include +#include +#include + +#include + +#include "internal.h" + +#define INCFS_MAX_MTREE_LEVELS 8 +#define INCFS_MAX_HASH_AREA_SIZE (1280 * 1024 * 1024) + +struct incfs_hash_alg { + const char *name; + int digest_size; + enum incfs_hash_tree_algorithm id; + + struct crypto_shash *shash; +}; + +/* Merkle tree structure. */ +struct mtree { + struct incfs_hash_alg *alg; + + u8 root_hash[INCFS_MAX_HASH_SIZE]; + + /* Offset of each hash level in the hash area. */ + u32 hash_level_suboffset[INCFS_MAX_MTREE_LEVELS]; + + u32 hash_tree_area_size; + + /* Number of levels in hash_level_suboffset */ + int depth; +}; + +struct incfs_hash_alg *incfs_get_hash_alg(enum incfs_hash_tree_algorithm id); + +struct mtree *incfs_alloc_mtree(struct mem_range signature, + int data_block_count); + +void incfs_free_mtree(struct mtree *tree); + +size_t incfs_get_mtree_depth(enum incfs_hash_tree_algorithm alg, loff_t size); + +size_t incfs_get_mtree_hash_count(enum incfs_hash_tree_algorithm alg, + loff_t size); + +int incfs_calc_digest(struct incfs_hash_alg *alg, struct mem_range data, + struct mem_range digest); + +#endif /* _INCFS_INTEGRITY_H */ diff --git a/fs/incfs/internal.h b/fs/incfs/internal.h new file mode 100644 index 000000000000..c2df8bf82d6f --- /dev/null +++ b/fs/incfs/internal.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2018 Google LLC + */ +#ifndef _INCFS_INTERNAL_H +#define _INCFS_INTERNAL_H +#include + +struct mem_range { + u8 *data; + size_t len; +}; + +static inline struct mem_range range(u8 *data, size_t len) +{ + return (struct mem_range){ .data = data, .len = len }; +} + +#define LOCK_REQUIRED(lock) WARN_ON_ONCE(!mutex_is_locked(&lock)) + +#define EFSCORRUPTED EUCLEAN + +#endif /* _INCFS_INTERNAL_H */ diff --git a/fs/incfs/main.c b/fs/incfs/main.c new file mode 100644 index 000000000000..213faa5e9117 --- /dev/null +++ b/fs/incfs/main.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2018 Google LLC + */ +#include +#include +#include + +#include + +#include "sysfs.h" +#include "vfs.h" + +static struct file_system_type incfs_fs_type = { + .owner = THIS_MODULE, + .name = INCFS_NAME, + .mount = incfs_mount_fs, + .kill_sb = incfs_kill_sb, + .fs_flags = 0 +}; + +static int __init init_incfs_module(void) +{ + int err = 0; + + err = incfs_init_sysfs(); + if (err) + return err; + + err = register_filesystem(&incfs_fs_type); + if (err) + incfs_cleanup_sysfs(); + + return err; +} + +static void __exit cleanup_incfs_module(void) +{ + incfs_cleanup_sysfs(); + unregister_filesystem(&incfs_fs_type); +} + +module_init(init_incfs_module); +module_exit(cleanup_incfs_module); + +MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); +MODULE_AUTHOR("Eugene Zemtsov "); +MODULE_DESCRIPTION("Incremental File System"); diff --git a/fs/incfs/pseudo_files.c b/fs/incfs/pseudo_files.c new file mode 100644 index 000000000000..57c3353666ee --- /dev/null +++ b/fs/incfs/pseudo_files.c @@ -0,0 +1,1393 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2020 Google LLC + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "pseudo_files.h" + +#include "data_mgmt.h" +#include "format.h" +#include "integrity.h" +#include "vfs.h" + +#define READ_WRITE_FILE_MODE 0666 + +static bool is_pseudo_filename(struct mem_range name); + +/******************************************************************************* + * .pending_reads pseudo file definition + ******************************************************************************/ +#define INCFS_PENDING_READS_INODE 2 +static const char pending_reads_file_name[] = INCFS_PENDING_READS_FILENAME; + +/* State of an open .pending_reads file, unique for each file descriptor. */ +struct pending_reads_state { + /* A serial number of the last pending read obtained from this file. */ + int last_pending_read_sn; +}; + +static ssize_t pending_reads_read(struct file *f, char __user *buf, size_t len, + loff_t *ppos) +{ + struct pending_reads_state *pr_state = f->private_data; + struct mount_info *mi = get_mount_info(file_superblock(f)); + bool report_uid; + unsigned long page = 0; + struct incfs_pending_read_info *reads_buf = NULL; + struct incfs_pending_read_info2 *reads_buf2 = NULL; + size_t record_size; + size_t reads_to_collect; + int last_known_read_sn = READ_ONCE(pr_state->last_pending_read_sn); + int new_max_sn = last_known_read_sn; + int reads_collected = 0; + ssize_t result = 0; + + if (!mi) + return -EFAULT; + + report_uid = mi->mi_options.report_uid; + record_size = report_uid ? sizeof(*reads_buf2) : sizeof(*reads_buf); + reads_to_collect = len / record_size; + + if (!incfs_fresh_pending_reads_exist(mi, last_known_read_sn)) + return 0; + + page = get_zeroed_page(GFP_NOFS); + if (!page) + return -ENOMEM; + + if (report_uid) + reads_buf2 = (struct incfs_pending_read_info2 *) page; + else + reads_buf = (struct incfs_pending_read_info *) page; + + reads_to_collect = + min_t(size_t, PAGE_SIZE / record_size, reads_to_collect); + + reads_collected = incfs_collect_pending_reads(mi, last_known_read_sn, + reads_buf, reads_buf2, reads_to_collect, + &new_max_sn); + + if (reads_collected < 0) { + result = reads_collected; + goto out; + } + + /* + * Just to make sure that we don't accidentally copy more data + * to reads buffer than userspace can handle. + */ + reads_collected = min_t(size_t, reads_collected, reads_to_collect); + result = reads_collected * record_size; + + /* Copy reads info to the userspace buffer */ + if (copy_to_user(buf, (void *)page, result)) { + result = -EFAULT; + goto out; + } + + WRITE_ONCE(pr_state->last_pending_read_sn, new_max_sn); + *ppos = 0; + +out: + free_page(page); + return result; +} + +static __poll_t pending_reads_poll(struct file *file, poll_table *wait) +{ + struct pending_reads_state *state = file->private_data; + struct mount_info *mi = get_mount_info(file_superblock(file)); + __poll_t ret = 0; + + poll_wait(file, &mi->mi_pending_reads_notif_wq, wait); + if (incfs_fresh_pending_reads_exist(mi, + state->last_pending_read_sn)) + ret = EPOLLIN | EPOLLRDNORM; + + return ret; +} + +static int pending_reads_open(struct inode *inode, struct file *file) +{ + struct pending_reads_state *state = NULL; + + state = kzalloc(sizeof(*state), GFP_NOFS); + if (!state) + return -ENOMEM; + + file->private_data = state; + return 0; +} + +static int pending_reads_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static long ioctl_permit_fill(struct file *f, void __user *arg) +{ + struct incfs_permit_fill __user *usr_permit_fill = arg; + struct incfs_permit_fill permit_fill; + long error = 0; + struct file *file = NULL; + struct incfs_file_data *fd; + + if (copy_from_user(&permit_fill, usr_permit_fill, sizeof(permit_fill))) + return -EFAULT; + + file = fget(permit_fill.file_descriptor); + if (IS_ERR_OR_NULL(file)) { + if (!file) + return -ENOENT; + + return PTR_ERR(file); + } + + if (file->f_op != &incfs_file_ops) { + error = -EPERM; + goto out; + } + + if (file->f_inode->i_sb != f->f_inode->i_sb) { + error = -EPERM; + goto out; + } + + fd = file->private_data; + + switch (fd->fd_fill_permission) { + case CANT_FILL: + fd->fd_fill_permission = CAN_FILL; + break; + + case CAN_FILL: + pr_debug("CAN_FILL already set"); + break; + + default: + pr_warn("Invalid file private data"); + error = -EFAULT; + goto out; + } + +out: + fput(file); + return error; +} + +static int chmod(struct dentry *dentry, umode_t mode) +{ + struct inode *inode = dentry->d_inode; + struct inode *delegated_inode = NULL; + struct iattr newattrs; + int error; + +retry_deleg: + inode_lock(inode); + newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; + error = notify_change(&init_user_ns, dentry, &newattrs, &delegated_inode); + inode_unlock(inode); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + return error; +} + +static bool incfs_equal_ranges(struct mem_range lhs, struct mem_range rhs) +{ + if (lhs.len != rhs.len) + return false; + return memcmp(lhs.data, rhs.data, lhs.len) == 0; +} + +static int validate_name(char *file_name) +{ + struct mem_range name = range(file_name, strlen(file_name)); + int i = 0; + + if (name.len > INCFS_MAX_NAME_LEN) + return -ENAMETOOLONG; + + if (is_pseudo_filename(name)) + return -EINVAL; + + for (i = 0; i < name.len; i++) + if (name.data[i] == '/') + return -EINVAL; + + return 0; +} + +static int dir_relative_path_resolve( + struct mount_info *mi, + const char __user *relative_path, + struct path *result_path, + struct path *base_path) +{ + int dir_fd = get_unused_fd_flags(0); + struct file *dir_f = NULL; + int error = 0; + + if (!base_path) + base_path = &mi->mi_backing_dir_path; + + if (dir_fd < 0) + return dir_fd; + + dir_f = dentry_open(base_path, O_RDONLY | O_NOATIME, current_cred()); + + if (IS_ERR(dir_f)) { + error = PTR_ERR(dir_f); + goto out; + } + fd_install(dir_fd, dir_f); + + if (!relative_path) { + /* No relative path given, just return the base dir. */ + *result_path = *base_path; + path_get(result_path); + goto out; + } + + error = user_path_at_empty(dir_fd, relative_path, + LOOKUP_FOLLOW | LOOKUP_DIRECTORY, result_path, NULL); + +out: + close_fd(dir_fd); + if (error) + pr_debug("Error: %d\n", error); + return error; +} + +static struct mem_range incfs_copy_signature_info_from_user(u8 __user *original, + u64 size) +{ + u8 *result; + + if (!original) + return range(NULL, 0); + + if (size > INCFS_MAX_SIGNATURE_SIZE) + return range(ERR_PTR(-EFAULT), 0); + + result = kzalloc(size, GFP_NOFS | __GFP_COMP); + if (!result) + return range(ERR_PTR(-ENOMEM), 0); + + if (copy_from_user(result, original, size)) { + kfree(result); + return range(ERR_PTR(-EFAULT), 0); + } + + return range(result, size); +} + +static int init_new_file(struct mount_info *mi, struct dentry *dentry, + incfs_uuid_t *uuid, u64 size, struct mem_range attr, + u8 __user *user_signature_info, u64 signature_size) +{ + struct path path = {}; + struct file *new_file; + int error = 0; + struct backing_file_context *bfc = NULL; + u32 block_count; + struct mem_range raw_signature = { NULL }; + struct mtree *hash_tree = NULL; + + if (!mi || !dentry || !uuid) + return -EFAULT; + + /* Resize newly created file to its true size. */ + path = (struct path) { + .mnt = mi->mi_backing_dir_path.mnt, + .dentry = dentry + }; + + new_file = dentry_open(&path, O_RDWR | O_NOATIME | O_LARGEFILE, + current_cred()); + + if (IS_ERR(new_file)) { + error = PTR_ERR(new_file); + goto out; + } + + bfc = incfs_alloc_bfc(mi, new_file); + fput(new_file); + if (IS_ERR(bfc)) { + error = PTR_ERR(bfc); + bfc = NULL; + goto out; + } + + mutex_lock(&bfc->bc_mutex); + error = incfs_write_fh_to_backing_file(bfc, uuid, size); + if (error) + goto out; + + block_count = (u32)get_blocks_count_for_size(size); + + if (user_signature_info) { + raw_signature = incfs_copy_signature_info_from_user( + user_signature_info, signature_size); + + if (IS_ERR(raw_signature.data)) { + error = PTR_ERR(raw_signature.data); + raw_signature.data = NULL; + goto out; + } + + hash_tree = incfs_alloc_mtree(raw_signature, block_count); + if (IS_ERR(hash_tree)) { + error = PTR_ERR(hash_tree); + hash_tree = NULL; + goto out; + } + + error = incfs_write_signature_to_backing_file(bfc, + raw_signature, hash_tree->hash_tree_area_size, + NULL, NULL); + if (error) + goto out; + + block_count += get_blocks_count_for_size( + hash_tree->hash_tree_area_size); + } + + if (block_count) + error = incfs_write_blockmap_to_backing_file(bfc, block_count); + + if (error) + goto out; + +out: + if (bfc) { + mutex_unlock(&bfc->bc_mutex); + incfs_free_bfc(bfc); + } + incfs_free_mtree(hash_tree); + kfree(raw_signature.data); + + if (error) + pr_debug("incfs: %s error: %d\n", __func__, error); + return error; +} + +static void notify_create(struct file *pending_reads_file, + const char __user *dir_name, const char *file_name, + const char *file_id_str, bool incomplete_file) +{ + struct mount_info *mi = + get_mount_info(file_superblock(pending_reads_file)); + struct path base_path = { + .mnt = pending_reads_file->f_path.mnt, + .dentry = pending_reads_file->f_path.dentry->d_parent, + }; + struct path dir_path = {}; + struct dentry *file = NULL; + struct dentry *dir = NULL; + int error; + + error = dir_relative_path_resolve(mi, dir_name, &dir_path, &base_path); + if (error) + goto out; + + file = incfs_lookup_dentry(dir_path.dentry, file_name); + if (IS_ERR(file)) { + error = PTR_ERR(file); + file = NULL; + goto out; + } + + fsnotify_create(d_inode(dir_path.dentry), file); + + if (file_id_str) { + dir = incfs_lookup_dentry(base_path.dentry, INCFS_INDEX_NAME); + if (IS_ERR(dir)) { + error = PTR_ERR(dir); + dir = NULL; + goto out; + } + + dput(file); + file = incfs_lookup_dentry(dir, file_id_str); + if (IS_ERR(file)) { + error = PTR_ERR(file); + file = NULL; + goto out; + } + + fsnotify_create(d_inode(dir), file); + + if (incomplete_file) { + dput(dir); + dir = incfs_lookup_dentry(base_path.dentry, + INCFS_INCOMPLETE_NAME); + if (IS_ERR(dir)) { + error = PTR_ERR(dir); + dir = NULL; + goto out; + } + + dput(file); + file = incfs_lookup_dentry(dir, file_id_str); + if (IS_ERR(file)) { + error = PTR_ERR(file); + file = NULL; + goto out; + } + + fsnotify_create(d_inode(dir), file); + } + } +out: + if (error) + pr_warn("%s failed with error %d\n", __func__, error); + + dput(dir); + dput(file); + path_put(&dir_path); +} + +static long ioctl_create_file(struct file *file, + struct incfs_new_file_args __user *usr_args) +{ + struct mount_info *mi = get_mount_info(file_superblock(file)); + struct incfs_new_file_args args; + char *file_id_str = NULL; + struct dentry *index_file_dentry = NULL; + struct dentry *named_file_dentry = NULL; + struct dentry *incomplete_file_dentry = NULL; + struct path parent_dir_path = {}; + struct inode *index_dir_inode = NULL; + __le64 size_attr_value = 0; + char *file_name = NULL; + char *attr_value = NULL; + int error = 0; + bool locked = false; + bool index_linked = false; + bool name_linked = false; + bool incomplete_linked = false; + + if (!mi || !mi->mi_index_dir || !mi->mi_incomplete_dir) { + error = -EFAULT; + goto out; + } + + if (copy_from_user(&args, usr_args, sizeof(args)) > 0) { + error = -EFAULT; + goto out; + } + + file_name = strndup_user(u64_to_user_ptr(args.file_name), PATH_MAX); + if (IS_ERR(file_name)) { + error = PTR_ERR(file_name); + file_name = NULL; + goto out; + } + + error = validate_name(file_name); + if (error) + goto out; + + file_id_str = file_id_to_str(args.file_id); + if (!file_id_str) { + error = -ENOMEM; + goto out; + } + + error = mutex_lock_interruptible(&mi->mi_dir_struct_mutex); + if (error) + goto out; + locked = true; + + /* Find a directory to put the file into. */ + error = dir_relative_path_resolve(mi, + u64_to_user_ptr(args.directory_path), + &parent_dir_path, NULL); + if (error) + goto out; + + if (parent_dir_path.dentry == mi->mi_index_dir) { + /* Can't create a file directly inside .index */ + error = -EBUSY; + goto out; + } + + if (parent_dir_path.dentry == mi->mi_incomplete_dir) { + /* Can't create a file directly inside .incomplete */ + error = -EBUSY; + goto out; + } + + /* Look up a dentry in the parent dir. It should be negative. */ + named_file_dentry = incfs_lookup_dentry(parent_dir_path.dentry, + file_name); + if (!named_file_dentry) { + error = -EFAULT; + goto out; + } + if (IS_ERR(named_file_dentry)) { + error = PTR_ERR(named_file_dentry); + named_file_dentry = NULL; + goto out; + } + if (d_really_is_positive(named_file_dentry)) { + /* File with this path already exists. */ + error = -EEXIST; + goto out; + } + + /* Look up a dentry in the incomplete dir. It should be negative. */ + incomplete_file_dentry = incfs_lookup_dentry(mi->mi_incomplete_dir, + file_id_str); + if (!incomplete_file_dentry) { + error = -EFAULT; + goto out; + } + if (IS_ERR(incomplete_file_dentry)) { + error = PTR_ERR(incomplete_file_dentry); + incomplete_file_dentry = NULL; + goto out; + } + if (d_really_is_positive(incomplete_file_dentry)) { + /* File with this path already exists. */ + error = -EEXIST; + goto out; + } + + /* Look up a dentry in the .index dir. It should be negative. */ + index_file_dentry = incfs_lookup_dentry(mi->mi_index_dir, file_id_str); + if (!index_file_dentry) { + error = -EFAULT; + goto out; + } + if (IS_ERR(index_file_dentry)) { + error = PTR_ERR(index_file_dentry); + index_file_dentry = NULL; + goto out; + } + if (d_really_is_positive(index_file_dentry)) { + /* File with this ID already exists in index. */ + error = -EEXIST; + goto out; + } + + /* Creating a file in the .index dir. */ + index_dir_inode = d_inode(mi->mi_index_dir); + inode_lock_nested(index_dir_inode, I_MUTEX_PARENT); + error = vfs_create(&init_user_ns, index_dir_inode, index_file_dentry, + args.mode | 0222, true); + inode_unlock(index_dir_inode); + + if (error) + goto out; + if (!d_really_is_positive(index_file_dentry)) { + error = -EINVAL; + goto out; + } + + error = chmod(index_file_dentry, args.mode | 0222); + if (error) { + pr_debug("incfs: chmod err: %d\n", error); + goto out; + } + + /* Save the file's ID as an xattr for easy fetching in future. */ + error = vfs_setxattr(&init_user_ns, index_file_dentry, INCFS_XATTR_ID_NAME, + file_id_str, strlen(file_id_str), XATTR_CREATE); + if (error) { + pr_debug("incfs: vfs_setxattr err:%d\n", error); + goto out; + } + + /* Save the file's size as an xattr for easy fetching in future. */ + size_attr_value = cpu_to_le64(args.size); + error = vfs_setxattr(&init_user_ns, index_file_dentry, INCFS_XATTR_SIZE_NAME, + (char *)&size_attr_value, sizeof(size_attr_value), + XATTR_CREATE); + if (error) { + pr_debug("incfs: vfs_setxattr err:%d\n", error); + goto out; + } + + /* Save the file's attribute as an xattr */ + if (args.file_attr_len && args.file_attr) { + if (args.file_attr_len > INCFS_MAX_FILE_ATTR_SIZE) { + error = -E2BIG; + goto out; + } + + attr_value = kmalloc(args.file_attr_len, GFP_NOFS); + if (!attr_value) { + error = -ENOMEM; + goto out; + } + + if (copy_from_user(attr_value, + u64_to_user_ptr(args.file_attr), + args.file_attr_len) > 0) { + error = -EFAULT; + goto out; + } + + error = vfs_setxattr(&init_user_ns, index_file_dentry, + INCFS_XATTR_METADATA_NAME, + attr_value, args.file_attr_len, + XATTR_CREATE); + + if (error) + goto out; + } + + /* Initializing a newly created file. */ + error = init_new_file(mi, index_file_dentry, &args.file_id, args.size, + range(attr_value, args.file_attr_len), + u64_to_user_ptr(args.signature_info), + args.signature_size); + if (error) + goto out; + index_linked = true; + + /* Linking a file with its real name from the requested dir. */ + error = incfs_link(index_file_dentry, named_file_dentry); + if (error) + goto out; + name_linked = true; + + if (args.size) { + /* Linking a file with its incomplete entry */ + error = incfs_link(index_file_dentry, incomplete_file_dentry); + if (error) + goto out; + incomplete_linked = true; + } + + notify_create(file, u64_to_user_ptr(args.directory_path), file_name, + file_id_str, args.size != 0); + +out: + if (error) { + pr_debug("incfs: %s err:%d\n", __func__, error); + if (index_linked) + incfs_unlink(index_file_dentry); + if (name_linked) + incfs_unlink(named_file_dentry); + if (incomplete_linked) + incfs_unlink(incomplete_file_dentry); + } + + kfree(file_id_str); + kfree(file_name); + kfree(attr_value); + dput(named_file_dentry); + dput(index_file_dentry); + dput(incomplete_file_dentry); + path_put(&parent_dir_path); + if (locked) + mutex_unlock(&mi->mi_dir_struct_mutex); + + return error; +} + +static int init_new_mapped_file(struct mount_info *mi, struct dentry *dentry, + incfs_uuid_t *uuid, u64 size, u64 offset) +{ + struct path path = {}; + struct file *new_file; + int error = 0; + struct backing_file_context *bfc = NULL; + + if (!mi || !dentry || !uuid) + return -EFAULT; + + /* Resize newly created file to its true size. */ + path = (struct path) { + .mnt = mi->mi_backing_dir_path.mnt, + .dentry = dentry + }; + new_file = dentry_open(&path, O_RDWR | O_NOATIME | O_LARGEFILE, + current_cred()); + + if (IS_ERR(new_file)) { + error = PTR_ERR(new_file); + goto out; + } + + bfc = incfs_alloc_bfc(mi, new_file); + fput(new_file); + if (IS_ERR(bfc)) { + error = PTR_ERR(bfc); + bfc = NULL; + goto out; + } + + mutex_lock(&bfc->bc_mutex); + error = incfs_write_mapping_fh_to_backing_file(bfc, uuid, size, offset); + if (error) + goto out; + +out: + if (bfc) { + mutex_unlock(&bfc->bc_mutex); + incfs_free_bfc(bfc); + } + + if (error) + pr_debug("incfs: %s error: %d\n", __func__, error); + return error; +} + +static long ioctl_create_mapped_file(struct file *file, void __user *arg) +{ + struct mount_info *mi = get_mount_info(file_superblock(file)); + struct incfs_create_mapped_file_args __user *args_usr_ptr = arg; + struct incfs_create_mapped_file_args args = {}; + char *file_name; + int error = 0; + struct path parent_dir_path = {}; + char *source_file_name = NULL; + struct dentry *source_file_dentry = NULL; + u64 source_file_size; + struct dentry *file_dentry = NULL; + struct inode *parent_inode; + __le64 size_attr_value; + + if (copy_from_user(&args, args_usr_ptr, sizeof(args)) > 0) + return -EINVAL; + + file_name = strndup_user(u64_to_user_ptr(args.file_name), PATH_MAX); + if (IS_ERR(file_name)) { + error = PTR_ERR(file_name); + file_name = NULL; + goto out; + } + + error = validate_name(file_name); + if (error) + goto out; + + if (args.source_offset % INCFS_DATA_FILE_BLOCK_SIZE) { + error = -EINVAL; + goto out; + } + + /* Validate file mapping is in range */ + source_file_name = file_id_to_str(args.source_file_id); + if (!source_file_name) { + pr_warn("Failed to alloc source_file_name\n"); + error = -ENOMEM; + goto out; + } + + source_file_dentry = incfs_lookup_dentry(mi->mi_index_dir, + source_file_name); + if (!source_file_dentry) { + pr_warn("Source file does not exist\n"); + error = -EINVAL; + goto out; + } + if (IS_ERR(source_file_dentry)) { + pr_warn("Error opening source file\n"); + error = PTR_ERR(source_file_dentry); + source_file_dentry = NULL; + goto out; + } + if (!d_really_is_positive(source_file_dentry)) { + pr_warn("Source file dentry negative\n"); + error = -EINVAL; + goto out; + } + + error = vfs_getxattr(&init_user_ns, source_file_dentry, INCFS_XATTR_SIZE_NAME, + (char *)&size_attr_value, sizeof(size_attr_value)); + if (error < 0) + goto out; + + if (error != sizeof(size_attr_value)) { + pr_warn("Mapped file has no size attr\n"); + error = -EINVAL; + goto out; + } + + source_file_size = le64_to_cpu(size_attr_value); + if (args.source_offset + args.size > source_file_size) { + pr_warn("Mapped file out of range\n"); + error = -EINVAL; + goto out; + } + + /* Find a directory to put the file into. */ + error = dir_relative_path_resolve(mi, + u64_to_user_ptr(args.directory_path), + &parent_dir_path, NULL); + if (error) + goto out; + + if (parent_dir_path.dentry == mi->mi_index_dir) { + /* Can't create a file directly inside .index */ + error = -EBUSY; + goto out; + } + + /* Look up a dentry in the parent dir. It should be negative. */ + file_dentry = incfs_lookup_dentry(parent_dir_path.dentry, + file_name); + if (!file_dentry) { + error = -EFAULT; + goto out; + } + if (IS_ERR(file_dentry)) { + error = PTR_ERR(file_dentry); + file_dentry = NULL; + goto out; + } + if (d_really_is_positive(file_dentry)) { + error = -EEXIST; + goto out; + } + + parent_inode = d_inode(parent_dir_path.dentry); + inode_lock_nested(parent_inode, I_MUTEX_PARENT); + error = vfs_create(&init_user_ns, parent_inode, file_dentry, + args.mode | 0222, true); + inode_unlock(parent_inode); + if (error) + goto out; + + error = chmod(file_dentry, args.mode | 0222); + if (error) { + pr_debug("incfs: chmod err: %d\n", error); + goto delete_file; + } + + /* Save the file's size as an xattr for easy fetching in future. */ + size_attr_value = cpu_to_le64(args.size); + error = vfs_setxattr(&init_user_ns, file_dentry, INCFS_XATTR_SIZE_NAME, + (char *)&size_attr_value, sizeof(size_attr_value), + XATTR_CREATE); + if (error) { + pr_debug("incfs: vfs_setxattr err:%d\n", error); + goto delete_file; + } + + error = init_new_mapped_file(mi, file_dentry, &args.source_file_id, + args.size, args.source_offset); + if (error) + goto delete_file; + + notify_create(file, u64_to_user_ptr(args.directory_path), file_name, + NULL, false); + + goto out; + +delete_file: + incfs_unlink(file_dentry); + +out: + dput(file_dentry); + dput(source_file_dentry); + path_put(&parent_dir_path); + kfree(file_name); + kfree(source_file_name); + return error; +} + +static long ioctl_get_read_timeouts(struct mount_info *mi, void __user *arg) +{ + struct incfs_get_read_timeouts_args __user *args_usr_ptr = arg; + struct incfs_get_read_timeouts_args args = {}; + int error = 0; + struct incfs_per_uid_read_timeouts *buffer; + int size; + + if (copy_from_user(&args, args_usr_ptr, sizeof(args))) + return -EINVAL; + + if (args.timeouts_array_size_out > INCFS_DATA_FILE_BLOCK_SIZE) + return -EINVAL; + + buffer = kzalloc(args.timeouts_array_size_out, GFP_NOFS); + if (!buffer) + return -ENOMEM; + + spin_lock(&mi->mi_per_uid_read_timeouts_lock); + size = mi->mi_per_uid_read_timeouts_size; + if (args.timeouts_array_size < size) + error = -E2BIG; + else if (size) + memcpy(buffer, mi->mi_per_uid_read_timeouts, size); + spin_unlock(&mi->mi_per_uid_read_timeouts_lock); + + args.timeouts_array_size_out = size; + if (!error && size) + if (copy_to_user(u64_to_user_ptr(args.timeouts_array), buffer, + size)) + error = -EFAULT; + + if (!error || error == -E2BIG) + if (copy_to_user(args_usr_ptr, &args, sizeof(args)) > 0) + error = -EFAULT; + + kfree(buffer); + return error; +} + +static long ioctl_set_read_timeouts(struct mount_info *mi, void __user *arg) +{ + struct incfs_set_read_timeouts_args __user *args_usr_ptr = arg; + struct incfs_set_read_timeouts_args args = {}; + int error = 0; + int size; + struct incfs_per_uid_read_timeouts *buffer = NULL, *tmp; + int i; + + if (copy_from_user(&args, args_usr_ptr, sizeof(args))) + return -EINVAL; + + size = args.timeouts_array_size; + if (size) { + if (size > INCFS_DATA_FILE_BLOCK_SIZE || + size % sizeof(*buffer) != 0) + return -EINVAL; + + buffer = kzalloc(size, GFP_NOFS); + if (!buffer) + return -ENOMEM; + + if (copy_from_user(buffer, u64_to_user_ptr(args.timeouts_array), + size)) { + error = -EINVAL; + goto out; + } + + for (i = 0; i < size / sizeof(*buffer); ++i) { + struct incfs_per_uid_read_timeouts *t = &buffer[i]; + + if (t->min_pending_time_us > t->max_pending_time_us) { + error = -EINVAL; + goto out; + } + } + } + + spin_lock(&mi->mi_per_uid_read_timeouts_lock); + mi->mi_per_uid_read_timeouts_size = size; + tmp = mi->mi_per_uid_read_timeouts; + mi->mi_per_uid_read_timeouts = buffer; + buffer = tmp; + spin_unlock(&mi->mi_per_uid_read_timeouts_lock); + +out: + kfree(buffer); + return error; +} + +static long ioctl_get_last_read_error(struct mount_info *mi, void __user *arg) +{ + struct incfs_get_last_read_error_args __user *args_usr_ptr = arg; + struct incfs_get_last_read_error_args args = {}; + int error; + + error = mutex_lock_interruptible(&mi->mi_le_mutex); + if (error) + return error; + + args.file_id_out = mi->mi_le_file_id; + args.time_us_out = mi->mi_le_time_us; + args.page_out = mi->mi_le_page; + args.errno_out = mi->mi_le_errno; + args.uid_out = mi->mi_le_uid; + + mutex_unlock(&mi->mi_le_mutex); + if (copy_to_user(args_usr_ptr, &args, sizeof(args)) > 0) + error = -EFAULT; + + return error; +} + +static long pending_reads_dispatch_ioctl(struct file *f, unsigned int req, + unsigned long arg) +{ + struct mount_info *mi = get_mount_info(file_superblock(f)); + + switch (req) { + case INCFS_IOC_CREATE_FILE: + return ioctl_create_file(f, (void __user *)arg); + case INCFS_IOC_PERMIT_FILL: + return ioctl_permit_fill(f, (void __user *)arg); + case INCFS_IOC_CREATE_MAPPED_FILE: + return ioctl_create_mapped_file(f, (void __user *)arg); + case INCFS_IOC_GET_READ_TIMEOUTS: + return ioctl_get_read_timeouts(mi, (void __user *)arg); + case INCFS_IOC_SET_READ_TIMEOUTS: + return ioctl_set_read_timeouts(mi, (void __user *)arg); + case INCFS_IOC_GET_LAST_READ_ERROR: + return ioctl_get_last_read_error(mi, (void __user *)arg); + default: + return -EINVAL; + } +} + +static const struct file_operations incfs_pending_reads_file_ops = { + .read = pending_reads_read, + .poll = pending_reads_poll, + .open = pending_reads_open, + .release = pending_reads_release, + .llseek = noop_llseek, + .unlocked_ioctl = pending_reads_dispatch_ioctl, + .compat_ioctl = pending_reads_dispatch_ioctl +}; + +/******************************************************************************* + * .log pseudo file definition + ******************************************************************************/ +#define INCFS_LOG_INODE 3 +static const char log_file_name[] = INCFS_LOG_FILENAME; + +/* State of an open .log file, unique for each file descriptor. */ +struct log_file_state { + struct read_log_state state; +}; + +static ssize_t log_read(struct file *f, char __user *buf, size_t len, + loff_t *ppos) +{ + struct log_file_state *log_state = f->private_data; + struct mount_info *mi = get_mount_info(file_superblock(f)); + int total_reads_collected = 0; + int rl_size; + ssize_t result = 0; + bool report_uid; + unsigned long page = 0; + struct incfs_pending_read_info *reads_buf = NULL; + struct incfs_pending_read_info2 *reads_buf2 = NULL; + size_t record_size; + ssize_t reads_to_collect; + ssize_t reads_per_page; + + if (!mi) + return -EFAULT; + + report_uid = mi->mi_options.report_uid; + record_size = report_uid ? sizeof(*reads_buf2) : sizeof(*reads_buf); + reads_to_collect = len / record_size; + reads_per_page = PAGE_SIZE / record_size; + + rl_size = READ_ONCE(mi->mi_log.rl_size); + if (rl_size == 0) + return 0; + + page = __get_free_page(GFP_NOFS); + if (!page) + return -ENOMEM; + + if (report_uid) + reads_buf2 = (struct incfs_pending_read_info2 *)page; + else + reads_buf = (struct incfs_pending_read_info *)page; + + reads_to_collect = min_t(ssize_t, rl_size, reads_to_collect); + while (reads_to_collect > 0) { + struct read_log_state next_state; + int reads_collected; + + memcpy(&next_state, &log_state->state, sizeof(next_state)); + reads_collected = incfs_collect_logged_reads( + mi, &next_state, reads_buf, reads_buf2, + min_t(ssize_t, reads_to_collect, reads_per_page)); + if (reads_collected <= 0) { + result = total_reads_collected ? + total_reads_collected * record_size : + reads_collected; + goto out; + } + if (copy_to_user(buf, (void *)page, + reads_collected * record_size)) { + result = total_reads_collected ? + total_reads_collected * record_size : + -EFAULT; + goto out; + } + + memcpy(&log_state->state, &next_state, sizeof(next_state)); + total_reads_collected += reads_collected; + buf += reads_collected * record_size; + reads_to_collect -= reads_collected; + } + + result = total_reads_collected * record_size; + *ppos = 0; +out: + free_page(page); + return result; +} + +static __poll_t log_poll(struct file *file, poll_table *wait) +{ + struct log_file_state *log_state = file->private_data; + struct mount_info *mi = get_mount_info(file_superblock(file)); + int count; + __poll_t ret = 0; + + poll_wait(file, &mi->mi_log.ml_notif_wq, wait); + count = incfs_get_uncollected_logs_count(mi, &log_state->state); + if (count >= mi->mi_options.read_log_wakeup_count) + ret = EPOLLIN | EPOLLRDNORM; + + return ret; +} + +static int log_open(struct inode *inode, struct file *file) +{ + struct log_file_state *log_state = NULL; + struct mount_info *mi = get_mount_info(file_superblock(file)); + + log_state = kzalloc(sizeof(*log_state), GFP_NOFS); + if (!log_state) + return -ENOMEM; + + log_state->state = incfs_get_log_state(mi); + file->private_data = log_state; + return 0; +} + +static int log_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static const struct file_operations incfs_log_file_ops = { + .read = log_read, + .poll = log_poll, + .open = log_open, + .release = log_release, + .llseek = noop_llseek, +}; + +/******************************************************************************* + * .blocks_written pseudo file definition + ******************************************************************************/ +#define INCFS_BLOCKS_WRITTEN_INODE 4 +static const char blocks_written_file_name[] = INCFS_BLOCKS_WRITTEN_FILENAME; + +/* State of an open .blocks_written file, unique for each file descriptor. */ +struct blocks_written_file_state { + unsigned long blocks_written; +}; + +static ssize_t blocks_written_read(struct file *f, char __user *buf, size_t len, + loff_t *ppos) +{ + struct mount_info *mi = get_mount_info(file_superblock(f)); + struct blocks_written_file_state *state = f->private_data; + unsigned long blocks_written; + char string[21]; + int result = 0; + + if (!mi) + return -EFAULT; + + blocks_written = atomic_read(&mi->mi_blocks_written); + if (state->blocks_written == blocks_written) + return 0; + + result = snprintf(string, sizeof(string), "%lu", blocks_written); + if (result > len) + result = len; + if (copy_to_user(buf, string, result)) + return -EFAULT; + + state->blocks_written = blocks_written; + return result; +} + +static __poll_t blocks_written_poll(struct file *f, poll_table *wait) +{ + struct mount_info *mi = get_mount_info(file_superblock(f)); + struct blocks_written_file_state *state = f->private_data; + unsigned long blocks_written; + + if (!mi) + return 0; + + poll_wait(f, &mi->mi_blocks_written_notif_wq, wait); + blocks_written = atomic_read(&mi->mi_blocks_written); + if (state->blocks_written == blocks_written) + return 0; + + return EPOLLIN | EPOLLRDNORM; +} + +static int blocks_written_open(struct inode *inode, struct file *file) +{ + struct blocks_written_file_state *state = + kzalloc(sizeof(*state), GFP_NOFS); + + if (!state) + return -ENOMEM; + + state->blocks_written = -1; + file->private_data = state; + return 0; +} + +static int blocks_written_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static const struct file_operations incfs_blocks_written_file_ops = { + .read = blocks_written_read, + .poll = blocks_written_poll, + .open = blocks_written_open, + .release = blocks_written_release, + .llseek = noop_llseek, +}; + +/******************************************************************************* + * Generic inode lookup functionality + ******************************************************************************/ + +const struct mem_range incfs_pseudo_file_names[] = { + { .data = (u8 *)pending_reads_file_name, + .len = ARRAY_SIZE(pending_reads_file_name) - 1 }, + { .data = (u8 *)log_file_name, .len = ARRAY_SIZE(log_file_name) - 1 }, + { .data = (u8 *)blocks_written_file_name, + .len = ARRAY_SIZE(blocks_written_file_name) - 1 } +}; + +const unsigned long incfs_pseudo_file_inodes[] = { INCFS_PENDING_READS_INODE, + INCFS_LOG_INODE, + INCFS_BLOCKS_WRITTEN_INODE }; + +static const struct file_operations *const pseudo_file_operations[] = { + &incfs_pending_reads_file_ops, &incfs_log_file_ops, + &incfs_blocks_written_file_ops +}; + +static bool is_pseudo_filename(struct mem_range name) +{ + int i = 0; + + for (; i < ARRAY_SIZE(incfs_pseudo_file_names); ++i) + if (incfs_equal_ranges(incfs_pseudo_file_names[i], name)) + return true; + return false; +} + +static bool get_pseudo_inode(int ino, struct inode *inode) +{ + int i = 0; + + for (; i < ARRAY_SIZE(incfs_pseudo_file_inodes); ++i) + if (ino == incfs_pseudo_file_inodes[i]) + break; + if (i == ARRAY_SIZE(incfs_pseudo_file_inodes)) + return false; + + inode->i_ctime = (struct timespec64){}; + inode->i_mtime = inode->i_ctime; + inode->i_atime = inode->i_ctime; + inode->i_size = 0; + inode->i_ino = ino; + inode->i_private = NULL; + inode_init_owner(&init_user_ns, inode, NULL, S_IFREG | READ_WRITE_FILE_MODE); + inode->i_op = &incfs_file_inode_ops; + inode->i_fop = pseudo_file_operations[i]; + return true; +} + +struct inode_search { + unsigned long ino; +}; + +static int inode_test(struct inode *inode, void *opaque) +{ + struct inode_search *search = opaque; + + return inode->i_ino == search->ino; +} + +static int inode_set(struct inode *inode, void *opaque) +{ + struct inode_search *search = opaque; + + if (get_pseudo_inode(search->ino, inode)) + return 0; + + /* Unknown inode requested. */ + return -EINVAL; +} + +static struct inode *fetch_inode(struct super_block *sb, unsigned long ino) +{ + struct inode_search search = { + .ino = ino + }; + struct inode *inode = iget5_locked(sb, search.ino, inode_test, + inode_set, &search); + + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) + unlock_new_inode(inode); + + return inode; +} + +int dir_lookup_pseudo_files(struct super_block *sb, struct dentry *dentry) +{ + struct mem_range name_range = + range((u8 *)dentry->d_name.name, dentry->d_name.len); + unsigned long ino; + struct inode *inode; + int i = 0; + + for (; i < ARRAY_SIZE(incfs_pseudo_file_names); ++i) + if (incfs_equal_ranges(incfs_pseudo_file_names[i], name_range)) + break; + if (i == ARRAY_SIZE(incfs_pseudo_file_names)) + return -ENOENT; + + ino = incfs_pseudo_file_inodes[i]; + + inode = fetch_inode(sb, ino); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + d_add(dentry, inode); + return 0; +} + +int emit_pseudo_files(struct dir_context *ctx) +{ + loff_t i = ctx->pos; + + for (; i < ARRAY_SIZE(incfs_pseudo_file_names); ++i) { + if (!dir_emit(ctx, incfs_pseudo_file_names[i].data, + incfs_pseudo_file_names[i].len, + incfs_pseudo_file_inodes[i], DT_REG)) + return -EINVAL; + + ctx->pos++; + } + return 0; +} diff --git a/fs/incfs/pseudo_files.h b/fs/incfs/pseudo_files.h new file mode 100644 index 000000000000..188721837253 --- /dev/null +++ b/fs/incfs/pseudo_files.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2020 Google LLC + */ + +#ifndef _INCFS_PSEUDO_FILES_H +#define _INCFS_PSEUDO_FILES_H + +#include "internal.h" + +#define PSEUDO_FILE_COUNT 3 +#define INCFS_START_INO_RANGE 10 + +extern const struct mem_range incfs_pseudo_file_names[PSEUDO_FILE_COUNT]; +extern const unsigned long incfs_pseudo_file_inodes[PSEUDO_FILE_COUNT]; + +int dir_lookup_pseudo_files(struct super_block *sb, struct dentry *dentry); +int emit_pseudo_files(struct dir_context *ctx); + +#endif diff --git a/fs/incfs/sysfs.c b/fs/incfs/sysfs.c new file mode 100644 index 000000000000..5c7e0fd4496c --- /dev/null +++ b/fs/incfs/sysfs.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2021 Google LLC + */ +#include +#include + +#include + +#include "sysfs.h" +#include "data_mgmt.h" +#include "vfs.h" + +/****************************************************************************** + * Define sys/fs/incrementalfs & sys/fs/incrementalfs/features + *****************************************************************************/ +#define INCFS_NODE_FEATURES "features" +#define INCFS_NODE_INSTANCES "instances" + +static struct kobject *sysfs_root; +static struct kobject *features_node; +static struct kobject *instances_node; + +#define DECLARE_FEATURE_FLAG(name) \ + static ssize_t name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buff) \ +{ \ + return sysfs_emit(buff, "supported\n"); \ +} \ + \ +static struct kobj_attribute name##_attr = __ATTR_RO(name) + +DECLARE_FEATURE_FLAG(corefs); +DECLARE_FEATURE_FLAG(zstd); +DECLARE_FEATURE_FLAG(v2); +DECLARE_FEATURE_FLAG(bugfix_throttling); + +static struct attribute *attributes[] = { + &corefs_attr.attr, + &zstd_attr.attr, + &v2_attr.attr, + &bugfix_throttling_attr.attr, + NULL, +}; + +static const struct attribute_group attr_group = { + .attrs = attributes, +}; + +int __init incfs_init_sysfs(void) +{ + int res = -ENOMEM; + + sysfs_root = kobject_create_and_add(INCFS_NAME, fs_kobj); + if (!sysfs_root) + return -ENOMEM; + + instances_node = kobject_create_and_add(INCFS_NODE_INSTANCES, + sysfs_root); + if (!instances_node) + goto err_put_root; + + features_node = kobject_create_and_add(INCFS_NODE_FEATURES, + sysfs_root); + if (!features_node) + goto err_put_instances; + + res = sysfs_create_group(features_node, &attr_group); + if (res) + goto err_put_features; + + return 0; + +err_put_features: + kobject_put(features_node); +err_put_instances: + kobject_put(instances_node); +err_put_root: + kobject_put(sysfs_root); + + return res; +} + +void incfs_cleanup_sysfs(void) +{ + if (features_node) { + sysfs_remove_group(features_node, &attr_group); + kobject_put(features_node); + } + + kobject_put(instances_node); + kobject_put(sysfs_root); +} + +/****************************************************************************** + * Define sys/fs/incrementalfs/instances// + *****************************************************************************/ +#define __DECLARE_STATUS_FLAG(name) \ +static ssize_t name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buff) \ +{ \ + struct incfs_sysfs_node *node = container_of(kobj, \ + struct incfs_sysfs_node, isn_sysfs_node); \ + \ + return sysfs_emit(buff, "%d\n", node->isn_mi->mi_##name); \ +} \ + \ +static struct kobj_attribute name##_attr = __ATTR_RO(name) + +#define __DECLARE_STATUS_FLAG64(name) \ +static ssize_t name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buff) \ +{ \ + struct incfs_sysfs_node *node = container_of(kobj, \ + struct incfs_sysfs_node, isn_sysfs_node); \ + \ + return sysfs_emit(buff, "%lld\n", node->isn_mi->mi_##name); \ +} \ + \ +static struct kobj_attribute name##_attr = __ATTR_RO(name) + +__DECLARE_STATUS_FLAG(reads_failed_timed_out); +__DECLARE_STATUS_FLAG(reads_failed_hash_verification); +__DECLARE_STATUS_FLAG(reads_failed_other); +__DECLARE_STATUS_FLAG(reads_delayed_pending); +__DECLARE_STATUS_FLAG64(reads_delayed_pending_us); +__DECLARE_STATUS_FLAG(reads_delayed_min); +__DECLARE_STATUS_FLAG64(reads_delayed_min_us); + +static struct attribute *mount_attributes[] = { + &reads_failed_timed_out_attr.attr, + &reads_failed_hash_verification_attr.attr, + &reads_failed_other_attr.attr, + &reads_delayed_pending_attr.attr, + &reads_delayed_pending_us_attr.attr, + &reads_delayed_min_attr.attr, + &reads_delayed_min_us_attr.attr, + NULL, +}; + +static void incfs_sysfs_release(struct kobject *kobj) +{ + struct incfs_sysfs_node *node = container_of(kobj, + struct incfs_sysfs_node, isn_sysfs_node); + + complete(&node->isn_completion); +} + +static const struct attribute_group mount_attr_group = { + .attrs = mount_attributes, +}; + +static struct kobj_type incfs_kobj_node_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = &incfs_sysfs_release, +}; + +struct incfs_sysfs_node *incfs_add_sysfs_node(const char *name, + struct mount_info *mi) +{ + struct incfs_sysfs_node *node = NULL; + int error; + + if (!name) + return NULL; + + node = kzalloc(sizeof(*node), GFP_NOFS); + if (!node) + return ERR_PTR(-ENOMEM); + + node->isn_mi = mi; + + init_completion(&node->isn_completion); + kobject_init(&node->isn_sysfs_node, &incfs_kobj_node_ktype); + error = kobject_add(&node->isn_sysfs_node, instances_node, "%s", name); + if (error) + goto err; + + error = sysfs_create_group(&node->isn_sysfs_node, &mount_attr_group); + if (error) + goto err; + + return node; + +err: + /* + * Note kobject_put always calls release, so incfs_sysfs_release will + * free node + */ + kobject_put(&node->isn_sysfs_node); + return ERR_PTR(error); +} + +void incfs_free_sysfs_node(struct incfs_sysfs_node *node) +{ + if (!node) + return; + + sysfs_remove_group(&node->isn_sysfs_node, &mount_attr_group); + kobject_put(&node->isn_sysfs_node); + wait_for_completion_interruptible(&node->isn_completion); + kfree(node); +} diff --git a/fs/incfs/sysfs.h b/fs/incfs/sysfs.h new file mode 100644 index 000000000000..65bf55463738 --- /dev/null +++ b/fs/incfs/sysfs.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2021 Google LLC + */ +#ifndef _INCFS_SYSFS_H +#define _INCFS_SYSFS_H + +struct incfs_sysfs_node { + struct kobject isn_sysfs_node; + + struct completion isn_completion; + + struct mount_info *isn_mi; +}; + +int incfs_init_sysfs(void); +void incfs_cleanup_sysfs(void); +struct incfs_sysfs_node *incfs_add_sysfs_node(const char *name, + struct mount_info *mi); +void incfs_free_sysfs_node(struct incfs_sysfs_node *node); + +#endif diff --git a/fs/incfs/verity.c b/fs/incfs/verity.c new file mode 100644 index 000000000000..562a8e774178 --- /dev/null +++ b/fs/incfs/verity.c @@ -0,0 +1,851 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2020 Google LLC + */ + +/* + * fs-verity integration into incfs + * + * Since incfs has its own merkle tree implementation, most of fs-verity code + * is not needed. The key part that is needed is the signature check, since + * that is based on the private /proc/sys/fs/verity/require_signatures value + * and a private keyring. Thus the first change is to modify verity code to + * export a version of fsverity_verify_signature. + * + * fs-verity integration then consists of the following modifications: + * + * 1. Add the (optional) verity signature to the incfs file format + * 2. Add a pointer to the digest of the fs-verity descriptor struct to the + * data_file struct that incfs attaches to each file inode. + * 3. Add the following ioclts: + * - FS_IOC_ENABLE_VERITY + * - FS_IOC_GETFLAGS + * - FS_IOC_MEASURE_VERITY + * 4. When FS_IOC_ENABLE_VERITY is called on a non-verity file, the + * fs-verity descriptor struct is populated and digested. If it passes the + * signature check or the signature is NULL and + * fs.verity.require_signatures=0, then the S_VERITY flag is set and the + * xattr incfs.verity is set. If the signature is non-NULL, an + * INCFS_MD_VERITY_SIGNATURE is added to the backing file containing the + * signature. + * 5. When a file with an incfs.verity xattr's inode is initialized, the + * inode’s S_VERITY flag is set. + * 6. When a file with the S_VERITY flag set on its inode is opened, the + * data_file is checked for its verity digest. If the file doesn’t have a + * digest, the file’s digest is calculated as above, checked, and set, or the + * open is denied if it is not valid. + * 7. FS_IOC_GETFLAGS simply returns the value of the S_VERITY flag + * 8. FS_IOC_MEASURE_VERITY simply returns the cached digest + * 9. The final complication is that if FS_IOC_ENABLE_VERITY is called on a file + * which doesn’t have a merkle tree, the merkle tree is calculated before the + * rest of the process is completed. + */ + +#include +#include +#include +#include + +#include "verity.h" + +#include "data_mgmt.h" +#include "format.h" +#include "integrity.h" +#include "vfs.h" + +#define FS_VERITY_MAX_SIGNATURE_SIZE 16128 + +static int incfs_get_root_hash(struct file *filp, u8 *root_hash) +{ + struct data_file *df = get_incfs_data_file(filp); + + if (!df) + return -EINVAL; + + memcpy(root_hash, df->df_hash_tree->root_hash, + df->df_hash_tree->alg->digest_size); + + return 0; +} + +static int incfs_end_enable_verity(struct file *filp, u8 *sig, size_t sig_size) +{ + struct inode *inode = file_inode(filp); + struct mem_range signature = { + .data = sig, + .len = sig_size, + }; + struct data_file *df = get_incfs_data_file(filp); + struct backing_file_context *bfc; + int error; + struct incfs_df_verity_signature *vs = NULL; + loff_t offset; + + if (!df || !df->df_backing_file_context) + return -EFSCORRUPTED; + + if (sig) { + vs = kzalloc(sizeof(*vs), GFP_NOFS); + if (!vs) + return -ENOMEM; + } + + bfc = df->df_backing_file_context; + error = mutex_lock_interruptible(&bfc->bc_mutex); + if (error) + goto out; + + error = incfs_write_verity_signature_to_backing_file(bfc, signature, + &offset); + mutex_unlock(&bfc->bc_mutex); + if (error) + goto out; + + /* + * Set verity xattr so we can set S_VERITY without opening backing file + */ + error = vfs_setxattr(&init_user_ns, bfc->bc_file->f_path.dentry, + INCFS_XATTR_VERITY_NAME, NULL, 0, XATTR_CREATE); + if (error) { + pr_warn("incfs: error setting verity xattr: %d\n", error); + goto out; + } + + if (sig) { + *vs = (struct incfs_df_verity_signature) { + .size = signature.len, + .offset = offset, + }; + + df->df_verity_signature = vs; + vs = NULL; + } + + inode_set_flags(inode, S_VERITY, S_VERITY); + +out: + kfree(vs); + return error; +} + +static int incfs_compute_file_digest(struct incfs_hash_alg *alg, + struct fsverity_descriptor *desc, + u8 *digest) +{ + SHASH_DESC_ON_STACK(d, alg->shash); + + d->tfm = alg->shash; + return crypto_shash_digest(d, (u8 *)desc, sizeof(*desc), digest); +} + +static enum incfs_hash_tree_algorithm incfs_convert_fsverity_hash_alg( + int hash_alg) +{ + switch (hash_alg) { + case FS_VERITY_HASH_ALG_SHA256: + return INCFS_HASH_TREE_SHA256; + default: + return -EINVAL; + } +} + +static struct mem_range incfs_get_verity_digest(struct inode *inode) +{ + struct inode_info *node = get_incfs_node(inode); + struct data_file *df; + struct mem_range verity_file_digest; + + if (!node) { + pr_warn("Invalid inode\n"); + return range(NULL, 0); + } + + df = node->n_file; + + /* + * Pairs with the cmpxchg_release() in incfs_set_verity_digest(). + * I.e., another task may publish ->df_verity_file_digest concurrently, + * executing a RELEASE barrier. We need to use smp_load_acquire() here + * to safely ACQUIRE the memory the other task published. + */ + verity_file_digest.data = smp_load_acquire( + &df->df_verity_file_digest.data); + verity_file_digest.len = df->df_verity_file_digest.len; + return verity_file_digest; +} + +static void incfs_set_verity_digest(struct inode *inode, + struct mem_range verity_file_digest) +{ + struct inode_info *node = get_incfs_node(inode); + struct data_file *df; + + if (!node) { + pr_warn("Invalid inode\n"); + kfree(verity_file_digest.data); + return; + } + + df = node->n_file; + df->df_verity_file_digest.len = verity_file_digest.len; + + /* + * Multiple tasks may race to set ->df_verity_file_digest.data, so use + * cmpxchg_release(). This pairs with the smp_load_acquire() in + * incfs_get_verity_digest(). I.e., here we publish + * ->df_verity_file_digest.data, with a RELEASE barrier so that other + * tasks can ACQUIRE it. + */ + if (cmpxchg_release(&df->df_verity_file_digest.data, NULL, + verity_file_digest.data) != NULL) + /* Lost the race, so free the file_digest we allocated. */ + kfree(verity_file_digest.data); +} + +/* + * Calculate the digest of the fsverity_descriptor. The signature (if present) + * is also checked. + */ +static struct mem_range incfs_calc_verity_digest_from_desc( + const struct inode *inode, + struct fsverity_descriptor *desc, + u8 *signature, size_t sig_size) +{ + enum incfs_hash_tree_algorithm incfs_hash_alg; + struct mem_range verity_file_digest; + int err; + struct incfs_hash_alg *hash_alg; + + incfs_hash_alg = incfs_convert_fsverity_hash_alg(desc->hash_algorithm); + if (incfs_hash_alg < 0) + return range(ERR_PTR(incfs_hash_alg), 0); + + hash_alg = incfs_get_hash_alg(incfs_hash_alg); + if (IS_ERR(hash_alg)) + return range((u8 *)hash_alg, 0); + + verity_file_digest = range(kzalloc(hash_alg->digest_size, GFP_KERNEL), + hash_alg->digest_size); + if (!verity_file_digest.data) + return range(ERR_PTR(-ENOMEM), 0); + + err = incfs_compute_file_digest(hash_alg, desc, + verity_file_digest.data); + if (err) { + pr_err("Error %d computing file digest", err); + goto out; + } + pr_debug("Computed file digest: %s:%*phN\n", + hash_alg->name, (int) verity_file_digest.len, + verity_file_digest.data); + + err = __fsverity_verify_signature(inode, signature, sig_size, + verity_file_digest.data, + desc->hash_algorithm); +out: + if (err) { + kfree(verity_file_digest.data); + verity_file_digest = range(ERR_PTR(err), 0); + } + return verity_file_digest; +} + +static struct fsverity_descriptor *incfs_get_fsverity_descriptor( + struct file *filp, int hash_algorithm) +{ + struct inode *inode = file_inode(filp); + struct fsverity_descriptor *desc = kzalloc(sizeof(*desc), GFP_KERNEL); + int err; + + if (!desc) + return ERR_PTR(-ENOMEM); + + *desc = (struct fsverity_descriptor) { + .version = 1, + .hash_algorithm = hash_algorithm, + .log_blocksize = ilog2(INCFS_DATA_FILE_BLOCK_SIZE), + .data_size = cpu_to_le64(inode->i_size), + }; + + err = incfs_get_root_hash(filp, desc->root_hash); + if (err) { + kfree(desc); + return ERR_PTR(err); + } + + return desc; +} + +static struct mem_range incfs_calc_verity_digest( + struct inode *inode, struct file *filp, + u8 *signature, size_t signature_size, + int hash_algorithm) +{ + struct fsverity_descriptor *desc = incfs_get_fsverity_descriptor(filp, + hash_algorithm); + struct mem_range verity_file_digest; + + if (IS_ERR(desc)) + return range((u8 *)desc, 0); + verity_file_digest = incfs_calc_verity_digest_from_desc(inode, desc, + signature, signature_size); + kfree(desc); + return verity_file_digest; +} + +static int incfs_build_merkle_tree(struct file *f, struct data_file *df, + struct backing_file_context *bfc, + struct mtree *hash_tree, loff_t hash_offset, + struct incfs_hash_alg *alg, struct mem_range hash) +{ + int error = 0; + int limit, lvl, i, result; + struct mem_range buf = {.len = INCFS_DATA_FILE_BLOCK_SIZE}; + struct mem_range tmp = {.len = 2 * INCFS_DATA_FILE_BLOCK_SIZE}; + + buf.data = (u8 *)__get_free_pages(GFP_NOFS, get_order(buf.len)); + tmp.data = (u8 *)__get_free_pages(GFP_NOFS, get_order(tmp.len)); + if (!buf.data || !tmp.data) { + error = -ENOMEM; + goto out; + } + + /* + * lvl - 1 is the level we are reading, lvl the level we are writing + * lvl == -1 means actual blocks + * lvl == hash_tree->depth means root hash + */ + limit = df->df_data_block_count; + for (lvl = 0; lvl <= hash_tree->depth; lvl++) { + for (i = 0; i < limit; ++i) { + loff_t hash_level_offset; + struct mem_range partial_buf = buf; + + if (lvl == 0) + result = incfs_read_data_file_block(partial_buf, + f, i, tmp, NULL, NULL); + else { + hash_level_offset = hash_offset + + hash_tree->hash_level_suboffset[lvl - 1]; + + result = incfs_kread(bfc, partial_buf.data, + partial_buf.len, + hash_level_offset + i * + INCFS_DATA_FILE_BLOCK_SIZE); + } + + if (result < 0) { + error = result; + goto out; + } + + partial_buf.len = result; + error = incfs_calc_digest(alg, partial_buf, hash); + if (error) + goto out; + + /* + * last level - only one hash to take and it is stored + * in the incfs signature record + */ + if (lvl == hash_tree->depth) + break; + + hash_level_offset = hash_offset + + hash_tree->hash_level_suboffset[lvl]; + + result = incfs_kwrite(bfc, hash.data, hash.len, + hash_level_offset + hash.len * i); + + if (result < 0) { + error = result; + goto out; + } + + if (result != hash.len) { + error = -EIO; + goto out; + } + } + limit = DIV_ROUND_UP(limit, + INCFS_DATA_FILE_BLOCK_SIZE / hash.len); + } + +out: + free_pages((unsigned long)tmp.data, get_order(tmp.len)); + free_pages((unsigned long)buf.data, get_order(buf.len)); + return error; +} + +/* + * incfs files have a signature record that is separate from the + * verity_signature record. The signature record does not actually contain a + * signature, rather it contains the size/offset of the hash tree, and a binary + * blob which contains the root hash and potentially a signature. + * + * If the file was created with a signature record, then this function simply + * returns. + * + * Otherwise it will create a signature record with a minimal binary blob as + * defined by the structure below, create space for the hash tree and then + * populate it using incfs_build_merkle_tree + */ +static int incfs_add_signature_record(struct file *f) +{ + /* See incfs_parse_signature */ + struct { + __le32 version; + __le32 size_of_hash_info_section; + struct { + __le32 hash_algorithm; + u8 log2_blocksize; + __le32 salt_size; + u8 salt[0]; + __le32 hash_size; + u8 root_hash[32]; + } __packed hash_section; + __le32 size_of_signing_info_section; + u8 signing_info_section[0]; + } __packed sig = { + .version = cpu_to_le32(INCFS_SIGNATURE_VERSION), + .size_of_hash_info_section = + cpu_to_le32(sizeof(sig.hash_section)), + .hash_section = { + .hash_algorithm = cpu_to_le32(INCFS_HASH_TREE_SHA256), + .log2_blocksize = ilog2(INCFS_DATA_FILE_BLOCK_SIZE), + .hash_size = cpu_to_le32(SHA256_DIGEST_SIZE), + }, + }; + + struct data_file *df = get_incfs_data_file(f); + struct mtree *hash_tree = NULL; + struct backing_file_context *bfc; + int error; + loff_t hash_offset, sig_offset; + struct incfs_hash_alg *alg = incfs_get_hash_alg(INCFS_HASH_TREE_SHA256); + u8 hash_buf[INCFS_MAX_HASH_SIZE]; + int hash_size = alg->digest_size; + struct mem_range hash = range(hash_buf, hash_size); + int result; + struct incfs_df_signature *signature = NULL; + + if (!df) + return -EINVAL; + + if (df->df_header_flags & INCFS_FILE_MAPPED) + return -EINVAL; + + /* Already signed? */ + if (df->df_signature && df->df_hash_tree) + return 0; + + if (df->df_signature || df->df_hash_tree) + return -EFSCORRUPTED; + + /* Add signature metadata record to file */ + hash_tree = incfs_alloc_mtree(range((u8 *)&sig, sizeof(sig)), + df->df_data_block_count); + if (IS_ERR(hash_tree)) + return PTR_ERR(hash_tree); + + bfc = df->df_backing_file_context; + if (!bfc) { + error = -EFSCORRUPTED; + goto out; + } + + error = mutex_lock_interruptible(&bfc->bc_mutex); + if (error) + goto out; + + error = incfs_write_signature_to_backing_file(bfc, + range((u8 *)&sig, sizeof(sig)), + hash_tree->hash_tree_area_size, + &hash_offset, &sig_offset); + mutex_unlock(&bfc->bc_mutex); + if (error) + goto out; + + /* Populate merkle tree */ + error = incfs_build_merkle_tree(f, df, bfc, hash_tree, hash_offset, alg, + hash); + if (error) + goto out; + + /* Update signature metadata record */ + memcpy(sig.hash_section.root_hash, hash.data, alg->digest_size); + result = incfs_kwrite(bfc, &sig, sizeof(sig), sig_offset); + if (result < 0) { + error = result; + goto out; + } + + if (result != sizeof(sig)) { + error = -EIO; + goto out; + } + + /* Update in-memory records */ + memcpy(hash_tree->root_hash, hash.data, alg->digest_size); + signature = kzalloc(sizeof(*signature), GFP_NOFS); + if (!signature) { + error = -ENOMEM; + goto out; + } + *signature = (struct incfs_df_signature) { + .hash_offset = hash_offset, + .hash_size = hash_tree->hash_tree_area_size, + .sig_offset = sig_offset, + .sig_size = sizeof(sig), + }; + df->df_signature = signature; + signature = NULL; + + /* + * Use memory barrier to prevent readpage seeing the hash tree until + * it's fully there + */ + smp_store_release(&df->df_hash_tree, hash_tree); + hash_tree = NULL; + +out: + kfree(signature); + kfree(hash_tree); + return error; +} + +static int incfs_enable_verity(struct file *filp, + const struct fsverity_enable_arg *arg) +{ + struct inode *inode = file_inode(filp); + struct data_file *df = get_incfs_data_file(filp); + u8 *signature = NULL; + struct mem_range verity_file_digest = range(NULL, 0); + int err; + + if (!df) + return -EFSCORRUPTED; + + err = mutex_lock_interruptible(&df->df_enable_verity); + if (err) + return err; + + if (IS_VERITY(inode)) { + err = -EEXIST; + goto out; + } + + err = incfs_add_signature_record(filp); + if (err) + goto out; + + /* Get the signature if the user provided one */ + if (arg->sig_size) { + signature = memdup_user(u64_to_user_ptr(arg->sig_ptr), + arg->sig_size); + if (IS_ERR(signature)) { + err = PTR_ERR(signature); + signature = NULL; + goto out; + } + } + + verity_file_digest = incfs_calc_verity_digest(inode, filp, signature, + arg->sig_size, arg->hash_algorithm); + if (IS_ERR(verity_file_digest.data)) { + err = PTR_ERR(verity_file_digest.data); + verity_file_digest.data = NULL; + goto out; + } + + err = incfs_end_enable_verity(filp, signature, arg->sig_size); + if (err) + goto out; + + /* Successfully enabled verity */ + incfs_set_verity_digest(inode, verity_file_digest); + verity_file_digest.data = NULL; +out: + mutex_unlock(&df->df_enable_verity); + kfree(signature); + kfree(verity_file_digest.data); + if (err) + pr_err("%s failed with err %d\n", __func__, err); + return err; +} + +int incfs_ioctl_enable_verity(struct file *filp, const void __user *uarg) +{ + struct inode *inode = file_inode(filp); + struct fsverity_enable_arg arg; + + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (arg.version != 1) + return -EINVAL; + + if (arg.__reserved1 || + memchr_inv(arg.__reserved2, 0, sizeof(arg.__reserved2))) + return -EINVAL; + + if (arg.hash_algorithm != FS_VERITY_HASH_ALG_SHA256) + return -EINVAL; + + if (arg.block_size != PAGE_SIZE) + return -EINVAL; + + if (arg.salt_size) + return -EINVAL; + + if (arg.sig_size > FS_VERITY_MAX_SIGNATURE_SIZE) + return -EMSGSIZE; + + if (S_ISDIR(inode->i_mode)) + return -EISDIR; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + return incfs_enable_verity(filp, &arg); +} + +static u8 *incfs_get_verity_signature(struct file *filp, size_t *sig_size) +{ + struct data_file *df = get_incfs_data_file(filp); + struct incfs_df_verity_signature *vs; + u8 *signature; + int res; + + if (!df || !df->df_backing_file_context) + return ERR_PTR(-EFSCORRUPTED); + + vs = df->df_verity_signature; + if (!vs) { + *sig_size = 0; + return NULL; + } + + if (!vs->size) { + *sig_size = 0; + return ERR_PTR(-EFSCORRUPTED); + } + + signature = kzalloc(vs->size, GFP_KERNEL); + if (!signature) + return ERR_PTR(-ENOMEM); + + res = incfs_kread(df->df_backing_file_context, + signature, vs->size, vs->offset); + + if (res < 0) + goto err_out; + + if (res != vs->size) { + res = -EINVAL; + goto err_out; + } + + *sig_size = vs->size; + return signature; + +err_out: + kfree(signature); + return ERR_PTR(res); +} + +/* Ensure data_file->df_verity_file_digest is populated */ +static int ensure_verity_info(struct inode *inode, struct file *filp) +{ + struct mem_range verity_file_digest; + u8 *signature = NULL; + size_t sig_size; + int err = 0; + + /* See if this file's verity file digest is already cached */ + verity_file_digest = incfs_get_verity_digest(inode); + if (verity_file_digest.data) + return 0; + + signature = incfs_get_verity_signature(filp, &sig_size); + if (IS_ERR(signature)) + return PTR_ERR(signature); + + verity_file_digest = incfs_calc_verity_digest(inode, filp, signature, + sig_size, + FS_VERITY_HASH_ALG_SHA256); + if (IS_ERR(verity_file_digest.data)) { + err = PTR_ERR(verity_file_digest.data); + goto out; + } + + incfs_set_verity_digest(inode, verity_file_digest); + +out: + kfree(signature); + return err; +} + +/** + * incfs_fsverity_file_open() - prepare to open a file that may be + * verity-enabled + * @inode: the inode being opened + * @filp: the struct file being set up + * + * When opening a verity file, set up data_file->df_verity_file_digest if not + * already done. Note that incfs does not allow opening for writing, so there is + * no need for that check. + * + * Return: 0 on success, -errno on failure + */ +int incfs_fsverity_file_open(struct inode *inode, struct file *filp) +{ + if (IS_VERITY(inode)) + return ensure_verity_info(inode, filp); + + return 0; +} + +int incfs_ioctl_measure_verity(struct file *filp, void __user *_uarg) +{ + struct inode *inode = file_inode(filp); + struct mem_range verity_file_digest = incfs_get_verity_digest(inode); + struct fsverity_digest __user *uarg = _uarg; + struct fsverity_digest arg; + + if (!verity_file_digest.data || !verity_file_digest.len) + return -ENODATA; /* not a verity file */ + + /* + * The user specifies the digest_size their buffer has space for; we can + * return the digest if it fits in the available space. We write back + * the actual size, which may be shorter than the user-specified size. + */ + + if (get_user(arg.digest_size, &uarg->digest_size)) + return -EFAULT; + if (arg.digest_size < verity_file_digest.len) + return -EOVERFLOW; + + memset(&arg, 0, sizeof(arg)); + arg.digest_algorithm = FS_VERITY_HASH_ALG_SHA256; + arg.digest_size = verity_file_digest.len; + + if (copy_to_user(uarg, &arg, sizeof(arg))) + return -EFAULT; + + if (copy_to_user(uarg->digest, verity_file_digest.data, + verity_file_digest.len)) + return -EFAULT; + + return 0; +} + +static int incfs_read_merkle_tree(struct file *filp, void __user *buf, + u64 start_offset, int length) +{ + struct mem_range tmp_buf; + size_t offset; + int retval = 0; + int err = 0; + struct data_file *df = get_incfs_data_file(filp); + + if (!df) + return -EINVAL; + + tmp_buf = (struct mem_range) { + .data = kzalloc(INCFS_DATA_FILE_BLOCK_SIZE, GFP_NOFS), + .len = INCFS_DATA_FILE_BLOCK_SIZE, + }; + if (!tmp_buf.data) + return -ENOMEM; + + for (offset = start_offset; offset < start_offset + length; + offset += tmp_buf.len) { + err = incfs_read_merkle_tree_blocks(tmp_buf, df, offset); + + if (err < 0) + break; + + if (err != tmp_buf.len) + break; + + if (copy_to_user(buf, tmp_buf.data, tmp_buf.len)) + break; + + buf += tmp_buf.len; + retval += tmp_buf.len; + } + + kfree(tmp_buf.data); + return retval ? retval : err; +} + +static int incfs_read_descriptor(struct file *filp, + void __user *buf, u64 offset, int length) +{ + int err; + struct fsverity_descriptor *desc = incfs_get_fsverity_descriptor(filp, + FS_VERITY_HASH_ALG_SHA256); + + if (IS_ERR(desc)) + return PTR_ERR(desc); + length = min_t(u64, length, sizeof(*desc)); + err = copy_to_user(buf, desc, length); + kfree(desc); + return err ? err : length; +} + +static int incfs_read_signature(struct file *filp, + void __user *buf, u64 offset, int length) +{ + size_t sig_size; + static u8 *signature; + int err; + + signature = incfs_get_verity_signature(filp, &sig_size); + if (IS_ERR(signature)) + return PTR_ERR(signature); + + if (!signature) + return -ENODATA; + + length = min_t(u64, length, sig_size); + err = copy_to_user(buf, signature, length); + kfree(signature); + return err ? err : length; +} + +int incfs_ioctl_read_verity_metadata(struct file *filp, + const void __user *uarg) +{ + struct fsverity_read_metadata_arg arg; + int length; + void __user *buf; + + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (arg.__reserved) + return -EINVAL; + + /* offset + length must not overflow. */ + if (arg.offset + arg.length < arg.offset) + return -EINVAL; + + /* Ensure that the return value will fit in INT_MAX. */ + length = min_t(u64, arg.length, INT_MAX); + + buf = u64_to_user_ptr(arg.buf_ptr); + + switch (arg.metadata_type) { + case FS_VERITY_METADATA_TYPE_MERKLE_TREE: + return incfs_read_merkle_tree(filp, buf, arg.offset, length); + case FS_VERITY_METADATA_TYPE_DESCRIPTOR: + return incfs_read_descriptor(filp, buf, arg.offset, length); + case FS_VERITY_METADATA_TYPE_SIGNATURE: + return incfs_read_signature(filp, buf, arg.offset, length); + default: + return -EINVAL; + } +} diff --git a/fs/incfs/verity.h b/fs/incfs/verity.h new file mode 100644 index 000000000000..8fcdbc8b93d6 --- /dev/null +++ b/fs/incfs/verity.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2020 Google LLC + */ + +#ifndef _INCFS_VERITY_H +#define _INCFS_VERITY_H + +/* Arbitrary limit to bound the kmalloc() size. Can be changed. */ +#define FS_VERITY_MAX_SIGNATURE_SIZE 16128 + +#ifdef CONFIG_FS_VERITY + +int incfs_ioctl_enable_verity(struct file *filp, const void __user *uarg); +int incfs_ioctl_measure_verity(struct file *filp, void __user *_uarg); + +int incfs_fsverity_file_open(struct inode *inode, struct file *filp); +int incfs_ioctl_read_verity_metadata(struct file *filp, + const void __user *uarg); + +#else /* !CONFIG_FS_VERITY */ + +static inline int incfs_ioctl_enable_verity(struct file *filp, + const void __user *uarg) +{ + return -EOPNOTSUPP; +} + +static inline int incfs_ioctl_measure_verity(struct file *filp, + void __user *_uarg) +{ + return -EOPNOTSUPP; +} + +static inline int incfs_fsverity_file_open(struct inode *inode, + struct file *filp) +{ + return -EOPNOTSUPP; +} + +static inline int incfs_ioctl_read_verity_metadata(struct file *filp, + const void __user *uarg) +{ + return -EOPNOTSUPP; +} + +#endif /* !CONFIG_FS_VERITY */ + +#endif diff --git a/fs/incfs/vfs.c b/fs/incfs/vfs.c new file mode 100644 index 000000000000..61d62167ad2e --- /dev/null +++ b/fs/incfs/vfs.c @@ -0,0 +1,1986 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2018 Google LLC + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "vfs.h" + +#include "data_mgmt.h" +#include "format.h" +#include "internal.h" +#include "pseudo_files.h" +#include "sysfs.h" +#include "verity.h" + +static int incfs_remount_fs(struct super_block *sb, int *flags, char *data); + +static int dentry_revalidate(struct dentry *dentry, unsigned int flags); +static void dentry_release(struct dentry *d); + +static int iterate_incfs_dir(struct file *file, struct dir_context *ctx); +static struct dentry *dir_lookup(struct inode *dir_inode, + struct dentry *dentry, unsigned int flags); +static int dir_mkdir(struct user_namespace *ns, struct inode *dir, + struct dentry *dentry, umode_t mode); +static int dir_unlink(struct inode *dir, struct dentry *dentry); +static int dir_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry); +static int dir_rmdir(struct inode *dir, struct dentry *dentry); +static int dir_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags); + +static int file_open(struct inode *inode, struct file *file); +static int file_release(struct inode *inode, struct file *file); +static int read_single_page(struct file *f, struct page *page); +static long dispatch_ioctl(struct file *f, unsigned int req, unsigned long arg); + +#ifdef CONFIG_COMPAT +static long incfs_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg); +#endif + +static struct inode *alloc_inode(struct super_block *sb); +static void free_inode(struct inode *inode); +static void evict_inode(struct inode *inode); + +static int incfs_setattr(struct user_namespace *ns, struct dentry *dentry, + struct iattr *ia); +static int incfs_getattr(struct user_namespace *ns, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags); +static ssize_t incfs_getxattr(struct dentry *d, const char *name, + void *value, size_t size); +static ssize_t incfs_setxattr(struct user_namespace *ns, struct dentry *d, + const char *name, const void *value, size_t size, + int flags); +static ssize_t incfs_listxattr(struct dentry *d, char *list, size_t size); + +static int show_options(struct seq_file *, struct dentry *); + +static const struct super_operations incfs_super_ops = { + .statfs = simple_statfs, + .remount_fs = incfs_remount_fs, + .alloc_inode = alloc_inode, + .destroy_inode = free_inode, + .evict_inode = evict_inode, + .show_options = show_options +}; + +static int dir_rename_wrap(struct user_namespace *ns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) +{ + return dir_rename(old_dir, old_dentry, new_dir, new_dentry, flags); +} + +static const struct inode_operations incfs_dir_inode_ops = { + .lookup = dir_lookup, + .mkdir = dir_mkdir, + .rename = dir_rename_wrap, + .unlink = dir_unlink, + .link = dir_link, + .rmdir = dir_rmdir, + .setattr = incfs_setattr, +}; + +static const struct file_operations incfs_dir_fops = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = iterate_incfs_dir, + .open = file_open, + .release = file_release, +}; + +static const struct dentry_operations incfs_dentry_ops = { + .d_revalidate = dentry_revalidate, + .d_release = dentry_release +}; + +static const struct address_space_operations incfs_address_space_ops = { + .readpage = read_single_page, + /* .readpages = readpages */ +}; + +static vm_fault_t incfs_fault(struct vm_fault *vmf) +{ + vmf->flags &= ~FAULT_FLAG_ALLOW_RETRY; + return filemap_fault(vmf); +} + +static const struct vm_operations_struct incfs_file_vm_ops = { + .fault = incfs_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = filemap_page_mkwrite, +}; + +/* This is used for a general mmap of a disk file */ + +static int incfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + file_accessed(file); + vma->vm_ops = &incfs_file_vm_ops; + return 0; +} + +const struct file_operations incfs_file_ops = { + .open = file_open, + .release = file_release, + .read_iter = generic_file_read_iter, + .mmap = incfs_file_mmap, + .splice_read = generic_file_splice_read, + .llseek = generic_file_llseek, + .unlocked_ioctl = dispatch_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = incfs_compat_ioctl, +#endif +}; + +const struct inode_operations incfs_file_inode_ops = { + .setattr = incfs_setattr, + .getattr = incfs_getattr, + .listxattr = incfs_listxattr +}; + +static int incfs_handler_getxattr(const struct xattr_handler *xh, + struct dentry *d, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return incfs_getxattr(d, name, buffer, size); +} + +static int incfs_handler_setxattr(const struct xattr_handler *xh, + struct user_namespace *ns, + struct dentry *d, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) +{ + return incfs_setxattr(ns, d, name, buffer, size, flags); +} + +static const struct xattr_handler incfs_xattr_handler = { + .prefix = "", /* AKA all attributes */ + .get = incfs_handler_getxattr, + .set = incfs_handler_setxattr, +}; + +static const struct xattr_handler *incfs_xattr_ops[] = { + &incfs_xattr_handler, + NULL, +}; + +struct inode_search { + unsigned long ino; + + struct dentry *backing_dentry; + + size_t size; + + bool verity; +}; + +enum parse_parameter { + Opt_read_timeout, + Opt_readahead_pages, + Opt_rlog_pages, + Opt_rlog_wakeup_cnt, + Opt_report_uid, + Opt_sysfs_name, + Opt_err +}; + +static const match_table_t option_tokens = { + { Opt_read_timeout, "read_timeout_ms=%u" }, + { Opt_readahead_pages, "readahead=%u" }, + { Opt_rlog_pages, "rlog_pages=%u" }, + { Opt_rlog_wakeup_cnt, "rlog_wakeup_cnt=%u" }, + { Opt_report_uid, "report_uid" }, + { Opt_sysfs_name, "sysfs_name=%s" }, + { Opt_err, NULL } +}; + +static void free_options(struct mount_options *opts) +{ + kfree(opts->sysfs_name); + opts->sysfs_name = NULL; +} + +static int parse_options(struct mount_options *opts, char *str) +{ + substring_t args[MAX_OPT_ARGS]; + int value; + char *position; + + if (opts == NULL) + return -EFAULT; + + *opts = (struct mount_options) { + .read_timeout_ms = 1000, /* Default: 1s */ + .readahead_pages = 10, + .read_log_pages = 2, + .read_log_wakeup_count = 10, + }; + + if (str == NULL || *str == 0) + return 0; + + while ((position = strsep(&str, ",")) != NULL) { + int token; + + if (!*position) + continue; + + token = match_token(position, option_tokens, args); + + switch (token) { + case Opt_read_timeout: + if (match_int(&args[0], &value)) + return -EINVAL; + if (value > 3600000) + return -EINVAL; + opts->read_timeout_ms = value; + break; + case Opt_readahead_pages: + if (match_int(&args[0], &value)) + return -EINVAL; + opts->readahead_pages = value; + break; + case Opt_rlog_pages: + if (match_int(&args[0], &value)) + return -EINVAL; + opts->read_log_pages = value; + break; + case Opt_rlog_wakeup_cnt: + if (match_int(&args[0], &value)) + return -EINVAL; + opts->read_log_wakeup_count = value; + break; + case Opt_report_uid: + opts->report_uid = true; + break; + case Opt_sysfs_name: + opts->sysfs_name = match_strdup(&args[0]); + break; + default: + free_options(opts); + return -EINVAL; + } + } + + return 0; +} + +/* Read file size from the attribute. Quicker than reading the header */ +static u64 read_size_attr(struct dentry *backing_dentry) +{ + __le64 attr_value; + ssize_t bytes_read; + + bytes_read = vfs_getxattr(&init_user_ns, backing_dentry, INCFS_XATTR_SIZE_NAME, + (char *)&attr_value, sizeof(attr_value)); + + if (bytes_read != sizeof(attr_value)) + return 0; + + return le64_to_cpu(attr_value); +} + +/* Read verity flag from the attribute. Quicker than reading the header */ +static bool read_verity_attr(struct dentry *backing_dentry) +{ + return vfs_getxattr(&init_user_ns, backing_dentry, INCFS_XATTR_VERITY_NAME, NULL, 0) + >= 0; +} + +static int inode_test(struct inode *inode, void *opaque) +{ + struct inode_search *search = opaque; + struct inode_info *node = get_incfs_node(inode); + struct inode *backing_inode = d_inode(search->backing_dentry); + + if (!node) + return 0; + + return node->n_backing_inode == backing_inode && + inode->i_ino == search->ino; +} + +static int inode_set(struct inode *inode, void *opaque) +{ + struct inode_search *search = opaque; + struct inode_info *node = get_incfs_node(inode); + struct dentry *backing_dentry = search->backing_dentry; + struct inode *backing_inode = d_inode(backing_dentry); + + fsstack_copy_attr_all(inode, backing_inode); + if (S_ISREG(inode->i_mode)) { + u64 size = search->size; + + inode->i_size = size; + inode->i_blocks = get_blocks_count_for_size(size); + inode->i_mapping->a_ops = &incfs_address_space_ops; + inode->i_op = &incfs_file_inode_ops; + inode->i_fop = &incfs_file_ops; + inode->i_mode &= ~0222; + if (search->verity) + inode_set_flags(inode, S_VERITY, S_VERITY); + } else if (S_ISDIR(inode->i_mode)) { + inode->i_size = 0; + inode->i_blocks = 1; + inode->i_mapping->a_ops = &incfs_address_space_ops; + inode->i_op = &incfs_dir_inode_ops; + inode->i_fop = &incfs_dir_fops; + } else { + pr_warn_once("incfs: Unexpected inode type\n"); + return -EBADF; + } + + ihold(backing_inode); + node->n_backing_inode = backing_inode; + node->n_mount_info = get_mount_info(inode->i_sb); + inode->i_ctime = backing_inode->i_ctime; + inode->i_mtime = backing_inode->i_mtime; + inode->i_atime = backing_inode->i_atime; + inode->i_ino = backing_inode->i_ino; + if (backing_inode->i_ino < INCFS_START_INO_RANGE) { + pr_warn("incfs: ino conflict with backing FS %ld\n", + backing_inode->i_ino); + } + + return 0; +} + +static struct inode *fetch_regular_inode(struct super_block *sb, + struct dentry *backing_dentry) +{ + struct inode *backing_inode = d_inode(backing_dentry); + struct inode_search search = { + .ino = backing_inode->i_ino, + .backing_dentry = backing_dentry, + .size = read_size_attr(backing_dentry), + .verity = read_verity_attr(backing_dentry), + }; + struct inode *inode = iget5_locked(sb, search.ino, inode_test, + inode_set, &search); + + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) + unlock_new_inode(inode); + + return inode; +} + +static int iterate_incfs_dir(struct file *file, struct dir_context *ctx) +{ + struct dir_file *dir = get_incfs_dir_file(file); + int error = 0; + struct mount_info *mi = get_mount_info(file_superblock(file)); + bool root; + + if (!dir) { + error = -EBADF; + goto out; + } + + root = dir->backing_dir->f_inode + == d_inode(mi->mi_backing_dir_path.dentry); + + if (root) { + error = emit_pseudo_files(ctx); + if (error) + goto out; + } + + ctx->pos -= PSEUDO_FILE_COUNT; + error = iterate_dir(dir->backing_dir, ctx); + ctx->pos += PSEUDO_FILE_COUNT; + file->f_pos = dir->backing_dir->f_pos; +out: + if (error) + pr_warn("incfs: %s %s %d\n", __func__, + file->f_path.dentry->d_name.name, error); + return error; +} + +static int incfs_init_dentry(struct dentry *dentry, struct path *path) +{ + struct dentry_info *d_info = NULL; + + if (!dentry || !path) + return -EFAULT; + + d_info = kzalloc(sizeof(*d_info), GFP_NOFS); + if (!d_info) + return -ENOMEM; + + d_info->backing_path = *path; + path_get(path); + + dentry->d_fsdata = d_info; + return 0; +} + +static struct dentry *open_or_create_special_dir(struct dentry *backing_dir, + const char *name, + bool *created) +{ + struct dentry *index_dentry; + struct inode *backing_inode = d_inode(backing_dir); + int err = 0; + + index_dentry = incfs_lookup_dentry(backing_dir, name); + if (!index_dentry) { + return ERR_PTR(-EINVAL); + } else if (IS_ERR(index_dentry)) { + return index_dentry; + } else if (d_really_is_positive(index_dentry)) { + /* Index already exists. */ + *created = false; + return index_dentry; + } + + /* Index needs to be created. */ + inode_lock_nested(backing_inode, I_MUTEX_PARENT); + err = vfs_mkdir(&init_user_ns, backing_inode, index_dentry, 0777); + inode_unlock(backing_inode); + + if (err) { + dput(index_dentry); + return ERR_PTR(err); + } + + if (!d_really_is_positive(index_dentry) || + unlikely(d_unhashed(index_dentry))) { + dput(index_dentry); + return ERR_PTR(-EINVAL); + } + + *created = true; + return index_dentry; +} + +static int read_single_page_timeouts(struct data_file *df, struct file *f, + int block_index, struct mem_range range, + struct mem_range tmp, + unsigned int *delayed_min_us) +{ + struct mount_info *mi = df->df_mount_info; + struct incfs_read_data_file_timeouts timeouts = { + .max_pending_time_us = U32_MAX, + }; + int uid = current_uid().val; + int i; + + spin_lock(&mi->mi_per_uid_read_timeouts_lock); + for (i = 0; i < mi->mi_per_uid_read_timeouts_size / + sizeof(*mi->mi_per_uid_read_timeouts); ++i) { + struct incfs_per_uid_read_timeouts *t = + &mi->mi_per_uid_read_timeouts[i]; + + if(t->uid == uid) { + timeouts.min_time_us = t->min_time_us; + timeouts.min_pending_time_us = t->min_pending_time_us; + timeouts.max_pending_time_us = t->max_pending_time_us; + break; + } + } + spin_unlock(&mi->mi_per_uid_read_timeouts_lock); + if (timeouts.max_pending_time_us == U32_MAX) { + u64 read_timeout_us = (u64)mi->mi_options.read_timeout_ms * + 1000; + + timeouts.max_pending_time_us = read_timeout_us <= U32_MAX ? + read_timeout_us : U32_MAX; + } + + return incfs_read_data_file_block(range, f, block_index, tmp, + &timeouts, delayed_min_us); +} + +static int usleep_interruptible(u32 us) +{ + /* See: + * https://www.kernel.org/doc/Documentation/timers/timers-howto.txt + * for explanation + */ + if (us < 10) { + udelay(us); + return 0; + } else if (us < 20000) { + usleep_range(us, us + us / 10); + return 0; + } else + return msleep_interruptible(us / 1000); +} + +static int read_single_page(struct file *f, struct page *page) +{ + loff_t offset = 0; + loff_t size = 0; + ssize_t bytes_to_read = 0; + ssize_t read_result = 0; + struct data_file *df = get_incfs_data_file(f); + int result = 0; + void *page_start; + int block_index; + unsigned int delayed_min_us = 0; + + if (!df) { + SetPageError(page); + unlock_page(page); + return -EBADF; + } + + page_start = kmap(page); + offset = page_offset(page); + block_index = (offset + df->df_mapped_offset) / + INCFS_DATA_FILE_BLOCK_SIZE; + size = df->df_size; + + if (offset < size) { + struct mem_range tmp = { + .len = 2 * INCFS_DATA_FILE_BLOCK_SIZE + }; + tmp.data = (u8 *)__get_free_pages(GFP_NOFS, get_order(tmp.len)); + if (!tmp.data) { + read_result = -ENOMEM; + goto err; + } + bytes_to_read = min_t(loff_t, size - offset, PAGE_SIZE); + + read_result = read_single_page_timeouts(df, f, block_index, + range(page_start, bytes_to_read), tmp, + &delayed_min_us); + + free_pages((unsigned long)tmp.data, get_order(tmp.len)); + } else { + bytes_to_read = 0; + read_result = 0; + } + +err: + if (read_result < 0) + result = read_result; + else if (read_result < PAGE_SIZE) + zero_user(page, read_result, PAGE_SIZE - read_result); + + if (result == 0) + SetPageUptodate(page); + else + SetPageError(page); + + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + if (delayed_min_us) + usleep_interruptible(delayed_min_us); + return result; +} + +int incfs_link(struct dentry *what, struct dentry *where) +{ + struct dentry *parent_dentry = dget_parent(where); + struct inode *pinode = d_inode(parent_dentry); + int error = 0; + + inode_lock_nested(pinode, I_MUTEX_PARENT); + error = vfs_link(what, &init_user_ns, pinode, where, NULL); + inode_unlock(pinode); + + dput(parent_dentry); + return error; +} + +int incfs_unlink(struct dentry *dentry) +{ + struct dentry *parent_dentry = dget_parent(dentry); + struct inode *pinode = d_inode(parent_dentry); + int error = 0; + + inode_lock_nested(pinode, I_MUTEX_PARENT); + error = vfs_unlink(&init_user_ns, pinode, dentry, NULL); + inode_unlock(pinode); + + dput(parent_dentry); + return error; +} + +static int incfs_rmdir(struct dentry *dentry) +{ + struct dentry *parent_dentry = dget_parent(dentry); + struct inode *pinode = d_inode(parent_dentry); + int error = 0; + + inode_lock_nested(pinode, I_MUTEX_PARENT); + error = vfs_rmdir(&init_user_ns, pinode, dentry); + inode_unlock(pinode); + + dput(parent_dentry); + return error; +} + +static void notify_unlink(struct dentry *dentry, const char *file_id_str, + const char *special_directory) +{ + struct dentry *root = dentry; + struct dentry *file = NULL; + struct dentry *dir = NULL; + int error = 0; + bool take_lock = root->d_parent != root->d_parent->d_parent; + + while (root != root->d_parent) + root = root->d_parent; + + if (take_lock) + dir = incfs_lookup_dentry(root, special_directory); + else + dir = lookup_one_len(special_directory, root, + strlen(special_directory)); + + if (IS_ERR(dir)) { + error = PTR_ERR(dir); + goto out; + } + if (d_is_negative(dir)) { + error = -ENOENT; + goto out; + } + + file = incfs_lookup_dentry(dir, file_id_str); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto out; + } + if (d_is_negative(file)) { + error = -ENOENT; + goto out; + } + + fsnotify_unlink(d_inode(dir), file); + d_delete(file); + +out: + if (error) + pr_warn("%s failed with error %d\n", __func__, error); + + dput(dir); + dput(file); +} + +static void handle_file_completed(struct file *f, struct data_file *df) +{ + struct backing_file_context *bfc; + struct mount_info *mi = df->df_mount_info; + char *file_id_str = NULL; + struct dentry *incomplete_file_dentry = NULL; + const struct cred *old_cred = override_creds(mi->mi_owner); + int error; + + /* Truncate file to remove any preallocated space */ + bfc = df->df_backing_file_context; + if (bfc) { + struct file *f = bfc->bc_file; + + if (f) { + loff_t size = i_size_read(file_inode(f)); + + error = vfs_truncate(&f->f_path, size); + if (error) + /* No useful action on failure */ + pr_warn("incfs: Failed to truncate complete file: %d\n", + error); + } + } + + /* This is best effort - there is no useful action to take on failure */ + file_id_str = file_id_to_str(df->df_id); + if (!file_id_str) + goto out; + + incomplete_file_dentry = incfs_lookup_dentry( + df->df_mount_info->mi_incomplete_dir, + file_id_str); + if (!incomplete_file_dentry || IS_ERR(incomplete_file_dentry)) { + incomplete_file_dentry = NULL; + goto out; + } + + if (!d_really_is_positive(incomplete_file_dentry)) + goto out; + + vfs_fsync(df->df_backing_file_context->bc_file, 0); + error = incfs_unlink(incomplete_file_dentry); + if (error) { + pr_warn("incfs: Deleting incomplete file failed: %d\n", error); + goto out; + } + + notify_unlink(f->f_path.dentry, file_id_str, INCFS_INCOMPLETE_NAME); + +out: + dput(incomplete_file_dentry); + kfree(file_id_str); + revert_creds(old_cred); +} + +static long ioctl_fill_blocks(struct file *f, void __user *arg) +{ + struct incfs_fill_blocks __user *usr_fill_blocks = arg; + struct incfs_fill_blocks fill_blocks; + struct incfs_fill_block __user *usr_fill_block_array; + struct data_file *df = get_incfs_data_file(f); + struct incfs_file_data *fd = f->private_data; + const ssize_t data_buf_size = 2 * INCFS_DATA_FILE_BLOCK_SIZE; + u8 *data_buf = NULL; + ssize_t error = 0; + int i = 0; + bool complete = false; + + if (!df) + return -EBADF; + + if (!fd || fd->fd_fill_permission != CAN_FILL) + return -EPERM; + + if (copy_from_user(&fill_blocks, usr_fill_blocks, sizeof(fill_blocks))) + return -EFAULT; + + usr_fill_block_array = u64_to_user_ptr(fill_blocks.fill_blocks); + data_buf = (u8 *)__get_free_pages(GFP_NOFS | __GFP_COMP, + get_order(data_buf_size)); + if (!data_buf) + return -ENOMEM; + + for (i = 0; i < fill_blocks.count; i++) { + struct incfs_fill_block fill_block = {}; + + if (copy_from_user(&fill_block, &usr_fill_block_array[i], + sizeof(fill_block)) > 0) { + error = -EFAULT; + break; + } + + if (fill_block.data_len > data_buf_size) { + error = -E2BIG; + break; + } + + if (copy_from_user(data_buf, u64_to_user_ptr(fill_block.data), + fill_block.data_len) > 0) { + error = -EFAULT; + break; + } + fill_block.data = 0; /* To make sure nobody uses it. */ + if (fill_block.flags & INCFS_BLOCK_FLAGS_HASH) { + error = incfs_process_new_hash_block(df, &fill_block, + data_buf); + } else { + error = incfs_process_new_data_block(df, &fill_block, + data_buf, &complete); + } + if (error) + break; + } + + if (data_buf) + free_pages((unsigned long)data_buf, get_order(data_buf_size)); + + if (complete) + handle_file_completed(f, df); + + /* + * Only report the error if no records were processed, otherwise + * just return how many were processed successfully. + */ + if (i == 0) + return error; + + return i; +} + +static long ioctl_read_file_signature(struct file *f, void __user *arg) +{ + struct incfs_get_file_sig_args __user *args_usr_ptr = arg; + struct incfs_get_file_sig_args args = {}; + u8 *sig_buffer = NULL; + size_t sig_buf_size = 0; + int error = 0; + int read_result = 0; + struct data_file *df = get_incfs_data_file(f); + + if (!df) + return -EINVAL; + + if (copy_from_user(&args, args_usr_ptr, sizeof(args)) > 0) + return -EINVAL; + + sig_buf_size = args.file_signature_buf_size; + if (sig_buf_size > INCFS_MAX_SIGNATURE_SIZE) + return -E2BIG; + + sig_buffer = kzalloc(sig_buf_size, GFP_NOFS | __GFP_COMP); + if (!sig_buffer) + return -ENOMEM; + + read_result = incfs_read_file_signature(df, + range(sig_buffer, sig_buf_size)); + + if (read_result < 0) { + error = read_result; + goto out; + } + + if (copy_to_user(u64_to_user_ptr(args.file_signature), sig_buffer, + read_result)) { + error = -EFAULT; + goto out; + } + + args.file_signature_len_out = read_result; + if (copy_to_user(args_usr_ptr, &args, sizeof(args))) + error = -EFAULT; + +out: + kfree(sig_buffer); + + return error; +} + +static long ioctl_get_filled_blocks(struct file *f, void __user *arg) +{ + struct incfs_get_filled_blocks_args __user *args_usr_ptr = arg; + struct incfs_get_filled_blocks_args args = {}; + struct data_file *df = get_incfs_data_file(f); + struct incfs_file_data *fd = f->private_data; + int error; + + if (!df || !fd) + return -EINVAL; + + if (fd->fd_fill_permission != CAN_FILL) + return -EPERM; + + if (copy_from_user(&args, args_usr_ptr, sizeof(args)) > 0) + return -EINVAL; + + error = incfs_get_filled_blocks(df, fd, &args); + + if (copy_to_user(args_usr_ptr, &args, sizeof(args))) + return -EFAULT; + + return error; +} + +static long ioctl_get_block_count(struct file *f, void __user *arg) +{ + struct incfs_get_block_count_args __user *args_usr_ptr = arg; + struct incfs_get_block_count_args args = {}; + struct data_file *df = get_incfs_data_file(f); + + if (!df) + return -EINVAL; + + args.total_data_blocks_out = df->df_data_block_count; + args.filled_data_blocks_out = atomic_read(&df->df_data_blocks_written); + args.total_hash_blocks_out = df->df_total_block_count - + df->df_data_block_count; + args.filled_hash_blocks_out = atomic_read(&df->df_hash_blocks_written); + + if (copy_to_user(args_usr_ptr, &args, sizeof(args))) + return -EFAULT; + + return 0; +} + +static int incfs_ioctl_get_flags(struct file *f, void __user *arg) +{ + u32 flags = IS_VERITY(file_inode(f)) ? FS_VERITY_FL : 0; + + return put_user(flags, (int __user *) arg); +} + +static long dispatch_ioctl(struct file *f, unsigned int req, unsigned long arg) +{ + switch (req) { + case INCFS_IOC_FILL_BLOCKS: + return ioctl_fill_blocks(f, (void __user *)arg); + case INCFS_IOC_READ_FILE_SIGNATURE: + return ioctl_read_file_signature(f, (void __user *)arg); + case INCFS_IOC_GET_FILLED_BLOCKS: + return ioctl_get_filled_blocks(f, (void __user *)arg); + case INCFS_IOC_GET_BLOCK_COUNT: + return ioctl_get_block_count(f, (void __user *)arg); + case FS_IOC_ENABLE_VERITY: + return incfs_ioctl_enable_verity(f, (const void __user *)arg); + case FS_IOC_GETFLAGS: + return incfs_ioctl_get_flags(f, (void __user *) arg); + case FS_IOC_MEASURE_VERITY: + return incfs_ioctl_measure_verity(f, (void __user *)arg); + case FS_IOC_READ_VERITY_METADATA: + return incfs_ioctl_read_verity_metadata(f, (void __user *)arg); + default: + return -EINVAL; + } +} + +#ifdef CONFIG_COMPAT +static long incfs_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case INCFS_IOC_FILL_BLOCKS: + case INCFS_IOC_READ_FILE_SIGNATURE: + case INCFS_IOC_GET_FILLED_BLOCKS: + case INCFS_IOC_GET_BLOCK_COUNT: + case FS_IOC_ENABLE_VERITY: + case FS_IOC_MEASURE_VERITY: + case FS_IOC_READ_VERITY_METADATA: + break; + default: + return -ENOIOCTLCMD; + } + return dispatch_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif + +static struct dentry *dir_lookup(struct inode *dir_inode, struct dentry *dentry, + unsigned int flags) +{ + struct mount_info *mi = get_mount_info(dir_inode->i_sb); + struct dentry *dir_dentry = NULL; + struct dentry *backing_dentry = NULL; + struct path dir_backing_path = {}; + struct inode_info *dir_info = get_incfs_node(dir_inode); + int err = 0; + + if (!mi || !dir_info || !dir_info->n_backing_inode) + return ERR_PTR(-EBADF); + + if (d_inode(mi->mi_backing_dir_path.dentry) == + dir_info->n_backing_inode) { + /* We do lookup in the FS root. Show pseudo files. */ + err = dir_lookup_pseudo_files(dir_inode->i_sb, dentry); + if (err != -ENOENT) + goto out; + err = 0; + } + + dir_dentry = dget_parent(dentry); + get_incfs_backing_path(dir_dentry, &dir_backing_path); + backing_dentry = incfs_lookup_dentry(dir_backing_path.dentry, + dentry->d_name.name); + + if (!backing_dentry || IS_ERR(backing_dentry)) { + err = IS_ERR(backing_dentry) + ? PTR_ERR(backing_dentry) + : -EFAULT; + backing_dentry = NULL; + goto out; + } else { + struct inode *inode = NULL; + struct path backing_path = { + .mnt = dir_backing_path.mnt, + .dentry = backing_dentry + }; + + err = incfs_init_dentry(dentry, &backing_path); + if (err) + goto out; + + if (!d_really_is_positive(backing_dentry)) { + /* + * No such entry found in the backing dir. + * Create a negative entry. + */ + d_add(dentry, NULL); + err = 0; + goto out; + } + + if (d_inode(backing_dentry)->i_sb != + dir_info->n_backing_inode->i_sb) { + /* + * Somehow after the path lookup we ended up in a + * different fs mount. If we keep going it's going + * to end badly. + */ + err = -EXDEV; + goto out; + } + + inode = fetch_regular_inode(dir_inode->i_sb, backing_dentry); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } + + d_add(dentry, inode); + } + +out: + dput(dir_dentry); + dput(backing_dentry); + path_put(&dir_backing_path); + if (err) + pr_debug("incfs: %s %s %d\n", __func__, + dentry->d_name.name, err); + return ERR_PTR(err); +} + +static int dir_mkdir(struct user_namespace *ns, struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct mount_info *mi = get_mount_info(dir->i_sb); + struct inode_info *dir_node = get_incfs_node(dir); + struct dentry *backing_dentry = NULL; + struct path backing_path = {}; + int err = 0; + + + if (!mi || !dir_node || !dir_node->n_backing_inode) + return -EBADF; + + err = mutex_lock_interruptible(&mi->mi_dir_struct_mutex); + if (err) + return err; + + get_incfs_backing_path(dentry, &backing_path); + backing_dentry = backing_path.dentry; + + if (!backing_dentry) { + err = -EBADF; + goto path_err; + } + + if (backing_dentry->d_parent == mi->mi_index_dir) { + /* Can't create a subdir inside .index */ + err = -EBUSY; + goto out; + } + + if (backing_dentry->d_parent == mi->mi_incomplete_dir) { + /* Can't create a subdir inside .incomplete */ + err = -EBUSY; + goto out; + } + inode_lock_nested(dir_node->n_backing_inode, I_MUTEX_PARENT); + err = vfs_mkdir(ns, dir_node->n_backing_inode, backing_dentry, mode | 0222); + inode_unlock(dir_node->n_backing_inode); + if (!err) { + struct inode *inode = NULL; + + if (d_really_is_negative(backing_dentry) || + unlikely(d_unhashed(backing_dentry))) { + err = -EINVAL; + goto out; + } + + inode = fetch_regular_inode(dir->i_sb, backing_dentry); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } + d_instantiate(dentry, inode); + } + +out: + if (d_really_is_negative(dentry)) + d_drop(dentry); + path_put(&backing_path); + +path_err: + mutex_unlock(&mi->mi_dir_struct_mutex); + if (err) + pr_debug("incfs: %s err:%d\n", __func__, err); + return err; +} + +/* + * Delete file referenced by backing_dentry and if appropriate its hardlink + * from .index and .incomplete + */ +static int file_delete(struct mount_info *mi, struct dentry *dentry, + struct dentry *backing_dentry, int nlink) +{ + struct dentry *index_file_dentry = NULL; + struct dentry *incomplete_file_dentry = NULL; + /* 2 chars per byte of file ID + 1 char for \0 */ + char file_id_str[2 * sizeof(incfs_uuid_t) + 1] = {0}; + ssize_t uuid_size = 0; + int error = 0; + + WARN_ON(!mutex_is_locked(&mi->mi_dir_struct_mutex)); + + if (nlink > 3) + goto just_unlink; + + uuid_size = vfs_getxattr(&init_user_ns, backing_dentry, INCFS_XATTR_ID_NAME, + file_id_str, 2 * sizeof(incfs_uuid_t)); + if (uuid_size < 0) { + error = uuid_size; + goto out; + } + + if (uuid_size != 2 * sizeof(incfs_uuid_t)) { + error = -EBADMSG; + goto out; + } + + index_file_dentry = incfs_lookup_dentry(mi->mi_index_dir, file_id_str); + if (IS_ERR(index_file_dentry)) { + error = PTR_ERR(index_file_dentry); + index_file_dentry = NULL; + goto out; + } + + if (d_really_is_positive(index_file_dentry) && nlink > 0) + nlink--; + + if (nlink > 2) + goto just_unlink; + + incomplete_file_dentry = incfs_lookup_dentry(mi->mi_incomplete_dir, + file_id_str); + if (IS_ERR(incomplete_file_dentry)) { + error = PTR_ERR(incomplete_file_dentry); + incomplete_file_dentry = NULL; + goto out; + } + + if (d_really_is_positive(incomplete_file_dentry) && nlink > 0) + nlink--; + + if (nlink > 1) + goto just_unlink; + + if (d_really_is_positive(index_file_dentry)) { + error = incfs_unlink(index_file_dentry); + if (error) + goto out; + notify_unlink(dentry, file_id_str, INCFS_INDEX_NAME); + } + + if (d_really_is_positive(incomplete_file_dentry)) { + error = incfs_unlink(incomplete_file_dentry); + if (error) + goto out; + notify_unlink(dentry, file_id_str, INCFS_INCOMPLETE_NAME); + } + +just_unlink: + error = incfs_unlink(backing_dentry); + +out: + dput(index_file_dentry); + dput(incomplete_file_dentry); + if (error) + pr_debug("incfs: delete_file_from_index err:%d\n", error); + return error; +} + +static int dir_unlink(struct inode *dir, struct dentry *dentry) +{ + struct mount_info *mi = get_mount_info(dir->i_sb); + struct path backing_path = {}; + struct kstat stat; + int err = 0; + + if (!mi) + return -EBADF; + + err = mutex_lock_interruptible(&mi->mi_dir_struct_mutex); + if (err) + return err; + + get_incfs_backing_path(dentry, &backing_path); + if (!backing_path.dentry) { + err = -EBADF; + goto path_err; + } + + if (backing_path.dentry->d_parent == mi->mi_index_dir) { + /* Direct unlink from .index are not allowed. */ + err = -EBUSY; + goto out; + } + + if (backing_path.dentry->d_parent == mi->mi_incomplete_dir) { + /* Direct unlink from .incomplete are not allowed. */ + err = -EBUSY; + goto out; + } + + err = vfs_getattr(&backing_path, &stat, STATX_NLINK, + AT_STATX_SYNC_AS_STAT); + if (err) + goto out; + + err = file_delete(mi, dentry, backing_path.dentry, stat.nlink); + + d_drop(dentry); +out: + path_put(&backing_path); +path_err: + if (err) + pr_debug("incfs: %s err:%d\n", __func__, err); + mutex_unlock(&mi->mi_dir_struct_mutex); + return err; +} + +static int dir_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + struct mount_info *mi = get_mount_info(dir->i_sb); + struct path backing_old_path = {}; + struct path backing_new_path = {}; + int error = 0; + + if (!mi) + return -EBADF; + + error = mutex_lock_interruptible(&mi->mi_dir_struct_mutex); + if (error) + return error; + + get_incfs_backing_path(old_dentry, &backing_old_path); + get_incfs_backing_path(new_dentry, &backing_new_path); + + if (backing_new_path.dentry->d_parent == mi->mi_index_dir) { + /* Can't link to .index */ + error = -EBUSY; + goto out; + } + + if (backing_new_path.dentry->d_parent == mi->mi_incomplete_dir) { + /* Can't link to .incomplete */ + error = -EBUSY; + goto out; + } + + error = incfs_link(backing_old_path.dentry, backing_new_path.dentry); + if (!error) { + struct inode *inode = NULL; + struct dentry *bdentry = backing_new_path.dentry; + + if (d_really_is_negative(bdentry)) { + error = -EINVAL; + goto out; + } + + inode = fetch_regular_inode(dir->i_sb, bdentry); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + goto out; + } + d_instantiate(new_dentry, inode); + } + +out: + path_put(&backing_old_path); + path_put(&backing_new_path); + if (error) + pr_debug("incfs: %s err:%d\n", __func__, error); + mutex_unlock(&mi->mi_dir_struct_mutex); + return error; +} + +static int dir_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct mount_info *mi = get_mount_info(dir->i_sb); + struct path backing_path = {}; + int err = 0; + + if (!mi) + return -EBADF; + + err = mutex_lock_interruptible(&mi->mi_dir_struct_mutex); + if (err) + return err; + + get_incfs_backing_path(dentry, &backing_path); + if (!backing_path.dentry) { + err = -EBADF; + goto path_err; + } + + if (backing_path.dentry == mi->mi_index_dir) { + /* Can't delete .index */ + err = -EBUSY; + goto out; + } + + if (backing_path.dentry == mi->mi_incomplete_dir) { + /* Can't delete .incomplete */ + err = -EBUSY; + goto out; + } + + err = incfs_rmdir(backing_path.dentry); + if (!err) + d_drop(dentry); +out: + path_put(&backing_path); + +path_err: + if (err) + pr_debug("incfs: %s err:%d\n", __func__, err); + mutex_unlock(&mi->mi_dir_struct_mutex); + return err; +} + +static int dir_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + struct mount_info *mi = get_mount_info(old_dir->i_sb); + struct dentry *backing_old_dentry; + struct dentry *backing_new_dentry; + struct dentry *backing_old_dir_dentry; + struct dentry *backing_new_dir_dentry; + struct inode *target_inode; + struct dentry *trap; + struct renamedata rd = {}; + int error = 0; + + error = mutex_lock_interruptible(&mi->mi_dir_struct_mutex); + if (error) + return error; + + backing_old_dentry = get_incfs_dentry(old_dentry)->backing_path.dentry; + + if (!backing_old_dentry || backing_old_dentry == mi->mi_index_dir || + backing_old_dentry == mi->mi_incomplete_dir) { + /* Renaming .index or .incomplete not allowed */ + error = -EBUSY; + goto exit; + } + + backing_new_dentry = get_incfs_dentry(new_dentry)->backing_path.dentry; + dget(backing_old_dentry); + dget(backing_new_dentry); + + backing_old_dir_dentry = dget_parent(backing_old_dentry); + backing_new_dir_dentry = dget_parent(backing_new_dentry); + target_inode = d_inode(new_dentry); + + if (backing_old_dir_dentry == mi->mi_index_dir || + backing_old_dir_dentry == mi->mi_incomplete_dir) { + /* Direct moves from .index or .incomplete are not allowed. */ + error = -EBUSY; + goto out; + } + + trap = lock_rename(backing_old_dir_dentry, backing_new_dir_dentry); + + if (trap == backing_old_dentry) { + error = -EINVAL; + goto unlock_out; + } + if (trap == backing_new_dentry) { + error = -ENOTEMPTY; + goto unlock_out; + } + + rd.old_dir = d_inode(backing_old_dir_dentry); + rd.old_dentry = backing_old_dentry; + rd.new_dir = d_inode(backing_new_dir_dentry); + rd.new_dentry = backing_new_dentry; + rd.flags = flags; + rd.old_mnt_userns = &init_user_ns; + rd.new_mnt_userns = &init_user_ns; + rd.delegated_inode = NULL; + + error = vfs_rename(&rd); + if (error) + goto unlock_out; + if (target_inode) + fsstack_copy_attr_all(target_inode, + get_incfs_node(target_inode)->n_backing_inode); + fsstack_copy_attr_all(new_dir, d_inode(backing_new_dir_dentry)); + if (new_dir != old_dir) + fsstack_copy_attr_all(old_dir, d_inode(backing_old_dir_dentry)); + +unlock_out: + unlock_rename(backing_old_dir_dentry, backing_new_dir_dentry); + +out: + dput(backing_new_dir_dentry); + dput(backing_old_dir_dentry); + dput(backing_new_dentry); + dput(backing_old_dentry); + +exit: + mutex_unlock(&mi->mi_dir_struct_mutex); + if (error) + pr_debug("incfs: %s err:%d\n", __func__, error); + return error; +} + + +static int file_open(struct inode *inode, struct file *file) +{ + struct mount_info *mi = get_mount_info(inode->i_sb); + struct file *backing_file = NULL; + struct path backing_path = {}; + int err = 0; + int flags = O_NOATIME | O_LARGEFILE | + (S_ISDIR(inode->i_mode) ? O_RDONLY : O_RDWR); + const struct cred *old_cred; + + WARN_ON(file->private_data); + + if (!mi) + return -EBADF; + + get_incfs_backing_path(file->f_path.dentry, &backing_path); + if (!backing_path.dentry) + return -EBADF; + + old_cred = override_creds(mi->mi_owner); + backing_file = dentry_open(&backing_path, flags, current_cred()); + revert_creds(old_cred); + path_put(&backing_path); + + if (IS_ERR(backing_file)) { + err = PTR_ERR(backing_file); + backing_file = NULL; + goto out; + } + + if (S_ISREG(inode->i_mode)) { + struct incfs_file_data *fd = kzalloc(sizeof(*fd), GFP_NOFS); + + if (!fd) { + err = -ENOMEM; + goto out; + } + + *fd = (struct incfs_file_data) { + .fd_fill_permission = CANT_FILL, + }; + file->private_data = fd; + + err = make_inode_ready_for_data_ops(mi, inode, backing_file); + if (err) + goto out; + + err = incfs_fsverity_file_open(inode, file); + if (err) + goto out; + } else if (S_ISDIR(inode->i_mode)) { + struct dir_file *dir = NULL; + + dir = incfs_open_dir_file(mi, backing_file); + if (IS_ERR(dir)) + err = PTR_ERR(dir); + else + file->private_data = dir; + } else + err = -EBADF; + +out: + if (err) { + pr_debug("name:%s err: %d\n", + file->f_path.dentry->d_name.name, err); + if (S_ISREG(inode->i_mode)) + kfree(file->private_data); + else if (S_ISDIR(inode->i_mode)) + incfs_free_dir_file(file->private_data); + + file->private_data = NULL; + } + + if (backing_file) + fput(backing_file); + return err; +} + +static int file_release(struct inode *inode, struct file *file) +{ + if (S_ISREG(inode->i_mode)) { + kfree(file->private_data); + file->private_data = NULL; + } else if (S_ISDIR(inode->i_mode)) { + struct dir_file *dir = get_incfs_dir_file(file); + + incfs_free_dir_file(dir); + } + + return 0; +} + +static int dentry_revalidate(struct dentry *d, unsigned int flags) +{ + struct path backing_path = {}; + struct inode_info *info = get_incfs_node(d_inode(d)); + struct inode *binode = (info == NULL) ? NULL : info->n_backing_inode; + struct dentry *backing_dentry = NULL; + int result = 0; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + get_incfs_backing_path(d, &backing_path); + backing_dentry = backing_path.dentry; + if (!backing_dentry) + goto out; + + if (d_inode(backing_dentry) != binode) { + /* + * Backing inodes obtained via dentry and inode don't match. + * It indicates that most likely backing dir has changed + * directly bypassing Incremental FS interface. + */ + goto out; + } + + if (backing_dentry->d_flags & DCACHE_OP_REVALIDATE) { + result = backing_dentry->d_op->d_revalidate(backing_dentry, + flags); + } else + result = 1; + +out: + path_put(&backing_path); + return result; +} + +static void dentry_release(struct dentry *d) +{ + struct dentry_info *di = get_incfs_dentry(d); + + if (di) + path_put(&di->backing_path); + kfree(d->d_fsdata); + d->d_fsdata = NULL; +} + +static struct inode *alloc_inode(struct super_block *sb) +{ + struct inode_info *node = kzalloc(sizeof(*node), GFP_NOFS); + + /* TODO: add a slab-based cache here. */ + if (!node) + return NULL; + inode_init_once(&node->n_vfs_inode); + return &node->n_vfs_inode; +} + +static void free_inode(struct inode *inode) +{ + struct inode_info *node = get_incfs_node(inode); + + kfree(node); +} + +static void evict_inode(struct inode *inode) +{ + struct inode_info *node = get_incfs_node(inode); + + if (node) { + if (node->n_backing_inode) { + iput(node->n_backing_inode); + node->n_backing_inode = NULL; + } + if (node->n_file) { + incfs_free_data_file(node->n_file); + node->n_file = NULL; + } + } + + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); +} + +static int incfs_setattr(struct user_namespace *ns, struct dentry *dentry, + struct iattr *ia) +{ + struct dentry_info *di = get_incfs_dentry(dentry); + struct dentry *backing_dentry; + struct inode *backing_inode; + int error; + + if (ia->ia_valid & ATTR_SIZE) + return -EINVAL; + + if ((ia->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) && + (ia->ia_valid & ATTR_MODE)) + return -EINVAL; + + if (!di) + return -EINVAL; + backing_dentry = di->backing_path.dentry; + if (!backing_dentry) + return -EINVAL; + + backing_inode = d_inode(backing_dentry); + + /* incfs files are readonly, but the backing files must be writeable */ + if (S_ISREG(backing_inode->i_mode)) { + if ((ia->ia_valid & ATTR_MODE) && (ia->ia_mode & 0222)) + return -EINVAL; + + ia->ia_mode |= 0222; + } + + inode_lock(d_inode(backing_dentry)); + error = notify_change(ns, backing_dentry, ia, NULL); + inode_unlock(d_inode(backing_dentry)); + + if (error) + return error; + + if (S_ISREG(backing_inode->i_mode)) + ia->ia_mode &= ~0222; + + return simple_setattr(ns, dentry, ia); +} + + +static int incfs_getattr(struct user_namespace *ns, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + + generic_fillattr(ns, inode, stat); + + if (inode->i_ino < INCFS_START_INO_RANGE) + return 0; + + stat->attributes &= ~STATX_ATTR_VERITY; + if (IS_VERITY(inode)) + stat->attributes |= STATX_ATTR_VERITY; + stat->attributes_mask |= STATX_ATTR_VERITY; + + if (request_mask & STATX_BLOCKS) { + struct kstat backing_kstat; + struct dentry_info *di = get_incfs_dentry(path->dentry); + int error = 0; + struct path *backing_path; + + if (!di) + return -EFSCORRUPTED; + backing_path = &di->backing_path; + error = vfs_getattr(backing_path, &backing_kstat, STATX_BLOCKS, + AT_STATX_SYNC_AS_STAT); + if (error) + return error; + + stat->blocks = backing_kstat.blocks; + } + + return 0; +} + +static ssize_t incfs_getxattr(struct dentry *d, const char *name, + void *value, size_t size) +{ + struct dentry_info *di = get_incfs_dentry(d); + struct mount_info *mi = get_mount_info(d->d_sb); + char *stored_value; + size_t stored_size; + int i; + + if (di && di->backing_path.dentry) + return vfs_getxattr(&init_user_ns, di->backing_path.dentry, name, value, size); + + if (strcmp(name, "security.selinux")) + return -ENODATA; + + for (i = 0; i < PSEUDO_FILE_COUNT; ++i) + if (!strcmp(d->d_iname, incfs_pseudo_file_names[i].data)) + break; + if (i == PSEUDO_FILE_COUNT) + return -ENODATA; + + stored_value = mi->pseudo_file_xattr[i].data; + stored_size = mi->pseudo_file_xattr[i].len; + if (!stored_value) + return -ENODATA; + + if (stored_size > size) + return -E2BIG; + + memcpy(value, stored_value, stored_size); + return stored_size; +} + + +static ssize_t incfs_setxattr(struct user_namespace *ns, struct dentry *d, + const char *name, const void *value, size_t size, + int flags) +{ + struct dentry_info *di = get_incfs_dentry(d); + struct mount_info *mi = get_mount_info(d->d_sb); + u8 **stored_value; + size_t *stored_size; + int i; + + if (di && di->backing_path.dentry) + return vfs_setxattr(ns, di->backing_path.dentry, name, value, + size, flags); + + if (strcmp(name, "security.selinux")) + return -ENODATA; + + if (size > INCFS_MAX_FILE_ATTR_SIZE) + return -E2BIG; + + for (i = 0; i < PSEUDO_FILE_COUNT; ++i) + if (!strcmp(d->d_iname, incfs_pseudo_file_names[i].data)) + break; + if (i == PSEUDO_FILE_COUNT) + return -ENODATA; + + stored_value = &mi->pseudo_file_xattr[i].data; + stored_size = &mi->pseudo_file_xattr[i].len; + kfree (*stored_value); + *stored_value = kzalloc(size, GFP_NOFS); + if (!*stored_value) + return -ENOMEM; + + memcpy(*stored_value, value, size); + *stored_size = size; + return 0; +} + +static ssize_t incfs_listxattr(struct dentry *d, char *list, size_t size) +{ + struct dentry_info *di = get_incfs_dentry(d); + + if (!di || !di->backing_path.dentry) + return -ENODATA; + + return vfs_listxattr(di->backing_path.dentry, list, size); +} + +struct dentry *incfs_mount_fs(struct file_system_type *type, int flags, + const char *dev_name, void *data) +{ + struct mount_options options = {}; + struct mount_info *mi = NULL; + struct path backing_dir_path = {}; + struct dentry *index_dir = NULL; + struct dentry *incomplete_dir = NULL; + struct super_block *src_fs_sb = NULL; + struct inode *root_inode = NULL; + struct super_block *sb = sget(type, NULL, set_anon_super, flags, NULL); + bool dir_created = false; + int error = 0; + + if (IS_ERR(sb)) + return ERR_CAST(sb); + + sb->s_op = &incfs_super_ops; + sb->s_d_op = &incfs_dentry_ops; + sb->s_flags |= S_NOATIME; + sb->s_magic = INCFS_MAGIC_NUMBER; + sb->s_time_gran = 1; + sb->s_blocksize = INCFS_DATA_FILE_BLOCK_SIZE; + sb->s_blocksize_bits = blksize_bits(sb->s_blocksize); + sb->s_xattr = incfs_xattr_ops; + + BUILD_BUG_ON(PAGE_SIZE != INCFS_DATA_FILE_BLOCK_SIZE); + + if (!dev_name) { + pr_err("incfs: Backing dir is not set, filesystem can't be mounted.\n"); + error = -ENOENT; + goto err_deactivate; + } + + error = parse_options(&options, (char *)data); + if (error != 0) { + pr_err("incfs: Options parsing error. %d\n", error); + goto err_deactivate; + } + + sb->s_bdi->ra_pages = options.readahead_pages; + if (!dev_name) { + pr_err("incfs: Backing dir is not set, filesystem can't be mounted.\n"); + error = -ENOENT; + goto err_free_opts; + } + + error = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, + &backing_dir_path); + if (error || backing_dir_path.dentry == NULL || + !d_really_is_positive(backing_dir_path.dentry)) { + pr_err("incfs: Error accessing: %s.\n", + dev_name); + goto err_free_opts; + } + src_fs_sb = backing_dir_path.dentry->d_sb; + sb->s_maxbytes = src_fs_sb->s_maxbytes; + sb->s_stack_depth = src_fs_sb->s_stack_depth + 1; + + if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + error = -EINVAL; + goto err_put_path; + } + + mi = incfs_alloc_mount_info(sb, &options, &backing_dir_path); + if (IS_ERR_OR_NULL(mi)) { + error = PTR_ERR(mi); + pr_err("incfs: Error allocating mount info. %d\n", error); + goto err_put_path; + } + + sb->s_fs_info = mi; + mi->mi_backing_dir_path = backing_dir_path; + index_dir = open_or_create_special_dir(backing_dir_path.dentry, + INCFS_INDEX_NAME, &dir_created); + if (IS_ERR_OR_NULL(index_dir)) { + error = PTR_ERR(index_dir); + pr_err("incfs: Can't find or create .index dir in %s\n", + dev_name); + /* No need to null index_dir since we don't put it */ + goto err_put_path; + } + + mi->mi_index_dir = index_dir; + mi->mi_index_free = dir_created; + + incomplete_dir = open_or_create_special_dir(backing_dir_path.dentry, + INCFS_INCOMPLETE_NAME, + &dir_created); + if (IS_ERR_OR_NULL(incomplete_dir)) { + error = PTR_ERR(incomplete_dir); + pr_err("incfs: Can't find or create .incomplete dir in %s\n", + dev_name); + /* No need to null incomplete_dir since we don't put it */ + goto err_put_path; + } + mi->mi_incomplete_dir = incomplete_dir; + mi->mi_incomplete_free = dir_created; + + root_inode = fetch_regular_inode(sb, backing_dir_path.dentry); + if (IS_ERR(root_inode)) { + error = PTR_ERR(root_inode); + goto err_put_path; + } + + sb->s_root = d_make_root(root_inode); + if (!sb->s_root) { + error = -ENOMEM; + goto err_put_path; + } + error = incfs_init_dentry(sb->s_root, &backing_dir_path); + if (error) + goto err_put_path; + + path_put(&backing_dir_path); + sb->s_flags |= SB_ACTIVE; + + pr_debug("incfs: mount\n"); + return dget(sb->s_root); + +err_put_path: + path_put(&backing_dir_path); +err_free_opts: + free_options(&options); +err_deactivate: + deactivate_locked_super(sb); + pr_err("incfs: mount failed %d\n", error); + return ERR_PTR(error); +} + +static int incfs_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct mount_options options; + struct mount_info *mi = get_mount_info(sb); + int err = 0; + + sync_filesystem(sb); + err = parse_options(&options, (char *)data); + if (err) + return err; + + if (options.report_uid != mi->mi_options.report_uid) { + pr_err("incfs: Can't change report_uid mount option on remount\n"); + err = -EOPNOTSUPP; + goto out; + } + + err = incfs_realloc_mount_info(mi, &options); + if (err) + goto out; + + pr_debug("incfs: remount\n"); + +out: + free_options(&options); + return err; +} + +void incfs_kill_sb(struct super_block *sb) +{ + struct mount_info *mi = sb->s_fs_info; + struct inode *dinode = NULL; + + pr_debug("incfs: unmount\n"); + + if (mi) { + if (mi->mi_backing_dir_path.dentry) + dinode = d_inode(mi->mi_backing_dir_path.dentry); + + if (dinode) { + if (mi->mi_index_dir && mi->mi_index_free) + vfs_rmdir(&init_user_ns, dinode, + mi->mi_index_dir); + + if (mi->mi_incomplete_dir && mi->mi_incomplete_free) + vfs_rmdir(&init_user_ns, dinode, + mi->mi_incomplete_dir); + } + + incfs_free_mount_info(mi); + sb->s_fs_info = NULL; + } + kill_anon_super(sb); +} + +static int show_options(struct seq_file *m, struct dentry *root) +{ + struct mount_info *mi = get_mount_info(root->d_sb); + + seq_printf(m, ",read_timeout_ms=%u", mi->mi_options.read_timeout_ms); + seq_printf(m, ",readahead=%u", mi->mi_options.readahead_pages); + if (mi->mi_options.read_log_pages != 0) { + seq_printf(m, ",rlog_pages=%u", mi->mi_options.read_log_pages); + seq_printf(m, ",rlog_wakeup_cnt=%u", + mi->mi_options.read_log_wakeup_count); + } + if (mi->mi_options.report_uid) + seq_puts(m, ",report_uid"); + + if (mi->mi_sysfs_node) + seq_printf(m, ",sysfs_name=%s", + kobject_name(&mi->mi_sysfs_node->isn_sysfs_node)); + return 0; +} diff --git a/fs/incfs/vfs.h b/fs/incfs/vfs.h new file mode 100644 index 000000000000..79fdf243733d --- /dev/null +++ b/fs/incfs/vfs.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2018 Google LLC + */ + +#ifndef _INCFS_VFS_H +#define _INCFS_VFS_H + +extern const struct file_operations incfs_file_ops; +extern const struct inode_operations incfs_file_inode_ops; + +void incfs_kill_sb(struct super_block *sb); +struct dentry *incfs_mount_fs(struct file_system_type *type, int flags, + const char *dev_name, void *data); +int incfs_link(struct dentry *what, struct dentry *where); +int incfs_unlink(struct dentry *dentry); + +static inline struct mount_info *get_mount_info(struct super_block *sb) +{ + struct mount_info *result = sb->s_fs_info; + + WARN_ON(!result); + return result; +} + +static inline struct super_block *file_superblock(struct file *f) +{ + struct inode *inode = file_inode(f); + + return inode->i_sb; +} + +#endif diff --git a/fs/inode.c b/fs/inode.c index 8279c700a2b7..18d16f159730 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -309,7 +309,7 @@ void drop_nlink(struct inode *inode) if (!inode->i_nlink) atomic_long_inc(&inode->i_sb->s_remove_count); } -EXPORT_SYMBOL(drop_nlink); +EXPORT_SYMBOL_NS(drop_nlink, ANDROID_GKI_VFS_EXPORT_ONLY); /** * clear_nlink - directly zero an inode's link count @@ -348,7 +348,7 @@ void set_nlink(struct inode *inode, unsigned int nlink) inode->__i_nlink = nlink; } } -EXPORT_SYMBOL(set_nlink); +EXPORT_SYMBOL_NS(set_nlink, ANDROID_GKI_VFS_EXPORT_ONLY); /** * inc_nlink - directly increment an inode's link count @@ -401,7 +401,7 @@ void inode_init_once(struct inode *inode) __address_space_init_once(&inode->i_data); i_size_ordered_init(inode); } -EXPORT_SYMBOL(inode_init_once); +EXPORT_SYMBOL_NS(inode_init_once, ANDROID_GKI_VFS_EXPORT_ONLY); static void init_once(void *foo) { @@ -425,7 +425,7 @@ void ihold(struct inode *inode) { WARN_ON(atomic_inc_return(&inode->i_count) < 2); } -EXPORT_SYMBOL(ihold); +EXPORT_SYMBOL_NS(ihold, ANDROID_GKI_VFS_EXPORT_ONLY); static void inode_lru_list_add(struct inode *inode) { @@ -505,7 +505,7 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval) spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } -EXPORT_SYMBOL(__insert_inode_hash); +EXPORT_SYMBOL_NS(__insert_inode_hash, ANDROID_GKI_VFS_EXPORT_ONLY); /** * __remove_inode_hash - remove an inode from the hash @@ -521,7 +521,7 @@ void __remove_inode_hash(struct inode *inode) spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } -EXPORT_SYMBOL(__remove_inode_hash); +EXPORT_SYMBOL_NS(__remove_inode_hash, ANDROID_GKI_VFS_EXPORT_ONLY); void clear_inode(struct inode *inode) { @@ -548,7 +548,7 @@ void clear_inode(struct inode *inode) /* don't need i_lock here, no concurrent mods to i_state */ inode->i_state = I_FREEING | I_CLEAR; } -EXPORT_SYMBOL(clear_inode); +EXPORT_SYMBOL_NS(clear_inode, ANDROID_GKI_VFS_EXPORT_ONLY); /* * Free the inode passed in, removing it from the lists it is still connected @@ -1008,7 +1008,7 @@ void unlock_new_inode(struct inode *inode) wake_up_bit(&inode->i_state, __I_NEW); spin_unlock(&inode->i_lock); } -EXPORT_SYMBOL(unlock_new_inode); +EXPORT_SYMBOL_NS(unlock_new_inode, ANDROID_GKI_VFS_EXPORT_ONLY); void discard_new_inode(struct inode *inode) { @@ -1165,7 +1165,7 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, } return inode; } -EXPORT_SYMBOL(iget5_locked); +EXPORT_SYMBOL_NS(iget5_locked, ANDROID_GKI_VFS_EXPORT_ONLY); /** * iget_locked - obtain an inode from a mounted file system @@ -1297,7 +1297,7 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) return res; } -EXPORT_SYMBOL(iunique); +EXPORT_SYMBOL_NS(iunique, ANDROID_GKI_VFS_EXPORT_ONLY); struct inode *igrab(struct inode *inode) { @@ -1380,7 +1380,7 @@ again: } return inode; } -EXPORT_SYMBOL(ilookup5); +EXPORT_SYMBOL_NS(ilookup5, ANDROID_GKI_VFS_EXPORT_ONLY); /** * ilookup - search for an inode in the inode cache @@ -1862,7 +1862,7 @@ void touch_atime(const struct path *path) skip_update: sb_end_write(inode->i_sb); } -EXPORT_SYMBOL(touch_atime); +EXPORT_SYMBOL_NS(touch_atime, ANDROID_GKI_VFS_EXPORT_ONLY); /* * The logic we want is @@ -1959,7 +1959,7 @@ int file_remove_privs(struct file *file) return error; } -EXPORT_SYMBOL(file_remove_privs); +EXPORT_SYMBOL_NS(file_remove_privs, ANDROID_GKI_VFS_EXPORT_ONLY); /** * file_update_time - update mtime and ctime time @@ -2140,7 +2140,7 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) " inode %s:%lu\n", mode, inode->i_sb->s_id, inode->i_ino); } -EXPORT_SYMBOL(init_special_inode); +EXPORT_SYMBOL_NS(init_special_inode, ANDROID_GKI_VFS_EXPORT_ONLY); /** * inode_init_owner - Init uid,gid,mode for new inode according to posix standards @@ -2173,7 +2173,7 @@ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode, inode_fsgid_set(inode, mnt_userns); inode->i_mode = mode; } -EXPORT_SYMBOL(inode_init_owner); +EXPORT_SYMBOL_NS(inode_init_owner, ANDROID_GKI_VFS_EXPORT_ONLY); /** * inode_owner_or_capable - check current task permissions to inode @@ -2237,7 +2237,7 @@ void inode_dio_wait(struct inode *inode) if (atomic_read(&inode->i_dio_count)) __inode_dio_wait(inode); } -EXPORT_SYMBOL(inode_dio_wait); +EXPORT_SYMBOL_NS(inode_dio_wait, ANDROID_GKI_VFS_EXPORT_ONLY); /* * inode_set_flags - atomically set some inode flags @@ -2261,7 +2261,7 @@ void inode_set_flags(struct inode *inode, unsigned int flags, WARN_ON_ONCE(flags & ~mask); set_mask_bits(&inode->i_flags, mask, flags); } -EXPORT_SYMBOL(inode_set_flags); +EXPORT_SYMBOL_NS(inode_set_flags, ANDROID_GKI_VFS_EXPORT_ONLY); void inode_nohighmem(struct inode *inode) { @@ -2298,7 +2298,7 @@ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode) WARN(1, "invalid file time granularity: %u", gran); return t; } -EXPORT_SYMBOL(timestamp_truncate); +EXPORT_SYMBOL_NS(timestamp_truncate, ANDROID_GKI_VFS_EXPORT_ONLY); /** * current_time - Return FS time diff --git a/fs/ioctl.c b/fs/ioctl.c index e0a3455f9a0f..88d195964c9a 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -149,7 +149,7 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, return 1; return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; } -EXPORT_SYMBOL(fiemap_fill_next_extent); +EXPORT_SYMBOL_NS(fiemap_fill_next_extent, ANDROID_GKI_VFS_EXPORT_ONLY); /** * fiemap_prep - check validity of requested flags for fiemap @@ -194,7 +194,7 @@ int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, ret = filemap_write_and_wait(inode->i_mapping); return ret; } -EXPORT_SYMBOL(fiemap_prep); +EXPORT_SYMBOL_NS(fiemap_prep, ANDROID_GKI_VFS_EXPORT_ONLY); static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) { diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 468dcbba45bc..b50785429962 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -186,11 +187,14 @@ static void iomap_dio_bio_end_io(struct bio *bio) static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, loff_t pos, unsigned len) { + struct inode *inode = file_inode(dio->iocb->ki_filp); struct page *page = ZERO_PAGE(0); int flags = REQ_SYNC | REQ_IDLE; struct bio *bio; bio = bio_alloc(GFP_KERNEL, 1); + fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, + GFP_KERNEL); bio_set_dev(bio, iter->iomap.bdev); bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); bio->bi_private = dio; @@ -310,6 +314,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, } bio = bio_alloc(GFP_KERNEL, nr_pages); + fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, + GFP_KERNEL); bio_set_dev(bio, iomap->bdev); bio->bi_iter.bi_sector = iomap_sector(iomap, pos); bio->bi_write_hint = dio->iocb->ki_hint; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 0c6eacfcbeef..7130a9bfcedb 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1613,3 +1613,4 @@ static void __exit exit_iso9660_fs(void) module_init(init_iso9660_fs) module_exit(exit_iso9660_fs) MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile index 126b4da6c7de..b64f93331643 100644 --- a/fs/jbd2/Makefile +++ b/fs/jbd2/Makefile @@ -3,6 +3,8 @@ # Makefile for the linux journaling routines. # +ccflags-y += -DDEFAULT_SYMBOL_NAMESPACE=ANDROID_GKI_VFS_EXPORT_ONLY + obj-$(CONFIG_JBD2) += jbd2.o jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 097ba728d516..07516bc5ce5d 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -3195,6 +3195,7 @@ static void __exit journal_exit(void) } MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); module_init(journal_init); module_exit(journal_exit); diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 81ca58c10b72..16dcc359fd35 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -439,3 +439,4 @@ MODULE_DESCRIPTION("The Journalling Flash File System, v2"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); // Actually dual-licensed, but it doesn't matter for // the sake of this tag. It's Free Software. +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 9030aeaf0f88..820e107ce98d 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -37,6 +37,7 @@ MODULE_DESCRIPTION("The Journaled Filesystem (JFS)"); MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); static struct kmem_cache *jfs_inode_cachep; diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c index 976d09aaee70..6d7ec0d4f851 100644 --- a/fs/ksmbd/server.c +++ b/fs/ksmbd/server.c @@ -618,6 +618,7 @@ MODULE_AUTHOR("Namjae Jeon "); MODULE_VERSION(KSMBD_VERSION); MODULE_DESCRIPTION("Linux kernel CIFS/SMB SERVER"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_SOFTDEP("pre: ecb"); MODULE_SOFTDEP("pre: hmac"); MODULE_SOFTDEP("pre: md4"); diff --git a/fs/libfs.c b/fs/libfs.c index 7bb5d90319cc..2a75bf3b609e 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -226,7 +226,7 @@ ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t { return -EISDIR; } -EXPORT_SYMBOL(generic_read_dir); +EXPORT_SYMBOL_NS(generic_read_dir, ANDROID_GKI_VFS_EXPORT_ONLY); const struct file_operations simple_dir_operations = { .open = dcache_dir_open, @@ -1124,7 +1124,7 @@ out: ret = err; return ret; } -EXPORT_SYMBOL(__generic_file_fsync); +EXPORT_SYMBOL_NS(__generic_file_fsync, ANDROID_GKI_VFS_EXPORT_ONLY); /** * generic_file_fsync - generic fsync implementation for simple filesystems @@ -1147,7 +1147,7 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end, return err; return blkdev_issue_flush(inode->i_sb->s_bdev); } -EXPORT_SYMBOL(generic_file_fsync); +EXPORT_SYMBOL_NS(generic_file_fsync, ANDROID_GKI_VFS_EXPORT_ONLY); /** * generic_check_addressable - Check addressability of file system diff --git a/fs/minix/inode.c b/fs/minix/inode.c index d4bd94234ef7..9717e6f71f97 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -721,4 +721,5 @@ static void __exit exit_minix_fs(void) module_init(init_minix_fs) module_exit(exit_minix_fs) MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/mpage.c b/fs/mpage.c index 334e7d09aa65..2f48fe4eec95 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -392,7 +392,7 @@ void mpage_readahead(struct readahead_control *rac, get_block_t get_block) if (args.bio) mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio); } -EXPORT_SYMBOL(mpage_readahead); +EXPORT_SYMBOL_NS(mpage_readahead, ANDROID_GKI_VFS_EXPORT_ONLY); /* * This isn't called much at all @@ -410,7 +410,7 @@ int mpage_readpage(struct page *page, get_block_t get_block) mpage_bio_submit(REQ_OP_READ, 0, args.bio); return 0; } -EXPORT_SYMBOL(mpage_readpage); +EXPORT_SYMBOL_NS(mpage_readpage, ANDROID_GKI_VFS_EXPORT_ONLY); /* * Writing is not so simple. diff --git a/fs/namei.c b/fs/namei.c index 81b31d9a063f..bacb0378748c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2573,7 +2573,7 @@ int kern_path(const char *name, unsigned int flags, struct path *path) return ret; } -EXPORT_SYMBOL(kern_path); +EXPORT_SYMBOL_NS(kern_path, ANDROID_GKI_VFS_EXPORT_ONLY); /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair @@ -3035,7 +3035,7 @@ int vfs_create(struct user_namespace *mnt_userns, struct inode *dir, fsnotify_create(dir, dentry); return error; } -EXPORT_SYMBOL(vfs_create); +EXPORT_SYMBOL_NS(vfs_create, ANDROID_GKI_VFS_EXPORT_ONLY); int vfs_mkobj(struct dentry *dentry, umode_t mode, int (*f)(struct dentry *, umode_t, void *), @@ -3939,7 +3939,7 @@ int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, fsnotify_mkdir(dir, dentry); return error; } -EXPORT_SYMBOL(vfs_mkdir); +EXPORT_SYMBOL_NS(vfs_mkdir, ANDROID_GKI_VFS_EXPORT_ONLY); int do_mkdirat(int dfd, struct filename *name, umode_t mode) { @@ -4035,7 +4035,7 @@ out: d_delete_notify(dir, dentry); return error; } -EXPORT_SYMBOL(vfs_rmdir); +EXPORT_SYMBOL_NS(vfs_rmdir, ANDROID_GKI_VFS_EXPORT_ONLY); int do_rmdir(int dfd, struct filename *name) { @@ -4170,7 +4170,7 @@ out: return error; } -EXPORT_SYMBOL(vfs_unlink); +EXPORT_SYMBOL_NS(vfs_unlink, ANDROID_GKI_VFS_EXPORT_ONLY); /* * Make sure that the actual truncation of the file will occur outside its @@ -4440,7 +4440,7 @@ int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns, fsnotify_link(dir, inode, new_dentry); return error; } -EXPORT_SYMBOL(vfs_link); +EXPORT_SYMBOL_NS(vfs_link, ANDROID_GKI_VFS_EXPORT_ONLY); /* * Hardlinks are often used in delicate situations. We avoid @@ -4714,7 +4714,7 @@ out: return error; } -EXPORT_SYMBOL(vfs_rename); +EXPORT_SYMBOL_NS(vfs_rename, ANDROID_GKI_VFS_EXPORT_ONLY); int do_renameat2(int olddfd, struct filename *from, int newdfd, struct filename *to, unsigned int flags) diff --git a/fs/namespace.c b/fs/namespace.c index d946298691ed..0cb0a7b1603b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -424,7 +424,7 @@ int mnt_want_write_file(struct file *file) sb_end_write(file_inode(file)->i_sb); return ret; } -EXPORT_SYMBOL_GPL(mnt_want_write_file); +EXPORT_SYMBOL_NS_GPL(mnt_want_write_file, ANDROID_GKI_VFS_EXPORT_ONLY); /** * __mnt_drop_write - give up write access to a mount @@ -467,7 +467,7 @@ void mnt_drop_write_file(struct file *file) __mnt_drop_write_file(file); sb_end_write(file_inode(file)->i_sb); } -EXPORT_SYMBOL(mnt_drop_write_file); +EXPORT_SYMBOL_NS(mnt_drop_write_file, ANDROID_GKI_VFS_EXPORT_ONLY); static inline int mnt_hold_writers(struct mount *mnt) { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index ad5114e48009..bf662ff45564 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -583,6 +583,7 @@ static const struct vm_operations_struct nfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = nfs_vm_page_mkwrite, + .speculative = true, }; ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e4524635a129..c268af1d15c0 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2477,6 +2477,7 @@ static void __exit exit_nfs_fs(void) /* Not quite true; I just maintain it */ MODULE_AUTHOR("Olaf Kirch "); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); module_param(enable_ino64, bool, 0644); module_init(init_nfs_fs) diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index d09bcfd7db89..b422e8a09d25 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -309,6 +309,7 @@ static void __exit exit_nfs_v4(void) } MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); module_init(init_nfs_v4); module_exit(exit_nfs_v4); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index cb73c1292562..af8c402e9457 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1583,5 +1583,6 @@ static void __exit exit_nfsd(void) MODULE_AUTHOR("Olaf Kirch "); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); module_init(init_nfsd) module_exit(exit_nfsd) diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 663e68c22db8..bdc885b92ac8 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -53,6 +53,7 @@ MODULE_AUTHOR("NTT Corp."); MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " "(NILFS)"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); static struct kmem_cache *nilfs_inode_cachep; struct kmem_cache *nilfs_transaction_cachep; diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 52ccd34b1e79..06c368ce3aa2 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -541,8 +541,8 @@ struct nls_table *load_nls_default(void) } EXPORT_SYMBOL(unregister_nls); -EXPORT_SYMBOL(unload_nls); -EXPORT_SYMBOL(load_nls); -EXPORT_SYMBOL(load_nls_default); +EXPORT_SYMBOL_NS(unload_nls, ANDROID_GKI_VFS_EXPORT_ONLY); +EXPORT_SYMBOL_NS(load_nls, ANDROID_GKI_VFS_EXPORT_ONLY); +EXPORT_SYMBOL_NS(load_nls_default, ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c index 162b3f160353..498b8a435d7e 100644 --- a/fs/nls/nls_euc-jp.c +++ b/fs/nls/nls_euc-jp.c @@ -578,3 +578,4 @@ module_init(init_nls_euc_jp) module_exit(exit_nls_euc_jp) MODULE_LICENSE("Dual BSD/GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c index a80a741a8676..99ceec9085ec 100644 --- a/fs/nls/nls_koi8-ru.c +++ b/fs/nls/nls_koi8-ru.c @@ -80,3 +80,4 @@ module_init(init_nls_koi8_ru) module_exit(exit_nls_koi8_ru) MODULE_LICENSE("Dual BSD/GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 9fb7701d2f8a..bee298f854e4 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -712,6 +712,8 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, struct fsnotify_group *group; struct inode *inode; struct path path; + struct path alteredpath; + struct path *canonical_path = &path; struct fd f; int ret; unsigned flags = 0; @@ -758,13 +760,23 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, if (ret) goto fput_and_out; + /* support stacked filesystems */ + if (path.dentry && path.dentry->d_op) { + if (path.dentry->d_op->d_canonical_path) { + path.dentry->d_op->d_canonical_path(&path, + &alteredpath); + canonical_path = &alteredpath; + path_put(&path); + } + } + /* inode held in place by reference to path; group by fget on fd */ - inode = path.dentry->d_inode; + inode = canonical_path->dentry->d_inode; group = f.file->private_data; /* create/update an inode mark */ ret = inotify_update_watch(group, inode, mask); - path_put(&path); + path_put(canonical_path); fput_and_out: fdput(f); return ret; diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 7f69422d5191..3ded93309139 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3187,6 +3187,7 @@ MODULE_AUTHOR("Anton Altaparmakov "); MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc."); MODULE_VERSION(NTFS_VERSION); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); #ifdef DEBUG module_param(debug_msgs, bint, 0); MODULE_PARM_DESC(debug_msgs, "Enable debug messages."); diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 33b1833ad525..a0360f200113 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -1494,6 +1494,7 @@ static void __exit exit_ntfs_fs(void) } MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_DESCRIPTION("ntfs3 read/write filesystem"); #ifdef CONFIG_NTFS3_FS_POSIX_ACL MODULE_INFO(behaviour, "Enabled Linux POSIX ACLs support"); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index fa0a14f199eb..308afb6655a6 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -616,6 +616,7 @@ static void __exit exit_dlmfs_fs(void) MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_DESCRIPTION("OCFS2 DLM-Filesystem"); module_init(init_dlmfs_fs) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index a03f0cabff0b..f194d9057d3e 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -69,6 +69,7 @@ static struct dentry *ocfs2_debugfs_root; MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_DESCRIPTION("OCFS2 cluster file system"); struct mount_options diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 2a0e83236c01..52b6e1ab17ce 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -20,6 +20,7 @@ MODULE_AUTHOR("Bob Copeland "); MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); struct buffer_head *omfs_bread(struct super_block *sb, sector_t block) { diff --git a/fs/open.c b/fs/open.c index 5e322f188e83..6ba44b27eaf8 100644 --- a/fs/open.c +++ b/fs/open.c @@ -35,6 +35,7 @@ #include #include "internal.h" +#include int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) @@ -809,6 +810,7 @@ static int do_dentry_open(struct file *f, error = -ENODEV; goto cleanup_all; } + trace_android_vh_check_file_open(f); error = security_file_open(f); if (error) @@ -1178,7 +1180,28 @@ struct file *filp_open(const char *filename, int flags, umode_t mode) } return file; } -EXPORT_SYMBOL(filp_open); +EXPORT_SYMBOL_NS(filp_open, ANDROID_GKI_VFS_EXPORT_ONLY); + + +/* ANDROID: Allow drivers to open only block files from kernel mode */ +struct file *filp_open_block(const char *filename, int flags, umode_t mode) +{ + struct file *file; + + file = filp_open(filename, flags, mode); + if (IS_ERR(file)) + goto err_out; + + /* Drivers should only be allowed to open block devices */ + if (!S_ISBLK(file->f_mapping->host->i_mode)) { + filp_close(file, NULL); + file = ERR_PTR(-ENOTBLK); + } + +err_out: + return file; +} +EXPORT_SYMBOL_GPL(filp_open_block); struct file *file_open_root(const struct path *root, const char *filename, int flags, umode_t mode) @@ -1390,7 +1413,7 @@ int generic_file_open(struct inode * inode, struct file * filp) return 0; } -EXPORT_SYMBOL(generic_file_open); +EXPORT_SYMBOL_NS(generic_file_open, ANDROID_GKI_VFS_EXPORT_ONLY); /* * This is used by subsystems that don't want seekable diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c index 5ab741c60b7e..dee05164c470 100644 --- a/fs/orangefs/orangefs-mod.c +++ b/fs/orangefs/orangefs-mod.c @@ -36,6 +36,7 @@ int orangefs_dcache_timeout_msecs = 50; int orangefs_getattr_timeout_msecs = 50; MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_AUTHOR("ORANGEFS Development Team"); MODULE_DESCRIPTION("The Linux Kernel VFS interface to ORANGEFS"); MODULE_PARM_DESC(module_parm_debug_mask, "debugging level (see orangefs-debug.h for values)"); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index ef0bf98b620d..da7ee5f16794 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -1046,7 +1046,7 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags) dput(parent); dput(next); } - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred); return err; } diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index eca984d6484d..4bd2aed5fc9e 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -569,7 +569,7 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, struct ovl_cattr *attr, bool origin) { int err; - const struct cred *old_cred; + const struct cred *old_cred, *hold_cred = NULL; struct cred *override_cred; struct dentry *parent = dentry->d_parent; @@ -610,13 +610,14 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, override_cred->fsuid = inode->i_uid; override_cred->fsgid = inode->i_gid; err = security_dentry_create_files_as(dentry, - attr->mode, &dentry->d_name, old_cred, + attr->mode, &dentry->d_name, + old_cred ? old_cred : current_cred(), override_cred); if (err) { put_cred(override_cred); goto out_revert_creds; } - put_cred(override_creds(override_cred)); + hold_cred = override_creds(override_cred); put_cred(override_cred); } @@ -626,7 +627,9 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, err = ovl_create_over_whiteout(dentry, inode, attr); out_revert_creds: - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred ?: hold_cred); + if (old_cred && hold_cred) + put_cred(hold_cred); return err; } @@ -703,7 +706,7 @@ static int ovl_set_link_redirect(struct dentry *dentry) old_cred = ovl_override_creds(dentry->d_sb); err = ovl_set_redirect(dentry, false); - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred); return err; } @@ -921,7 +924,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) err = ovl_remove_upper(dentry, is_dir, &list); else err = ovl_remove_and_whiteout(dentry, &list); - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred); if (!err) { if (is_dir) clear_nlink(dentry->d_inode); @@ -1295,7 +1298,7 @@ out_dput_old: out_unlock: unlock_rename(new_upperdir, old_upperdir); out_revert_creds: - revert_creds(old_cred); + ovl_revert_creds(old->d_sb, old_cred); if (update_nlink) ovl_nlink_end(new); out_drop_write: diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 28cb05ef018c..26c17ca2c232 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -15,6 +15,8 @@ #include #include "overlayfs.h" +#define OVL_IOCB_MASK (IOCB_DSYNC | IOCB_HIPRI | IOCB_NOWAIT | IOCB_SYNC) + struct ovl_aio_req { struct kiocb iocb; refcount_t ref; @@ -55,13 +57,14 @@ static struct file *ovl_open_realfile(const struct file *file, if (err) { realfile = ERR_PTR(err); } else { - if (!inode_owner_or_capable(&init_user_ns, realinode)) + if (old_cred && !inode_owner_or_capable(&init_user_ns, + realinode)) flags &= ~O_NOATIME; realfile = open_with_fake_path(&file->f_path, flags, realinode, current_cred()); } - revert_creds(old_cred); + ovl_revert_creds(inode->i_sb, old_cred); pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n", file, file, ovl_whatisit(inode, realinode), file->f_flags, @@ -205,7 +208,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) old_cred = ovl_override_creds(inode->i_sb); ret = vfs_llseek(real.file, offset, whence); - revert_creds(old_cred); + ovl_revert_creds(inode->i_sb, old_cred); file->f_pos = real.file->f_pos; ovl_inode_unlock(inode); @@ -237,22 +240,6 @@ static void ovl_file_accessed(struct file *file) touch_atime(&file->f_path); } -static rwf_t ovl_iocb_to_rwf(int ifl) -{ - rwf_t flags = 0; - - if (ifl & IOCB_NOWAIT) - flags |= RWF_NOWAIT; - if (ifl & IOCB_HIPRI) - flags |= RWF_HIPRI; - if (ifl & IOCB_DSYNC) - flags |= RWF_DSYNC; - if (ifl & IOCB_SYNC) - flags |= RWF_SYNC; - - return flags; -} - static inline void ovl_aio_put(struct ovl_aio_req *aio_req) { if (refcount_dec_and_test(&aio_req->ref)) { @@ -313,7 +300,8 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) old_cred = ovl_override_creds(file_inode(file)->i_sb); if (is_sync_kiocb(iocb)) { ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb->ki_flags)); + iocb_to_rw_flags(iocb->ki_flags, + OVL_IOCB_MASK)); } else { struct ovl_aio_req *aio_req; @@ -334,7 +322,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) ovl_aio_cleanup_handler(aio_req); } out: - revert_creds(old_cred); + ovl_revert_creds(file_inode(file)->i_sb, old_cred); ovl_file_accessed(file); out_fdput: fdput(real); @@ -378,7 +366,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (is_sync_kiocb(iocb)) { file_start_write(real.file); ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(ifl)); + iocb_to_rw_flags(ifl, OVL_IOCB_MASK)); file_end_write(real.file); /* Update size */ ovl_copyattr(inode); @@ -407,7 +395,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) ovl_aio_cleanup_handler(aio_req); } out: - revert_creds(old_cred); + ovl_revert_creds(file_inode(file)->i_sb, old_cred); out_fdput: fdput(real); @@ -452,7 +440,7 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, file_end_write(real.file); /* Update size */ ovl_copyattr(inode); - revert_creds(old_cred); + ovl_revert_creds(inode->i_sb, old_cred); fdput(real); out_unlock: @@ -479,7 +467,7 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (file_inode(real.file) == ovl_inode_upper(file_inode(file))) { old_cred = ovl_override_creds(file_inode(file)->i_sb); ret = vfs_fsync_range(real.file, start, end, datasync); - revert_creds(old_cred); + ovl_revert_creds(file_inode(file)->i_sb, old_cred); } fdput(real); @@ -503,7 +491,7 @@ static int ovl_mmap(struct file *file, struct vm_area_struct *vma) old_cred = ovl_override_creds(file_inode(file)->i_sb); ret = call_mmap(vma->vm_file, vma); - revert_creds(old_cred); + ovl_revert_creds(file_inode(file)->i_sb, old_cred); ovl_file_accessed(file); return ret; @@ -529,7 +517,7 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len old_cred = ovl_override_creds(file_inode(file)->i_sb); ret = vfs_fallocate(real.file, mode, offset, len); - revert_creds(old_cred); + ovl_revert_creds(file_inode(file)->i_sb, old_cred); /* Update size */ ovl_copyattr(inode); @@ -554,7 +542,7 @@ static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice) old_cred = ovl_override_creds(file_inode(file)->i_sb); ret = vfs_fadvise(real.file, offset, len, advice); - revert_creds(old_cred); + ovl_revert_creds(file_inode(file)->i_sb, old_cred); fdput(real); @@ -613,7 +601,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, flags); break; } - revert_creds(old_cred); + ovl_revert_creds(file_inode(file_out)->i_sb, old_cred); /* Update size */ ovl_copyattr(inode_out); @@ -675,7 +663,7 @@ static int ovl_flush(struct file *file, fl_owner_t id) if (real.file->f_op->flush) { old_cred = ovl_override_creds(file_inode(file)->i_sb); err = real.file->f_op->flush(real.file, id); - revert_creds(old_cred); + ovl_revert_creds(file_inode(file)->i_sb, old_cred); } fdput(real); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index d41f0c8e0e2a..dee23f5f42b7 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -78,7 +78,7 @@ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, inode_lock(upperdentry->d_inode); old_cred = ovl_override_creds(dentry->d_sb); err = notify_change(&init_user_ns, upperdentry, attr, NULL); - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred); if (!err) ovl_copyattr(dentry->d_inode); inode_unlock(upperdentry->d_inode); @@ -270,7 +270,7 @@ int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->nlink = dentry->d_inode->i_nlink; out: - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred); return err; } @@ -305,7 +305,7 @@ int ovl_permission(struct user_namespace *mnt_userns, mask |= MAY_READ; } err = inode_permission(&init_user_ns, realinode, mask); - revert_creds(old_cred); + ovl_revert_creds(inode->i_sb, old_cred); return err; } @@ -322,7 +322,7 @@ static const char *ovl_get_link(struct dentry *dentry, old_cred = ovl_override_creds(dentry->d_sb); p = vfs_get_link(ovl_dentry_real(dentry), done); - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred); return p; } @@ -353,7 +353,7 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, if (!value && !upperdentry) { old_cred = ovl_override_creds(dentry->d_sb); err = vfs_getxattr(&init_user_ns, realdentry, name, NULL, 0); - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred); if (err < 0) goto out_drop_write; } @@ -374,7 +374,7 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, WARN_ON(flags != XATTR_REPLACE); err = vfs_removexattr(&init_user_ns, realdentry, name); } - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred); /* copy c/mtime */ ovl_copyattr(inode); @@ -395,7 +395,7 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, old_cred = ovl_override_creds(dentry->d_sb); res = vfs_getxattr(&init_user_ns, realdentry, name, value, size); - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred); return res; } @@ -423,7 +423,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) old_cred = ovl_override_creds(dentry->d_sb); res = vfs_listxattr(realdentry, list, size); - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred); if (res <= 0 || size == 0) return res; @@ -461,7 +461,7 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) old_cred = ovl_override_creds(inode->i_sb); acl = get_acl(realinode, type); - revert_creds(old_cred); + ovl_revert_creds(inode->i_sb, old_cred); return acl; } @@ -495,7 +495,7 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, old_cred = ovl_override_creds(inode->i_sb); err = realinode->i_op->fiemap(realinode, fieinfo, start, len); - revert_creds(old_cred); + ovl_revert_creds(inode->i_sb, old_cred); return err; } @@ -566,7 +566,7 @@ int ovl_fileattr_set(struct user_namespace *mnt_userns, err = ovl_set_protattr(inode, upperpath.dentry, fa); if (!err) err = ovl_real_fileattr_set(&upperpath, fa); - revert_creds(old_cred); + ovl_revert_creds(inode->i_sb, old_cred); /* * Merge real inode flags with inode flags read from @@ -628,7 +628,7 @@ int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa) old_cred = ovl_override_creds(inode->i_sb); err = ovl_real_fileattr_get(&realpath, fa); ovl_fileattr_prot_flags(inode, fa); - revert_creds(old_cred); + ovl_revert_creds(inode->i_sb, old_cred); return err; } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 1a9b515fc45d..0ecb6bab0f2a 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -1106,7 +1106,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, ovl_dentry_update_reval(dentry, upperdentry, DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred); if (origin_path) { dput(origin_path->dentry); kfree(origin_path); @@ -1133,7 +1133,7 @@ out_put_upper: kfree(upperredirect); out: kfree(d.redirect); - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred); return ERR_PTR(err); } @@ -1185,7 +1185,7 @@ bool ovl_lower_positive(struct dentry *dentry) dput(this); } } - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred); return positive; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 2df3e74cdf0f..020df39b5dea 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -280,6 +280,7 @@ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); +void ovl_revert_creds(struct super_block *sb, const struct cred *oldcred); int ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); bool ovl_index_all(struct super_block *sb); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index b2d64f3c974b..ca55d0542d0b 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -20,6 +20,7 @@ struct ovl_config { bool metacopy; bool userxattr; bool ovl_volatile; + bool override_creds; }; struct ovl_sb { diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 150fdf3bc68d..c6038e6ad753 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -286,7 +286,7 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) } inode_unlock(dir->d_inode); } - revert_creds(old_cred); + ovl_revert_creds(rdd->dentry->d_sb, old_cred); return err; } @@ -789,7 +789,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) } err = 0; out: - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred); return err; } @@ -841,7 +841,7 @@ static struct file *ovl_dir_open_realfile(const struct file *file, old_cred = ovl_override_creds(file_inode(file)->i_sb); res = ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE)); - revert_creds(old_cred); + ovl_revert_creds(file_inode(file)->i_sb, old_cred); return res; } @@ -967,7 +967,7 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) old_cred = ovl_override_creds(dentry->d_sb); err = ovl_dir_read_merged(dentry, list, &root); - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred); if (err) return err; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index b3675d13c1ac..8231271a1eaf 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -20,6 +20,7 @@ MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Overlay filesystem"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); struct ovl_dir_cache; @@ -53,6 +54,11 @@ module_param_named(xino_auto, ovl_xino_auto_def, bool, 0644); MODULE_PARM_DESC(xino_auto, "Auto enable xino feature"); +static bool __read_mostly ovl_override_creds_def = true; +module_param_named(override_creds, ovl_override_creds_def, bool, 0644); +MODULE_PARM_DESC(ovl_override_creds_def, + "Use mounter's credentials for accesses"); + static void ovl_entry_stack_free(struct ovl_entry *oe) { unsigned int i; @@ -388,6 +394,9 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) seq_puts(m, ",volatile"); if (ofs->config.userxattr) seq_puts(m, ",userxattr"); + if (ofs->config.override_creds != ovl_override_creds_def) + seq_show_option(m, "override_creds", + ofs->config.override_creds ? "on" : "off"); return 0; } @@ -443,6 +452,8 @@ enum { OPT_METACOPY_ON, OPT_METACOPY_OFF, OPT_VOLATILE, + OPT_OVERRIDE_CREDS_ON, + OPT_OVERRIDE_CREDS_OFF, OPT_ERR, }; @@ -465,6 +476,8 @@ static const match_table_t ovl_tokens = { {OPT_METACOPY_ON, "metacopy=on"}, {OPT_METACOPY_OFF, "metacopy=off"}, {OPT_VOLATILE, "volatile"}, + {OPT_OVERRIDE_CREDS_ON, "override_creds=on"}, + {OPT_OVERRIDE_CREDS_OFF, "override_creds=off"}, {OPT_ERR, NULL} }; @@ -524,6 +537,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL); if (!config->redirect_mode) return -ENOMEM; + config->override_creds = ovl_override_creds_def; while ((p = ovl_next_opt(&opt)) != NULL) { int token; @@ -625,6 +639,14 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->userxattr = true; break; + case OPT_OVERRIDE_CREDS_ON: + config->override_creds = true; + break; + + case OPT_OVERRIDE_CREDS_OFF: + config->override_creds = false; + break; + default: pr_err("unrecognized mount option \"%s\" or missing value\n", p); @@ -2151,7 +2173,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) kfree(splitlower); sb->s_root = root_dentry; - return 0; out_free_oe: diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 9d33ce385bef..32e4b24bc0e2 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -38,9 +38,18 @@ const struct cred *ovl_override_creds(struct super_block *sb) { struct ovl_fs *ofs = sb->s_fs_info; + if (!ofs->config.override_creds) + return NULL; return override_creds(ofs->creator_cred); } +void ovl_revert_creds(struct super_block *sb, const struct cred *old_cred) +{ + if (old_cred) + revert_creds(old_cred); +} + + /* * Check if underlying fs supports file handles and try to determine encoding * type, in order to deduce maximum inode number used by fs. @@ -912,7 +921,7 @@ int ovl_nlink_start(struct dentry *dentry) * value relative to the upper inode nlink in an upper inode xattr. */ err = ovl_set_nlink_upper(dentry); - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred); out: if (err) @@ -930,7 +939,7 @@ void ovl_nlink_end(struct dentry *dentry) old_cred = ovl_override_creds(dentry->d_sb); ovl_cleanup_index(dentry); - revert_creds(old_cred); + ovl_revert_creds(dentry->d_sb, old_cred); } ovl_inode_unlock(inode); diff --git a/fs/proc/base.c b/fs/proc/base.c index 300d53ee7040..01825cba84aa 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -97,6 +97,7 @@ #include #include #include +#include #include #include "internal.h" #include "fd.h" @@ -3303,6 +3304,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif +#ifdef CONFIG_CPU_FREQ_TIMES + ONE("time_in_state", 0444, proc_time_in_state_show), +#endif #ifdef CONFIG_STACKLEAK_METRICS ONE("stack_depth", S_IRUGO, proc_stack_depth), #endif @@ -3646,6 +3650,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif +#ifdef CONFIG_CPU_FREQ_TIMES + ONE("time_in_state", 0444, proc_time_in_state_show), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 6fa761c9cc78..2be7ffa2a03f 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -18,7 +18,7 @@ #endif #include #include "internal.h" - +#include void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) { } @@ -108,6 +108,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #endif show_val_kb(m, "PageTables: ", global_node_page_state(NR_PAGETABLE)); + show_val_kb(m, "SecPageTables: ", + global_node_page_state(NR_SECONDARY_PAGETABLE)); show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", @@ -145,6 +147,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "CmaFree: ", global_zone_page_state(NR_FREE_CMA_PAGES)); #endif + trace_android_vh_meminfo_proc_show(m); hugetlb_report_meminfo(m); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c3b76746cce8..de8e525440eb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include @@ -308,6 +309,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) name = arch_vma_name(vma); if (!name) { + struct anon_vma_name *anon_name; + if (!mm) { name = "[vdso]"; goto done; @@ -319,8 +322,16 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - if (is_stack(vma)) + if (is_stack(vma)) { name = "[stack]"; + goto done; + } + + anon_name = anon_vma_name(vma); + if (anon_name) { + seq_pad(m, ' '); + seq_printf(m, "[anon:%s]", anon_name->name); + } } done: diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 6093088de49f..2d4f78af83a1 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -358,6 +358,7 @@ static void __exit pstore_blk_exit(void) module_exit(pstore_blk_exit); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_AUTHOR("WeiXiong Liao "); MODULE_AUTHOR("Kees Cook "); MODULE_DESCRIPTION("pstore backend for block devices"); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index f3fa3625d772..80bfa26b7772 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "internal.h" #define RAMOOPS_KERNMSG_HDR "====" @@ -633,6 +634,7 @@ static int ramoops_parse_dt(struct platform_device *pdev, { struct device_node *of_node = pdev->dev.of_node; struct device_node *parent_node; + struct reserved_mem *rmem; struct resource *res; u32 value; int ret; @@ -641,13 +643,20 @@ static int ramoops_parse_dt(struct platform_device *pdev, res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) { - dev_err(&pdev->dev, - "failed to locate DT /reserved-memory resource\n"); - return -EINVAL; + rmem = of_reserved_mem_lookup(of_node); + if (rmem) { + pdata->mem_size = rmem->size; + pdata->mem_address = rmem->base; + } else { + dev_err(&pdev->dev, + "failed to locate DT /reserved-memory resource\n"); + return -EINVAL; + } + } else { + pdata->mem_size = resource_size(res); + pdata->mem_address = res->start; } - pdata->mem_size = resource_size(res); - pdata->mem_address = res->start; /* * Setting "unbuffered" is deprecated and will be ignored if * "mem_type" is also specified. diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 3fb7fc819b4f..704fb7d09f0e 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -420,4 +420,5 @@ static void __exit exit_qnx4_fs(void) module_init(init_qnx4_fs) module_exit(exit_qnx4_fs) MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 61191f7bdf62..ac771f0e0238 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -679,3 +679,4 @@ static void __exit exit_qnx6_fs(void) module_init(init_qnx6_fs) module_exit(exit_qnx6_fs) MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/read_write.c b/fs/read_write.c index b4b15279b66b..3f68186207a9 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -460,7 +460,7 @@ ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) return ret; return __kernel_read(file, buf, count, pos); } -EXPORT_SYMBOL(kernel_read); +EXPORT_SYMBOL_NS(kernel_read, ANDROID_GKI_VFS_EXPORT_ONLY); ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { @@ -569,7 +569,7 @@ ssize_t kernel_write(struct file *file, const void *buf, size_t count, file_end_write(file); return ret; } -EXPORT_SYMBOL(kernel_write); +EXPORT_SYMBOL_NS(kernel_write, ANDROID_GKI_VFS_EXPORT_ONLY); ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index f7b05c6b3dcf..87258c2640a0 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2649,6 +2649,7 @@ MODULE_ALIAS_FS("reiserfs"); MODULE_DESCRIPTION("ReiserFS journaled filesystem"); MODULE_AUTHOR("Hans Reiser "); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); module_init(init_reiserfs_fs); module_exit(exit_reiserfs_fs); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 259f684d9236..c335658c2e0f 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -666,3 +666,4 @@ module_exit(exit_romfs_fs); MODULE_DESCRIPTION("Direct-MTD Capable RomFS"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */ +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/splice.c b/fs/splice.c index 5dbce4dcc1a7..4f1557673c60 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -326,7 +326,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, return ret; } -EXPORT_SYMBOL(generic_file_splice_read); +EXPORT_SYMBOL_NS(generic_file_splice_read, ANDROID_GKI_VFS_EXPORT_ONLY); const struct pipe_buf_operations default_pipe_buf_ops = { .release = generic_pipe_buf_release, @@ -725,7 +725,7 @@ done: return ret; } -EXPORT_SYMBOL(iter_file_splice_write); +EXPORT_SYMBOL_NS(iter_file_splice_write, ANDROID_GKI_VFS_EXPORT_ONLY); /** * generic_splice_sendpage - splice data from a pipe to a socket diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 60d6951915f4..ac80ce874a6d 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -584,3 +584,4 @@ module_exit(exit_squashfs_fs); MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem"); MODULE_AUTHOR("Phillip Lougher "); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/stat.c b/fs/stat.c index 246d138ec066..3e5be67027ef 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -57,7 +57,7 @@ void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode, stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; } -EXPORT_SYMBOL(generic_fillattr); +EXPORT_SYMBOL_NS(generic_fillattr, ANDROID_GKI_VFS_EXPORT_ONLY); /** * generic_fill_statx_attr - Fill in the statx attributes from the inode flags diff --git a/fs/super.c b/fs/super.c index 7fa3ee79ec89..6bdb475ccd4e 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1385,7 +1385,7 @@ error_bdev: error: return ERR_PTR(error); } -EXPORT_SYMBOL(mount_bdev); +EXPORT_SYMBOL_NS(mount_bdev, ANDROID_GKI_VFS_EXPORT_ONLY); void kill_block_super(struct super_block *sb) { @@ -1399,7 +1399,7 @@ void kill_block_super(struct super_block *sb) blkdev_put(bdev, mode | FMODE_EXCL); } -EXPORT_SYMBOL(kill_block_super); +EXPORT_SYMBOL_NS(kill_block_super, ANDROID_GKI_VFS_EXPORT_ONLY); #endif struct dentry *mount_nodev(struct file_system_type *fs_type, diff --git a/fs/sync.c b/fs/sync.c index c7690016453e..847f4c6c69f0 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include @@ -69,7 +69,7 @@ int sync_filesystem(struct super_block *sb) } return sync_blockdev(sb->s_bdev); } -EXPORT_SYMBOL(sync_filesystem); +EXPORT_SYMBOL_NS(sync_filesystem, ANDROID_GKI_VFS_EXPORT_ONLY); static void sync_inodes_one_sb(struct super_block *sb, void *arg) { @@ -211,6 +211,7 @@ static int do_fsync(unsigned int fd, int datasync) if (f.file) { ret = vfs_fsync(f.file, datasync); fdput(f); + inc_syscfs(current); } return ret; } diff --git a/fs/sysv/super.c b/fs/sysv/super.c index cc8e2ed155c8..e5383bff5dad 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -592,3 +592,4 @@ static void __exit exit_sysv_fs(void) module_init(init_sysv_fs) module_exit(exit_sysv_fs) MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 066e8344934d..8c11efca87fe 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -522,8 +522,7 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, if (unlikely(!inode)) return failed_creating(dentry); - /* Do not set bits for OTH */ - inode->i_mode = S_IFDIR | S_IRWXU | S_IRUSR| S_IRGRP | S_IXUSR | S_IXGRP; + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; inode->i_op = ops; inode->i_fop = &simple_dir_operations; inode->i_uid = d_inode(dentry->d_parent)->i_uid; diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index 22be7aeb96c4..c57b46a352d8 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -82,5 +82,4 @@ const struct fscrypt_operations ubifs_crypt_operations = { .get_context = ubifs_crypt_get_context, .set_context = ubifs_crypt_set_context, .empty_dir = ubifs_crypt_empty_dir, - .max_namelen = UBIFS_MAX_NLEN, }; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 6b45a037a047..a924974954f3 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1598,6 +1598,7 @@ static const struct vm_operations_struct ubifs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ubifs_vm_page_mkwrite, + .speculative = true, }; static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index eb05038b7191..ad1d144d693a 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2475,6 +2475,7 @@ static void __exit ubifs_exit(void) module_exit(ubifs_exit); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_VERSION(__stringify(UBIFS_VERSION)); MODULE_AUTHOR("Artem Bityutskiy, Adrian Hunter"); MODULE_DESCRIPTION("UBIFS - UBI File System"); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index c38066ce9ab0..b3c20d49b056 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -130,7 +130,7 @@ #define WORST_COMPR_FACTOR 2 #ifdef CONFIG_FS_ENCRYPTION -#define UBIFS_CIPHER_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE +#define UBIFS_CIPHER_BLOCK_SIZE FSCRYPT_CONTENTS_ALIGNMENT #else #define UBIFS_CIPHER_BLOCK_SIZE 0 #endif diff --git a/fs/udf/super.c b/fs/udf/super.c index aa2f6093d3f6..2de83c93f5d0 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -2546,5 +2546,6 @@ static unsigned int udf_count_free(struct super_block *sb) MODULE_AUTHOR("Ben Fennema"); MODULE_DESCRIPTION("Universal Disk Format Filesystem"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); module_init(init_udf_fs) module_exit(exit_udf_fs) diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 00a01471ea05..5a6452c752cc 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1541,3 +1541,4 @@ static void __exit exit_ufs_fs(void) module_init(init_ufs_fs) module_exit(exit_ufs_fs) MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index b56e8e31d967..232861e8aad6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -877,7 +878,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) vma = prev; else @@ -1436,7 +1437,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - ((struct vm_userfaultfd_ctx){ ctx })); + ((struct vm_userfaultfd_ctx){ ctx }), + anon_vma_name(vma)); if (prev) { vma = prev; goto next; @@ -1613,7 +1615,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { vma = prev; goto next; diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c index 864c2fad23be..2bdfdfc826a5 100644 --- a/fs/vboxsf/file.c +++ b/fs/vboxsf/file.c @@ -163,6 +163,7 @@ static const struct vm_operations_struct vboxsf_file_vm_ops = { .close = vboxsf_vma_close, .fault = filemap_fault, .map_pages = filemap_map_pages, + .speculative = true, }; static int vboxsf_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index 37dd3fe5b1e9..dd70aef05157 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -482,4 +482,5 @@ module_exit(vboxsf_fini); MODULE_DESCRIPTION("Oracle VM VirtualBox Module for Host File System Access"); MODULE_AUTHOR("Oracle Corporation"); MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_ALIAS_FS("vboxsf"); diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig index 24d1b54de807..54598cd80145 100644 --- a/fs/verity/Kconfig +++ b/fs/verity/Kconfig @@ -3,6 +3,7 @@ config FS_VERITY bool "FS Verity (read-only file-based authenticity protection)" select CRYPTO + select CRYPTO_HASH_INFO # SHA-256 is implied as it's intended to be the default hash algorithm. # To avoid bloat, other wanted algorithms must be selected explicitly. # Note that CRYPTO_SHA256 denotes the generic C implementation, but diff --git a/fs/verity/Makefile b/fs/verity/Makefile index 435559a4fa9e..4b8323450f90 100644 --- a/fs/verity/Makefile +++ b/fs/verity/Makefile @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 +ccflags-y += -DDEFAULT_SYMBOL_NAMESPACE=ANDROID_GKI_VFS_EXPORT_ONLY + obj-$(CONFIG_FS_VERITY) += enable.o \ hash_algs.o \ init.o \ diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 60a4372aa4d7..d52872c808ff 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -202,7 +202,7 @@ static int enable_verity(struct file *filp, const struct fsverity_operations *vops = inode->i_sb->s_vop; struct merkle_tree_params params = { }; struct fsverity_descriptor *desc; - size_t desc_size = sizeof(*desc) + arg->sig_size; + size_t desc_size = struct_size(desc, signature, arg->sig_size); struct fsverity_info *vi; int err; @@ -281,7 +281,7 @@ static int enable_verity(struct file *filp, * from disk. This is simpler, and it serves as an extra check that the * metadata we're writing is valid before actually enabling verity. */ - vi = fsverity_create_info(inode, desc, desc_size); + vi = fsverity_create_info(inode, desc); if (IS_ERR(vi)) { err = PTR_ERR(vi); goto rollback; diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index a7920434bae5..41890ecf068c 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -14,7 +14,6 @@ #define pr_fmt(fmt) "fs-verity: " fmt -#include #include #include @@ -26,12 +25,6 @@ struct ahash_request; */ #define FS_VERITY_MAX_LEVELS 8 -/* - * Largest digest size among all hash algorithms supported by fs-verity. - * Currently assumed to be <= size of fsverity_descriptor::root_hash. - */ -#define FS_VERITY_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE - /* A hash algorithm supported by fs-verity */ struct fsverity_hash_alg { struct crypto_ahash *tfm; /* hash tfm, allocated on demand */ @@ -39,6 +32,11 @@ struct fsverity_hash_alg { unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */ unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */ mempool_t req_pool; /* mempool with a preallocated hash request */ + /* + * The HASH_ALGO_* constant for this algorithm. This is different from + * FS_VERITY_HASH_ALG_*, which uses a different numbering scheme. + */ + enum hash_algo algo_id; }; /* Merkle tree parameters: hash algorithm, initial hash state, and topology */ @@ -122,16 +120,14 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, const u8 *salt, size_t salt_size); struct fsverity_info *fsverity_create_info(const struct inode *inode, - struct fsverity_descriptor *desc, - size_t desc_size); + struct fsverity_descriptor *desc); void fsverity_set_info(struct inode *inode, struct fsverity_info *vi); void fsverity_free_info(struct fsverity_info *vi); int fsverity_get_descriptor(struct inode *inode, - struct fsverity_descriptor **desc_ret, - size_t *desc_size_ret); + struct fsverity_descriptor **desc_ret); int __init fsverity_init_info_cache(void); void __init fsverity_exit_info_cache(void); diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index 71d0fccb6d4c..6f8170cf4ae7 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -16,11 +16,13 @@ struct fsverity_hash_alg fsverity_hash_algs[] = { .name = "sha256", .digest_size = SHA256_DIGEST_SIZE, .block_size = SHA256_BLOCK_SIZE, + .algo_id = HASH_ALGO_SHA256, }, [FS_VERITY_HASH_ALG_SHA512] = { .name = "sha512", .digest_size = SHA512_DIGEST_SIZE, .block_size = SHA512_BLOCK_SIZE, + .algo_id = HASH_ALGO_SHA512, }, }; @@ -324,5 +326,9 @@ void __init fsverity_check_hash_algs(void) */ BUG_ON(!is_power_of_2(alg->digest_size)); BUG_ON(!is_power_of_2(alg->block_size)); + + /* Verify that there is a valid mapping to HASH_ALGO_*. */ + BUG_ON(alg->algo_id == 0); + BUG_ON(alg->digest_size != hash_digest_size[alg->algo_id]); } } diff --git a/fs/verity/measure.c b/fs/verity/measure.c index f0d7b30c62db..5c79ea1b2468 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -57,3 +57,31 @@ int fsverity_ioctl_measure(struct file *filp, void __user *_uarg) return 0; } EXPORT_SYMBOL_GPL(fsverity_ioctl_measure); + +/** + * fsverity_get_digest() - get a verity file's digest + * @inode: inode to get digest of + * @digest: (out) pointer to the digest + * @alg: (out) pointer to the hash algorithm enumeration + * + * Return the file hash algorithm and digest of an fsverity protected file. + * Assumption: before calling this, the file must have been opened. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_get_digest(struct inode *inode, + u8 digest[FS_VERITY_MAX_DIGEST_SIZE], + enum hash_algo *alg) +{ + const struct fsverity_info *vi; + const struct fsverity_hash_alg *hash_alg; + + vi = fsverity_get_info(inode); + if (!vi) + return -ENODATA; /* not a verity file */ + + hash_alg = vi->tree_params.hash_alg; + memcpy(digest, vi->file_digest, hash_alg->digest_size); + *alg = hash_alg->algo_id; + return 0; +} diff --git a/fs/verity/open.c b/fs/verity/open.c index 92df87f5fa38..81ff94442f7b 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -147,8 +147,7 @@ static int compute_file_digest(struct fsverity_hash_alg *hash_alg, * fsverity_descriptor must have already undergone basic validation. */ struct fsverity_info *fsverity_create_info(const struct inode *inode, - struct fsverity_descriptor *desc, - size_t desc_size) + struct fsverity_descriptor *desc) { struct fsverity_info *vi; int err; @@ -264,8 +263,7 @@ static bool validate_fsverity_descriptor(struct inode *inode, * the filesystem, and do basic validation of it. */ int fsverity_get_descriptor(struct inode *inode, - struct fsverity_descriptor **desc_ret, - size_t *desc_size_ret) + struct fsverity_descriptor **desc_ret) { int res; struct fsverity_descriptor *desc; @@ -297,7 +295,6 @@ int fsverity_get_descriptor(struct inode *inode, } *desc_ret = desc; - *desc_size_ret = res; return 0; } @@ -306,17 +303,16 @@ static int ensure_verity_info(struct inode *inode) { struct fsverity_info *vi = fsverity_get_info(inode); struct fsverity_descriptor *desc; - size_t desc_size; int err; if (vi) return 0; - err = fsverity_get_descriptor(inode, &desc, &desc_size); + err = fsverity_get_descriptor(inode, &desc); if (err) return err; - vi = fsverity_create_info(inode, desc, desc_size); + vi = fsverity_create_info(inode, desc); if (IS_ERR(vi)) { err = PTR_ERR(vi); goto out_free_desc; diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c index 7e2d0c7bdf0d..2aefc5565152 100644 --- a/fs/verity/read_metadata.c +++ b/fs/verity/read_metadata.c @@ -53,14 +53,14 @@ static int fsverity_read_merkle_tree(struct inode *inode, break; } - virt = kmap(page); + virt = kmap_local_page(page); if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) { - kunmap(page); + kunmap_local(virt); put_page(page); err = -EFAULT; break; } - kunmap(page); + kunmap_local(virt); put_page(page); retval += bytes_to_copy; @@ -101,7 +101,7 @@ static int fsverity_read_descriptor(struct inode *inode, size_t desc_size; int res; - res = fsverity_get_descriptor(inode, &desc, &desc_size); + res = fsverity_get_descriptor(inode, &desc); if (res) return res; @@ -119,10 +119,9 @@ static int fsverity_read_signature(struct inode *inode, void __user *buf, u64 offset, int length) { struct fsverity_descriptor *desc; - size_t desc_size; int res; - res = fsverity_get_descriptor(inode, &desc, &desc_size); + res = fsverity_get_descriptor(inode, &desc); if (res) return res; diff --git a/fs/verity/signature.c b/fs/verity/signature.c index 143a530a8008..e33f6c401373 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -40,11 +40,38 @@ static struct key *fsverity_keyring; int fsverity_verify_signature(const struct fsverity_info *vi, const u8 *signature, size_t sig_size) { - const struct inode *inode = vi->inode; - const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg; + unsigned int digest_algorithm = + vi->tree_params.hash_alg - fsverity_hash_algs; + + return __fsverity_verify_signature(vi->inode, signature, sig_size, + vi->file_digest, digest_algorithm); +} + +/** + * __fsverity_verify_signature() - check a verity file's signature + * @inode: the file's inode + * @signature: the file's signature + * @sig_size: size of @signature. Can be 0 if there is no signature + * @file_digest: the file's digest + * @digest_algorithm: the digest algorithm used + * + * Takes the file's digest and optional signature and verifies the signature + * against the digest and the fs-verity keyring if appropriate + * + * Return: 0 on success (signature valid or not required); -errno on failure + */ +int __fsverity_verify_signature(const struct inode *inode, const u8 *signature, + size_t sig_size, const u8 *file_digest, + unsigned int digest_algorithm) +{ struct fsverity_formatted_digest *d; + struct fsverity_hash_alg *hash_alg = fsverity_get_hash_alg(inode, + digest_algorithm); int err; + if (IS_ERR(hash_alg)) + return PTR_ERR(hash_alg); + if (sig_size == 0) { if (fsverity_require_signatures) { fsverity_err(inode, @@ -60,7 +87,7 @@ int fsverity_verify_signature(const struct fsverity_info *vi, memcpy(d->magic, "FSVerity", 8); d->digest_algorithm = cpu_to_le16(hash_alg - fsverity_hash_algs); d->digest_size = cpu_to_le16(hash_alg->digest_size); - memcpy(d->digest, vi->file_digest, hash_alg->digest_size); + memcpy(d->digest, file_digest, hash_alg->digest_size); err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size, signature, sig_size, fsverity_keyring, @@ -83,9 +110,10 @@ int fsverity_verify_signature(const struct fsverity_info *vi, } pr_debug("Valid signature for file digest %s:%*phN\n", - hash_alg->name, hash_alg->digest_size, vi->file_digest); + hash_alg->name, hash_alg->digest_size, file_digest); return 0; } +EXPORT_SYMBOL_GPL(__fsverity_verify_signature); #ifdef CONFIG_SYSCTL static struct ctl_table_header *fsverity_sysctl_header; diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 0adb970f4e73..da185bd12cd5 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -39,16 +39,6 @@ static void hash_at_level(const struct merkle_tree_params *params, (params->log_blocksize - params->log_arity); } -/* Extract a hash from a hash page */ -static void extract_hash(struct page *hpage, unsigned int hoffset, - unsigned int hsize, u8 *out) -{ - void *virt = kmap_atomic(hpage); - - memcpy(out, virt + hoffset, hsize); - kunmap_atomic(virt); -} - static inline int cmp_hashes(const struct fsverity_info *vi, const u8 *want_hash, const u8 *real_hash, pgoff_t index, int level) @@ -129,7 +119,7 @@ static bool verify_page(struct inode *inode, const struct fsverity_info *vi, } if (PageChecked(hpage)) { - extract_hash(hpage, hoffset, hsize, _want_hash); + memcpy_from_page(_want_hash, hpage, hoffset, hsize); want_hash = _want_hash; put_page(hpage); pr_debug_ratelimited("Hash page already checked, want %s:%*phN\n", @@ -158,7 +148,7 @@ descend: if (err) goto out; SetPageChecked(hpage); - extract_hash(hpage, hoffset, hsize, _want_hash); + memcpy_from_page(_want_hash, hpage, hoffset, hsize); want_hash = _want_hash; put_page(hpage); pr_debug("Verified hash page at level %d, now want %s:%*phN\n", @@ -210,9 +200,8 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page); * @bio: the bio to verify * * Verify a set of pages that have just been read from a verity file. The pages - * must be pagecache pages that are still locked and not yet uptodate. Pages - * that fail verification are set to the Error state. Verification is skipped - * for pages already in the Error state, e.g. due to fscrypt decryption failure. + * must be pagecache pages that are still locked and not yet uptodate. If a + * page fails verification, then bio->bi_status is set to an error status. * * This is a helper function for use by the ->readpages() method of filesystems * that issue bios to read data directly into the page cache. Filesystems that @@ -254,9 +243,10 @@ void fsverity_verify_bio(struct bio *bio) unsigned long level0_ra_pages = min(max_ra_pages, params->level0_blocks - level0_index); - if (!PageError(page) && - !verify_page(inode, vi, req, page, level0_ra_pages)) - SetPageError(page); + if (!verify_page(inode, vi, req, page, level0_ra_pages)) { + bio->bi_status = BLK_STS_IOERR; + break; + } } fsverity_free_hash_request(params->hash_alg, req); diff --git a/fs/xattr.c b/fs/xattr.c index 4c82f271f4aa..c17327f2239c 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -314,7 +314,7 @@ retry_deleg: return error; } -EXPORT_SYMBOL_GPL(vfs_setxattr); +EXPORT_SYMBOL_NS_GPL(vfs_setxattr, ANDROID_GKI_VFS_EXPORT_ONLY); static ssize_t xattr_getsecurity(struct user_namespace *mnt_userns, struct inode *inode, @@ -433,7 +433,7 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, nolsm: return __vfs_getxattr(dentry, inode, name, value, size); } -EXPORT_SYMBOL_GPL(vfs_getxattr); +EXPORT_SYMBOL_NS_GPL(vfs_getxattr, ANDROID_GKI_VFS_EXPORT_ONLY); ssize_t vfs_listxattr(struct dentry *dentry, char *list, size_t size) @@ -453,7 +453,7 @@ vfs_listxattr(struct dentry *dentry, char *list, size_t size) } return error; } -EXPORT_SYMBOL_GPL(vfs_listxattr); +EXPORT_SYMBOL_NS_GPL(vfs_listxattr, ANDROID_GKI_VFS_EXPORT_ONLY); int __vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 240eb932c014..2f12f8d62777 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1410,6 +1410,9 @@ xfs_filemap_map_pages( struct inode *inode = file_inode(vmf->vma->vm_file); vm_fault_t ret; + if (!xfs_ilock_nowait(XFS_I(inode), XFS_MMAPLOCK_SHARED)) + return (vmf->flags & FAULT_FLAG_SPECULATIVE) ? + VM_FAULT_RETRY : 0; xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = filemap_map_pages(vmf, start_pgoff, end_pgoff); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index df1d6be61bfa..d0f149709f8f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -2361,3 +2361,4 @@ module_exit(exit_xfs_fs); MODULE_AUTHOR("Silicon Graphics, Inc."); MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index d3e182c1a128..24e21af56444 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1897,5 +1897,6 @@ MODULE_AUTHOR("Damien Le Moal"); MODULE_DESCRIPTION("Zone file system for zoned block devices"); MODULE_LICENSE("GPL"); MODULE_ALIAS_FS("zonefs"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); module_init(zonefs_init); module_exit(zonefs_exit); diff --git a/include/OWNERS b/include/OWNERS new file mode 100644 index 000000000000..4b815c212f9d --- /dev/null +++ b/include/OWNERS @@ -0,0 +1 @@ +per-file net/**=file:/net/OWNERS diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index 98954dda5734..7b15be070a04 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -61,6 +61,35 @@ #define __io_par(v) __io_ar(v) #endif +#if IS_ENABLED(CONFIG_TRACE_MMIO_ACCESS) && !(defined(__DISABLE_TRACE_MMIO__)) +#include + +DECLARE_TRACEPOINT(rwmmio_write); +DECLARE_TRACEPOINT(rwmmio_post_write); +DECLARE_TRACEPOINT(rwmmio_read); +DECLARE_TRACEPOINT(rwmmio_post_read); + +void log_write_mmio(u64 val, u8 width, volatile void __iomem *addr, + unsigned long caller_addr); +void log_post_write_mmio(u64 val, u8 width, volatile void __iomem *addr, + unsigned long caller_addr); +void log_read_mmio(u8 width, const volatile void __iomem *addr, + unsigned long caller_addr); +void log_post_read_mmio(u64 val, u8 width, const volatile void __iomem *addr, + unsigned long caller_addr); + +#else + +static inline void log_write_mmio(u64 val, u8 width, volatile void __iomem *addr, + unsigned long caller_addr) {} +static inline void log_post_write_mmio(u64 val, u8 width, volatile void __iomem *addr, + unsigned long caller_addr) {} +static inline void log_read_mmio(u8 width, const volatile void __iomem *addr, + unsigned long caller_addr) {} +static inline void log_post_read_mmio(u64 val, u8 width, const volatile void __iomem *addr, + unsigned long caller_addr) {} + +#endif /* CONFIG_TRACE_MMIO_ACCESS */ /* * __raw_{read,write}{b,w,l,q}() access memory in native endianness. @@ -149,9 +178,11 @@ static inline u8 readb(const volatile void __iomem *addr) { u8 val; + log_read_mmio(8, addr, _THIS_IP_); __io_br(); val = __raw_readb(addr); __io_ar(val); + log_post_read_mmio(val, 8, addr, _THIS_IP_); return val; } #endif @@ -162,9 +193,11 @@ static inline u16 readw(const volatile void __iomem *addr) { u16 val; + log_read_mmio(16, addr, _THIS_IP_); __io_br(); val = __le16_to_cpu((__le16 __force)__raw_readw(addr)); __io_ar(val); + log_post_read_mmio(val, 16, addr, _THIS_IP_); return val; } #endif @@ -175,9 +208,11 @@ static inline u32 readl(const volatile void __iomem *addr) { u32 val; + log_read_mmio(32, addr, _THIS_IP_); __io_br(); val = __le32_to_cpu((__le32 __force)__raw_readl(addr)); __io_ar(val); + log_post_read_mmio(val, 32, addr, _THIS_IP_); return val; } #endif @@ -189,9 +224,11 @@ static inline u64 readq(const volatile void __iomem *addr) { u64 val; + log_read_mmio(64, addr, _THIS_IP_); __io_br(); val = __le64_to_cpu(__raw_readq(addr)); __io_ar(val); + log_post_read_mmio(val, 64, addr, _THIS_IP_); return val; } #endif @@ -201,9 +238,11 @@ static inline u64 readq(const volatile void __iomem *addr) #define writeb writeb static inline void writeb(u8 value, volatile void __iomem *addr) { + log_write_mmio(value, 8, addr, _THIS_IP_); __io_bw(); __raw_writeb(value, addr); __io_aw(); + log_post_write_mmio(value, 8, addr, _THIS_IP_); } #endif @@ -211,9 +250,11 @@ static inline void writeb(u8 value, volatile void __iomem *addr) #define writew writew static inline void writew(u16 value, volatile void __iomem *addr) { + log_write_mmio(value, 16, addr, _THIS_IP_); __io_bw(); __raw_writew((u16 __force)cpu_to_le16(value), addr); __io_aw(); + log_post_write_mmio(value, 16, addr, _THIS_IP_); } #endif @@ -221,9 +262,11 @@ static inline void writew(u16 value, volatile void __iomem *addr) #define writel writel static inline void writel(u32 value, volatile void __iomem *addr) { + log_write_mmio(value, 32, addr, _THIS_IP_); __io_bw(); __raw_writel((u32 __force)__cpu_to_le32(value), addr); __io_aw(); + log_post_write_mmio(value, 32, addr, _THIS_IP_); } #endif @@ -232,9 +275,11 @@ static inline void writel(u32 value, volatile void __iomem *addr) #define writeq writeq static inline void writeq(u64 value, volatile void __iomem *addr) { + log_write_mmio(value, 64, addr, _THIS_IP_); __io_bw(); __raw_writeq(__cpu_to_le64(value), addr); __io_aw(); + log_post_write_mmio(value, 64, addr, _THIS_IP_); } #endif #endif /* CONFIG_64BIT */ @@ -248,7 +293,12 @@ static inline void writeq(u64 value, volatile void __iomem *addr) #define readb_relaxed readb_relaxed static inline u8 readb_relaxed(const volatile void __iomem *addr) { - return __raw_readb(addr); + u8 val; + + log_read_mmio(8, addr, _THIS_IP_); + val = __raw_readb(addr); + log_post_read_mmio(val, 8, addr, _THIS_IP_); + return val; } #endif @@ -256,7 +306,12 @@ static inline u8 readb_relaxed(const volatile void __iomem *addr) #define readw_relaxed readw_relaxed static inline u16 readw_relaxed(const volatile void __iomem *addr) { - return __le16_to_cpu(__raw_readw(addr)); + u16 val; + + log_read_mmio(16, addr, _THIS_IP_); + val = __le16_to_cpu(__raw_readw(addr)); + log_post_read_mmio(val, 16, addr, _THIS_IP_); + return val; } #endif @@ -264,7 +319,12 @@ static inline u16 readw_relaxed(const volatile void __iomem *addr) #define readl_relaxed readl_relaxed static inline u32 readl_relaxed(const volatile void __iomem *addr) { - return __le32_to_cpu(__raw_readl(addr)); + u32 val; + + log_read_mmio(32, addr, _THIS_IP_); + val = __le32_to_cpu(__raw_readl(addr)); + log_post_read_mmio(val, 32, addr, _THIS_IP_); + return val; } #endif @@ -272,7 +332,12 @@ static inline u32 readl_relaxed(const volatile void __iomem *addr) #define readq_relaxed readq_relaxed static inline u64 readq_relaxed(const volatile void __iomem *addr) { - return __le64_to_cpu(__raw_readq(addr)); + u64 val; + + log_read_mmio(64, addr, _THIS_IP_); + val =__le64_to_cpu(__raw_readq(addr)); + log_post_read_mmio(val, 64, addr, _THIS_IP_); + return val; } #endif @@ -280,7 +345,9 @@ static inline u64 readq_relaxed(const volatile void __iomem *addr) #define writeb_relaxed writeb_relaxed static inline void writeb_relaxed(u8 value, volatile void __iomem *addr) { + log_write_mmio(value, 8, addr, _THIS_IP_); __raw_writeb(value, addr); + log_post_write_mmio(value, 8, addr, _THIS_IP_); } #endif @@ -288,7 +355,9 @@ static inline void writeb_relaxed(u8 value, volatile void __iomem *addr) #define writew_relaxed writew_relaxed static inline void writew_relaxed(u16 value, volatile void __iomem *addr) { + log_write_mmio(value, 16, addr, _THIS_IP_); __raw_writew(cpu_to_le16(value), addr); + log_post_write_mmio(value, 16, addr, _THIS_IP_); } #endif @@ -296,7 +365,9 @@ static inline void writew_relaxed(u16 value, volatile void __iomem *addr) #define writel_relaxed writel_relaxed static inline void writel_relaxed(u32 value, volatile void __iomem *addr) { + log_write_mmio(value, 32, addr, _THIS_IP_); __raw_writel(__cpu_to_le32(value), addr); + log_post_write_mmio(value, 32, addr, _THIS_IP_); } #endif @@ -304,7 +375,9 @@ static inline void writel_relaxed(u32 value, volatile void __iomem *addr) #define writeq_relaxed writeq_relaxed static inline void writeq_relaxed(u64 value, volatile void __iomem *addr) { + log_write_mmio(value, 64, addr, _THIS_IP_); __raw_writeq(__cpu_to_le64(value), addr); + log_post_write_mmio(value, 64, addr, _THIS_IP_); } #endif diff --git a/include/crypto/polyval.h b/include/crypto/polyval.h new file mode 100644 index 000000000000..1d630f371f77 --- /dev/null +++ b/include/crypto/polyval.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common values for the Polyval hash algorithm + * + * Copyright 2021 Google LLC + */ + +#ifndef _CRYPTO_POLYVAL_H +#define _CRYPTO_POLYVAL_H + +#include +#include + +#define POLYVAL_BLOCK_SIZE 16 +#define POLYVAL_DIGEST_SIZE 16 + +void polyval_mul_non4k(u8 *op1, const u8 *op2); + +void polyval_update_non4k(const u8 *key, const u8 *in, + size_t nblocks, u8 *accumulator); + +#endif diff --git a/include/drm/drm_mode_object.h b/include/drm/drm_mode_object.h index 912f1e415685..08d7a7f0188f 100644 --- a/include/drm/drm_mode_object.h +++ b/include/drm/drm_mode_object.h @@ -60,7 +60,7 @@ struct drm_mode_object { void (*free_cb)(struct kref *kref); }; -#define DRM_OBJECT_MAX_PROPERTY 24 +#define DRM_OBJECT_MAX_PROPERTY 64 /** * struct drm_object_properties - property tracking for &drm_mode_object */ diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 51c19381108c..cd6d8f260eab 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -76,8 +76,6 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); -bool kvm_timer_is_pending(struct kvm_vcpu *vcpu); - u64 kvm_phys_timer_read(void); void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu); diff --git a/include/kvm/arm_hypercalls.h b/include/kvm/arm_hypercalls.h index 0e2509d27910..1188f116cf4e 100644 --- a/include/kvm/arm_hypercalls.h +++ b/include/kvm/arm_hypercalls.h @@ -40,4 +40,12 @@ static inline void smccc_set_retval(struct kvm_vcpu *vcpu, vcpu_set_reg(vcpu, 3, a3); } +struct kvm_one_reg; + +void kvm_arm_init_hypercalls(struct kvm *kvm); +int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu); +int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); +int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); +int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); + #endif diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 90f21898aad8..c0b868ce6a8f 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -13,13 +13,6 @@ #define ARMV8_PMU_CYCLE_IDX (ARMV8_PMU_MAX_COUNTERS - 1) #define ARMV8_PMU_MAX_COUNTER_PAIRS ((ARMV8_PMU_MAX_COUNTERS + 1) >> 1) -DECLARE_STATIC_KEY_FALSE(kvm_arm_pmu_available); - -static __always_inline bool kvm_arm_support_pmu_v3(void) -{ - return static_branch_likely(&kvm_arm_pmu_available); -} - #ifdef CONFIG_HW_PERF_EVENTS struct kvm_pmc { @@ -27,15 +20,33 @@ struct kvm_pmc { struct perf_event *perf_event; }; +struct kvm_pmu_events { + u32 events_host; + u32 events_guest; +}; + struct kvm_pmu { - int irq_num; + struct irq_work overflow_work; + struct kvm_pmu_events events; struct kvm_pmc pmc[ARMV8_PMU_MAX_COUNTERS]; DECLARE_BITMAP(chained, ARMV8_PMU_MAX_COUNTER_PAIRS); + int irq_num; bool created; bool irq_level; - struct irq_work overflow_work; }; +struct arm_pmu_entry { + struct list_head entry; + struct arm_pmu *arm_pmu; +}; + +DECLARE_STATIC_KEY_FALSE(kvm_arm_pmu_available); + +static __always_inline bool kvm_arm_support_pmu_v3(void) +{ + return static_branch_likely(&kvm_arm_pmu_available); +} + #define kvm_arm_pmu_irq_initialized(v) ((v)->arch.pmu.irq_num >= VGIC_NR_SGIS) u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); @@ -61,10 +72,34 @@ int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu); + +struct kvm_pmu_events *kvm_get_pmu_events(void); +void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu); +void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu); + +#define kvm_vcpu_has_pmu(vcpu) \ + (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features)) + +/* + * Updates the vcpu's view of the pmu events for this cpu. + * Must be called before every vcpu run after disabling interrupts, to ensure + * that an interrupt cannot fire and update the structure. + */ +#define kvm_pmu_update_vcpu_events(vcpu) \ + do { \ + if (!has_vhe() && kvm_vcpu_has_pmu(vcpu)) \ + vcpu->arch.pmu.events = *kvm_get_pmu_events(); \ + } while (0) + #else struct kvm_pmu { }; +static inline bool kvm_arm_support_pmu_v3(void) +{ + return false; +} + #define kvm_arm_pmu_irq_initialized(v) (false) static inline u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) @@ -117,6 +152,11 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) return 0; } +#define kvm_vcpu_has_pmu(vcpu) ({ false; }) +static inline void kvm_pmu_update_vcpu_events(struct kvm_vcpu *vcpu) {} +static inline void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) {} +static inline void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) {} + #endif #endif diff --git a/include/kvm/arm_psci.h b/include/kvm/arm_psci.h index 5b58bd2fe088..817093052396 100644 --- a/include/kvm/arm_psci.h +++ b/include/kvm/arm_psci.h @@ -13,14 +13,11 @@ #define KVM_ARM_PSCI_0_1 PSCI_VERSION(0, 1) #define KVM_ARM_PSCI_0_2 PSCI_VERSION(0, 2) #define KVM_ARM_PSCI_1_0 PSCI_VERSION(1, 0) +#define KVM_ARM_PSCI_1_1 PSCI_VERSION(1, 1) -#define KVM_ARM_PSCI_LATEST KVM_ARM_PSCI_1_0 +#define KVM_ARM_PSCI_LATEST KVM_ARM_PSCI_1_1 -/* - * We need the KVM pointer independently from the vcpu as we can call - * this from HYP, and need to apply kern_hyp_va on it... - */ -static inline int kvm_psci_version(struct kvm_vcpu *vcpu, struct kvm *kvm) +static inline int kvm_psci_version(struct kvm_vcpu *vcpu) { /* * Our PSCI implementation stays the same across versions from @@ -39,14 +36,36 @@ static inline int kvm_psci_version(struct kvm_vcpu *vcpu, struct kvm *kvm) return KVM_ARM_PSCI_0_1; } +/* Narrow the PSCI register arguments (r1 to r3) to 32 bits. */ +static inline void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu) +{ + int i; + + /* + * Zero the input registers' upper 32 bits. They will be fully + * zeroed on exit, so we're fine changing them in place. + */ + for (i = 1; i < 4; i++) + vcpu_set_reg(vcpu, i, lower_32_bits(vcpu_get_reg(vcpu, i))); +} + +static inline bool kvm_psci_valid_affinity(struct kvm_vcpu *vcpu, + unsigned long affinity) +{ + return !(affinity & ~MPIDR_HWID_BITMASK); +} + + +#define AFFINITY_MASK(level) ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1) + +static inline unsigned long psci_affinity_mask(unsigned long affinity_level) +{ + if (affinity_level <= 3) + return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level); + + return 0; +} int kvm_psci_call(struct kvm_vcpu *vcpu); -struct kvm_one_reg; - -int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu); -int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); -int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); -int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); - #endif /* __KVM_ARM_PSCI_H__ */ diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index e602d848fc1a..ffeffbc43ba3 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -5,9 +5,11 @@ #ifndef __KVM_ARM_VGIC_H #define __KVM_ARM_VGIC_H -#include +#include #include #include +#include +#include #include #include #include @@ -229,6 +231,9 @@ struct vgic_dist { /* Implementation revision as reported in the GICD_IIDR */ u32 implementation_rev; +#define KVM_VGIC_IMP_REV_2 2 /* GICv2 restorable groups */ +#define KVM_VGIC_IMP_REV_3 3 /* GICv3 GICR_CTLR.{IW,CES,RWP} */ +#define KVM_VGIC_IMP_REV_LATEST KVM_VGIC_IMP_REV_3 /* Userspace can write to GICv2 IGROUPR */ bool v2_groups_user_writable; @@ -342,11 +347,12 @@ struct vgic_cpu { struct vgic_io_device rd_iodev; struct vgic_redist_region *rdreg; u32 rdreg_index; + atomic_t syncr_busy; /* Contains the attributes and gpa of the LPI pending tables. */ u64 pendbaser; - - bool lpis_enabled; + /* GICR_CTLR.{ENABLE_LPIS,RWP} */ + atomic_t ctlr; /* Cache guest priority bits */ u32 num_pri_bits; @@ -358,7 +364,7 @@ struct vgic_cpu { extern struct static_key_false vgic_v2_cpuif_trap; extern struct static_key_false vgic_v3_cpuif_trap; -int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write); +int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr); void kvm_vgic_early_init(struct kvm *kvm); int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu); int kvm_vgic_create(struct kvm *kvm, u32 type); @@ -378,8 +384,7 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid); int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); void kvm_vgic_load(struct kvm_vcpu *vcpu); -void kvm_vgic_put(struct kvm_vcpu *vcpu); -void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu); +void kvm_vgic_put(struct kvm_vcpu *vcpu, bool blocking); #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) ((k)->arch.vgic.initialized) diff --git a/include/linux/OWNERS b/include/linux/OWNERS new file mode 100644 index 000000000000..68b6dedea64f --- /dev/null +++ b/include/linux/OWNERS @@ -0,0 +1,4 @@ +per-file bio.h=file:/block/OWNERS +per-file blk*.h=file:/block/OWNERS +per-file f2fs**=file:/fs/f2fs/OWNERS +per-file net**=file:/net/OWNERS diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index c68d87b87283..5970a430e2ae 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -17,6 +17,7 @@ #include #include #include +#include #define AMBA_NR_IRQS 9 #define AMBA_CID 0xb105f00d @@ -71,6 +72,8 @@ struct amba_device { struct amba_cs_uci_id uci; unsigned int irq[AMBA_NR_IRQS]; char *driver_override; + + ANDROID_KABI_RESERVE(1); }; struct amba_driver { @@ -79,6 +82,8 @@ struct amba_driver { void (*remove)(struct amba_device *); void (*shutdown)(struct amba_device *); const struct amba_id *id_table; + + ANDROID_KABI_RESERVE(1); }; /* diff --git a/include/linux/android_debug_symbols.h b/include/linux/android_debug_symbols.h new file mode 100644 index 000000000000..3750d361ce46 --- /dev/null +++ b/include/linux/android_debug_symbols.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021, The Linux Foundation. All rights reserved. + */ + +#ifndef _ANDROID_DEBUG_SYMBOLS_H +#define _ANDROID_DEBUG_SYMBOLS_H + +enum android_debug_symbol { + ADS_SDATA = 0, + ADS_BSS_END, + ADS_PER_CPU_START, + ADS_PER_CPU_END, + ADS_START_RO_AFTER_INIT, + ADS_END_RO_AFTER_INIT, + ADS_LINUX_BANNER, +#ifdef CONFIG_CMA + ADS_TOTAL_CMA, +#endif + ADS_SLAB_CACHES, + ADS_SLAB_MUTEX, + ADS_MIN_LOW_PFN, + ADS_MAX_PFN, + ADS_VMALLOC_NR_PAGES, + ADS_PCPU_NR_PAGES, +#ifdef CONFIG_PAGE_OWNER + ADS_PAGE_OWNER_ENABLED, +#endif +#ifdef CONFIG_SLUB_DEBUG + ADS_SLUB_DEBUG, +#endif +#ifdef CONFIG_SWAP + ADS_NR_SWAP_PAGES, +#endif +#ifdef CONFIG_MMU + ADS_MMAP_MIN_ADDR, +#endif + ADS_STACK_GUARD_GAP, +#ifdef CONFIG_SYSCTL + ADS_SYSCTL_LEGACY_VA_LAYOUT, +#endif + ADS_SHOW_MEM, + ADS_END +}; + +enum android_debug_per_cpu_symbol { + ADS_IRQ_STACK_PTR = 0, + ADS_DEBUG_PER_CPU_END +}; + +#ifdef CONFIG_ANDROID_DEBUG_SYMBOLS + +void *android_debug_symbol(enum android_debug_symbol symbol); +void *android_debug_per_cpu_symbol(enum android_debug_per_cpu_symbol symbol); + +void android_debug_for_each_module(int (*fn)(const char *mod_name, void *mod_addr, void *data), + void *data); + +#else /* !CONFIG_ANDROID_DEBUG_SYMBOLS */ + +static inline void *android_debug_symbol(enum android_debug_symbol symbol) +{ + return NULL; +} +static inline void *android_debug_per_cpu_symbol(enum android_debug_per_cpu_symbol symbol) +{ + return NULL; +} + +static inline void android_debug_for_each_module(int (*fn)(const char *mod_name, void *mod_addr, + void *data), void *data) {} +#endif /* CONFIG_ANDROID_DEBUG_SYMBOLS */ + +#endif /* _ANDROID_DEBUG_SYMBOLS_H */ diff --git a/include/linux/android_kabi.h b/include/linux/android_kabi.h new file mode 100644 index 000000000000..f6dd7f00b386 --- /dev/null +++ b/include/linux/android_kabi.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * android_kabi.h - Android kernel abi abstraction header + * + * Copyright (C) 2020 Google, Inc. + * + * Heavily influenced by rh_kabi.h which came from the RHEL/CENTOS kernel and + * was: + * Copyright (c) 2014 Don Zickus + * Copyright (c) 2015-2018 Jiri Benc + * Copyright (c) 2015 Sabrina Dubroca, Hannes Frederic Sowa + * Copyright (c) 2016-2018 Prarit Bhargava + * Copyright (c) 2017 Paolo Abeni, Larry Woodman + * + * These macros are to be used to try to help alleviate future kernel abi + * changes that will occur as LTS and other kernel patches are merged into the + * tree during a period in which the kernel abi is wishing to not be disturbed. + * + * There are two times these macros should be used: + * - Before the kernel abi is "frozen" + * Padding can be added to various kernel structures that have in the past + * been known to change over time. That will give "room" in the structure + * that can then be used when fields are added so that the structure size + * will not change. + * + * - After the kernel abi is "frozen" + * If a structure's field is changed to a type that is identical in size to + * the previous type, it can be changed with a union macro + * If a field is added to a structure, the padding fields can be used to add + * the new field in a "safe" way. + */ +#ifndef _ANDROID_KABI_H +#define _ANDROID_KABI_H + +#include + +/* + * Worker macros, don't use these, use the ones without a leading '_' + */ + +#define __ANDROID_KABI_CHECK_SIZE_ALIGN(_orig, _new) \ + union { \ + _Static_assert(sizeof(struct{_new;}) <= sizeof(struct{_orig;}), \ + __FILE__ ":" __stringify(__LINE__) ": " \ + __stringify(_new) \ + " is larger than " \ + __stringify(_orig) ); \ + _Static_assert(__alignof__(struct{_new;}) <= __alignof__(struct{_orig;}), \ + __FILE__ ":" __stringify(__LINE__) ": " \ + __stringify(_orig) \ + " is not aligned the same as " \ + __stringify(_new) ); \ + } + +#ifdef __GENKSYMS__ + +#define _ANDROID_KABI_REPLACE(_orig, _new) _orig + +#else + +#define _ANDROID_KABI_REPLACE(_orig, _new) \ + union { \ + _new; \ + struct { \ + _orig; \ + }; \ + __ANDROID_KABI_CHECK_SIZE_ALIGN(_orig, _new); \ + } + +#endif /* __GENKSYMS__ */ + +#define _ANDROID_KABI_RESERVE(n) u64 android_kabi_reserved##n + + +/* + * Macros to use _before_ the ABI is frozen + */ + +/* + * ANDROID_KABI_RESERVE + * Reserve some "padding" in a structure for potential future use. + * This normally placed at the end of a structure. + * number: the "number" of the padding variable in the structure. Start with + * 1 and go up. + */ +#ifdef CONFIG_ANDROID_KABI_RESERVE +#define ANDROID_KABI_RESERVE(number) _ANDROID_KABI_RESERVE(number) +#else +#define ANDROID_KABI_RESERVE(number) +#endif + + +/* + * Macros to use _after_ the ABI is frozen + */ + +/* + * ANDROID_KABI_USE(number, _new) + * Use a previous padding entry that was defined with ANDROID_KABI_RESERVE + * number: the previous "number" of the padding variable + * _new: the variable to use now instead of the padding variable + */ +#define ANDROID_KABI_USE(number, _new) \ + _ANDROID_KABI_REPLACE(_ANDROID_KABI_RESERVE(number), _new) + +/* + * ANDROID_KABI_USE2(number, _new1, _new2) + * Use a previous padding entry that was defined with ANDROID_KABI_RESERVE for + * two new variables that fit into 64 bits. This is good for when you do not + * want to "burn" a 64bit padding variable for a smaller variable size if not + * needed. + */ +#define ANDROID_KABI_USE2(number, _new1, _new2) \ + _ANDROID_KABI_REPLACE(_ANDROID_KABI_RESERVE(number), struct{ _new1; _new2; }) + + +#endif /* _ANDROID_KABI_H */ diff --git a/include/linux/android_vendor.h b/include/linux/android_vendor.h new file mode 100644 index 000000000000..af3014ccc82e --- /dev/null +++ b/include/linux/android_vendor.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * android_vendor.h - Android vendor data + * + * Copyright 2020 Google LLC + * + * These macros are to be used to reserve space in kernel data structures + * for use by vendor modules. + * + * These macros should be used before the kernel abi is "frozen". + * Fields can be added to various kernel structures that need space + * for functionality implemented in vendor modules. The use of + * these fields is vendor specific. + */ +#ifndef _ANDROID_VENDOR_H +#define _ANDROID_VENDOR_H + +/* + * ANDROID_VENDOR_DATA + * Reserve some "padding" in a structure for potential future use. + * This normally placed at the end of a structure. + * number: the "number" of the padding variable in the structure. Start with + * 1 and go up. + * + * ANDROID_VENDOR_DATA_ARRAY + * Same as ANDROID_VENDOR_DATA but allocates an array of u64 with + * the specified size + */ +#ifdef CONFIG_ANDROID_VENDOR_OEM_DATA +#define ANDROID_VENDOR_DATA(n) u64 android_vendor_data##n +#define ANDROID_VENDOR_DATA_ARRAY(n, s) u64 android_vendor_data##n[s] + +#define ANDROID_OEM_DATA(n) u64 android_oem_data##n +#define ANDROID_OEM_DATA_ARRAY(n, s) u64 android_oem_data##n[s] + +#define android_init_vendor_data(p, n) \ + memset(&p->android_vendor_data##n, 0, sizeof(p->android_vendor_data##n)) +#define android_init_oem_data(p, n) \ + memset(&p->android_oem_data##n, 0, sizeof(p->android_oem_data##n)) +#else +#define ANDROID_VENDOR_DATA(n) +#define ANDROID_VENDOR_DATA_ARRAY(n, s) +#define ANDROID_OEM_DATA(n) +#define ANDROID_OEM_DATA_ARRAY(n, s) + +#define android_init_vendor_data(p, n) +#define android_init_oem_data(p, n) +#endif + +#endif /* _ANDROID_VENDOR_H */ diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index f180240dc95f..c5ce6c5798d9 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -7,6 +7,7 @@ #include #include +#include void topology_normalize_cpu_scale(void); int topology_update_cpu_topology(void); @@ -67,6 +68,8 @@ struct cpu_topology { cpumask_t thread_sibling; cpumask_t core_sibling; cpumask_t llc_sibling; + + cpumask_t android_vendor_data1; }; #ifdef CONFIG_GENERIC_ARCH_TOPOLOGY @@ -85,5 +88,6 @@ void remove_cpu_topology(unsigned int cpuid); void reset_cpu_topology(void); int parse_acpi_topology(void); #endif +extern bool topology_update_done; #endif /* _LINUX_ARCH_TOPOLOGY_H_ */ diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 220c8c60e021..b23906d0652f 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -112,6 +112,14 @@ /* KVM "vendor specific" services */ #define ARM_SMCCC_KVM_FUNC_FEATURES 0 #define ARM_SMCCC_KVM_FUNC_PTP 1 +#define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO 2 +#define ARM_SMCCC_KVM_FUNC_MEM_SHARE 3 +#define ARM_SMCCC_KVM_FUNC_MEM_UNSHARE 4 +#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO 5 +#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL 6 +#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP 7 +#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP 8 +#define ARM_SMCCC_KVM_FUNC_MEM_RELINQUISH 9 #define ARM_SMCCC_KVM_FUNC_FEATURES_2 127 #define ARM_SMCCC_KVM_NUM_FUNCS 128 @@ -134,10 +142,58 @@ ARM_SMCCC_OWNER_VENDOR_HYP, \ ARM_SMCCC_KVM_FUNC_PTP) +#define ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_HYP_MEMINFO) + +#define ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MEM_SHARE) + +#define ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MEM_UNSHARE) + +#define ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MEM_RELINQUISH) + /* ptp_kvm counter type ID */ #define KVM_PTP_VIRT_COUNTER 0 #define KVM_PTP_PHYS_COUNTER 1 +#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_INFO_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO) + +#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_ENROLL_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL) + +#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP) + +#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_UNMAP_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP) + /* Paravirtualised time calls (defined by ARM DEN0057A) */ #define ARM_SMCCC_HV_PV_TIME_FEATURES \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index 505c679b6a9b..f0cb5b72b87b 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -11,6 +11,97 @@ #include #include +#define FFA_SMC(calling_convention, func_num) \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, (calling_convention), \ + ARM_SMCCC_OWNER_STANDARD, (func_num)) + +#define FFA_SMC_32(func_num) FFA_SMC(ARM_SMCCC_SMC_32, (func_num)) +#define FFA_SMC_64(func_num) FFA_SMC(ARM_SMCCC_SMC_64, (func_num)) + +#define FFA_ERROR FFA_SMC_32(0x60) +#define FFA_SUCCESS FFA_SMC_32(0x61) +#define FFA_INTERRUPT FFA_SMC_32(0x62) +#define FFA_VERSION FFA_SMC_32(0x63) +#define FFA_FEATURES FFA_SMC_32(0x64) +#define FFA_RX_RELEASE FFA_SMC_32(0x65) +#define FFA_RXTX_MAP FFA_SMC_32(0x66) +#define FFA_FN64_RXTX_MAP FFA_SMC_64(0x66) +#define FFA_RXTX_UNMAP FFA_SMC_32(0x67) +#define FFA_PARTITION_INFO_GET FFA_SMC_32(0x68) +#define FFA_ID_GET FFA_SMC_32(0x69) +#define FFA_MSG_POLL FFA_SMC_32(0x6A) +#define FFA_MSG_WAIT FFA_SMC_32(0x6B) +#define FFA_YIELD FFA_SMC_32(0x6C) +#define FFA_RUN FFA_SMC_32(0x6D) +#define FFA_MSG_SEND FFA_SMC_32(0x6E) +#define FFA_MSG_SEND_DIRECT_REQ FFA_SMC_32(0x6F) +#define FFA_FN64_MSG_SEND_DIRECT_REQ FFA_SMC_64(0x6F) +#define FFA_MSG_SEND_DIRECT_RESP FFA_SMC_32(0x70) +#define FFA_FN64_MSG_SEND_DIRECT_RESP FFA_SMC_64(0x70) +#define FFA_MEM_DONATE FFA_SMC_32(0x71) +#define FFA_FN64_MEM_DONATE FFA_SMC_64(0x71) +#define FFA_MEM_LEND FFA_SMC_32(0x72) +#define FFA_FN64_MEM_LEND FFA_SMC_64(0x72) +#define FFA_MEM_SHARE FFA_SMC_32(0x73) +#define FFA_FN64_MEM_SHARE FFA_SMC_64(0x73) +#define FFA_MEM_RETRIEVE_REQ FFA_SMC_32(0x74) +#define FFA_FN64_MEM_RETRIEVE_REQ FFA_SMC_64(0x74) +#define FFA_MEM_RETRIEVE_RESP FFA_SMC_32(0x75) +#define FFA_MEM_RELINQUISH FFA_SMC_32(0x76) +#define FFA_MEM_RECLAIM FFA_SMC_32(0x77) +#define FFA_MEM_OP_PAUSE FFA_SMC_32(0x78) +#define FFA_MEM_OP_RESUME FFA_SMC_32(0x79) +#define FFA_MEM_FRAG_RX FFA_SMC_32(0x7A) +#define FFA_MEM_FRAG_TX FFA_SMC_32(0x7B) +#define FFA_NORMAL_WORLD_RESUME FFA_SMC_32(0x7C) + +/* + * For some calls it is necessary to use SMC64 to pass or return 64-bit values. + * For such calls FFA_FN_NATIVE(name) will choose the appropriate + * (native-width) function ID. + */ +#ifdef CONFIG_64BIT +#define FFA_FN_NATIVE(name) FFA_FN64_##name +#else +#define FFA_FN_NATIVE(name) FFA_##name +#endif + +/* FFA error codes. */ +#define FFA_RET_SUCCESS (0) +#define FFA_RET_NOT_SUPPORTED (-1) +#define FFA_RET_INVALID_PARAMETERS (-2) +#define FFA_RET_NO_MEMORY (-3) +#define FFA_RET_BUSY (-4) +#define FFA_RET_INTERRUPTED (-5) +#define FFA_RET_DENIED (-6) +#define FFA_RET_RETRY (-7) +#define FFA_RET_ABORTED (-8) + +/* FFA version encoding */ +#define FFA_MAJOR_VERSION_MASK GENMASK(30, 16) +#define FFA_MINOR_VERSION_MASK GENMASK(15, 0) +#define FFA_MAJOR_VERSION(x) ((u16)(FIELD_GET(FFA_MAJOR_VERSION_MASK, (x)))) +#define FFA_MINOR_VERSION(x) ((u16)(FIELD_GET(FFA_MINOR_VERSION_MASK, (x)))) +#define FFA_PACK_VERSION_INFO(major, minor) \ + (FIELD_PREP(FFA_MAJOR_VERSION_MASK, (major)) | \ + FIELD_PREP(FFA_MINOR_VERSION_MASK, (minor))) +#define FFA_VERSION_1_0 FFA_PACK_VERSION_INFO(1, 0) + +/** + * FF-A specification mentions explicitly about '4K pages'. This should + * not be confused with the kernel PAGE_SIZE, which is the translation + * granule kernel is configured and may be one among 4K, 16K and 64K. + */ +#define FFA_PAGE_SIZE SZ_4K + +/* + * Minimum buffer size/alignment encodings returned by an FFA_FEATURES + * query for FFA_RXTX_MAP. + */ +#define FFA_FEAT_RXTX_MIN_SZ_4K 0 +#define FFA_FEAT_RXTX_MIN_SZ_64K 1 +#define FFA_FEAT_RXTX_MIN_SZ_16K 2 + /* FFA Bus/Device/Driver related */ struct ffa_device { int vm_id; @@ -156,11 +247,11 @@ struct ffa_mem_region_attributes { */ #define FFA_MEM_RETRIEVE_SELF_BORROWER BIT(0) u8 flag; - u32 composite_off; /* * Offset in bytes from the start of the outer `ffa_memory_region` to * an `struct ffa_mem_region_addr_range`. */ + u32 composite_off; u64 reserved; }; @@ -262,6 +353,8 @@ struct ffa_dev_ops { int (*memory_reclaim)(u64 g_handle, u32 flags); int (*memory_share)(struct ffa_device *dev, struct ffa_mem_ops_args *args); + int (*memory_lend)(struct ffa_device *dev, + struct ffa_mem_ops_args *args); }; #endif /* _LINUX_ARM_FFA_H */ diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 33207004cfde..b298388bc3b6 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -13,6 +13,7 @@ #include #include #include +#include struct page; struct device; @@ -164,6 +165,9 @@ struct bdi_writeback { struct rcu_head rcu; }; #endif + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; struct backing_dev_info { @@ -202,6 +206,9 @@ struct backing_dev_info { #ifdef CONFIG_DEBUG_FS struct dentry *debug_dir; #endif + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; enum { diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 338aa27e4773..6b72060c76bf 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -43,6 +43,7 @@ #include #include #include +#include /* * Balloon device information descriptor. @@ -103,6 +104,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon, __SetPageMovable(page, balloon->inode->i_mapping); set_page_private(page, (unsigned long)balloon); list_add(&page->lru, &balloon->pages); + page_relinquish(page); } /* @@ -147,6 +149,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon, { __SetPageOffline(page); list_add(&page->lru, &balloon->pages); + page_relinquish(page); } static inline void balloon_page_delete(struct page *page) diff --git a/include/linux/bio.h b/include/linux/bio.h index 00952e92eae1..dc18d864c4ca 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -10,6 +10,7 @@ /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ #include #include +#include #define BIO_DEBUG @@ -338,6 +339,10 @@ struct bio_integrity_payload { struct work_struct bip_work; /* I/O completion */ struct bio_vec *bip_vec; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; @@ -686,6 +691,11 @@ struct bio_set { * Hot un-plug notifier for the per-cpu cache, if used */ struct hlist_node cpuhp_dead; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; static inline bool bioset_initialized(struct bio_set *bs) diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h new file mode 100644 index 000000000000..8b30d04ef008 --- /dev/null +++ b/include/linux/blk-crypto-profile.h @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2019 Google LLC + */ + +#ifndef __LINUX_BLK_CRYPTO_PROFILE_H +#define __LINUX_BLK_CRYPTO_PROFILE_H + +#include +#include + +struct blk_crypto_profile; + +/** + * struct blk_crypto_ll_ops - functions to control inline encryption hardware + * + * Low-level operations for controlling inline encryption hardware. This + * interface must be implemented by storage drivers that support inline + * encryption. All functions may sleep, are serialized by profile->lock, and + * are never called while profile->dev (if set) is runtime-suspended. + */ +struct blk_crypto_ll_ops { + + /** + * @keyslot_program: Program a key into the inline encryption hardware. + * + * Program @key into the specified @slot in the inline encryption + * hardware, overwriting any key that the keyslot may already contain. + * The keyslot is guaranteed to not be in-use by any I/O. + * + * This is required if the device has keyslots. Otherwise (i.e. if the + * device is a layered device, or if the device is real hardware that + * simply doesn't have the concept of keyslots) it is never called. + * + * Must return 0 on success, or -errno on failure. + */ + int (*keyslot_program)(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + unsigned int slot); + + /** + * @keyslot_evict: Evict a key from the inline encryption hardware. + * + * If the device has keyslots, this function must evict the key from the + * specified @slot. The slot will contain @key, but there should be no + * need for the @key argument to be used as @slot should be sufficient. + * The keyslot is guaranteed to not be in-use by any I/O. + * + * If the device doesn't have keyslots itself, this function must evict + * @key from any underlying devices. @slot won't be valid in this case. + * + * If there are no keyslots and no underlying devices, this function + * isn't required. + * + * Must return 0 on success, or -errno on failure. + */ + int (*keyslot_evict)(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + unsigned int slot); + + /** + * @derive_sw_secret: Derive the software secret from a hardware-wrapped + * key in ephemerally-wrapped form. + * + * This only needs to be implemented if BLK_CRYPTO_KEY_TYPE_HW_WRAPPED + * is supported. + * + * Must return 0 on success, -EBADMSG if the key is invalid, or another + * -errno code on other errors. + */ + int (*derive_sw_secret)(struct blk_crypto_profile *profile, + const u8 *eph_key, size_t eph_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]); +}; + +/** + * struct blk_crypto_profile - inline encryption profile for a device + * + * This struct contains a storage device's inline encryption capabilities (e.g. + * the supported crypto algorithms), driver-provided functions to control the + * inline encryption hardware (e.g. programming and evicting keys), and optional + * device-independent keyslot management data. + */ +struct blk_crypto_profile { + + /* public: Drivers must initialize the following fields. */ + + /** + * @ll_ops: Driver-provided functions to control the inline encryption + * hardware, e.g. program and evict keys. + */ + struct blk_crypto_ll_ops ll_ops; + + /** + * @max_dun_bytes_supported: The maximum number of bytes supported for + * specifying the data unit number (DUN). Specifically, the range of + * supported DUNs is 0 through (1 << (8 * max_dun_bytes_supported)) - 1. + */ + unsigned int max_dun_bytes_supported; + + /** + * @key_types_supported: A bitmask of the supported key types: + * BLK_CRYPTO_KEY_TYPE_STANDARD and/or BLK_CRYPTO_KEY_TYPE_HW_WRAPPED. + */ + unsigned int key_types_supported; + + /** + * @modes_supported: Array of bitmasks that specifies whether each + * combination of crypto mode and data unit size is supported. + * Specifically, the i'th bit of modes_supported[crypto_mode] is set if + * crypto_mode can be used with a data unit size of (1 << i). Note that + * only data unit sizes that are powers of 2 can be supported. + */ + unsigned int modes_supported[BLK_ENCRYPTION_MODE_MAX]; + + /** + * @dev: An optional device for runtime power management. If the driver + * provides this device, it will be runtime-resumed before any function + * in @ll_ops is called and will remain resumed during the call. + */ + struct device *dev; + + /* private: The following fields shouldn't be accessed by drivers. */ + + /* Number of keyslots, or 0 if not applicable */ + unsigned int num_slots; + + /* + * Serializes all calls to functions in @ll_ops as well as all changes + * to @slot_hashtable. This can also be taken in read mode to look up + * keyslots while ensuring that they can't be changed concurrently. + */ + struct rw_semaphore lock; + + /* List of idle slots, with least recently used slot at front */ + wait_queue_head_t idle_slots_wait_queue; + struct list_head idle_slots; + spinlock_t idle_slots_lock; + + /* + * Hash table which maps struct *blk_crypto_key to keyslots, so that we + * can find a key's keyslot in O(1) time rather than O(num_slots). + * Protected by 'lock'. + */ + struct hlist_head *slot_hashtable; + unsigned int log_slot_ht_size; + + /* Per-keyslot data */ + struct blk_crypto_keyslot *slots; +}; + +int blk_crypto_profile_init(struct blk_crypto_profile *profile, + unsigned int num_slots); + +int devm_blk_crypto_profile_init(struct device *dev, + struct blk_crypto_profile *profile, + unsigned int num_slots); + +unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot); + +void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile); + +void blk_crypto_profile_destroy(struct blk_crypto_profile *profile); + +void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent, + const struct blk_crypto_profile *child); + +bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target, + const struct blk_crypto_profile *reference); + +void blk_crypto_update_capabilities(struct blk_crypto_profile *dst, + const struct blk_crypto_profile *src); + +#endif /* __LINUX_BLK_CRYPTO_PROFILE_H */ diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 69b24fe92cbf..ef771c94f59a 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -13,10 +13,62 @@ enum blk_crypto_mode_num { BLK_ENCRYPTION_MODE_AES_256_XTS, BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV, BLK_ENCRYPTION_MODE_ADIANTUM, + BLK_ENCRYPTION_MODE_SM4_XTS, BLK_ENCRYPTION_MODE_MAX, }; -#define BLK_CRYPTO_MAX_KEY_SIZE 64 +/* + * Supported types of keys. Must be bitflags due to their use in + * blk_crypto_profile::key_types_supported. + */ +enum blk_crypto_key_type { + /* + * Standard keys (i.e. "software keys"). These keys are simply kept in + * raw, plaintext form in kernel memory. + */ + BLK_CRYPTO_KEY_TYPE_STANDARD = 1 << 0, + + /* + * Hardware-wrapped keys. These keys are only present in kernel memory + * in ephemerally-wrapped form, and they can only be unwrapped by + * dedicated hardware. For details, see the "Hardware-wrapped keys" + * section of Documentation/block/inline-encryption.rst. + */ + BLK_CRYPTO_KEY_TYPE_HW_WRAPPED = 1 << 1, +}; + +/* + * Currently the maximum standard key size is 64 bytes, as that is the key size + * of BLK_ENCRYPTION_MODE_AES_256_XTS which takes the longest key. + * + * The maximum hardware-wrapped key size depends on the hardware's key wrapping + * algorithm, which is a hardware implementation detail, so it isn't precisely + * specified. But currently 128 bytes is plenty in practice. Implementations + * are recommended to wrap a 32-byte key for the hardware KDF with AES-256-GCM, + * which should result in a size closer to 64 bytes than 128. + * + * Both of these values can trivially be increased if ever needed. + */ +#define BLK_CRYPTO_MAX_STANDARD_KEY_SIZE 64 +#define BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE 128 + +/* This should use max(), but max() doesn't work in a struct definition. */ +#define BLK_CRYPTO_MAX_ANY_KEY_SIZE \ + (BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE > \ + BLK_CRYPTO_MAX_STANDARD_KEY_SIZE ? \ + BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE : BLK_CRYPTO_MAX_STANDARD_KEY_SIZE) + +/* + * Size of the "software secret" which can be derived from a hardware-wrapped + * key. This is currently always 32 bytes. Note, the choice of 32 bytes + * assumes that the software secret is only used directly for algorithms that + * don't require more than a 256-bit key to get the desired security strength. + * If it were to be used e.g. directly as an AES-256-XTS key, then this would + * need to be increased (which is possible if hardware supports it, but care + * would need to be taken to avoid breaking users who need exactly 32 bytes). + */ +#define BLK_CRYPTO_SW_SECRET_SIZE 32 + /** * struct blk_crypto_config - an inline encryption key's crypto configuration * @crypto_mode: encryption algorithm this key is for @@ -25,20 +77,23 @@ enum blk_crypto_mode_num { * ciphertext. This is always a power of 2. It might be e.g. the * filesystem block size or the disk sector size. * @dun_bytes: the maximum number of bytes of DUN used when using this key + * @key_type: the type of this key -- either standard or hardware-wrapped */ struct blk_crypto_config { enum blk_crypto_mode_num crypto_mode; unsigned int data_unit_size; unsigned int dun_bytes; + enum blk_crypto_key_type key_type; }; /** * struct blk_crypto_key - an inline encryption key - * @crypto_cfg: the crypto configuration (like crypto_mode, key size) for this - * key + * @crypto_cfg: the crypto mode, data unit size, key type, and other + * characteristics of this key and how it will be used * @data_unit_size_bits: log2 of data_unit_size - * @size: size of this key in bytes (determined by @crypto_cfg.crypto_mode) - * @raw: the raw bytes of this key. Only the first @size bytes are used. + * @size: size of this key in bytes. The size of a standard key is fixed for a + * given crypto mode, but the size of a hardware-wrapped key can vary. + * @raw: the bytes of this key. Only the first @size bytes are significant. * * A blk_crypto_key is immutable once created, and many bios can reference it at * the same time. It must not be freed until all bios using it have completed @@ -48,7 +103,7 @@ struct blk_crypto_key { struct blk_crypto_config crypto_cfg; unsigned int data_unit_size_bits; unsigned int size; - u8 raw[BLK_CRYPTO_MAX_KEY_SIZE]; + u8 raw[BLK_CRYPTO_MAX_ANY_KEY_SIZE]; }; #define BLK_CRYPTO_MAX_IV_SIZE 32 @@ -71,9 +126,6 @@ struct bio_crypt_ctx { #include #include -struct request; -struct request_queue; - #ifdef CONFIG_BLK_INLINE_ENCRYPTION static inline bool bio_has_crypt_ctx(struct bio *bio) @@ -89,20 +141,28 @@ bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc, unsigned int bytes, const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]); -int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, +int blk_crypto_init_key(struct blk_crypto_key *blk_key, + const u8 *raw_key, size_t raw_key_size, + enum blk_crypto_key_type key_type, enum blk_crypto_mode_num crypto_mode, unsigned int dun_bytes, unsigned int data_unit_size); -int blk_crypto_start_using_key(const struct blk_crypto_key *key, - struct request_queue *q); +int blk_crypto_start_using_key(struct block_device *bdev, + const struct blk_crypto_key *key); -int blk_crypto_evict_key(struct request_queue *q, +int blk_crypto_evict_key(struct block_device *bdev, const struct blk_crypto_key *key); -bool blk_crypto_config_supported(struct request_queue *q, +bool blk_crypto_config_supported_natively(struct block_device *bdev, + const struct blk_crypto_config *cfg); +bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg); +int blk_crypto_derive_sw_secret(struct block_device *bdev, + const u8 *eph_key, size_t eph_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]); + #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline bool bio_has_crypt_ctx(struct bio *bio) @@ -112,6 +172,9 @@ static inline bool bio_has_crypt_ctx(struct bio *bio) #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ +static inline void bio_clone_skip_dm_default_key(struct bio *dst, + const struct bio *src); + int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask); /** * bio_crypt_clone - clone bio encryption context @@ -127,9 +190,42 @@ int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask); static inline int bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask) { + bio_clone_skip_dm_default_key(dst, src); if (bio_has_crypt_ctx(src)) return __bio_crypt_clone(dst, src, gfp_mask); return 0; } +#if IS_ENABLED(CONFIG_DM_DEFAULT_KEY) +static inline void bio_set_skip_dm_default_key(struct bio *bio) +{ + bio->bi_skip_dm_default_key = true; +} + +static inline bool bio_should_skip_dm_default_key(const struct bio *bio) +{ + return bio->bi_skip_dm_default_key; +} + +static inline void bio_clone_skip_dm_default_key(struct bio *dst, + const struct bio *src) +{ + dst->bi_skip_dm_default_key = src->bi_skip_dm_default_key; +} +#else /* CONFIG_DM_DEFAULT_KEY */ +static inline void bio_set_skip_dm_default_key(struct bio *bio) +{ +} + +static inline bool bio_should_skip_dm_default_key(const struct bio *bio) +{ + return false; +} + +static inline void bio_clone_skip_dm_default_key(struct bio *dst, + const struct bio *src) +{ +} +#endif /* !CONFIG_DM_DEFAULT_KEY */ + #endif /* __LINUX_BLK_CRYPTO_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 36ce3d0fb9f3..6f76f90a7054 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -10,6 +10,7 @@ #include #include #include +#include struct bio_set; struct bio; @@ -49,6 +50,11 @@ struct block_device { #ifdef CONFIG_FAIL_MAKE_REQUEST bool bd_make_it_fail; #endif + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); } __randomize_layout; #define bdev_whole(_bdev) \ @@ -245,6 +251,9 @@ struct bio { #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct bio_crypt_ctx *bi_crypt_context; +#if IS_ENABLED(CONFIG_DM_DEFAULT_KEY) + bool bi_skip_dm_default_key; +#endif #endif union { @@ -267,6 +276,10 @@ struct bio { struct bio_set *bi_pool; + ANDROID_OEM_DATA(1); + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + /* * We can inline a number of vecs at the end of the bio, to avoid * double allocations for a small number of bio_vecs. This member diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 67344dfe07a7..f017ca2e803c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -24,6 +24,8 @@ #include #include #include +#include +#include struct module; struct request_queue; @@ -37,7 +39,7 @@ struct pr_ops; struct rq_qos; struct blk_queue_stats; struct blk_stat_callback; -struct blk_keyslot_manager; +struct blk_crypto_profile; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -211,7 +213,7 @@ struct request { #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct bio_crypt_ctx *crypt_ctx; - struct blk_ksm_keyslot *crypt_keyslot; + struct blk_crypto_keyslot *crypt_keyslot; #endif unsigned short write_hint; @@ -233,6 +235,8 @@ struct request { */ rq_end_io_fn *end_io; void *end_io_data; + + ANDROID_KABI_RESERVE(1); }; static inline int blk_validate_block_size(unsigned int bsize) @@ -329,6 +333,10 @@ struct queue_limits { unsigned char discard_misaligned; unsigned char raid_partial_stripes_expensive; enum blk_zoned_model zoned; + + ANDROID_KABI_RESERVE(1); + + ANDROID_OEM_DATA(1); }; typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, @@ -450,8 +458,8 @@ struct request_queue { unsigned int dma_alignment; #ifdef CONFIG_BLK_INLINE_ENCRYPTION - /* Inline crypto capabilities */ - struct blk_keyslot_manager *ksm; + struct blk_crypto_profile *crypto_profile; + struct kobject *crypto_kobject; #endif unsigned int rq_timeout; @@ -559,6 +567,13 @@ struct request_queue { #define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS]; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); + + ANDROID_OEM_DATA(1); }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ @@ -592,6 +607,8 @@ struct request_queue { #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ +/* Pipeline zoned writes (REQ_OP_WRITE, REQ_OP_WRITE_ZEROES). */ +#define QUEUE_FLAG_PIPELINE_ZONED_WRITES 30 #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ @@ -639,6 +656,11 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) #define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags) +static inline bool blk_queue_pipeline_zoned_writes(struct request_queue *q) +{ + return test_bit(QUEUE_FLAG_PIPELINE_ZONED_WRITES, &q->queue_flags); +} + extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); @@ -708,6 +730,15 @@ static inline unsigned int blk_queue_zone_no(struct request_queue *q, return sector >> ilog2(q->limits.chunk_sectors); } +/** + * blk_queue_zone_is_seq() - Whether a logical block is in a sequential zone. + * @q: Request queue pointer. + * @sector: Offset from start of block device in 512 byte units. + * + * Return: true if and only if @q refers to a zoned block device and + * @sector refers either to a sequential write required or a sequential + * write preferred zone. + */ static inline bool blk_queue_zone_is_seq(struct request_queue *q, sector_t sector) { @@ -985,10 +1016,41 @@ static inline unsigned int blk_rq_zone_no(struct request *rq) return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); } +/** + * blk_rq_zone_is_seq() - Whether a request is for a sequential zone. + * @rq: Request pointer. + * + * Return: true if and only if blk_rq_pos(@rq) refers either to a sequential + * write required or a sequential write preferred zone. + */ static inline unsigned int blk_rq_zone_is_seq(struct request *rq) { return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); } + +/** + * blk_rq_is_seq_zone_write() - Whether @rq is a write request for a sequential zone. + * @rq: Request to examine. + * + * In this context sequential zone means either a sequential write required or + * to a sequential write preferred zone. + */ +static inline bool blk_rq_is_seq_zone_write(struct request *rq) +{ + switch (req_op(rq)) { + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE: + return blk_rq_zone_is_seq(rq); + default: + return false; + } +} +#else /* CONFIG_BLK_DEV_ZONED */ +static inline bool blk_rq_is_seq_zone_write(struct request *rq) +{ + return false; +} #endif /* CONFIG_BLK_DEV_ZONED */ /* @@ -1829,20 +1891,17 @@ static inline struct bio_vec *rq_integrity_vec(struct request *rq) #ifdef CONFIG_BLK_INLINE_ENCRYPTION -bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q); - -void blk_ksm_unregister(struct request_queue *q); +bool blk_crypto_register(struct blk_crypto_profile *profile, + struct request_queue *q); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ -static inline bool blk_ksm_register(struct blk_keyslot_manager *ksm, - struct request_queue *q) +static inline bool blk_crypto_register(struct blk_crypto_profile *profile, + struct request_queue *q) { return true; } -static inline void blk_ksm_unregister(struct request_queue *q) { } - #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ @@ -1872,6 +1931,10 @@ struct block_device_operations { * driver. */ int (*alternative_gpt_sector)(struct gendisk *disk, sector_t *sector); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_OEM_DATA(1); }; #ifdef CONFIG_COMPAT diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 84efd8dd139d..9a154d1b0c29 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -23,6 +23,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -151,6 +152,9 @@ struct bpf_map_ops { /* bpf_iter info used to open a seq_file */ const struct bpf_iter_seq_info *iter_seq_info; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; struct bpf_map { @@ -257,6 +261,8 @@ struct bpf_map_dev_ops { int (*map_update_elem)(struct bpf_offloaded_map *map, void *key, void *value, u64 flags); int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key); + + ANDROID_KABI_RESERVE(1); }; struct bpf_offloaded_map { @@ -570,6 +576,7 @@ struct bpf_verifier_ops { enum bpf_access_type atype, u32 *next_btf_id); bool (*check_kfunc_call)(u32 kfunc_btf_id); + ANDROID_KABI_RESERVE(1); }; struct bpf_prog_offload_ops { @@ -585,6 +592,7 @@ struct bpf_prog_offload_ops { int (*prepare)(struct bpf_prog *prog); int (*translate)(struct bpf_prog *prog); void (*destroy)(struct bpf_prog *prog); + ANDROID_KABI_RESERVE(1); }; struct bpf_prog_offload { @@ -597,6 +605,7 @@ struct bpf_prog_offload { bool opt_failed; void *jited_image; u32 jited_len; + ANDROID_KABI_RESERVE(1); }; enum bpf_cgroup_storage_type { @@ -980,6 +989,7 @@ struct bpf_prog_aux { struct work_struct work; struct rcu_head rcu; }; + ANDROID_KABI_RESERVE(1); }; struct bpf_array_aux { @@ -1018,6 +1028,7 @@ struct bpf_link_ops { void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq); int (*fill_link_info)(const struct bpf_link *link, struct bpf_link_info *info); + ANDROID_KABI_RESERVE(1); }; struct bpf_link_primer { @@ -1047,6 +1058,7 @@ struct bpf_struct_ops { struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; u32 type_id; u32 value_id; + ANDROID_KABI_RESERVE(1); }; #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index bbe1eefa4c8a..a49d35016e6b 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -79,6 +79,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm, #endif BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall, void *, void *) +#ifdef CONFIG_FUSE_BPF +BPF_PROG_TYPE(BPF_PROG_TYPE_FUSE, fuse, struct fuse_bpf_args, struct fuse_bpf_args) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3d04b48e502d..501ff5602462 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -8,6 +8,7 @@ #include /* for struct btf and btf_id() */ #include /* for MAX_BPF_STACK */ #include +#include /* Maximum variable offset umax_value permitted when resolving memory accesses. * In practice this is far bigger than any realistic pointer offset; this limit @@ -448,6 +449,8 @@ struct bpf_subprog_info { bool tail_call_reachable; bool has_ld_abs; bool is_async_cb; + + ANDROID_KABI_RESERVE(1); }; /* single container for all structs @@ -508,8 +511,12 @@ struct bpf_verifier_env { /* longest register parentage chain walked for liveness marking */ u32 longest_mark_read_walk; bpfptr_t fd_array; + /* buffer used in reg_type_str() to generate reg_type string */ char type_str_buf[TYPE_STR_BUF_LEN]; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 45cdb12243e3..a0ff347d03a1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -24,6 +24,7 @@ #include #include #include +#include #include @@ -66,6 +67,8 @@ struct css_task_iter { struct css_set *cur_dcset; struct task_struct *cur_task; struct list_head iters_node; /* css_set->task_iters */ + + ANDROID_KABI_RESERVE(1); }; extern struct file_system_type cgroup_fs_type; @@ -433,6 +436,18 @@ static inline void cgroup_put(struct cgroup *cgrp) css_put(&cgrp->self); } +extern struct mutex cgroup_mutex; + +static inline void cgroup_lock(void) +{ + mutex_lock(&cgroup_mutex); +} + +static inline void cgroup_unlock(void) +{ + mutex_unlock(&cgroup_mutex); +} + /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for @@ -447,7 +462,6 @@ static inline void cgroup_put(struct cgroup *cgrp) * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU -extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ @@ -708,6 +722,8 @@ struct cgroup; static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} +static inline void cgroup_lock(void) {} +static inline void cgroup_unlock(void) {} static inline int cgroup_attach_task_all(struct task_struct *from, struct task_struct *t) { return 0; } static inline int cgroupstats_build(struct cgroupstats *stats, diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index 5f5730c1d324..b6c42ce7f65b 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -5,6 +5,7 @@ #include #include #include +#include #define CLEANCACHE_NO_POOL -1 #define CLEANCACHE_NO_BACKEND -2 @@ -36,6 +37,7 @@ struct cleancache_ops { void (*invalidate_page)(int, struct cleancache_filekey, pgoff_t); void (*invalidate_inode)(int, struct cleancache_filekey); void (*invalidate_fs)(int); + ANDROID_OEM_DATA(1); }; extern int cleancache_register_ops(const struct cleancache_ops *ops); diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 402aef48119b..84f7d03ccd8c 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -32,6 +32,7 @@ #define CLK_OPS_PARENT_ENABLE BIT(12) /* duty cycle call may be forwarded to the parent clock */ #define CLK_DUTY_CYCLE_PARENT BIT(13) +#define CLK_DONT_HOLD_STATE BIT(14) /* Don't hold state */ struct clk; struct clk_hw; @@ -211,6 +212,13 @@ struct clk_duty { * directory is provided as an argument. Called with * prepare_lock held. Returns 0 on success, -EERROR otherwise. * + * @pre_rate_change: Optional callback for a clock to fulfill its rate + * change requirements before any rate change has occurred in + * its clock tree. Returns 0 on success, -EERROR otherwise. + * + * @post_rate_change: Optional callback for a clock to clean up any + * requirements that were needed while the clock and its tree + * was changing states. Returns 0 on success, -EERROR otherwise. * * The clk_enable/clk_disable and clk_prepare/clk_unprepare pairs allow * implementations to split any work between atomic (enable) and sleepable @@ -258,6 +266,12 @@ struct clk_ops { int (*init)(struct clk_hw *hw); void (*terminate)(struct clk_hw *hw); void (*debug_init)(struct clk_hw *hw, struct dentry *dentry); + int (*pre_rate_change)(struct clk_hw *hw, + unsigned long rate, + unsigned long new_rate); + int (*post_rate_change)(struct clk_hw *hw, + unsigned long old_rate, + unsigned long rate); }; /** @@ -1160,6 +1174,7 @@ void devm_clk_unregister(struct device *dev, struct clk *clk); void clk_hw_unregister(struct clk_hw *hw); void devm_clk_hw_unregister(struct device *dev, struct clk_hw *hw); +void clk_sync_state(struct device *dev); /* helper functions */ const char *__clk_get_name(const struct clk *clk); diff --git a/include/linux/cma.h b/include/linux/cma.h index 53fd8c3cdbd0..585ec7aae4c8 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -22,6 +22,15 @@ struct cma; +struct cma_alloc_info { + unsigned long nr_migrated; + unsigned long nr_reclaimed; + unsigned long nr_mapped; + unsigned int nr_isolate_fail; + unsigned int nr_migrate_fail; + unsigned int nr_test_fail; +}; + extern unsigned long totalcma_pages; extern phys_addr_t cma_get_base(const struct cma *cma); extern unsigned long cma_get_size(const struct cma *cma); @@ -45,7 +54,7 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, const char *name, struct cma **res_cma); extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, - bool no_warn); + gfp_t gfp_mask); extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 34bce35c808d..b99b7532cc29 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -180,6 +180,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, extern int kcompactd_run(int nid); extern void kcompactd_stop(int nid); extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx); +extern unsigned long isolate_and_split_free_page(struct page *page, + struct list_head *list); #else static inline void reset_isolation_suitable(pg_data_t *pgdat) @@ -225,6 +227,12 @@ static inline void wakeup_kcompactd(pg_data_t *pgdat, { } +static inline unsigned long isolate_and_split_free_page(struct page *page, + struct list_head *list) +{ + return 0; +} + #endif /* CONFIG_COMPACTION */ struct node; diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 3c4de9b6c6e3..aa83f46be08b 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -68,3 +68,13 @@ #define __nocfi __attribute__((__no_sanitize__("cfi"))) #define __cficanonical __attribute__((__cfi_canonical_jump_table__)) + +#if defined(CONFIG_CFI_CLANG) +/* + * With CONFIG_CFI_CLANG, the compiler replaces function address + * references with the address of the function's CFI jump table + * entry. The function_nocfi macro always returns the address of the + * actual function instead. + */ +#define function_nocfi(x) __builtin_function_start(x) +#endif diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 025391be1b19..4b4fbf4cf5be 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1083,14 +1083,6 @@ static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_ } #endif -#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) -void sched_cpufreq_governor_change(struct cpufreq_policy *policy, - struct cpufreq_governor *old_gov); -#else -static inline void sched_cpufreq_governor_change(struct cpufreq_policy *policy, - struct cpufreq_governor *old_gov) { } -#endif - extern void arch_freq_prepare_all(void); extern unsigned int arch_freq_get_on_cpu(int cpu); diff --git a/include/linux/cpufreq_times.h b/include/linux/cpufreq_times.h new file mode 100644 index 000000000000..38272a5f3163 --- /dev/null +++ b/include/linux/cpufreq_times.h @@ -0,0 +1,42 @@ +/* drivers/cpufreq/cpufreq_times.c + * + * Copyright (C) 2018 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef _LINUX_CPUFREQ_TIMES_H +#define _LINUX_CPUFREQ_TIMES_H + +#include +#include + +#ifdef CONFIG_CPU_FREQ_TIMES +void cpufreq_task_times_init(struct task_struct *p); +void cpufreq_task_times_alloc(struct task_struct *p); +void cpufreq_task_times_exit(struct task_struct *p); +int proc_time_in_state_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *p); +void cpufreq_acct_update_power(struct task_struct *p, u64 cputime); +void cpufreq_times_create_policy(struct cpufreq_policy *policy); +void cpufreq_times_record_transition(struct cpufreq_policy *policy, + unsigned int new_freq); +#else +static inline void cpufreq_task_times_init(struct task_struct *p) {} +static inline void cpufreq_task_times_alloc(struct task_struct *p) {} +static inline void cpufreq_task_times_exit(struct task_struct *p) {} +static inline void cpufreq_acct_update_power(struct task_struct *p, + u64 cputime) {} +static inline void cpufreq_times_create_policy(struct cpufreq_policy *policy) {} +static inline void cpufreq_times_record_transition( + struct cpufreq_policy *policy, unsigned int new_freq) {} +#endif /* CONFIG_CPU_FREQ_TIMES */ +#endif /* _LINUX_CPUFREQ_TIMES_H */ diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index c88ccc48877d..fa2c263964cd 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -250,6 +250,10 @@ enum cpuhp_state { CPUHP_AP_X86_KVM_CLK_ONLINE, CPUHP_AP_DTPM_CPU_ONLINE, CPUHP_AP_ACTIVE, + CPUHP_ANDROID_RESERVED_1, + CPUHP_ANDROID_RESERVED_2, + CPUHP_ANDROID_RESERVED_3, + CPUHP_ANDROID_RESERVED_4, CPUHP_ONLINE, }; diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index fce476275e16..1035cb423fc1 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -14,6 +14,7 @@ #include #include #include +#include #define CPUIDLE_STATE_MAX 10 #define CPUIDLE_NAME_LEN 16 @@ -110,6 +111,8 @@ struct cpuidle_device { cpumask_t coupled_cpus; struct cpuidle_coupled *coupled; #endif + + ANDROID_KABI_RESERVE(1); }; DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices); @@ -135,6 +138,8 @@ struct cpuidle_driver { /* preferred governor to switch at register time */ const char *governor; + + ANDROID_KABI_RESERVE(1); }; #ifdef CONFIG_CPU_IDLE diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index d2b9c41c8edf..b8ef84bc8368 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -56,8 +56,6 @@ extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); extern void cpuset_wait_for_hotplug(void); -extern void cpuset_read_lock(void); -extern void cpuset_read_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -179,9 +177,6 @@ static inline void cpuset_update_active_cpus(void) static inline void cpuset_wait_for_hotplug(void) { } -static inline void cpuset_read_lock(void) { } -static inline void cpuset_read_unlock(void) { } - static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { diff --git a/include/linux/damon.h b/include/linux/damon.h index d68b67b8d458..5e1e3a128b77 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -11,9 +11,18 @@ #include #include #include +#include /* Minimal region size. Every damon_region is aligned by this. */ #define DAMON_MIN_REGION PAGE_SIZE +/* Max priority score for DAMON-based operation schemes */ +#define DAMOS_MAX_SCORE (99) + +/* Get a random number in [l, r) */ +static inline unsigned long damon_rand(unsigned long l, unsigned long r) +{ + return l + prandom_u32_max(r - l); +} /** * struct damon_addr_range - Represents an address region of [@start, @end). @@ -31,12 +40,22 @@ struct damon_addr_range { * @sampling_addr: Address of the sample for the next access check. * @nr_accesses: Access frequency of this region. * @list: List head for siblings. + * @age: Age of this region. + * + * @age is initially zero, increased for each aggregation interval, and reset + * to zero again if the access frequency is significantly changed. If two + * regions are merged into a new region, both @nr_accesses and @age of the new + * region are set as region size-weighted average of those of the two regions. */ struct damon_region { struct damon_addr_range ar; unsigned long sampling_addr; unsigned int nr_accesses; struct list_head list; + + unsigned int age; +/* private: Internal value for age calculation. */ + unsigned int last_nr_accesses; }; /** @@ -59,16 +78,194 @@ struct damon_target { struct list_head list; }; +/** + * enum damos_action - Represents an action of a Data Access Monitoring-based + * Operation Scheme. + * + * @DAMOS_WILLNEED: Call ``madvise()`` for the region with MADV_WILLNEED. + * @DAMOS_COLD: Call ``madvise()`` for the region with MADV_COLD. + * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. + * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. + * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. + * @DAMOS_STAT: Do nothing but count the stat. + */ +enum damos_action { + DAMOS_WILLNEED, + DAMOS_COLD, + DAMOS_PAGEOUT, + DAMOS_HUGEPAGE, + DAMOS_NOHUGEPAGE, + DAMOS_STAT, /* Do nothing but only record the stat */ +}; + +/** + * struct damos_quota - Controls the aggressiveness of the given scheme. + * @ms: Maximum milliseconds that the scheme can use. + * @sz: Maximum bytes of memory that the action can be applied. + * @reset_interval: Charge reset interval in milliseconds. + * + * @weight_sz: Weight of the region's size for prioritization. + * @weight_nr_accesses: Weight of the region's nr_accesses for prioritization. + * @weight_age: Weight of the region's age for prioritization. + * + * To avoid consuming too much CPU time or IO resources for applying the + * &struct damos->action to large memory, DAMON allows users to set time and/or + * size quotas. The quotas can be set by writing non-zero values to &ms and + * &sz, respectively. If the time quota is set, DAMON tries to use only up to + * &ms milliseconds within &reset_interval for applying the action. If the + * size quota is set, DAMON tries to apply the action only up to &sz bytes + * within &reset_interval. + * + * Internally, the time quota is transformed to a size quota using estimated + * throughput of the scheme's action. DAMON then compares it against &sz and + * uses smaller one as the effective quota. + * + * For selecting regions within the quota, DAMON prioritizes current scheme's + * target memory regions using the &struct damon_primitive->get_scheme_score. + * You could customize the prioritization logic by setting &weight_sz, + * &weight_nr_accesses, and &weight_age, because monitoring primitives are + * encouraged to respect those. + */ +struct damos_quota { + unsigned long ms; + unsigned long sz; + unsigned long reset_interval; + + unsigned int weight_sz; + unsigned int weight_nr_accesses; + unsigned int weight_age; + +/* private: */ + /* For throughput estimation */ + unsigned long total_charged_sz; + unsigned long total_charged_ns; + + unsigned long esz; /* Effective size quota in bytes */ + + /* For charging the quota */ + unsigned long charged_sz; + unsigned long charged_from; + struct damon_target *charge_target_from; + unsigned long charge_addr_from; + + /* For prioritization */ + unsigned long histogram[DAMOS_MAX_SCORE + 1]; + unsigned int min_score; +}; + +/** + * enum damos_wmark_metric - Represents the watermark metric. + * + * @DAMOS_WMARK_NONE: Ignore the watermarks of the given scheme. + * @DAMOS_WMARK_FREE_MEM_RATE: Free memory rate of the system in [0,1000]. + */ +enum damos_wmark_metric { + DAMOS_WMARK_NONE, + DAMOS_WMARK_FREE_MEM_RATE, +}; + +/** + * struct damos_watermarks - Controls when a given scheme should be activated. + * @metric: Metric for the watermarks. + * @interval: Watermarks check time interval in microseconds. + * @high: High watermark. + * @mid: Middle watermark. + * @low: Low watermark. + * + * If &metric is &DAMOS_WMARK_NONE, the scheme is always active. Being active + * means DAMON does monitoring and applying the action of the scheme to + * appropriate memory regions. Else, DAMON checks &metric of the system for at + * least every &interval microseconds and works as below. + * + * If &metric is higher than &high, the scheme is inactivated. If &metric is + * between &mid and &low, the scheme is activated. If &metric is lower than + * &low, the scheme is inactivated. + */ +struct damos_watermarks { + enum damos_wmark_metric metric; + unsigned long interval; + unsigned long high; + unsigned long mid; + unsigned long low; + +/* private: */ + bool activated; +}; + +/** + * struct damos_stat - Statistics on a given scheme. + * @nr_tried: Total number of regions that the scheme is tried to be applied. + * @sz_tried: Total size of regions that the scheme is tried to be applied. + * @nr_applied: Total number of regions that the scheme is applied. + * @sz_applied: Total size of regions that the scheme is applied. + * @qt_exceeds: Total number of times the quota of the scheme has exceeded. + */ +struct damos_stat { + unsigned long nr_tried; + unsigned long sz_tried; + unsigned long nr_applied; + unsigned long sz_applied; + unsigned long qt_exceeds; +}; + +/** + * struct damos - Represents a Data Access Monitoring-based Operation Scheme. + * @min_sz_region: Minimum size of target regions. + * @max_sz_region: Maximum size of target regions. + * @min_nr_accesses: Minimum ``->nr_accesses`` of target regions. + * @max_nr_accesses: Maximum ``->nr_accesses`` of target regions. + * @min_age_region: Minimum age of target regions. + * @max_age_region: Maximum age of target regions. + * @action: &damo_action to be applied to the target regions. + * @quota: Control the aggressiveness of this scheme. + * @wmarks: Watermarks for automated (in)activation of this scheme. + * @stat: Statistics of this scheme. + * @list: List head for siblings. + * + * For each aggregation interval, DAMON finds regions which fit in the + * condition (&min_sz_region, &max_sz_region, &min_nr_accesses, + * &max_nr_accesses, &min_age_region, &max_age_region) and applies &action to + * those. To avoid consuming too much CPU time or IO resources for the + * &action, "a is used. + * + * To do the work only when needed, schemes can be activated for specific + * system situations using &wmarks. If all schemes that registered to the + * monitoring context are inactive, DAMON stops monitoring either, and just + * repeatedly checks the watermarks. + * + * If all schemes that registered to a &struct damon_ctx are inactive, DAMON + * stops monitoring and just repeatedly checks the watermarks. + * + * After applying the &action to each region, &stat_count and &stat_sz is + * updated to reflect the number of regions and total size of regions that the + * &action is applied. + */ +struct damos { + unsigned long min_sz_region; + unsigned long max_sz_region; + unsigned int min_nr_accesses; + unsigned int max_nr_accesses; + unsigned int min_age_region; + unsigned int max_age_region; + enum damos_action action; + struct damos_quota quota; + struct damos_watermarks wmarks; + struct damos_stat stat; + struct list_head list; +}; + struct damon_ctx; /** - * struct damon_primitive Monitoring primitives for given use cases. + * struct damon_primitive - Monitoring primitives for given use cases. * * @init: Initialize primitive-internal data structures. * @update: Update primitive-internal data structures. * @prepare_access_checks: Prepare next access check of target regions. * @check_accesses: Check the accesses to target regions. * @reset_aggregated: Reset aggregated accesses monitoring results. + * @get_scheme_score: Get the score of a region for a scheme. + * @apply_scheme: Apply a DAMON-based operation scheme. * @target_valid: Determine if the target is valid. * @cleanup: Clean up the context. * @@ -94,6 +291,12 @@ struct damon_ctx; * of its update. The value will be used for regions adjustment threshold. * @reset_aggregated should reset the access monitoring results that aggregated * by @check_accesses. + * @get_scheme_score should return the priority score of a region for a scheme + * as an integer in [0, &DAMOS_MAX_SCORE]. + * @apply_scheme is called from @kdamond when a region for user provided + * DAMON-based operation scheme is found. It should apply the scheme's action + * to the region and return bytes of the region that the action is successfully + * applied. * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup is called from @kdamond just before its termination. @@ -104,12 +307,18 @@ struct damon_primitive { void (*prepare_access_checks)(struct damon_ctx *context); unsigned int (*check_accesses)(struct damon_ctx *context); void (*reset_aggregated)(struct damon_ctx *context); + int (*get_scheme_score)(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme); + unsigned long (*apply_scheme)(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme); bool (*target_valid)(void *target); void (*cleanup)(struct damon_ctx *context); }; -/* - * struct damon_callback Monitoring events notification callbacks. +/** + * struct damon_callback - Monitoring events notification callbacks. * * @before_start: Called before starting the monitoring. * @after_sampling: Called after each sampling. @@ -136,7 +345,7 @@ struct damon_callback { int (*before_start)(struct damon_ctx *context); int (*after_sampling)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); - int (*before_terminate)(struct damon_ctx *context); + void (*before_terminate)(struct damon_ctx *context); }; /** @@ -182,6 +391,7 @@ struct damon_callback { * @min_nr_regions: The minimum number of adaptive monitoring regions. * @max_nr_regions: The maximum number of adaptive monitoring regions. * @adaptive_targets: Head of monitoring targets (&damon_target) list. + * @schemes: Head of schemes (&damos) list. */ struct damon_ctx { unsigned long sample_interval; @@ -194,7 +404,6 @@ struct damon_ctx { /* public: */ struct task_struct *kdamond; - bool kdamond_stop; struct mutex kdamond_lock; struct damon_primitive primitive; @@ -203,13 +412,23 @@ struct damon_ctx { unsigned long min_nr_regions; unsigned long max_nr_regions; struct list_head adaptive_targets; + struct list_head schemes; }; -#define damon_next_region(r) \ - (container_of(r->list.next, struct damon_region, list)) +static inline struct damon_region *damon_next_region(struct damon_region *r) +{ + return container_of(r->list.next, struct damon_region, list); +} -#define damon_prev_region(r) \ - (container_of(r->list.prev, struct damon_region, list)) +static inline struct damon_region *damon_prev_region(struct damon_region *r) +{ + return container_of(r->list.prev, struct damon_region, list); +} + +static inline struct damon_region *damon_last_region(struct damon_target *t) +{ + return list_last_entry(&t->regions_list, struct damon_region, list); +} #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) @@ -223,17 +442,42 @@ struct damon_ctx { #define damon_for_each_target_safe(t, next, ctx) \ list_for_each_entry_safe(t, next, &(ctx)->adaptive_targets, list) +#define damon_for_each_scheme(s, ctx) \ + list_for_each_entry(s, &(ctx)->schemes, list) + +#define damon_for_each_scheme_safe(s, next, ctx) \ + list_for_each_entry_safe(s, next, &(ctx)->schemes, list) + #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); -inline void damon_insert_region(struct damon_region *r, + +/* + * Add a region between two other regions + */ +static inline void damon_insert_region(struct damon_region *r, struct damon_region *prev, struct damon_region *next, - struct damon_target *t); + struct damon_target *t) +{ + __list_add(&r->list, &prev->list, &next->list); + t->nr_regions++; +} + void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); +struct damos *damon_new_scheme( + unsigned long min_sz_region, unsigned long max_sz_region, + unsigned int min_nr_accesses, unsigned int max_nr_accesses, + unsigned int min_age_region, unsigned int max_age_region, + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks); +void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); +void damon_destroy_scheme(struct damos *s); + struct damon_target *damon_new_target(unsigned long id); void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); +bool damon_targets_empty(struct damon_ctx *ctx); void damon_free_target(struct damon_target *t); void damon_destroy_target(struct damon_target *t); unsigned int damon_nr_regions(struct damon_target *t); @@ -245,6 +489,8 @@ int damon_set_targets(struct damon_ctx *ctx, int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, unsigned long aggr_int, unsigned long primitive_upd_int, unsigned long min_nr_reg, unsigned long max_nr_reg); +int damon_set_schemes(struct damon_ctx *ctx, + struct damos **schemes, ssize_t nr_schemes); int damon_nr_running_ctxs(void); int damon_start(struct damon_ctx **ctxs, int nr_ctxs); @@ -253,16 +499,13 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); #endif /* CONFIG_DAMON */ #ifdef CONFIG_DAMON_VADDR - -/* Monitoring primitives for virtual memory address spaces */ -void damon_va_init(struct damon_ctx *ctx); -void damon_va_update(struct damon_ctx *ctx); -void damon_va_prepare_access_checks(struct damon_ctx *ctx); -unsigned int damon_va_check_accesses(struct damon_ctx *ctx); bool damon_va_target_valid(void *t); -void damon_va_cleanup(struct damon_ctx *ctx); void damon_va_set_primitives(struct damon_ctx *ctx); - #endif /* CONFIG_DAMON_VADDR */ +#ifdef CONFIG_DAMON_PADDR +bool damon_pa_target_valid(void *t); +void damon_pa_set_primitives(struct damon_ctx *ctx); +#endif /* CONFIG_DAMON_PADDR */ + #endif /* _DAMON_H */ diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 9e23d33bb6f1..fcb6a04158f6 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -14,6 +14,7 @@ #include #include #include +#include struct path; struct vfsmount; @@ -120,6 +121,9 @@ struct dentry { struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; } d_u; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); } __randomize_layout; /* @@ -149,6 +153,11 @@ struct dentry_operations { struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *); + void (*d_canonical_path)(const struct path *, struct path *); + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); } ____cacheline_aligned; /* diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 114553b487ef..cf9983ecaa5b 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -13,6 +13,7 @@ #include #include #include +#include struct dm_dev; struct dm_target; @@ -204,6 +205,9 @@ struct target_type { dm_dax_copy_iter_fn dax_copy_to_iter; dm_dax_zero_page_range_fn dax_zero_page_range; + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + /* For internal device-mapper use. */ struct list_head list; }; @@ -366,6 +370,9 @@ struct dm_target { * zone append operations using regular writes. */ bool emulate_zone_append:1; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; void *dm_per_bio_data(struct bio *bio, size_t data_size); @@ -576,9 +583,9 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *t); /* - * Table keyslot manager functions + * Table blk_crypto_profile functions */ -void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm); +void dm_destroy_crypto_profile(struct blk_crypto_profile *profile); /*----------------------------------------------------------------- * Macros. diff --git a/include/linux/device.h b/include/linux/device.h index e270cb740b9e..9bbf87def768 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -30,6 +30,7 @@ #include #include #include +#include #include struct device; @@ -575,6 +576,14 @@ struct device { #ifdef CONFIG_DMA_OPS_BYPASS bool dma_ops_bypass : 1; #endif + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); + ANDROID_KABI_RESERVE(5); + ANDROID_KABI_RESERVE(6); + ANDROID_KABI_RESERVE(7); + ANDROID_KABI_RESERVE(8); }; /** @@ -603,6 +612,8 @@ struct device_link { struct kref kref; struct work_struct rm_work; bool supplier_preactivated; /* Owned by consumer probe. */ + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; static inline struct device *kobj_to_dev(struct kobject *kobj) diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index 062777a45a74..e73ea437d141 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -112,6 +112,11 @@ struct bus_type { struct lock_class_key lock_key; bool need_parent_lock; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; extern int __must_check bus_register(struct bus_type *bus); diff --git a/include/linux/device/class.h b/include/linux/device/class.h index e61ec5502019..3d6a2b8d4ceb 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -18,6 +18,7 @@ #include #include #include +#include struct device; struct fwnode_handle; @@ -75,6 +76,11 @@ struct class { const struct dev_pm_ops *pm; struct subsys_private *p; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; struct class_dev_iter { diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index a498ebcf4993..c5b52e586e31 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -118,6 +118,11 @@ struct device_driver { void (*coredump) (struct device *dev); struct driver_private *p; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 3ad636a13b8e..1b935d19b992 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -22,6 +22,8 @@ #include #include #include +#include +#include struct device; struct dma_buf; @@ -229,6 +231,41 @@ struct dma_buf_ops { */ int (*begin_cpu_access)(struct dma_buf *, enum dma_data_direction); + /** + * @begin_cpu_access_partial: + * + * This is called from dma_buf_begin_cpu_access_partial() and allows the + * exporter to ensure that the memory specified in the range is + * available for cpu access - the exporter might need to allocate or + * swap-in and pin the backing storage. + * The exporter also needs to ensure that cpu access is + * coherent for the access direction. The direction can be used by the + * exporter to optimize the cache flushing, i.e. access with a different + * direction (read instead of write) might return stale or even bogus + * data (e.g. when the exporter needs to copy the data to temporary + * storage). + * + * This callback is optional. + * + * FIXME: This is both called through the DMA_BUF_IOCTL_SYNC command + * from userspace (where storage shouldn't be pinned to avoid handing + * de-factor mlock rights to userspace) and for the kernel-internal + * users of the various kmap interfaces, where the backing storage must + * be pinned to guarantee that the atomic kmap calls can succeed. Since + * there's no in-kernel users of the kmap interfaces yet this isn't a + * real problem. + * + * Returns: + * + * 0 on success or a negative error code on failure. This can for + * example fail when the backing storage can't be allocated. Can also + * return -ERESTARTSYS or -EINTR when the call has been interrupted and + * needs to be restarted. + */ + int (*begin_cpu_access_partial)(struct dma_buf *dmabuf, + enum dma_data_direction, + unsigned int offset, unsigned int len); + /** * @end_cpu_access: * @@ -246,6 +283,28 @@ struct dma_buf_ops { */ int (*end_cpu_access)(struct dma_buf *, enum dma_data_direction); + /** + * @end_cpu_access_partial: + * + * This is called from dma_buf_end_cpu_access_partial() when the + * importer is done accessing the CPU. The exporter can use to limit + * cache flushing to only the range specefied and to unpin any + * resources pinned in @begin_cpu_access_umapped. + * The result of any dma_buf kmap calls after end_cpu_access_partial is + * undefined. + * + * This callback is optional. + * + * Returns: + * + * 0 on success or a negative error code on failure. Can return + * -ERESTARTSYS or -EINTR when the call has been interrupted and needs + * to be restarted. + */ + int (*end_cpu_access_partial)(struct dma_buf *dmabuf, + enum dma_data_direction, + unsigned int offset, unsigned int len); + /** * @mmap: * @@ -285,6 +344,23 @@ struct dma_buf_ops { int (*vmap)(struct dma_buf *dmabuf, struct dma_buf_map *map); void (*vunmap)(struct dma_buf *dmabuf, struct dma_buf_map *map); + + /** + * @get_flags: + * + * This is called by dma_buf_get_flags and is used to get the buffer's + * flags. + * This callback is optional. + * + * Returns: + * + * 0 on success or a negative error code on failure. On success flags + * will be populated with the buffer's flags. + */ + int (*get_flags)(struct dma_buf *dmabuf, unsigned long *flags); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; /** @@ -442,10 +518,21 @@ struct dma_buf { * `DMA-BUF statistics`_ for the uapi this enables. */ struct dma_buf_sysfs_entry { - struct kobject kobj; + union { + struct kobject kobj; + + /** @sysfs_add_work: + * + * For deferred sysfs kobject creation using a workqueue. + */ + struct work_struct sysfs_add_work; + }; struct dma_buf *dmabuf; } *sysfs_entry; #endif + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; /** @@ -481,6 +568,7 @@ struct dma_buf_attach_ops { * point to the new location of the DMA-buf. */ void (*move_notify)(struct dma_buf_attachment *attach); + ANDROID_KABI_RESERVE(1); }; /** @@ -495,6 +583,8 @@ struct dma_buf_attach_ops { * @importer_ops: importer operations for this attachment, if provided * dma_buf_map/unmap_attachment() must be called with the dma_resv lock held. * @importer_priv: importer specific attachment data. + * @dma_map_attrs: DMA attributes to be used when the exporter maps the buffer + * through dma_buf_map_attachment. * * This structure holds the attachment information between the dma_buf buffer * and its user device(s). The list contains one attachment struct per device @@ -515,6 +605,10 @@ struct dma_buf_attachment { const struct dma_buf_attach_ops *importer_ops; void *importer_priv; void *priv; + unsigned long dma_map_attrs; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; /** @@ -538,6 +632,9 @@ struct dma_buf_export_info { int flags; struct dma_resv *resv; void *priv; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; /** @@ -592,6 +689,9 @@ dma_buf_attachment_is_dynamic(struct dma_buf_attachment *attach) return !!attach->importer_ops; } +int get_each_dmabuf(int (*callback)(const struct dma_buf *dmabuf, + void *private), void *private); +int is_dma_buf_file(struct file *file); struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf, struct device *dev); struct dma_buf_attachment * @@ -616,11 +716,19 @@ void dma_buf_unmap_attachment(struct dma_buf_attachment *, struct sg_table *, void dma_buf_move_notify(struct dma_buf *dma_buf); int dma_buf_begin_cpu_access(struct dma_buf *dma_buf, enum dma_data_direction dir); +int dma_buf_begin_cpu_access_partial(struct dma_buf *dma_buf, + enum dma_data_direction dir, + unsigned int offset, unsigned int len); int dma_buf_end_cpu_access(struct dma_buf *dma_buf, enum dma_data_direction dir); +int dma_buf_end_cpu_access_partial(struct dma_buf *dma_buf, + enum dma_data_direction dir, + unsigned int offset, unsigned int len); int dma_buf_mmap(struct dma_buf *, struct vm_area_struct *, unsigned long); int dma_buf_vmap(struct dma_buf *dmabuf, struct dma_buf_map *map); void dma_buf_vunmap(struct dma_buf *dmabuf, struct dma_buf_map *map); +long dma_buf_set_name(struct dma_buf *dmabuf, const char *name); +int dma_buf_get_flags(struct dma_buf *dmabuf, unsigned long *flags); #endif /* __DMA_BUF_H__ */ diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 18aade195884..80f0dbac77c4 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -24,6 +24,33 @@ struct bus_dma_region { u64 offset; }; +static inline bool zone_dma32_is_empty(int node) +{ +#ifdef CONFIG_ZONE_DMA32 + pg_data_t *pgdat = NODE_DATA(node); + + return zone_is_empty(&pgdat->node_zones[ZONE_DMA32]); +#else + return true; +#endif +} + +static inline bool zone_dma32_are_empty(void) +{ +#ifdef CONFIG_NUMA + int node; + + for_each_node(node) + if (!zone_dma32_is_empty(node)) + return false; +#else + if (!zone_dma32_is_empty(numa_node_id())) + return false; +#endif + + return true; +} + static inline dma_addr_t translate_phys_to_dma(struct device *dev, phys_addr_t paddr) { diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h index 0c05561cad6e..a1a190ebc461 100644 --- a/include/linux/dma-heap.h +++ b/include/linux/dma-heap.h @@ -17,6 +17,7 @@ struct dma_heap; /** * struct dma_heap_ops - ops to operate on a given heap * @allocate: allocate dmabuf and return struct dma_buf ptr + * @get_pool_size: if heap maintains memory pools, get pool size in bytes * * allocate returns dmabuf on success, ERR_PTR(-errno) on error. */ @@ -25,6 +26,7 @@ struct dma_heap_ops { unsigned long len, unsigned long fd_flags, unsigned long heap_flags); + long (*get_pool_size)(struct dma_heap *heap); }; /** @@ -50,6 +52,15 @@ struct dma_heap_export_info { */ void *dma_heap_get_drvdata(struct dma_heap *heap); +/** + * dma_heap_get_dev() - get device struct for the heap + * @heap: DMA-Heap to retrieve device struct from + * + * Returns: + * The device struct for the heap. + */ +struct device *dma_heap_get_dev(struct dma_heap *heap); + /** * dma_heap_get_name() - get heap name * @heap: DMA-Heap to retrieve private data for @@ -65,4 +76,49 @@ const char *dma_heap_get_name(struct dma_heap *heap); */ struct dma_heap *dma_heap_add(const struct dma_heap_export_info *exp_info); +/** + * dma_heap_put - drops a reference to a dmabuf heaps, potentially freeing it + * @heap: heap pointer + */ +void dma_heap_put(struct dma_heap *heap); + +/** + * dma_heap_find - Returns the registered dma_heap with the specified name + * @name: Name of the heap to find + * + * NOTE: dma_heaps returned from this function MUST be released + * using dma_heap_put() when the user is done. + */ +struct dma_heap *dma_heap_find(const char *name); + +/** + * dma_heap_buffer_alloc - Allocate dma-buf from a dma_heap + * @heap: dma_heap to allocate from + * @len: size to allocate + * @fd_flags: flags to set on returned dma-buf fd + * @heap_flags: flags to pass to the dma heap + * + * This is for internal dma-buf allocations only. + */ +struct dma_buf *dma_heap_buffer_alloc(struct dma_heap *heap, size_t len, + unsigned int fd_flags, + unsigned int heap_flags); + +/** dma_heap_buffer_free - Free dma_buf allocated by dma_heap_buffer_alloc + * @dma_buf: dma_buf to free + * + * This is really only a simple wrapper to dma_buf_put() + */ +void dma_heap_buffer_free(struct dma_buf *); + +/** + * dma_heap_bufferfd_alloc - Allocate dma-buf fd from a dma_heap + * @heap: dma_heap to allocate from + * @len: size to allocate + * @fd_flags: flags to set on returned dma-buf fd + * @heap_flags: flags to pass to the dma heap + */ +int dma_heap_bufferfd_alloc(struct dma_heap *heap, size_t len, + unsigned int fd_flags, + unsigned int heap_flags); #endif /* _DMA_HEAPS_H */ diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h index 24607dc3c2ac..ee6eee9af9ed 100644 --- a/include/linux/dma-iommu.h +++ b/include/linux/dma-iommu.h @@ -43,6 +43,8 @@ void iommu_dma_free_cpu_cached_iovas(unsigned int cpu, extern bool iommu_dma_forcedac; +int iommu_dma_enable_best_fit_algo(struct device *dev); + #else /* CONFIG_IOMMU_DMA */ struct iommu_domain; @@ -89,5 +91,10 @@ static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_he { } +static inline int iommu_dma_enable_best_fit_algo(struct device *dev) +{ + return -ENODEV; +} + #endif /* CONFIG_IOMMU_DMA */ #endif /* __DMA_IOMMU_H */ diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 0d5b06b3a4a6..1a269c891b60 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -8,6 +8,7 @@ #include #include +#include struct cma; @@ -70,6 +71,11 @@ struct dma_map_ops { u64 (*get_required_mask)(struct device *dev); size_t (*max_mapping_size)(struct device *dev); unsigned long (*get_merge_boundary)(struct device *dev); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #ifdef CONFIG_DMA_OPS @@ -166,6 +172,7 @@ static inline void dma_pernuma_cma_reserve(void) { } #ifdef CONFIG_DMA_DECLARE_COHERENT int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size); +void dma_release_coherent_memory(struct device *dev); int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle, void **ret); int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr); @@ -177,9 +184,11 @@ static inline int dma_declare_coherent_memory(struct device *dev, { return -ENOSYS; } + #define dma_alloc_from_dev_coherent(dev, size, handle, ret) (0) #define dma_release_from_dev_coherent(dev, order, vaddr) (0) #define dma_mmap_from_dev_coherent(dev, vma, vaddr, order, ret) (0) +static inline void dma_release_coherent_memory(struct device *dev) { } #endif /* CONFIG_DMA_DECLARE_COHERENT */ #ifdef CONFIG_DMA_GLOBAL_POOL @@ -277,6 +286,14 @@ void arch_dma_free(struct device *dev, size_t size, void *cpu_addr, #define pgprot_dmacoherent(prot) pgprot_noncached(prot) #endif +/* + * If there is no system cache pgprot, then fallback to dmacoherent + * pgprot, as the expectation is that the device is not coherent. + */ +#ifndef pgprot_syscached +#define pgprot_syscached(prot) pgprot_dmacoherent(prot) +#endif + pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs); #else static inline pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index dca2b1355bb1..b6106938a94b 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -61,6 +61,23 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) +/* + * DMA_ATTR_SYS_CACHE_ONLY: used to indicate that the buffer should be mapped + * with the correct memory attributes so that it can be cached in the system + * or last level cache. This is useful for buffers that are being mapped for + * devices that are non-coherent, but can use the system cache. + */ +#define DMA_ATTR_SYS_CACHE_ONLY (1UL << 10) + +/* + * DMA_ATTR_SYS_CACHE_ONLY_NWA: used to indicate that the buffer should be + * mapped with the correct memory attributes so that it can be cached in the + * system or last level cache, with a no write allocate cache policy. This is + * useful for buffers that are being mapped for devices that are non-coherent, + * but can use the system cache. + */ +#define DMA_ATTR_SYS_CACHE_ONLY_NWA (1UL << 11) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 9000f3ffce8b..ba37dfc6105a 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -12,6 +12,7 @@ #include #include #include +#include #include /** @@ -621,6 +622,11 @@ struct dma_async_tx_descriptor { struct dma_async_tx_descriptor *parent; spinlock_t lock; #endif + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #ifdef CONFIG_DMA_ENGINE diff --git a/include/linux/elevator.h b/include/linux/elevator.h index ef9ceead3db1..bd8093bc9ea9 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -4,6 +4,7 @@ #include #include +#include #ifdef CONFIG_BLOCK @@ -50,6 +51,11 @@ struct elevator_mq_ops { struct request *(*next_request)(struct request_queue *, struct request *); void (*init_icq)(struct io_cq *); void (*exit_icq)(struct io_cq *); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #define ELV_NAME_MAX (16) @@ -86,6 +92,9 @@ struct elevator_type /* managed by elevator core */ char icq_cache_name[ELV_NAME_MAX + 6]; /* elvname + "_io_cq" */ struct list_head list; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; #define ELV_HASH_BITS 6 diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 3fad741df53e..200e7f3b7b47 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -16,6 +16,7 @@ #include #include #include +#include #include struct compat_ethtool_rx_flow_spec { @@ -705,6 +706,11 @@ struct ethtool_ops { void (*get_rmon_stats)(struct net_device *dev, struct ethtool_rmon_stats *rmon_stats, const struct ethtool_rmon_hist_range **ranges); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; int ethtool_check_ops(const struct ethtool_ops *ops); diff --git a/include/linux/export.h b/include/linux/export.h index 5910ccb66ca2..770abed3a1e7 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -120,7 +120,7 @@ struct kernel_symbol { */ #define __EXPORT_SYMBOL(sym, sec, ns) -#elif defined(CONFIG_TRIM_UNUSED_KSYMS) +#elif defined(CONFIG_TRIM_UNUSED_KSYMS) && !defined(MODULE) #include diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index d445150c5350..ee0d75d9a302 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -73,6 +73,42 @@ struct f2fs_device { __le32 total_segments; } __packed; +/* reason of stop_checkpoint */ +enum stop_cp_reason { + STOP_CP_REASON_SHUTDOWN, + STOP_CP_REASON_FAULT_INJECT, + STOP_CP_REASON_META_PAGE, + STOP_CP_REASON_WRITE_FAIL, + STOP_CP_REASON_CORRUPTED_SUMMARY, + STOP_CP_REASON_UPDATE_INODE, + STOP_CP_REASON_FLUSH_FAIL, + STOP_CP_REASON_MAX, +}; + +#define MAX_STOP_REASON 32 + +/* detail reason for EFSCORRUPTED */ +enum f2fs_error { + ERROR_CORRUPTED_CLUSTER, + ERROR_FAIL_DECOMPRESSION, + ERROR_INVALID_BLKADDR, + ERROR_CORRUPTED_DIRENT, + ERROR_CORRUPTED_INODE, + ERROR_INCONSISTENT_SUMMARY, + ERROR_INCONSISTENT_FOOTER, + ERROR_INCONSISTENT_SUM_TYPE, + ERROR_CORRUPTED_JOURNAL, + ERROR_INCONSISTENT_NODE_COUNT, + ERROR_INCONSISTENT_BLOCK_COUNT, + ERROR_INVALID_CURSEG, + ERROR_INCONSISTENT_SIT, + ERROR_CORRUPTED_VERITY_XATTR, + ERROR_CORRUPTED_XATTR, + ERROR_MAX, +}; + +#define MAX_F2FS_ERRORS 16 + struct f2fs_super_block { __le32 magic; /* Magic Number */ __le16 major_ver; /* Major Version */ @@ -116,7 +152,9 @@ struct f2fs_super_block { __u8 hot_ext_count; /* # of hot file extension */ __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ - __u8 reserved[306]; /* valid reserved region */ + __u8 s_stop_reason[MAX_STOP_REASON]; /* stop checkpoint reason */ + __u8 s_errors[MAX_F2FS_ERRORS]; /* reason of image corrupts */ + __u8 reserved[258]; /* valid reserved region */ __le32 crc; /* checksum of superblock */ } __packed; diff --git a/include/linux/fb.h b/include/linux/fb.h index 02f362c661c8..3d7306c9a706 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -502,6 +502,7 @@ struct fb_info { } *apertures; bool skip_vt_switch; /* no VT switch on suspend/resume required */ + bool forced_out; /* set when being removed by another driver */ }; static inline struct apertures_struct *alloc_apertures(unsigned int max_num) { diff --git a/include/linux/fips.h b/include/linux/fips.h index c6961e932fef..83f5d6f7c62d 100644 --- a/include/linux/fips.h +++ b/include/linux/fips.h @@ -2,7 +2,15 @@ #ifndef _FIPS_H #define _FIPS_H -#ifdef CONFIG_CRYPTO_FIPS +#ifdef BUILD_FIPS140_KO +/* + * In fips140.ko, enable the behavior that the upstream fips_enabled flag + * controls, such as the XTS weak key check. + */ +#define fips_enabled 1 +#define CONFIG_CRYPTO_FIPS 1 + +#elif defined(CONFIG_CRYPTO_FIPS) extern int fips_enabled; extern struct atomic_notifier_head fips_fail_notif_chain; diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 0621c5f86c39..3a2b6902bee8 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -8,6 +8,9 @@ #include #include #include +#if defined(CONFIG_ARM64) && !defined(__GENKSYMS__) +#include +#endif #ifdef CONFIG_FREEZER extern atomic_t system_freezing_cnt; /* nr of freezing conds in effect */ @@ -27,6 +30,11 @@ static inline bool frozen(struct task_struct *p) return p->flags & PF_FROZEN; } +static inline bool frozen_or_skipped(struct task_struct *p) +{ + return p->flags & (PF_FROZEN | PF_FREEZER_SKIP); +} + extern bool freezing_slow_path(struct task_struct *p); /* @@ -103,10 +111,15 @@ static inline bool cgroup_freezing(struct task_struct *task) * The caller shouldn't do anything which isn't allowed for a frozen task * until freezer_cont() is called. Usually, freezer[_do_not]_count() pair * wrap a scheduling operation and nothing much else. + * + * The write to current->flags uses release semantics to prevent a concurrent + * freezer_should_skip() from observing this write before a write to on_rq + * during a prior call to activate_task(), which may cause it to return true + * before deactivate_task() is called. */ static inline void freezer_do_not_count(void) { - current->flags |= PF_FREEZER_SKIP; + smp_store_release(¤t->flags, current->flags | PF_FREEZER_SKIP); } /** @@ -156,7 +169,19 @@ static inline bool freezer_should_skip(struct task_struct *p) * clearing %PF_FREEZER_SKIP. */ smp_mb(); +#ifdef CONFIG_ARM64 + return (p->flags & PF_FREEZER_SKIP) && + (!p->on_rq || task_cpu_possible_mask(p) == cpu_possible_mask); +#else + /* + * On non-aarch64, avoid depending on task_cpu_possible_mask(), which is + * defined in , because including that header from + * here exposes a tricky bug in the tracepoint headers on x86, and that + * macro would end up being defined equal to cpu_possible_mask on other + * architectures anyway. + */ return p->flags & PF_FREEZER_SKIP; +#endif } /* @@ -270,6 +295,7 @@ static inline int freezable_schedule_hrtimeout_range(ktime_t *expires, #else /* !CONFIG_FREEZER */ static inline bool frozen(struct task_struct *p) { return false; } +static inline bool frozen_or_skipped(struct task_struct *p) { return false; } static inline bool freezing(struct task_struct *p) { return false; } static inline void __thaw_task(struct task_struct *t) {} diff --git a/include/linux/fs.h b/include/linux/fs.h index 1e1ac116dd13..3114de5e9628 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -419,6 +420,11 @@ struct address_space_operations { int (*swap_activate)(struct swap_info_struct *sis, struct file *file, sector_t *span); void (*swap_deactivate)(struct file *file); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; extern const struct address_space_operations empty_aops; @@ -477,6 +483,11 @@ struct address_space { spinlock_t private_lock; struct list_head private_list; void *private_data; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but @@ -512,6 +523,11 @@ static inline void i_mmap_unlock_write(struct address_space *mapping) up_write(&mapping->i_mmap_rwsem); } +static inline int i_mmap_trylock_read(struct address_space *mapping) +{ + return down_read_trylock(&mapping->i_mmap_rwsem); +} + static inline void i_mmap_lock_read(struct address_space *mapping) { down_read(&mapping->i_mmap_rwsem); @@ -731,6 +747,9 @@ struct inode { #endif void *i_private; /* fs or device private pointer */ + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); } __randomize_layout; struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode); @@ -1001,6 +1020,9 @@ struct file { struct address_space *f_mapping; errseq_t f_wb_err; errseq_t f_sb_err; /* for syncfs */ + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ @@ -1061,6 +1083,9 @@ struct file_lock; struct file_lock_operations { void (*fl_copy_lock)(struct file_lock *, struct file_lock *); void (*fl_release_private)(struct file_lock *); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; struct lock_manager_operations { @@ -1072,6 +1097,9 @@ struct lock_manager_operations { int (*lm_change)(struct file_lock *, int, struct list_head *); void (*lm_setup)(struct file_lock *, void **); bool (*lm_breaker_owns_lease)(struct file_lock *); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; struct lock_manager { @@ -1145,6 +1173,10 @@ struct file_lock { unsigned int debug_id; } afs; } fl_u; + + struct list_head android_reserved1; /* not a macro as we might just need it as-is */ + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); } __randomize_layout; struct file_lock_context { @@ -1606,6 +1638,11 @@ struct super_block { spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); } __randomize_layout; static inline struct user_namespace *i_user_ns(const struct inode *inode) @@ -2046,6 +2083,11 @@ struct file_operations { struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); } __randomize_layout; struct inode_operations { @@ -2087,6 +2129,11 @@ struct inode_operations { int (*fileattr_set)(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa); int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); } ____cacheline_aligned; static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio, @@ -2162,6 +2209,11 @@ struct super_operations { struct shrink_control *); long (*free_cached_objects)(struct super_block *, struct shrink_control *); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; /* @@ -2478,6 +2530,11 @@ struct file_system_type { struct lock_class_key i_mutex_key; struct lock_class_key invalidate_lock_key; struct lock_class_key i_mutex_dir_key; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME) @@ -2689,6 +2746,7 @@ extern long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode); extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); +extern struct file *filp_open_block(const char *, int, umode_t); extern struct file *file_open_root(const struct path *, const char *, int, umode_t); static inline struct file *file_open_root_mnt(struct vfsmount *mnt, @@ -3462,6 +3520,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return 0; } +static inline rwf_t iocb_to_rw_flags(int ifl, int iocb_mask) +{ + return ifl & iocb_mask; +} + static inline ino_t parent_ino(struct dentry *dentry) { ino_t res; diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 3c7ea2cf85a5..6a95b750ab18 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -18,10 +18,21 @@ #include #include -#define FS_CRYPTO_BLOCK_SIZE 16 +/* + * The lengths of all file contents blocks must be divisible by this value. + * This is needed to ensure that all contents encryption modes will work, as + * some of the supported modes don't support arbitrarily byte-aligned messages. + * + * Since the needed alignment is 16 bytes, most filesystems will meet this + * requirement naturally, as typical block sizes are powers of 2. However, if a + * filesystem can generate arbitrarily byte-aligned block lengths (e.g., via + * compression), then it will need to pad to this alignment before encryption. + */ +#define FSCRYPT_CONTENTS_ALIGNMENT 16 union fscrypt_policy; struct fscrypt_info; +struct fs_parameter; struct seq_file; struct fscrypt_str { @@ -118,9 +129,6 @@ struct fscrypt_operations { */ bool (*empty_dir)(struct inode *inode); - /* The filesystem's maximum ciphertext filename length, in bytes */ - unsigned int max_namelen; - /* * Check whether the filesystem's inode numbers and UUID are stable, * meaning that they will never be changed even by offline operations @@ -153,24 +161,28 @@ struct fscrypt_operations { int *ino_bits_ret, int *lblk_bits_ret); /* - * Return the number of block devices to which the filesystem may write - * encrypted file contents. + * Return an array of pointers to the block devices to which the + * filesystem may write encrypted file contents, NULL if the filesystem + * only has a single such block device, or an ERR_PTR() on error. + * + * On successful non-NULL return, *num_devs is set to the number of + * devices in the returned array. The caller must free the returned + * array using kfree(). * * If the filesystem can use multiple block devices (other than block * devices that aren't used for encrypted file contents, such as * external journal devices), and wants to support inline encryption, * then it must implement this function. Otherwise it's not needed. */ - int (*get_num_devices)(struct super_block *sb); + struct block_device **(*get_devices)(struct super_block *sb, + unsigned int *num_devs); - /* - * If ->get_num_devices() returns a value greater than 1, then this - * function is called to get the array of request_queues that the - * filesystem is using -- one per block device. (There may be duplicate - * entries in this array, as block devices can share a request_queue.) - */ - void (*get_devices)(struct super_block *sb, - struct request_queue **devs); + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); + + ANDROID_OEM_DATA_ARRAY(1, 4); }; static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode) @@ -276,16 +288,26 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg); int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); int fscrypt_has_permitted_context(struct inode *parent, struct inode *child); +int fscrypt_context_for_new_inode(void *ctx, struct inode *inode); int fscrypt_set_context(struct inode *inode, void *fs_data); struct fscrypt_dummy_policy { const union fscrypt_policy *policy; }; +int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, + struct fscrypt_dummy_policy *dummy_policy); +bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, + const struct fscrypt_dummy_policy *p2); int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg, struct fscrypt_dummy_policy *dummy_policy); void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb); +static inline bool +fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy) +{ + return dummy_policy->policy != NULL; +} static inline void fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) { @@ -296,6 +318,8 @@ fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) /* keyring.c */ void fscrypt_destroy_keyring(struct super_block *sb); int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); +int fscrypt_add_test_dummy_key(struct super_block *sb, + const struct fscrypt_dummy_policy *dummy_policy); int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg); int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg); int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg); @@ -308,6 +332,10 @@ void fscrypt_free_inode(struct inode *inode); int fscrypt_drop_inode(struct inode *inode); /* fname.c */ +int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen); +bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, + u32 max_len, u32 *encrypted_len_ret); int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, int lookup, struct fscrypt_name *fname); @@ -329,7 +357,7 @@ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags); /* bio.c */ -void fscrypt_decrypt_bio(struct bio *bio); +bool fscrypt_decrypt_bio(struct bio *bio); int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len); @@ -470,12 +498,32 @@ static inline int fscrypt_set_context(struct inode *inode, void *fs_data) struct fscrypt_dummy_policy { }; +static inline int +fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, + struct fscrypt_dummy_policy *dummy_policy) +{ + return -EINVAL; +} + +static inline bool +fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, + const struct fscrypt_dummy_policy *p2) +{ + return true; +} + static inline void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb) { } +static inline bool +fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy) +{ + return false; +} + static inline void fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) { @@ -491,6 +539,13 @@ static inline int fscrypt_ioctl_add_key(struct file *filp, void __user *arg) return -EOPNOTSUPP; } +static inline int +fscrypt_add_test_dummy_key(struct super_block *sb, + const struct fscrypt_dummy_policy *dummy_policy) +{ + return 0; +} + static inline int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg) { return -EOPNOTSUPP; @@ -595,8 +650,9 @@ static inline int fscrypt_d_revalidate(struct dentry *dentry, } /* bio.c */ -static inline void fscrypt_decrypt_bio(struct bio *bio) +static inline bool fscrypt_decrypt_bio(struct bio *bio) { + return true; } static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, @@ -717,6 +773,10 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, bool fscrypt_mergeable_bio_bh(struct bio *bio, const struct buffer_head *next_bh); +bool fscrypt_dio_supported(struct inode *inode); + +u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks); + #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ static inline bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) @@ -745,8 +805,33 @@ static inline bool fscrypt_mergeable_bio_bh(struct bio *bio, { return true; } + +static inline bool fscrypt_dio_supported(struct inode *inode) +{ + return !fscrypt_needs_contents_encryption(inode); +} + +static inline u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, + u64 nr_blocks) +{ + return nr_blocks; +} #endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_DM_DEFAULT_KEY) +static inline bool +fscrypt_inode_should_skip_dm_default_key(const struct inode *inode) +{ + return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); +} +#else +static inline bool +fscrypt_inode_should_skip_dm_default_key(const struct inode *inode) +{ + return false; +} +#endif + /** * fscrypt_inode_uses_inline_crypto() - test whether an inode uses inline * encryption diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index a9477c14fad5..f2e7ffcb746b 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -96,6 +96,25 @@ static inline int fsnotify_file(struct file *file, __u32 mask) if (file->f_mode & FMODE_NONOTIFY) return 0; + /* + * Open calls notify early on, so lower file system must be notified + */ + if (mask & FS_OPEN) { + if (path->dentry->d_op && + path->dentry->d_op->d_canonical_path) { + struct path lower_path; + int ret; + + path->dentry->d_op->d_canonical_path(path, &lower_path); + ret = fsnotify_parent(lower_path.dentry, mask, + &lower_path, FSNOTIFY_EVENT_PATH); + path_put(&lower_path); + + if (ret) + return ret; + } + } + return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index b568b3c7d095..270ba08df011 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -12,8 +12,16 @@ #define _LINUX_FSVERITY_H #include +#include +#include #include +/* + * Largest digest size among all hash algorithms supported by fs-verity. + * Currently assumed to be <= size of fsverity_descriptor::root_hash. + */ +#define FS_VERITY_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE + /* Verity operations for filesystems */ struct fsverity_operations { @@ -131,6 +139,9 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *arg); /* measure.c */ int fsverity_ioctl_measure(struct file *filp, void __user *arg); +int fsverity_get_digest(struct inode *inode, + u8 digest[FS_VERITY_MAX_DIGEST_SIZE], + enum hash_algo *alg); /* open.c */ @@ -170,6 +181,13 @@ static inline int fsverity_ioctl_measure(struct file *filp, void __user *arg) return -EOPNOTSUPP; } +static inline int fsverity_get_digest(struct inode *inode, + u8 digest[FS_VERITY_MAX_DIGEST_SIZE], + enum hash_algo *alg) +{ + return -EOPNOTSUPP; +} + /* open.c */ static inline int fsverity_file_open(struct inode *inode, struct file *filp) @@ -233,4 +251,18 @@ static inline bool fsverity_active(const struct inode *inode) return fsverity_get_info(inode) != NULL; } +#ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES +int __fsverity_verify_signature(const struct inode *inode, const u8 *signature, + size_t sig_size, const u8 *file_digest, + unsigned int digest_algorithm); +#else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ +static inline int __fsverity_verify_signature(const struct inode *inode, + const u8 *signature, size_t sig_size, + const u8 *file_digest, + unsigned int digest_algorithm) +{ + return 0; +} +#endif /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ + #endif /* _LINUX_FSVERITY_H */ diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 2d68606fb725..84fa9ee51f4d 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -12,6 +12,7 @@ #include #include #include +#include struct fwnode_operations; struct device; @@ -39,6 +40,7 @@ struct fwnode_handle { struct list_head suppliers; struct list_head consumers; u8 flags; + ANDROID_KABI_RESERVE(1); }; struct fwnode_link { @@ -46,6 +48,9 @@ struct fwnode_link { struct list_head s_hook; struct fwnode_handle *consumer; struct list_head c_hook; + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); }; /** diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 0b48a0cf4262..422806e0a0f5 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -16,6 +16,7 @@ #include #include #include +#include extern const struct device_type disk_type; extern struct device_type part_type; @@ -112,6 +113,9 @@ struct blk_integrity { unsigned char tuple_size; unsigned char interval_exp; unsigned char tag_size; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; struct gendisk { @@ -162,6 +166,11 @@ struct gendisk { struct badblocks *bb; struct lockdep_map lockdep_map; u64 diskseq; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; static inline bool disk_live(struct gendisk *disk) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 55b2ec1f965a..b86ae05b4282 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -54,12 +54,30 @@ struct vm_area_struct; #define ___GFP_THISNODE 0x200000u #define ___GFP_ACCOUNT 0x400000u #define ___GFP_ZEROTAGS 0x800000u -#define ___GFP_SKIP_KASAN_POISON 0x1000000u +#ifdef CONFIG_KASAN_HW_TAGS +#define ___GFP_SKIP_ZERO 0x1000000u +#define ___GFP_SKIP_KASAN_UNPOISON 0x2000000u +#define ___GFP_SKIP_KASAN_POISON 0x4000000u +#else +#define ___GFP_SKIP_ZERO 0 +#define ___GFP_SKIP_KASAN_UNPOISON 0 +#define ___GFP_SKIP_KASAN_POISON 0 +#endif +#ifdef CONFIG_CMA +#define ___GFP_CMA 0x8000000u +#else +#define ___GFP_CMA 0 +#endif #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x2000000u +#ifdef CONFIG_CMA +#define ___GFP_NOLOCKDEP 0x10000000u +#else +#define ___GFP_NOLOCKDEP 0x8000000u +#endif #else #define ___GFP_NOLOCKDEP 0 #endif + /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* @@ -73,6 +91,7 @@ struct vm_area_struct; #define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM) #define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32) #define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */ +#define __GFP_CMA ((__force gfp_t)___GFP_CMA) #define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE) /** @@ -232,24 +251,35 @@ struct vm_area_struct; * * %__GFP_ZERO returns a zeroed page on success. * - * %__GFP_ZEROTAGS returns a page with zeroed memory tags on success, if - * __GFP_ZERO is set. + * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself + * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that + * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting + * memory tags at the same time as zeroing memory has minimal additional + * performace impact. * - * %__GFP_SKIP_KASAN_POISON returns a page which does not need to be poisoned - * on deallocation. Typically used for userspace pages. Currently only has an - * effect in HW tags mode. + * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation. + * Only effective in HW_TAGS mode. + * + * %__GFP_SKIP_KASAN_POISON makes KASAN skip poisoning on page deallocation. + * Typically, used for userspace pages. Only effective in HW_TAGS mode. */ #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) #define __GFP_COMP ((__force gfp_t)___GFP_COMP) #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) #define __GFP_ZEROTAGS ((__force gfp_t)___GFP_ZEROTAGS) -#define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON) +#define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO) +#define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON) +#define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON) /* Disable lockdep for GFP context tracking */ #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP)) +#ifdef CONFIG_CMA +#define __GFP_BITS_SHIFT (28 + IS_ENABLED(CONFIG_LOCKDEP)) +#else +#define __GFP_BITS_SHIFT (27 + IS_ENABLED(CONFIG_LOCKDEP)) +#endif #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /** @@ -331,7 +361,7 @@ struct vm_area_struct; #define GFP_DMA32 __GFP_DMA32 #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE | \ - __GFP_SKIP_KASAN_POISON) + __GFP_SKIP_KASAN_POISON | __GFP_SKIP_KASAN_UNPOISON) #define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM) #define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM) @@ -473,16 +503,7 @@ static inline bool gfpflags_normal_context(const gfp_t gfp_flags) | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM) \ ) -static inline enum zone_type gfp_zone(gfp_t flags) -{ - enum zone_type z; - int bit = (__force int) (flags & GFP_ZONEMASK); - - z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & - ((1 << GFP_ZONES_SHIFT) - 1); - VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); - return z; -} +enum zone_type gfp_zone(gfp_t flags); /* * There is only one page-allocator function, and two main namespaces to @@ -672,9 +693,24 @@ static inline bool pm_suspended_storage(void) #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_CONTIG_ALLOC +extern unsigned long pfn_max_align_up(unsigned long pfn); + +#define ACR_ERR_ISOLATE (1 << 0) +#define ACR_ERR_MIGRATE (1 << 1) +#define ACR_ERR_TEST (1 << 2) + +struct acr_info { + unsigned long nr_mapped; + unsigned long nr_migrated; + unsigned long nr_reclaimed; + unsigned int err; + unsigned long failed_pfn; +}; + /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, - unsigned migratetype, gfp_t gfp_mask); + unsigned migratetype, gfp_t gfp_mask, + struct acr_info *info); extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, int nid, nodemask_t *nodemask); #endif diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 11c26ae7b4fa..0d6e7bd661ff 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -10,6 +10,7 @@ #include #include #include +#include struct gpio_desc; struct of_phandle_args; @@ -275,6 +276,9 @@ struct gpio_irq_chip { * Store old irq_chip irq_mask callback */ void (*irq_mask)(struct irq_data *data); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; /** @@ -494,6 +498,9 @@ struct gpio_chip { struct device_node *np); #endif /* CONFIG_OF_GPIO */ + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; extern const char *gpiochip_is_requested(struct gpio_chip *gc, diff --git a/include/linux/hid.h b/include/linux/hid.h index 26742ca14609..f5ebf6cbcfad 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -26,6 +26,7 @@ #include #include #include +#include /* * We parse each description item into this structure. Short items data @@ -629,6 +630,9 @@ struct hid_device { /* device report descriptor */ struct list_head debug_list; spinlock_t debug_list_lock; wait_queue_head_t debug_wait; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; #define to_hid_device(pdev) \ diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 4aa1031d3e4c..7e6cfb1a2c2e 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -8,7 +8,7 @@ #ifdef CONFIG_KMAP_LOCAL void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); void *__kmap_local_page_prot(struct page *page, pgprot_t prot); -void kunmap_local_indexed(void *vaddr); +void kunmap_local_indexed(const void *vaddr); void kmap_local_fork(struct task_struct *tsk); void __kmap_local_sched_out(void); void __kmap_local_sched_in(void); @@ -83,7 +83,7 @@ static inline void *kmap_local_pfn(unsigned long pfn) return __kmap_local_pfn_prot(pfn, kmap_prot); } -static inline void __kunmap_local(void *vaddr) +static inline void __kunmap_local(const void *vaddr) { kunmap_local_indexed(vaddr); } @@ -115,7 +115,7 @@ static inline void *kmap_atomic_pfn(unsigned long pfn) return __kmap_local_pfn_prot(pfn, kmap_prot); } -static inline void __kunmap_atomic(void *addr) +static inline void __kunmap_atomic(const void *addr) { kunmap_local_indexed(addr); pagefault_enable(); @@ -181,7 +181,7 @@ static inline void *kmap_local_pfn(unsigned long pfn) return kmap_local_page(pfn_to_page(pfn)); } -static inline void __kunmap_local(void *addr) +static inline void __kunmap_local(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(addr); @@ -208,7 +208,7 @@ static inline void *kmap_atomic_pfn(unsigned long pfn) return kmap_atomic(pfn_to_page(pfn)); } -static inline void __kunmap_atomic(void *addr) +static inline void __kunmap_atomic(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(addr); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index b4c49f9cc379..a11de6d65954 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -166,7 +166,7 @@ static inline struct page * alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, unsigned long vaddr) { - struct page *page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); + struct page *page = alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_CMA, vma, vaddr); if (page) clear_user_highpage(page, vaddr); @@ -182,6 +182,16 @@ static inline void clear_highpage(struct page *page) kunmap_atomic(kaddr); } +static inline void clear_highpage_kasan_tagged(struct page *page) +{ + u8 tag; + + tag = page_kasan_tag(page); + page_kasan_tag_reset(page); + clear_highpage(page); + page_kasan_tag_set(page, tag); +} + #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE static inline void tag_clear_highpage(struct page *page) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 0ee140176f10..f5d43920ecd6 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -20,6 +20,7 @@ #include #include #include +#include struct hrtimer_clock_base; struct hrtimer_cpu_base; @@ -124,6 +125,8 @@ struct hrtimer { u8 is_rel; u8 is_soft; u8 is_hard; + + ANDROID_KABI_RESERVE(1); }; /** diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 00ed7c17698d..2928605cf54c 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -9,7 +9,7 @@ * Copyright (c) 2006, Michael Wu * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH * Copyright (c) 2016 - 2017 Intel Deutschland GmbH - * Copyright (c) 2018 - 2021 Intel Corporation + * Copyright (c) 2018 - 2022 Intel Corporation */ #ifndef LINUX_IEEE80211_H @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -1929,6 +1930,131 @@ struct ieee80211_mu_edca_param_set { struct ieee80211_he_mu_edca_param_ac_rec ac_vo; } __packed; +#define IEEE80211_EHT_MCS_NSS_RX 0x0f +#define IEEE80211_EHT_MCS_NSS_TX 0xf0 + +/** + * struct ieee80211_eht_mcs_nss_supp_20mhz_only - EHT 20MHz only station max + * supported NSS for per MCS. + * + * For each field below, bits 0 - 3 indicate the maximal number of spatial + * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams + * for Tx. + * + * @rx_tx_mcs7_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 0 - 7. + * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 8 - 9. + * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 10 - 11. + * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 12 - 13. + */ +struct ieee80211_eht_mcs_nss_supp_20mhz_only { + u8 rx_tx_mcs7_max_nss; + u8 rx_tx_mcs9_max_nss; + u8 rx_tx_mcs11_max_nss; + u8 rx_tx_mcs13_max_nss; +}; + +/** + * struct ieee80211_eht_mcs_nss_supp_bw - EHT max supported NSS per MCS (except + * 20MHz only stations). + * + * For each field below, bits 0 - 3 indicate the maximal number of spatial + * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams + * for Tx. + * + * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 0 - 9. + * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 10 - 11. + * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 12 - 13. + */ +struct ieee80211_eht_mcs_nss_supp_bw { + u8 rx_tx_mcs9_max_nss; + u8 rx_tx_mcs11_max_nss; + u8 rx_tx_mcs13_max_nss; +}; + +/** + * struct ieee80211_eht_cap_elem_fixed - EHT capabilities fixed data + * + * This structure is the "EHT Capabilities element" fixed fields as + * described in P802.11be_D2.0 section 9.4.2.313. + * + * @mac_cap_info: MAC capabilities, see IEEE80211_EHT_MAC_CAP* + * @phy_cap_info: PHY capabilities, see IEEE80211_EHT_PHY_CAP* + */ +struct ieee80211_eht_cap_elem_fixed { + u8 mac_cap_info[2]; + u8 phy_cap_info[9]; +} __packed; + +/** + * struct ieee80211_eht_cap_elem - EHT capabilities element + * @fixed: fixed parts, see &ieee80211_eht_cap_elem_fixed + * @optional: optional parts + */ +struct ieee80211_eht_cap_elem { + struct ieee80211_eht_cap_elem_fixed fixed; + + /* + * Followed by: + * Supported EHT-MCS And NSS Set field: 4, 3, 6 or 9 octets. + * EHT PPE Thresholds field: variable length. + */ + u8 optional[]; +} __packed; + +#define IEEE80211_EHT_OPER_INFO_PRESENT 0x01 +#define IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT 0x02 +#define IEEE80211_EHT_OPER_EHT_DEF_PE_DURATION 0x04 +#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_LIMIT 0x08 +#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_EXP_MASK 0x30 + +/** + * struct ieee80211_eht_operation - eht operation element + * + * This structure is the "EHT Operation Element" fields as + * described in P802.11be_D2.0 section 9.4.2.311 + * + * @params: EHT operation element parameters. See &IEEE80211_EHT_OPER_* + * @basic_mcs_nss: indicates the EHT-MCSs for each number of spatial streams in + * EHT PPDUs that are supported by all EHT STAs in the BSS in transmit and + * receive. + * @optional: optional parts + */ +struct ieee80211_eht_operation { + u8 params; + __le32 basic_mcs_nss; + u8 optional[]; +} __packed; + +/** + * struct ieee80211_eht_operation_info - eht operation information + * + * @control: EHT operation information control. + * @ccfs0: defines a channel center frequency for a 20, 40, 80, 160, or 320 MHz + * EHT BSS. + * @ccfs1: defines a channel center frequency for a 160 or 320 MHz EHT BSS. + * @optional: optional parts + */ +struct ieee80211_eht_operation_info { + u8 control; + u8 ccfs0; + u8 ccfs1; + u8 optional[]; +} __packed; + /* 802.11ac VHT Capabilities */ #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895 0x00000000 #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991 0x00000001 @@ -2094,6 +2220,8 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G 0x04 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G 0x08 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G 0x10 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK_ALL 0x1e + #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G 0x20 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G 0x40 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK 0xfe @@ -2302,6 +2430,29 @@ ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info) return n; } +static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len) +{ + const struct ieee80211_he_cap_elem *he_cap_ie_elem = (const void *)data; + u8 needed = sizeof(*he_cap_ie_elem); + + if (len < needed) + return false; + + needed += ieee80211_he_mcs_nss_size(he_cap_ie_elem); + if (len < needed) + return false; + + if (he_cap_ie_elem->phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) { + if (len < needed + 1) + return false; + needed += ieee80211_he_ppe_size(data[needed], + he_cap_ie_elem->phy_cap_info); + } + + return len >= needed; +} + /* HE Operation defines */ #define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK 0x00000007 #define IEEE80211_HE_OPERATION_TWT_REQUIRED 0x00000008 @@ -2563,6 +2714,207 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) #define S1G_OPER_CH_WIDTH_PRIMARY_1MHZ BIT(0) #define S1G_OPER_CH_WIDTH_OPER GENMASK(4, 1) +/* EHT MAC capabilities as defined in P802.11be_D2.0 section 9.4.2.313.2 */ +#define IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS 0x01 +#define IEEE80211_EHT_MAC_CAP0_OM_CONTROL 0x02 +#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1 0x04 +#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE2 0x08 +#define IEEE80211_EHT_MAC_CAP0_RESTRICTED_TWT 0x10 +#define IEEE80211_EHT_MAC_CAP0_SCS_TRAFFIC_DESC 0x20 +#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_MASK 0xc0 +#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_3895 0 +#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_7991 1 +#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_11454 2 + +#define IEEE80211_EHT_MAC_CAP1_MAX_AMPDU_LEN_MASK 0x01 + +/* EHT PHY capabilities as defined in P802.11be_D2.0 section 9.4.2.313.3 */ +#define IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ 0x02 +#define IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ 0x04 +#define IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI 0x08 +#define IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO 0x10 +#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER 0x20 +#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE 0x40 + +/* EHT beamformee number of spatial streams <= 80MHz is split */ +#define IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK 0x80 +#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK 0x03 + +#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK 0x1c +#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK 0xe0 + +#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK 0x07 +#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK 0x38 + +/* EHT number of sounding dimensions for 320MHz is split */ +#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK 0xc0 +#define IEEE80211_EHT_PHY_CAP3_SOUNDING_DIM_320MHZ_MASK 0x01 +#define IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK 0x02 +#define IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK 0x04 +#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK 0x08 +#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK 0x10 +#define IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK 0x20 +#define IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK 0x40 +#define IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK 0x80 + +#define IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO 0x01 +#define IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP 0x02 +#define IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP 0x04 +#define IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI 0x08 +#define IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK 0xf0 + +#define IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK 0x01 +#define IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP 0x02 +#define IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP 0x04 +#define IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT 0x08 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK 0x30 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_0US 0 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_8US 1 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_16US 2 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_20US 3 + +/* Maximum number of supported EHT LTF is split */ +#define IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK 0xc0 +#define IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK 0x07 + +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK 0x78 +#define IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP 0x80 + +#define IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW 0x01 +#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ 0x02 +#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ 0x04 +#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ 0x08 +#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ 0x10 +#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ 0x20 +#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ 0x40 +#define IEEE80211_EHT_PHY_CAP7_TB_SOUNDING_FDBK_RATE_LIMIT 0x80 + +#define IEEE80211_EHT_PHY_CAP8_RX_1024QAM_WIDER_BW_DL_OFDMA 0x01 +#define IEEE80211_EHT_PHY_CAP8_RX_4096QAM_WIDER_BW_DL_OFDMA 0x02 + +/* + * EHT operation channel width as defined in P802.11be_D2.0 section 9.4.2.311 + */ +#define IEEE80211_EHT_OPER_CHAN_WIDTH 0x7 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ 0 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ 1 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ 2 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ 3 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ 4 + +/* Calculate 802.11be EHT capabilities IE Tx/Rx EHT MCS NSS Support Field size */ +static inline u8 +ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap, + const struct ieee80211_eht_cap_elem_fixed *eht_cap, + bool from_ap) +{ + u8 count = 0; + + /* on 2.4 GHz, if it supports 40 MHz, the result is 3 */ + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G) + return 3; + + /* on 2.4 GHz, these three bits are reserved, so should be 0 */ + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G) + count += 3; + + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) + count += 3; + + if (eht_cap->phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) + count += 3; + + if (count) + return count; + + return from_ap ? 3 : 4; +} + +/* 802.11be EHT PPE Thresholds */ +#define IEEE80211_EHT_PPE_THRES_NSS_POS 0 +#define IEEE80211_EHT_PPE_THRES_NSS_MASK 0xf +#define IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK 0x1f0 +#define IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE 3 +#define IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE 9 + +/* + * Calculate 802.11be EHT capabilities IE EHT field size + */ +static inline u8 +ieee80211_eht_ppe_size(u16 ppe_thres_hdr, const u8 *phy_cap_info) +{ + u32 n; + + if (!(phy_cap_info[5] & + IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT)) + return 0; + + n = hweight16(ppe_thres_hdr & + IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK); + n *= 1 + u16_get_bits(ppe_thres_hdr, IEEE80211_EHT_PPE_THRES_NSS_MASK); + + /* + * Each pair is 6 bits, and we need to add the 9 "header" bits to the + * total size. + */ + n = n * IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE * 2 + + IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE; + return DIV_ROUND_UP(n, 8); +} + +static inline bool +ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len, + bool from_ap) +{ + const struct ieee80211_eht_cap_elem_fixed *elem = (const void *)data; + u8 needed = sizeof(struct ieee80211_eht_cap_elem_fixed); + + if (len < needed || !he_capa) + return false; + + needed += ieee80211_eht_mcs_nss_size((const void *)he_capa, + (const void *)data, + from_ap); + if (len < needed) + return false; + + if (elem->phy_cap_info[5] & + IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT) { + u16 ppe_thres_hdr; + + if (len < needed + sizeof(ppe_thres_hdr)) + return false; + + ppe_thres_hdr = get_unaligned_le16(data + needed); + needed += ieee80211_eht_ppe_size(ppe_thres_hdr, + elem->phy_cap_info); + } + + return len >= needed; +} + +static inline bool +ieee80211_eht_oper_size_ok(const u8 *data, u8 len) +{ + const struct ieee80211_eht_operation *elem = (const void *)data; + u8 needed = sizeof(*elem); + + if (len < needed) + return false; + + if (elem->params & IEEE80211_EHT_OPER_INFO_PRESENT) { + needed += 3; + + if (elem->params & + IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT) + needed += 2; + } + + return len >= needed; +} #define LISTEN_INT_USF GENMASK(15, 14) #define LISTEN_INT_UI GENMASK(13, 0) @@ -3018,6 +3370,9 @@ enum ieee80211_eid_ext { WLAN_EID_EXT_SHORT_SSID_LIST = 58, WLAN_EID_EXT_HE_6GHZ_CAPA = 59, WLAN_EID_EXT_UL_MU_POWER_CAPA = 60, + WLAN_EID_EXT_EHT_OPERATION = 106, + WLAN_EID_EXT_EHT_MULTI_LINK = 107, + WLAN_EID_EXT_EHT_CAPABILITY = 108, }; /* Action category code */ @@ -3982,4 +4337,229 @@ enum ieee80211_range_params_max_total_ltf { IEEE80211_RANGE_PARAMS_MAX_TOTAL_LTF_UNSPECIFIED, }; +/* multi-link device */ +#define IEEE80211_MLD_MAX_NUM_LINKS 15 + +#define IEEE80211_ML_CONTROL_TYPE 0x0007 +#define IEEE80211_ML_CONTROL_TYPE_BASIC 0 +#define IEEE80211_ML_CONTROL_TYPE_PREQ 1 +#define IEEE80211_ML_CONTROL_TYPE_RECONF 2 +#define IEEE80211_ML_CONTROL_TYPE_TDLS 3 +#define IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS 4 +#define IEEE80211_ML_CONTROL_PRESENCE_MASK 0xfff0 + +struct ieee80211_multi_link_elem { + __le16 control; + u8 variable[]; +} __packed; + +#define IEEE80211_MLC_BASIC_PRES_LINK_ID 0x0010 +#define IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT 0x0020 +#define IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY 0x0040 +#define IEEE80211_MLC_BASIC_PRES_EML_CAPA 0x0080 +#define IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP 0x0100 +#define IEEE80211_MLC_BASIC_PRES_MLD_ID 0x0200 + +#define IEEE80211_MED_SYNC_DELAY_DURATION 0x00ff +#define IEEE80211_MED_SYNC_DELAY_SYNC_OFDM_ED_THRESH 0x0f00 +#define IEEE80211_MED_SYNC_DELAY_SYNC_MAX_NUM_TXOPS 0xf000 + +#define IEEE80211_EML_CAP_EMLSR_SUPP 0x0001 +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY 0x000e +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_0US 0 +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_32US 1 +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_64US 2 +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_128US 3 +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US 4 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY 0x0070 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_0US 0 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_16US 1 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_32US 2 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_64US 3 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_128US 4 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US 5 +#define IEEE80211_EML_CAP_EMLMR_SUPPORT 0x0080 +#define IEEE80211_EML_CAP_EMLMR_DELAY 0x0700 +#define IEEE80211_EML_CAP_EMLMR_DELAY_0US 0 +#define IEEE80211_EML_CAP_EMLMR_DELAY_32US 1 +#define IEEE80211_EML_CAP_EMLMR_DELAY_64US 2 +#define IEEE80211_EML_CAP_EMLMR_DELAY_128US 3 +#define IEEE80211_EML_CAP_EMLMR_DELAY_256US 4 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT 0x7800 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_0 0 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128US 1 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_256US 2 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_512US 3 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_1TU 4 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_2TU 5 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_4TU 6 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_8TU 7 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_16TU 8 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_32TU 9 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_64TU 10 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU 11 + +#define IEEE80211_MLD_CAP_OP_MAX_SIMUL_LINKS 0x000f +#define IEEE80211_MLD_CAP_OP_SRS_SUPPORT 0x0010 +#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP 0x0060 +#define IEEE80211_MLD_CAP_OP_FREQ_SEP_TYPE_IND 0x0f80 +#define IEEE80211_MLD_CAP_OP_AAR_SUPPORT 0x1000 + +struct ieee80211_mle_basic_common_info { + u8 len; + u8 mld_mac_addr[ETH_ALEN]; + u8 variable[]; +} __packed; + +#define IEEE80211_MLC_PREQ_PRES_MLD_ID 0x0010 + +struct ieee80211_mle_preq_common_info { + u8 len; + u8 variable[]; +} __packed; + +#define IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR 0x0010 + +/* no fixed fields in RECONF */ + +struct ieee80211_mle_tdls_common_info { + u8 len; + u8 ap_mld_mac_addr[ETH_ALEN]; +} __packed; + +#define IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR 0x0010 + +/* no fixed fields in PRIO_ACCESS */ + +/** + * ieee80211_mle_common_size - check multi-link element common size + * @data: multi-link element, must already be checked for size using + * ieee80211_mle_size_ok() + */ +static inline u8 ieee80211_mle_common_size(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + u8 common = 0; + + switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) { + case IEEE80211_ML_CONTROL_TYPE_BASIC: + common += sizeof(struct ieee80211_mle_basic_common_info); + break; + case IEEE80211_ML_CONTROL_TYPE_PREQ: + common += sizeof(struct ieee80211_mle_preq_common_info); + break; + case IEEE80211_ML_CONTROL_TYPE_RECONF: + if (control & IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR) + common += ETH_ALEN; + return common; + case IEEE80211_ML_CONTROL_TYPE_TDLS: + common += sizeof(struct ieee80211_mle_tdls_common_info); + break; + case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: + if (control & IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR) + common += ETH_ALEN; + return common; + default: + WARN_ON(1); + return 0; + } + + return common + mle->variable[0]; +} + +/** + * ieee80211_mle_size_ok - validate multi-link element size + * @data: pointer to the element data + * @len: length of the containing element + */ +static inline bool ieee80211_mle_size_ok(const u8 *data, u8 len) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u8 fixed = sizeof(*mle); + u8 common = 0; + bool check_common_len = false; + u16 control; + + if (len < fixed) + return false; + + control = le16_to_cpu(mle->control); + + switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) { + case IEEE80211_ML_CONTROL_TYPE_BASIC: + common += sizeof(struct ieee80211_mle_basic_common_info); + check_common_len = true; + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID) + common += 1; + break; + case IEEE80211_ML_CONTROL_TYPE_PREQ: + common += sizeof(struct ieee80211_mle_preq_common_info); + if (control & IEEE80211_MLC_PREQ_PRES_MLD_ID) + common += 1; + check_common_len = true; + break; + case IEEE80211_ML_CONTROL_TYPE_RECONF: + if (control & IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR) + common += ETH_ALEN; + break; + case IEEE80211_ML_CONTROL_TYPE_TDLS: + common += sizeof(struct ieee80211_mle_tdls_common_info); + check_common_len = true; + break; + case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: + if (control & IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR) + common += ETH_ALEN; + break; + default: + /* we don't know this type */ + return true; + } + + if (len < fixed + common) + return false; + + if (!check_common_len) + return true; + + /* if present, common length is the first octet there */ + return mle->variable[0] >= common; +} + +enum ieee80211_mle_subelems { + IEEE80211_MLE_SUBELEM_PER_STA_PROFILE = 0, +}; + +#define IEEE80211_MLE_STA_CONTROL_LINK_ID 0x000f +#define IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE 0x0010 +#define IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT 0x0020 +#define IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT 0x0040 +#define IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT 0x0080 +#define IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT 0x0100 +#define IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT 0x0200 +#define IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE 0x0400 +#define IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT 0x0800 + +struct ieee80211_mle_per_sta_profile { + __le16 control; + u8 sta_info_len; + u8 variable[]; +} __packed; + +#define for_each_mle_subelement(_elem, _data, _len) \ + if (ieee80211_mle_size_ok(_data, _len)) \ + for_each_element(_elem, \ + _data + ieee80211_mle_common_size(_data),\ + _len - ieee80211_mle_common_size(_data)) + #endif /* LINUX_IEEE80211_H */ diff --git a/include/linux/input.h b/include/linux/input.h index 0354b298d874..dbf4ba68e477 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -7,6 +7,7 @@ #include #include +#include #include /* Implementation details, userspace should not care about these */ #define ABS_MT_FIRST ABS_MT_TOUCH_MAJOR @@ -209,6 +210,11 @@ struct input_dev { ktime_t timestamp[INPUT_CLK_MAX]; bool inhibited; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #define to_input_dev(d) container_of(d, struct input_dev, dev) @@ -328,6 +334,8 @@ struct input_handler { struct list_head h_list; struct list_head node; + + ANDROID_KABI_RESERVE(1); }; /** @@ -354,6 +362,8 @@ struct input_handle { struct list_head d_node; struct list_head h_node; + + ANDROID_KABI_RESERVE(1); }; struct input_dev __must_check *input_allocate_device(void); @@ -560,6 +570,9 @@ struct ff_device { int max_effects; struct ff_effect *effects; + + ANDROID_KABI_RESERVE(1); + struct file *effect_owners[]; }; diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 1f22a30c0963..48a6365d63b5 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -529,6 +529,12 @@ enum }; #define SOFTIRQ_STOP_IDLE_MASK (~(1 << RCU_SOFTIRQ)) +/* Softirq's where the handling might be long: */ +#define LONG_SOFTIRQ_MASK ((1 << NET_TX_SOFTIRQ) | \ + (1 << NET_RX_SOFTIRQ) | \ + (1 << BLOCK_SOFTIRQ) | \ + (1 << IRQ_POLL_SOFTIRQ) | \ + (1 << TASKLET_SOFTIRQ)) /* map softirq index to softirq name. update 'softirq_to_name' in * kernel/softirq.c when adding a new softirq. @@ -555,6 +561,7 @@ extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); DECLARE_PER_CPU(struct task_struct *, ksoftirqd); +DECLARE_PER_CPU(__u32, active_softirqs); static inline struct task_struct *this_cpu_ksoftirqd(void) { diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index ca98aeadcc80..1ccc25a2e978 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -150,6 +150,9 @@ struct io_pgtable_cfg { * * @map: Map a physically contiguous memory region. * @map_pages: Map a physically contiguous range of pages of the same size. + * @map_sg: Map a scatter-gather list of physically contiguous memory + * chunks. The mapped pointer argument is used to store how + * many bytes are mapped. * @unmap: Unmap a physically contiguous memory region. * @unmap_pages: Unmap a range of virtually contiguous pages of the same size. * @iova_to_phys: Translate iova to physical address. @@ -163,6 +166,9 @@ struct io_pgtable_ops { int (*map_pages)(struct io_pgtable_ops *ops, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped); + int (*map_sg)(struct io_pgtable_ops *ops, unsigned long iova, + struct scatterlist *sg, unsigned int nents, int prot, + gfp_t gfp, size_t *mapped); size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova, size_t size, struct iommu_iotlb_gather *gather); size_t (*unmap_pages)(struct io_pgtable_ops *ops, unsigned long iova, diff --git a/include/linux/io.h b/include/linux/io.h index 9595151d800d..84eac81e8834 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -21,6 +21,8 @@ void __ioread32_copy(void *to, const void __iomem *from, size_t count); void __iowrite64_copy(void __iomem *to, const void *from, size_t count); #ifdef CONFIG_MMU +void ioremap_phys_range_hook(phys_addr_t phys_addr, size_t size, pgprot_t prot); +void iounmap_phys_range_hook(phys_addr_t phys_addr, size_t size); int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot); #else diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 829f2325ecba..9693aa884cc8 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -9,6 +9,7 @@ #include #include #include +#include struct address_space; struct fiemap_extent_info; @@ -89,6 +90,8 @@ struct iomap { void *inline_data; void *private; /* filesystem private */ const struct iomap_page_ops *page_ops; + + ANDROID_KABI_RESERVE(1); }; static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) @@ -160,6 +163,9 @@ struct iomap_ops { */ int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; /** diff --git a/include/linux/iommu.h b/include/linux/iommu.h index d2f3435e7d17..1016c15f2447 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -31,6 +31,18 @@ * if the IOMMU page table format is equivalent. */ #define IOMMU_PRIV (1 << 5) +/* + * Allow caching in a transparent outer level of cache, also known as + * the last-level or system cache, with a read/write allocation policy. + * Does not depend on IOMMU_CACHE. Incompatible with IOMMU_SYS_CACHE_NWA. + */ +#define IOMMU_SYS_CACHE (1 << 6) +/* + * Allow caching in a transparent outer level of cache, also known as + * the last-level or system cache, with a read allocation policy. + * Does not depend on IOMMU_CACHE. Incompatible with IOMMU_SYS_CACHE. + */ +#define IOMMU_SYS_CACHE_NWA (1 << 7) struct iommu_ops; struct iommu_group; @@ -200,6 +212,8 @@ struct iommu_iotlb_gather { * @map: map a physically contiguous memory region to an iommu domain * @map_pages: map a physically contiguous set of pages of the same size to * an iommu domain. + * @map_sg: map a scatter-gather list of physically contiguous chunks to + * an iommu domain. * @unmap: unmap a physically contiguous memory region from an iommu domain * @unmap_pages: unmap a number of pages of the same size from an iommu domain * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain @@ -253,6 +267,9 @@ struct iommu_ops { int (*map_pages)(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped); + int (*map_sg)(struct iommu_domain *domain, unsigned long iova, + struct scatterlist *sg, unsigned int nents, int prot, + gfp_t gfp, size_t *mapped); size_t (*unmap)(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *iotlb_gather); size_t (*unmap_pages)(struct iommu_domain *domain, unsigned long iova, diff --git a/include/linux/ioport.h b/include/linux/ioport.h index ec5f71f7135b..2c4484d38b4e 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -14,6 +14,7 @@ #include #include #include +#include /* * Resources are tree-like, allowing * nesting etc.. @@ -25,6 +26,11 @@ struct resource { unsigned long flags; unsigned long desc; struct resource *parent, *sibling, *child; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; /* diff --git a/include/linux/iova.h b/include/linux/iova.h index 6b6cc104e300..052bf06c947b 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -96,6 +96,7 @@ struct iova_domain { atomic_t fq_timer_on; /* 1 when timer is active, 0 when not */ struct hlist_node cpuhp_dead; + bool best_fit; }; static inline unsigned long iova_size(struct iova *iova) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index d1f386430795..ab1fff41ed12 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -3,6 +3,7 @@ #define _IPV6_H #include +#include #define ipv6_optlen(p) (((p)->hdrlen+1) << 3) #define ipv6_authlen(p) (((p)->hdrlen+2) << 2) @@ -43,6 +44,7 @@ struct ipv6_devconf { __s32 accept_ra_rt_info_max_plen; #endif #endif + __s32 accept_ra_rt_table; __s32 proxy_ndp; __s32 accept_source_route; __s32 accept_ra_from_local; @@ -81,6 +83,11 @@ struct ipv6_devconf { __u8 ioam6_enabled; struct ctl_table_header *sysctl_header; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; struct ipv6_params { diff --git a/include/linux/irq.h b/include/linux/irq.h index c8293c817646..a693de87d171 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -73,6 +73,7 @@ enum irqchip_irq_state; * IRQ_DISABLE_UNLAZY - Disable lazy irq disable * IRQ_HIDDEN - Don't show up in /proc/interrupts * IRQ_NO_DEBUG - Exclude from note_interrupt() debugging + * IRQ_RAW - Skip tick management and irqtime accounting */ enum { IRQ_TYPE_NONE = 0x00000000, @@ -101,6 +102,7 @@ enum { IRQ_DISABLE_UNLAZY = (1 << 19), IRQ_HIDDEN = (1 << 20), IRQ_NO_DEBUG = (1 << 21), + IRQ_RAW = (1 << 22), }; #define IRQF_MODIFY_MASK \ diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 81cbf85f73de..33d591555139 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -127,6 +127,8 @@ #define GICR_PIDR2 GICD_PIDR2 #define GICR_CTLR_ENABLE_LPIS (1UL << 0) +#define GICR_CTLR_CES (1UL << 1) +#define GICR_CTLR_IR (1UL << 2) #define GICR_CTLR_RWP (1UL << 3) #define GICR_TYPER_CPU_NUMBER(r) (((r) >> 8) & 0xffff) @@ -637,6 +639,20 @@ int its_init(struct fwnode_handle *handle, struct rdists *rdists, struct irq_domain *domain); int mbi_init(struct fwnode_handle *fwnode, struct irq_domain *parent); +struct gic_chip_data { + struct fwnode_handle *fwnode; + void __iomem *dist_base; + struct redist_region *redist_regions; + struct rdists rdists; + struct irq_domain *domain; + u64 redist_stride; + u32 nr_redist_regions; + u64 flags; + bool has_rss; + unsigned int ppi_nr; + struct partition_desc **ppi_descs; +}; + static inline bool gic_enable_sre(void) { u32 val; @@ -652,6 +668,8 @@ static inline bool gic_enable_sre(void) return !!(val & ICC_SRE_EL1_SRE); } +void gic_resume(void); + #endif #endif diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 9ee238ad29ce..6121f4307cd3 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -35,6 +35,7 @@ #include #include #include +#include struct device_node; struct fwnode_handle; @@ -171,6 +172,11 @@ struct irq_domain { struct irq_domain *parent; #endif + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); + /* reverse map data. The linear map gets appended to the irq_domain */ irq_hw_number_t hwirq_max; unsigned int revmap_size; diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 019e55c13248..fca348538477 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -113,7 +113,7 @@ struct static_key { #endif /* CONFIG_JUMP_LABEL */ #endif /* __ASSEMBLY__ */ -#ifdef CONFIG_JUMP_LABEL +#if defined(CONFIG_JUMP_LABEL) && !defined(BUILD_FIPS140_KO) #include #ifndef __ASSEMBLY__ @@ -200,7 +200,28 @@ enum jump_label_type { struct module; -#ifdef CONFIG_JUMP_LABEL +#ifdef BUILD_FIPS140_KO + +static inline int static_key_count(struct static_key *key) +{ + return atomic_read(&key->enabled); +} + +static __always_inline bool static_key_false(struct static_key *key) +{ + if (unlikely(static_key_count(key) > 0)) + return true; + return false; +} + +static __always_inline bool static_key_true(struct static_key *key) +{ + if (likely(static_key_count(key) > 0)) + return true; + return false; +} + +#elif defined(CONFIG_JUMP_LABEL) #define JUMP_TYPE_FALSE 0UL #define JUMP_TYPE_TRUE 1UL @@ -420,7 +441,7 @@ extern bool ____wrong_branch_error(void); static_key_count((struct static_key *)x) > 0; \ }) -#ifdef CONFIG_JUMP_LABEL +#if defined(CONFIG_JUMP_LABEL) && !defined(BUILD_FIPS140_KO) /* * Combine the right initial value (type) with the right branch order diff --git a/include/linux/kasan-enabled.h b/include/linux/kasan-enabled.h new file mode 100644 index 000000000000..6f612d69ea0c --- /dev/null +++ b/include/linux/kasan-enabled.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_KASAN_ENABLED_H +#define _LINUX_KASAN_ENABLED_H + +#include + +#ifdef CONFIG_KASAN_HW_TAGS + +DECLARE_STATIC_KEY_FALSE(kasan_flag_enabled); + +static __always_inline bool kasan_enabled(void) +{ + return static_branch_likely(&kasan_flag_enabled); +} + +static inline bool kasan_hw_tags_enabled(void) +{ + return kasan_enabled(); +} + +#else /* CONFIG_KASAN_HW_TAGS */ + +static inline bool kasan_enabled(void) +{ + return IS_ENABLED(CONFIG_KASAN); +} + +static inline bool kasan_hw_tags_enabled(void) +{ + return false; +} + +#endif /* CONFIG_KASAN_HW_TAGS */ + +#endif /* LINUX_KASAN_ENABLED_H */ diff --git a/include/linux/kasan.h b/include/linux/kasan.h index f407e937241a..45003ed10e23 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -3,6 +3,7 @@ #define _LINUX_KASAN_H #include +#include #include #include #include @@ -17,13 +18,15 @@ struct task_struct; #include #include -/* kasan_data struct is used in KUnit tests for KASAN expected failures */ -struct kunit_kasan_expectation { - bool report_found; -}; - #endif +typedef unsigned int __bitwise kasan_vmalloc_flags_t; + +#define KASAN_VMALLOC_NONE ((__force kasan_vmalloc_flags_t)0x00u) +#define KASAN_VMALLOC_INIT ((__force kasan_vmalloc_flags_t)0x01u) +#define KASAN_VMALLOC_VM_ALLOC ((__force kasan_vmalloc_flags_t)0x02u) +#define KASAN_VMALLOC_PROT_NORMAL ((__force kasan_vmalloc_flags_t)0x04u) + #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #include @@ -80,67 +83,21 @@ static inline void kasan_disable_current(void) {} #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ -#ifdef CONFIG_KASAN_HW_TAGS - -DECLARE_STATIC_KEY_FALSE(kasan_flag_enabled); - -static __always_inline bool kasan_enabled(void) -{ - return static_branch_likely(&kasan_flag_enabled); -} - static inline bool kasan_has_integrated_init(void) { - return kasan_enabled(); + return kasan_hw_tags_enabled(); } -void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags); -void kasan_free_pages(struct page *page, unsigned int order); - -#else /* CONFIG_KASAN_HW_TAGS */ - -static inline bool kasan_enabled(void) -{ - return IS_ENABLED(CONFIG_KASAN); -} - -static inline bool kasan_has_integrated_init(void) -{ - return false; -} - -static __always_inline void kasan_alloc_pages(struct page *page, - unsigned int order, gfp_t flags) -{ - /* Only available for integrated init. */ - BUILD_BUG(); -} - -static __always_inline void kasan_free_pages(struct page *page, - unsigned int order) -{ - /* Only available for integrated init. */ - BUILD_BUG(); -} - -#endif /* CONFIG_KASAN_HW_TAGS */ - #ifdef CONFIG_KASAN struct kasan_cache { +#ifdef CONFIG_KASAN_GENERIC int alloc_meta_offset; int free_meta_offset; +#endif bool is_kmalloc; }; -slab_flags_t __kasan_never_merge(void); -static __always_inline slab_flags_t kasan_never_merge(void) -{ - if (kasan_enabled()) - return __kasan_never_merge(); - return 0; -} - void __kasan_unpoison_range(const void *addr, size_t size); static __always_inline void kasan_unpoison_range(const void *addr, size_t size) { @@ -156,21 +113,13 @@ static __always_inline void kasan_poison_pages(struct page *page, __kasan_poison_pages(page, order, init); } -void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init); -static __always_inline void kasan_unpoison_pages(struct page *page, +bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init); +static __always_inline bool kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { if (kasan_enabled()) - __kasan_unpoison_pages(page, order, init); -} - -void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, - slab_flags_t *flags); -static __always_inline void kasan_cache_create(struct kmem_cache *cache, - unsigned int *size, slab_flags_t *flags) -{ - if (kasan_enabled()) - __kasan_cache_create(cache, size, flags); + return __kasan_unpoison_pages(page, order, init); + return false; } void __kasan_cache_create_kmalloc(struct kmem_cache *cache); @@ -180,14 +129,6 @@ static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) __kasan_cache_create_kmalloc(cache); } -size_t __kasan_metadata_size(struct kmem_cache *cache); -static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache) -{ - if (kasan_enabled()) - return __kasan_metadata_size(cache); - return 0; -} - void __kasan_poison_slab(struct page *page); static __always_inline void kasan_poison_slab(struct page *page) { @@ -297,26 +238,17 @@ static __always_inline bool kasan_check_byte(const void *addr) return true; } - -bool kasan_save_enable_multi_shot(void); -void kasan_restore_multi_shot(bool enabled); - #else /* CONFIG_KASAN */ -static inline slab_flags_t kasan_never_merge(void) -{ - return 0; -} static inline void kasan_unpoison_range(const void *address, size_t size) {} static inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) {} -static inline void kasan_unpoison_pages(struct page *page, unsigned int order, - bool init) {} -static inline void kasan_cache_create(struct kmem_cache *cache, - unsigned int *size, - slab_flags_t *flags) {} +static inline bool kasan_unpoison_pages(struct page *page, unsigned int order, + bool init) +{ + return false; +} static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} -static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } static inline void kasan_poison_slab(struct page *page) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) {} @@ -367,15 +299,37 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} #ifdef CONFIG_KASAN_GENERIC +size_t kasan_metadata_size(struct kmem_cache *cache); +slab_flags_t kasan_never_merge(void); +void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, + slab_flags_t *flags); + void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_record_aux_stack(void *ptr); +void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ +/* Tag-based KASAN modes do not use per-object metadata. */ +static inline size_t kasan_metadata_size(struct kmem_cache *cache) +{ + return 0; +} +/* And thus nothing prevents cache merging. */ +static inline slab_flags_t kasan_never_merge(void) +{ + return 0; +} +/* And no cache-related metadata initialization is required. */ +static inline void kasan_cache_create(struct kmem_cache *cache, + unsigned int *size, + slab_flags_t *flags) {} + static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_record_aux_stack(void *ptr) {} +static inline void kasan_record_aux_stack_noalloc(void *ptr) {} #endif /* CONFIG_KASAN_GENERIC */ @@ -427,29 +381,72 @@ static inline void kasan_init_hw_tags(void) { } #ifdef CONFIG_KASAN_VMALLOC +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + +void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); int kasan_populate_vmalloc(unsigned long addr, unsigned long size); -void kasan_poison_vmalloc(const void *start, unsigned long size); -void kasan_unpoison_vmalloc(const void *start, unsigned long size); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); -#else /* CONFIG_KASAN_VMALLOC */ +#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ +static inline void kasan_populate_early_vm_area_shadow(void *start, + unsigned long size) +{ } static inline int kasan_populate_vmalloc(unsigned long start, unsigned long size) { return 0; } - -static inline void kasan_poison_vmalloc(const void *start, unsigned long size) -{ } -static inline void kasan_unpoison_vmalloc(const void *start, unsigned long size) -{ } static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, - unsigned long free_region_end) {} + unsigned long free_region_end) { } + +#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ + +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags); +static __always_inline void *kasan_unpoison_vmalloc(const void *start, + unsigned long size, + kasan_vmalloc_flags_t flags) +{ + if (kasan_enabled()) + return __kasan_unpoison_vmalloc(start, size, flags); + return (void *)start; +} + +void __kasan_poison_vmalloc(const void *start, unsigned long size); +static __always_inline void kasan_poison_vmalloc(const void *start, + unsigned long size) +{ + if (kasan_enabled()) + __kasan_poison_vmalloc(start, size); +} + +#else /* CONFIG_KASAN_VMALLOC */ + +static inline void kasan_populate_early_vm_area_shadow(void *start, + unsigned long size) { } +static inline int kasan_populate_vmalloc(unsigned long start, + unsigned long size) +{ + return 0; +} +static inline void kasan_release_vmalloc(unsigned long start, + unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end) { } + +static inline void *kasan_unpoison_vmalloc(const void *start, + unsigned long size, + kasan_vmalloc_flags_t flags) +{ + return (void *)start; +} +static inline void kasan_poison_vmalloc(const void *start, unsigned long size) +{ } #endif /* CONFIG_KASAN_VMALLOC */ @@ -457,17 +454,17 @@ static inline void kasan_release_vmalloc(unsigned long start, !defined(CONFIG_KASAN_VMALLOC) /* - * These functions provide a special case to support backing module - * allocations with real shadow memory. With KASAN vmalloc, the special - * case is unnecessary, as the work is handled in the generic case. + * These functions allocate and free shadow memory for kernel modules. + * They are only required when KASAN_VMALLOC is not supported, as otherwise + * shadow memory is allocated by the generic vmalloc handlers. */ -int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask); -void kasan_free_shadow(const struct vm_struct *vm); +int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask); +void kasan_free_module_shadow(const struct vm_struct *vm); #else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ -static inline int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { return 0; } -static inline void kasan_free_shadow(const struct vm_struct *vm) {} +static inline int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) { return 0; } +static inline void kasan_free_module_shadow(const struct vm_struct *vm) {} #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 1093abf7c28c..867b43e628cc 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -16,6 +16,7 @@ #include #include #include +#include struct file; struct dentry; @@ -161,6 +162,8 @@ struct kernfs_node { unsigned short flags; umode_t mode; struct kernfs_iattrs *iattr; + + ANDROID_KABI_RESERVE(1); }; /* @@ -180,6 +183,11 @@ struct kernfs_syscall_ops { const char *new_name); int (*show_path)(struct seq_file *sf, struct kernfs_node *kn, struct kernfs_root *root); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; struct kernfs_root { @@ -197,6 +205,8 @@ struct kernfs_root { struct list_head supers; wait_queue_head_t deactivate_waitq; + + ANDROID_KABI_RESERVE(1); }; struct kernfs_open_file { @@ -217,6 +227,8 @@ struct kernfs_open_file { bool mmapped:1; bool released:1; const struct vm_operations_struct *vm_ops; + + ANDROID_KABI_RESERVE(1); }; struct kernfs_ops { @@ -273,6 +285,9 @@ struct kernfs_ops { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lock_class_key lockdep_key; #endif + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; /* diff --git a/include/linux/key-type.h b/include/linux/key-type.h index 7d985a1dfe4a..106f8d097002 100644 --- a/include/linux/key-type.h +++ b/include/linux/key-type.h @@ -10,6 +10,7 @@ #include #include +#include #ifdef CONFIG_KEYS @@ -156,6 +157,9 @@ struct key_type { int (*asym_verify_signature)(struct kernel_pkey_params *params, const void *in, const void *in2); + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + /* internal fields */ struct list_head link; /* link in types list */ struct lock_class_key lock_class; /* key->sem lock class */ diff --git a/include/linux/keyslot-manager.h b/include/linux/keyslot-manager.h deleted file mode 100644 index a27605e2f826..000000000000 --- a/include/linux/keyslot-manager.h +++ /dev/null @@ -1,120 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright 2019 Google LLC - */ - -#ifndef __LINUX_KEYSLOT_MANAGER_H -#define __LINUX_KEYSLOT_MANAGER_H - -#include -#include - -struct blk_keyslot_manager; - -/** - * struct blk_ksm_ll_ops - functions to manage keyslots in hardware - * @keyslot_program: Program the specified key into the specified slot in the - * inline encryption hardware. - * @keyslot_evict: Evict key from the specified keyslot in the hardware. - * The key is provided so that e.g. dm layers can evict - * keys from the devices that they map over. - * Returns 0 on success, -errno otherwise. - * - * This structure should be provided by storage device drivers when they set up - * a keyslot manager - this structure holds the function ptrs that the keyslot - * manager will use to manipulate keyslots in the hardware. - */ -struct blk_ksm_ll_ops { - int (*keyslot_program)(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - unsigned int slot); - int (*keyslot_evict)(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - unsigned int slot); -}; - -struct blk_keyslot_manager { - /* - * The struct blk_ksm_ll_ops that this keyslot manager will use - * to perform operations like programming and evicting keys on the - * device - */ - struct blk_ksm_ll_ops ksm_ll_ops; - - /* - * The maximum number of bytes supported for specifying the data unit - * number. - */ - unsigned int max_dun_bytes_supported; - - /* - * Array of size BLK_ENCRYPTION_MODE_MAX of bitmasks that represents - * whether a crypto mode and data unit size are supported. The i'th - * bit of crypto_mode_supported[crypto_mode] is set iff a data unit - * size of (1 << i) is supported. We only support data unit sizes - * that are powers of 2. - */ - unsigned int crypto_modes_supported[BLK_ENCRYPTION_MODE_MAX]; - - /* Device for runtime power management (NULL if none) */ - struct device *dev; - - /* Here onwards are *private* fields for internal keyslot manager use */ - - unsigned int num_slots; - - /* Protects programming and evicting keys from the device */ - struct rw_semaphore lock; - - /* List of idle slots, with least recently used slot at front */ - wait_queue_head_t idle_slots_wait_queue; - struct list_head idle_slots; - spinlock_t idle_slots_lock; - - /* - * Hash table which maps struct *blk_crypto_key to keyslots, so that we - * can find a key's keyslot in O(1) time rather than O(num_slots). - * Protected by 'lock'. - */ - struct hlist_head *slot_hashtable; - unsigned int log_slot_ht_size; - - /* Per-keyslot data */ - struct blk_ksm_keyslot *slots; -}; - -int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots); - -int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm, - unsigned int num_slots); - -blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - struct blk_ksm_keyslot **slot_ptr); - -unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot); - -void blk_ksm_put_slot(struct blk_ksm_keyslot *slot); - -bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm, - const struct blk_crypto_config *cfg); - -int blk_ksm_evict_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key); - -void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm); - -void blk_ksm_destroy(struct blk_keyslot_manager *ksm); - -void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent, - const struct blk_keyslot_manager *child); - -void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm); - -bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset, - struct blk_keyslot_manager *ksm_subset); - -void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm, - struct blk_keyslot_manager *reference_ksm); - -#endif /* __LINUX_KEYSLOT_MANAGER_H */ diff --git a/include/linux/kobject.h b/include/linux/kobject.h index ea30529fba08..e6d8e1b8ab0a 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -27,6 +27,7 @@ #include #include #include +#include #define UEVENT_HELPER_PATH_LEN 256 #define UEVENT_NUM_ENVP 64 /* number of env pointers */ @@ -77,6 +78,11 @@ struct kobject { unsigned int state_add_uevent_sent:1; unsigned int state_remove_uevent_sent:1; unsigned int uevent_suppress:1; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; extern __printf(2, 3) @@ -143,6 +149,11 @@ struct kobj_type { const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj); const void *(*namespace)(struct kobject *kobj); void (*get_ownership)(struct kobject *kobj, kuid_t *uid, kgid_t *gid); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; struct kobj_uevent_env { @@ -194,6 +205,11 @@ struct kset { spinlock_t list_lock; struct kobject kobj; const struct kset_uevent_ops *uevent_ops; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); } __randomize_layout; extern void kset_init(struct kset *kset); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7e2423ffaf59..3430c0e891fd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -659,7 +660,7 @@ struct kvm { struct mutex slots_arch_lock; struct mm_struct *mm; /* userspace tied to this vm */ struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; - struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + struct xarray vcpu_array; /* Used to wait for completion of MMU notifiers. */ spinlock_t mn_invalidate_lock; @@ -800,19 +801,17 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu. */ smp_rmb(); - return kvm->vcpus[i]; + return xa_load(&kvm->vcpu_array, i); } -#define kvm_for_each_vcpu(idx, vcpup, kvm) \ - for (idx = 0; \ - idx < atomic_read(&kvm->online_vcpus) && \ - (vcpup = kvm_get_vcpu(kvm, idx)) != NULL; \ - idx++) +#define kvm_for_each_vcpu(idx, vcpup, kvm) \ + xa_for_each_range(&kvm->vcpu_array, idx, vcpup, 0, \ + (atomic_read(&kvm->online_vcpus) - 1)) static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) { struct kvm_vcpu *vcpu = NULL; - int i; + unsigned long i; if (id < 0) return NULL; @@ -832,7 +831,7 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) if (WARN_ON_ONCE(!memslot->npages)) { \ } else -void kvm_vcpu_destroy(struct kvm_vcpu *vcpu); +void kvm_destroy_vcpus(struct kvm *kvm); void vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); @@ -1264,6 +1263,16 @@ static inline bool kvm_arch_intc_initialized(struct kvm *kvm) } #endif +#ifdef CONFIG_GUEST_PERF_EVENTS +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu); + +void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)); +void kvm_unregister_perf_callbacks(void); +#else +static inline void kvm_register_perf_callbacks(void *ign) {} +static inline void kvm_unregister_perf_callbacks(void) {} +#endif /* CONFIG_GUEST_PERF_EVENTS */ + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); @@ -1272,7 +1281,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); bool kvm_is_reserved_pfn(kvm_pfn_t pfn); bool kvm_is_zone_device_pfn(kvm_pfn_t pfn); -bool kvm_is_transparent_hugepage(kvm_pfn_t pfn); struct kvm_irq_ack_notifier { struct hlist_node link; @@ -1939,6 +1947,19 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) } #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ +/* + * If more than one page is being (un)accounted, @virt must be the address of + * the first page of a block of pages what were allocated together (i.e + * accounted together). + * + * kvm_account_pgtable_pages() is thread-safe because mod_lruvec_page_state() + * is thread-safe. + */ +static inline void kvm_account_pgtable_pages(void *virt, int nr) +{ + mod_lruvec_page_state(virt_to_page(virt), NR_SECONDARY_PAGETABLE, nr); +} + /* * This defines how many reserved entries we want to keep before we * kick the vcpu to the userspace to avoid dirty ring full. This diff --git a/include/linux/mem_relinquish.h b/include/linux/mem_relinquish.h new file mode 100644 index 000000000000..b906c4196d11 --- /dev/null +++ b/include/linux/mem_relinquish.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 Google LLC + * Author: Keir Fraser + */ + +#ifndef __MEM_RELINQUISH_H__ +#define __MEM_RELINQUISH_H__ + +#ifdef CONFIG_MEMORY_RELINQUISH + +#include + +#else /* !CONFIG_MEMORY_RELINQUISH */ + +static inline bool kvm_has_memrelinquish_services(void) { return false; } +static inline void page_relinquish(struct page *page) { } + +#endif /* CONFIG_MEMORY_RELINQUISH */ + +#endif /* __MEM_RELINQUISH_H__ */ diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 5df38332e413..d9b25e57f056 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -372,7 +372,7 @@ static inline int memblock_get_region_node(const struct memblock_region *r) /* Flags for memblock allocation APIs */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) #define MEMBLOCK_ALLOC_ACCESSIBLE 0 -#define MEMBLOCK_ALLOC_KASAN 1 +#define MEMBLOCK_ALLOC_NOLEAKTRACE 1 /* We are using top down, so it is safe to use 0 here */ #define MEMBLOCK_LOW_LIMIT 0 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4f189b17dafc..8fdf6e22a29b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -21,6 +21,7 @@ #include #include #include +#include struct mem_cgroup; struct obj_cgroup; @@ -348,6 +349,15 @@ struct mem_cgroup { struct deferred_split deferred_split_queue; #endif +#ifdef CONFIG_LRU_GEN + /* per-memcg mm_struct list */ + struct lru_gen_mm_list mm_list; +#endif + + /* for dynamic low */ + ANDROID_VENDOR_DATA(1); + ANDROID_OEM_DATA_ARRAY(1, 2); + struct mem_cgroup_per_node *nodeinfo[]; }; @@ -442,6 +452,7 @@ static inline struct obj_cgroup *__page_objcg(struct page *page) * - LRU isolation * - lock_page_memcg() * - exclusive reference + * - mem_cgroup_trylock_pages() * * For a kmem page a caller should hold an rcu read lock to protect memcg * associated with a kmem page from being released. @@ -497,6 +508,7 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page) * - LRU isolation * - lock_page_memcg() * - exclusive reference + * - mem_cgroup_trylock_pages() * * For a kmem page a caller should hold an rcu read lock to protect memcg * associated with a kmem page from being released. @@ -953,6 +965,23 @@ void unlock_page_memcg(struct page *page); void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); +/* try to stablize page_memcg() for all the pages in a memcg */ +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) +{ + rcu_read_lock(); + + if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) + return true; + + rcu_read_unlock(); + return false; +} + +static inline void mem_cgroup_unlock_pages(void) +{ + rcu_read_unlock(); +} + /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) @@ -1369,6 +1398,18 @@ static inline void unlock_page_memcg(struct page *page) { } +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) +{ + /* to match page_memcg_rcu() */ + rcu_read_lock(); + return true; +} + +static inline void mem_cgroup_unlock_pages(void) +{ + rcu_read_unlock(); +} + static inline void mem_cgroup_handle_over_high(void) { } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index e5a867c950b2..e348e0feafcb 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -302,6 +302,7 @@ extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages, struct memory_group *group); extern int remove_memory(u64 start, u64 size); +extern int remove_memory_subsection(u64 start, u64 size); extern void __remove_memory(u64 start, u64 size); extern int offline_and_remove_memory(u64 start, u64 size); @@ -329,6 +330,7 @@ extern void clear_zone_contiguous(struct zone *zone); extern void __ref free_area_init_core_hotplug(int nid); extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); +extern int add_memory_subsection(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource, mhp_t mhp_flags); extern int add_memory_driver_managed(int nid, u64 start, u64 size, diff --git a/include/linux/mm.h b/include/linux/mm.h index e4e1817bb3b8..4935f58c249e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -32,6 +32,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -233,6 +234,9 @@ int __add_to_page_cache_locked(struct page *page, struct address_space *mapping, /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) +/* to align the pointer to the (prev) page boundary */ +#define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE) + /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) @@ -252,6 +256,7 @@ void setup_initial_init_mm(void *start_code, void *end_code, struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); +void vm_area_free_no_check(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); #ifndef CONFIG_MMU @@ -446,6 +451,7 @@ extern pgprot_t protection_map[16]; * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. + * @FAULT_FLAG_SPECULATIVE: The fault is handled without holding the mmap lock. * * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify * whether we would allow page faults to retry by specifying these two @@ -477,6 +483,7 @@ enum fault_flag { FAULT_FLAG_REMOTE = 1 << 7, FAULT_FLAG_INSTRUCTION = 1 << 8, FAULT_FLAG_INTERRUPTIBLE = 1 << 9, + FAULT_FLAG_SPECULATIVE = 1 << 10, }; /* @@ -516,7 +523,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags) { FAULT_FLAG_USER, "USER" }, \ { FAULT_FLAG_REMOTE, "REMOTE" }, \ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ - { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" } + { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ + { FAULT_FLAG_SPECULATIVE, "SPECULATIVE" } /* * vm_fault is filled by the pagefault handler and passed to the vma's @@ -537,6 +545,10 @@ struct vm_fault { }; enum fault_flag flags; /* FAULT_FLAG_xxx flags * XXX: should really be 'const' */ +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + unsigned long seq; + pmd_t orig_pmd; +#endif pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching @@ -544,9 +556,11 @@ struct vm_fault { */ union { pte_t orig_pte; /* Value of PTE at the time of fault */ +#ifndef CONFIG_SPECULATIVE_PAGE_FAULT pmd_t orig_pmd; /* Value of PMD at the time of fault, * used by PMD fault only. */ +#endif }; struct page *cow_page; /* Page handler may use for COW fault */ @@ -654,6 +668,18 @@ struct vm_operations_struct { */ struct page *(*find_special_page)(struct vm_area_struct *vma, unsigned long addr); + /* + * speculative indicates that the vm_operations support + * speculative page faults. This allows ->fault and ->map_pages + * to be called with FAULT_FLAG_SPECULATIVE set; such calls will + * run within an rcu read locked section and with mmap lock not held. + */ + bool speculative; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) @@ -663,6 +689,10 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &dummy_vm_ops; +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + /* Start from 0 to use atomic_inc_unless_negative() in get_vma() */ + atomic_set(&vma->file_ref_count, 0); +#endif INIT_LIST_HEAD(&vma->anon_vma_chain); } @@ -706,6 +736,20 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) return vma->vm_flags & VM_ACCESS_FLAGS; } +static inline bool vma_can_speculate(struct vm_area_struct *vma, + unsigned int flags) +{ + if (vma_is_anonymous(vma)) + return true; + if (!vma->vm_ops->speculative) + return false; + if (!(flags & FAULT_FLAG_WRITE)) + return true; + if (!(vma->vm_flags & VM_SHARED)) + return true; + return false; +} + #ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow @@ -1093,6 +1137,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) +#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) +#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) /* * Define the bit shifts to access each section. For non-existent @@ -1773,9 +1819,15 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page); int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU -extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - struct pt_regs *regs); +extern vm_fault_t do_handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned long seq, struct pt_regs *regs); +static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + struct pt_regs *regs) +{ + return do_handle_mm_fault(vma, address, flags, 0, regs); +} extern int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); @@ -2482,6 +2534,7 @@ extern void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int migratetype); extern void setup_per_zone_wmarks(void); +extern void calculate_min_free_kbytes(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); @@ -2555,7 +2608,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx); + struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); @@ -2684,11 +2737,20 @@ extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); #define expand_upwards(vma, address) (0) #endif +extern struct vm_area_struct *find_vma_from_tree(struct mm_struct *mm, + unsigned long addr); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); +extern struct vm_area_struct * __find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); +static inline +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + mmap_assert_locked(mm); + return __find_vma(mm, addr); +} + /** * find_vma_intersection() - Look up the first VMA which intersects the interval * @mm: The process address space. @@ -3262,6 +3324,7 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, #endif extern int sysctl_nr_trim_pages; +extern int reclaim_shmem_address_space(struct address_space *mapping); #ifdef CONFIG_PRINTK void mem_dump_obj(void *object); @@ -3301,5 +3364,56 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) return 0; } +#ifdef CONFIG_ANON_VMA_NAME +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, + struct anon_vma_name *anon_name); +#else +static inline int +madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, struct anon_vma_name *anon_name) { + return 0; +} +#endif + +#ifdef CONFIG_MMU +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + +bool __pte_map_lock(struct vm_fault *vmf); + +static inline bool pte_map_lock(struct vm_fault *vmf) +{ + VM_BUG_ON(vmf->pte); + return __pte_map_lock(vmf); +} + +static inline bool pte_spinlock(struct vm_fault *vmf) +{ + VM_BUG_ON(!vmf->pte); + return __pte_map_lock(vmf); +} + +struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr); +void put_vma(struct vm_area_struct *vma); + +#else /* !CONFIG_SPECULATIVE_PAGE_FAULT */ + +#define pte_map_lock(___vmf) \ +({ \ + ___vmf->pte = pte_offset_map_lock(___vmf->vma->vm_mm, ___vmf->pmd,\ + ___vmf->address, &___vmf->ptl); \ + true; \ +}) + +#define pte_spinlock(___vmf) \ +({ \ + ___vmf->ptl = pte_lockptr(___vmf->vma->vm_mm, ___vmf->pmd); \ + spin_lock(___vmf->ptl); \ + true; \ +}) + +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ +#endif /* CONFIG_MMU */ + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 355ea1ee32bd..56ebc9116b95 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -4,6 +4,7 @@ #include #include +#include /** * page_is_file_lru - should the page be on a file LRU or anon LRU? @@ -24,15 +25,25 @@ static inline int page_is_file_lru(struct page *page) return !PageSwapBacked(page); } -static __always_inline void update_lru_size(struct lruvec *lruvec, +static __always_inline void __update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, - int nr_pages) + long nr_pages) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); + lockdep_assert_held(&lruvec->lru_lock); + WARN_ON_ONCE(nr_pages != (int)nr_pages); + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); +} + +static __always_inline void update_lru_size(struct lruvec *lruvec, + enum lru_list lru, enum zone_type zid, + long nr_pages) +{ + __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif @@ -79,11 +90,224 @@ static __always_inline enum lru_list page_lru(struct page *page) return lru; } +#ifdef CONFIG_LRU_GEN + +#ifdef CONFIG_LRU_GEN_ENABLED +static inline bool lru_gen_enabled(void) +{ + DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); + + return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); +} +#else +static inline bool lru_gen_enabled(void) +{ + DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); + + return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]); +} +#endif + +static inline bool lru_gen_in_fault(void) +{ + return current->in_lru_fault; +} + +static inline int lru_gen_from_seq(unsigned long seq) +{ + return seq % MAX_NR_GENS; +} + +static inline int lru_hist_from_seq(unsigned long seq) +{ + return seq % NR_HIST_GENS; +} + +static inline int lru_tier_from_refs(int refs) +{ + VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); + + /* see the comment in page_lru_refs() */ + return order_base_2(refs + 1); +} + +static inline int page_lru_refs(struct page *page) +{ + unsigned long flags = READ_ONCE(page->flags); + bool workingset = flags & BIT(PG_workingset); + + /* + * Return the number of accesses beyond PG_referenced, i.e., N-1 if the + * total number of accesses is N>1, since N=0,1 both map to the first + * tier. lru_tier_from_refs() will account for this off-by-one. Also see + * the comment on MAX_NR_TIERS. + */ + return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; +} + +static inline int page_lru_gen(struct page *page) +{ + unsigned long flags = READ_ONCE(page->flags); + + return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; +} + +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) +{ + unsigned long max_seq = lruvec->lrugen.max_seq; + + VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); + + /* see the comment on MIN_NR_GENS */ + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); +} + +static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page, + int old_gen, int new_gen) +{ + int type = page_is_file_lru(page); + int zone = page_zonenum(page); + int delta = thp_nr_pages(page); + enum lru_list lru = type * LRU_INACTIVE_FILE; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); + VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); + VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1); + + if (old_gen >= 0) + WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone], + lrugen->nr_pages[old_gen][type][zone] - delta); + if (new_gen >= 0) + WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone], + lrugen->nr_pages[new_gen][type][zone] + delta); + + /* addition */ + if (old_gen < 0) { + if (lru_gen_is_active(lruvec, new_gen)) + lru += LRU_ACTIVE; + __update_lru_size(lruvec, lru, zone, delta); + return; + } + + /* deletion */ + if (new_gen < 0) { + if (lru_gen_is_active(lruvec, old_gen)) + lru += LRU_ACTIVE; + __update_lru_size(lruvec, lru, zone, -delta); + return; + } + + /* promotion */ + if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { + __update_lru_size(lruvec, lru, zone, -delta); + __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); + } + + /* demotion requires isolation, e.g., lru_deactivate_fn() */ + VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); +} + +static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming) +{ + unsigned long seq; + unsigned long flags; + int gen = page_lru_gen(page); + int type = page_is_file_lru(page); + int zone = page_zonenum(page); + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE_PAGE(gen != -1, page); + + if (PageUnevictable(page) || !lrugen->enabled) + return false; + /* + * There are three common cases for this page: + * 1. If it's hot, e.g., freshly faulted in or previously hot and + * migrated, add it to the youngest generation. + * 2. If it's cold but can't be evicted immediately, i.e., an anon page + * not in swapcache or a dirty page pending writeback, add it to the + * second oldest generation. + * 3. Everything else (clean, cold) is added to the oldest generation. + */ + if (PageActive(page)) + seq = lrugen->max_seq; + else if ((type == LRU_GEN_ANON && !PageSwapCache(page)) || + (PageReclaim(page) && + (PageDirty(page) || PageWriteback(page)))) + seq = lrugen->min_seq[type] + 1; + else + seq = lrugen->min_seq[type]; + + gen = lru_gen_from_seq(seq); + flags = (gen + 1UL) << LRU_GEN_PGOFF; + /* see the comment on MIN_NR_GENS about PG_active */ + set_mask_bits(&page->flags, LRU_GEN_MASK | BIT(PG_active), flags); + + lru_gen_update_size(lruvec, page, -1, gen); + /* for rotate_reclaimable_page() */ + if (reclaiming) + list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]); + else + list_add(&page->lru, &lrugen->lists[gen][type][zone]); + + return true; +} + +static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming) +{ + unsigned long flags; + int gen = page_lru_gen(page); + + if (gen < 0) + return false; + + VM_WARN_ON_ONCE_PAGE(PageActive(page), page); + VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page); + + /* for migrate_page_states() */ + flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; + flags = set_mask_bits(&page->flags, LRU_GEN_MASK, flags); + gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + + lru_gen_update_size(lruvec, page, gen, -1); + list_del(&page->lru); + + return true; +} + +#else /* !CONFIG_LRU_GEN */ + +static inline bool lru_gen_enabled(void) +{ + return false; +} + +static inline bool lru_gen_in_fault(void) +{ + return false; +} + +static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming) +{ + return false; +} + +static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming) +{ + return false; +} + +#endif /* CONFIG_LRU_GEN */ + static __always_inline void add_page_to_lru_list(struct page *page, struct lruvec *lruvec) { enum lru_list lru = page_lru(page); + if (lru_gen_add_page(lruvec, page, false)) + return; + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); list_add(&page->lru, &lruvec->lists[lru]); } @@ -93,6 +317,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, { enum lru_list lru = page_lru(page); + if (lru_gen_add_page(lruvec, page, true)) + return; + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); list_add_tail(&page->lru, &lruvec->lists[lru]); } @@ -100,8 +327,100 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec) { + if (lru_gen_del_page(lruvec, page, false)) + return; + list_del(&page->lru); update_lru_size(lruvec, page_lru(page), page_zonenum(page), -thp_nr_pages(page)); } + +#ifdef CONFIG_ANON_VMA_NAME +/* + * mmap_lock should be read-locked when calling anon_vma_name(). Caller should + * either keep holding the lock while using the returned pointer or it should + * raise anon_vma_name refcount before releasing the lock. + */ +extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma); +extern struct anon_vma_name *anon_vma_name_alloc(const char *name); +extern void anon_vma_name_free(struct kref *kref); + +/* mmap_lock should be read-locked */ +static inline void anon_vma_name_get(struct anon_vma_name *anon_name) +{ + if (anon_name) + kref_get(&anon_name->kref); +} + +static inline void anon_vma_name_put(struct anon_vma_name *anon_name) +{ + if (anon_name) + kref_put(&anon_name->kref, anon_vma_name_free); +} + +static inline +struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name) +{ + /* Prevent anon_name refcount saturation early on */ + if (kref_read(&anon_name->kref) < REFCOUNT_MAX) { + anon_vma_name_get(anon_name); + return anon_name; + + } + return anon_vma_name_alloc(anon_name->name); +} + +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) +{ + struct anon_vma_name *anon_name = anon_vma_name(orig_vma); + + if (anon_name) + new_vma->anon_name = anon_vma_name_reuse(anon_name); +} + +static inline void free_anon_vma_name(struct vm_area_struct *vma) +{ + /* + * Not using anon_vma_name because it generates a warning if mmap_lock + * is not held, which might be the case here. + */ + if (!vma->vm_file) + anon_vma_name_put(vma->anon_name); +} + +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, + struct anon_vma_name *anon_name2) +{ + if (anon_name1 == anon_name2) + return true; + + return anon_name1 && anon_name2 && + !strcmp(anon_name1->name, anon_name2->name); +} +#else /* CONFIG_ANON_VMA_NAME */ +static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) +{ + return NULL; +} + +static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) +{ + return NULL; +} + +static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} +static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_anon_vma_name(struct vm_area_struct *vma) {} + +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, + struct anon_vma_name *anon_name2) +{ + return true; +} + +#endif /* CONFIG_ANON_VMA_NAME */ + #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7f8ee09c711f..d290cba826fc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -15,6 +16,7 @@ #include #include #include +#include #include @@ -310,21 +312,38 @@ struct vm_userfaultfd_ctx { struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ +struct anon_vma_name { + struct kref kref; + /* The name needs to be at the end because it is dynamically sized. */ + char name[]; +}; + /* * This struct describes a virtual memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory * space that has a special rule for the page-fault handlers (ie a shared * library, the executable area etc). + * + * Note that speculative page faults make an on-stack copy of the VMA, + * so the structure size matters. + * (TODO - it would be preferable to copy only the required vma attributes + * rather than the entire vma). */ struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ - unsigned long vm_start; /* Our start address within vm_mm. */ - unsigned long vm_end; /* The first byte after our end address - within vm_mm. */ + union { + struct { + /* VMA covers [vm_start; vm_end) addresses within mm */ + unsigned long vm_start, vm_end; - /* linked list of VM areas per task, sorted by address */ - struct vm_area_struct *vm_next, *vm_prev; + /* linked list of VMAs per task, sorted by address */ + struct vm_area_struct *vm_next, *vm_prev; + }; +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + struct rcu_head vm_rcu; /* Used for deferred freeing. */ +#endif + }; struct rb_node vm_rb; @@ -350,11 +369,22 @@ struct vm_area_struct { /* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. + * + * For private anonymous mappings, a pointer to a null terminated string + * containing the name given to the vma, or NULL if unnamed. */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; + + union { + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; + /* + * Serialized by mmap_sem. Never use directly because it is + * valid only when vm_file is NULL. Use anon_vma_name instead. + */ + struct anon_vma_name *anon_name; + }; /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma @@ -385,6 +415,19 @@ struct vm_area_struct { struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + /* + * The name does not reflect the usage and is not renamed to keep + * the ABI intact. + * This is used to refcount VMA in get_vma/put_vma. + */ + atomic_t file_ref_count; +#endif + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); } __randomize_layout; struct core_thread { @@ -399,6 +442,7 @@ struct core_state { }; struct kioctx_table; +struct percpu_rw_semaphore; struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ @@ -471,6 +515,10 @@ struct mm_struct { * cacheline. */ struct rw_semaphore mmap_lock; +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + unsigned long mmap_seq; +#endif + struct list_head mmlist; /* List of maybe swapped mm's. These * are globally strung together off @@ -543,7 +591,10 @@ struct mm_struct { struct file __rcu *exe_file; #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_subscriptions *notifier_subscriptions; -#endif +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + struct percpu_rw_semaphore *mmu_notifier_lock; +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ +#endif /* CONFIG_MMU_NOTIFIER */ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif @@ -580,6 +631,24 @@ struct mm_struct { #ifdef CONFIG_IOMMU_SUPPORT u32 pasid; #endif +#ifdef CONFIG_LRU_GEN + struct { + /* this mm_struct is on lru_gen_mm_list */ + struct list_head list; + /* + * Set when switching to this mm_struct, as a hint of + * whether it has been used since the last time per-node + * page table walkers cleared the corresponding bits. + */ + unsigned long bitmap; +#ifdef CONFIG_MEMCG + /* points to the memcg of "owner" above */ + struct mem_cgroup *memcg; +#endif + } lru_gen; +#endif /* CONFIG_LRU_GEN */ + + ANDROID_KABI_RESERVE(1); } __randomize_layout; /* @@ -606,6 +675,66 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) return (struct cpumask *)&mm->cpu_bitmap; } +#ifdef CONFIG_LRU_GEN + +struct lru_gen_mm_list { + /* mm_struct list for page table walkers */ + struct list_head fifo; + /* protects the list above */ + spinlock_t lock; +}; + +void lru_gen_add_mm(struct mm_struct *mm); +void lru_gen_del_mm(struct mm_struct *mm); +#ifdef CONFIG_MEMCG +void lru_gen_migrate_mm(struct mm_struct *mm); +#endif + +static inline void lru_gen_init_mm(struct mm_struct *mm) +{ + INIT_LIST_HEAD(&mm->lru_gen.list); + mm->lru_gen.bitmap = 0; +#ifdef CONFIG_MEMCG + mm->lru_gen.memcg = NULL; +#endif +} + +static inline void lru_gen_use_mm(struct mm_struct *mm) +{ + /* + * When the bitmap is set, page reclaim knows this mm_struct has been + * used since the last time it cleared the bitmap. So it might be worth + * walking the page tables of this mm_struct to clear the accessed bit. + */ + WRITE_ONCE(mm->lru_gen.bitmap, -1); +} + +#else /* !CONFIG_LRU_GEN */ + +static inline void lru_gen_add_mm(struct mm_struct *mm) +{ +} + +static inline void lru_gen_del_mm(struct mm_struct *mm) +{ +} + +#ifdef CONFIG_MEMCG +static inline void lru_gen_migrate_mm(struct mm_struct *mm) +{ +} +#endif + +static inline void lru_gen_init_mm(struct mm_struct *mm) +{ +} + +static inline void lru_gen_use_mm(struct mm_struct *mm) +{ +} + +#endif /* CONFIG_LRU_GEN */ + struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 96e113e23d04..747805ce07b8 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -7,9 +7,18 @@ #include #include #include +#include -#define MMAP_LOCK_INITIALIZER(name) \ - .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +#define MMAP_LOCK_SEQ_INITIALIZER(name) \ + .mmap_seq = 0, +#else +#define MMAP_LOCK_SEQ_INITIALIZER(name) +#endif + +#define MMAP_LOCK_INITIALIZER(name) \ + .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), \ + MMAP_LOCK_SEQ_INITIALIZER(name) DECLARE_TRACEPOINT(mmap_lock_start_locking); DECLARE_TRACEPOINT(mmap_lock_acquire_returned); @@ -63,13 +72,66 @@ static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + mm->mmap_seq = 0; +#endif } +static inline void __mmap_seq_write_lock(struct mm_struct *mm) +{ +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + VM_BUG_ON_MM(mm->mmap_seq & 1, mm); + mm->mmap_seq++; + smp_wmb(); +#endif +} + +static inline void __mmap_seq_write_unlock(struct mm_struct *mm) +{ +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + smp_wmb(); + mm->mmap_seq++; + VM_BUG_ON_MM(mm->mmap_seq & 1, mm); +#endif +} + +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +static inline unsigned long mmap_seq_read_start(struct mm_struct *mm) +{ + unsigned long seq; + + seq = READ_ONCE(mm->mmap_seq); + smp_rmb(); + return seq; +} + +static inline bool __mmap_seq_read_check(struct mm_struct *mm, + unsigned long seq) +{ + smp_rmb(); + return seq == READ_ONCE(mm->mmap_seq); +} + +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT_STATS +static inline bool mmap_seq_read_check(struct mm_struct *mm, unsigned long seq, + enum vm_event_item fail_event) +{ + if (__mmap_seq_read_check(mm, seq)) + return true; + count_vm_event(fail_event); + return false; +} +#else +#define mmap_seq_read_check(mm, seq, fail) __mmap_seq_read_check(mm, seq) +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT_STATS */ +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ + static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); down_write(&mm->mmap_lock); __mmap_lock_trace_acquire_returned(mm, true, true); + __mmap_seq_write_lock(mm); } static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) @@ -77,37 +139,44 @@ static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) __mmap_lock_trace_start_locking(mm, true); down_write_nested(&mm->mmap_lock, subclass); __mmap_lock_trace_acquire_returned(mm, true, true); + __mmap_seq_write_lock(mm); } static inline int mmap_write_lock_killable(struct mm_struct *mm) { - int ret; + int error; __mmap_lock_trace_start_locking(mm, true); - ret = down_write_killable(&mm->mmap_lock); - __mmap_lock_trace_acquire_returned(mm, true, ret == 0); - return ret; + error = down_write_killable(&mm->mmap_lock); + __mmap_lock_trace_acquire_returned(mm, true, !error); + if (likely(!error)) + __mmap_seq_write_lock(mm); + return error; } static inline bool mmap_write_trylock(struct mm_struct *mm) { - bool ret; + bool ok; __mmap_lock_trace_start_locking(mm, true); - ret = down_write_trylock(&mm->mmap_lock) != 0; - __mmap_lock_trace_acquire_returned(mm, true, ret); - return ret; + ok = down_write_trylock(&mm->mmap_lock) != 0; + __mmap_lock_trace_acquire_returned(mm, true, ok); + if (likely(ok)) + __mmap_seq_write_lock(mm); + return ok; } static inline void mmap_write_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, true); + __mmap_seq_write_unlock(mm); up_write(&mm->mmap_lock); } static inline void mmap_write_downgrade(struct mm_struct *mm) { __mmap_lock_trace_acquire_returned(mm, false, true); + __mmap_seq_write_unlock(mm); downgrade_write(&mm->mmap_lock); } @@ -120,22 +189,22 @@ static inline void mmap_read_lock(struct mm_struct *mm) static inline int mmap_read_lock_killable(struct mm_struct *mm) { - int ret; + int error; __mmap_lock_trace_start_locking(mm, false); - ret = down_read_killable(&mm->mmap_lock); - __mmap_lock_trace_acquire_returned(mm, false, ret == 0); - return ret; + error = down_read_killable(&mm->mmap_lock); + __mmap_lock_trace_acquire_returned(mm, false, !error); + return error; } static inline bool mmap_read_trylock(struct mm_struct *mm) { - bool ret; + bool ok; __mmap_lock_trace_start_locking(mm, false); - ret = down_read_trylock(&mm->mmap_lock) != 0; - __mmap_lock_trace_acquire_returned(mm, false, ret); - return ret; + ok = down_read_trylock(&mm->mmap_lock) != 0; + __mmap_lock_trace_acquire_returned(mm, false, ok); + return ok; } static inline void mmap_read_unlock(struct mm_struct *mm) @@ -162,9 +231,9 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm) VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); } -static inline int mmap_lock_is_contended(struct mm_struct *mm) +static inline bool mmap_lock_is_contended(struct mm_struct *mm) { - return rwsem_is_contended(&mm->mmap_lock); + return rwsem_is_contended(&mm->mmap_lock) != 0; } #endif /* _LINUX_MMAP_LOCK_H */ diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 0557b0e5c42c..50937e87fdcb 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -9,6 +9,7 @@ #include #include +#include struct mmc_cid { unsigned int manfid; @@ -258,6 +259,8 @@ struct mmc_part { #define MMC_BLK_DATA_AREA_BOOT (1<<1) #define MMC_BLK_DATA_AREA_GP (1<<2) #define MMC_BLK_DATA_AREA_RPMB (1<<3) + + ANDROID_KABI_RESERVE(1); }; /* @@ -339,6 +342,9 @@ struct mmc_card { unsigned int nr_parts; struct workqueue_struct *complete_wq; /* Private workqueue */ + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; static inline bool mmc_large_sector(struct mmc_card *card) diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 0c0c9a0fdf57..d034df740796 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -15,7 +15,10 @@ #include #include #include -#include +#include +#include + +#include struct mmc_ios { unsigned int clock; /* clock rate */ @@ -93,6 +96,25 @@ struct mmc_clk_phase_map { struct mmc_host; +enum mmc_err_stat { + MMC_ERR_CMD_TIMEOUT, + MMC_ERR_CMD_CRC, + MMC_ERR_DAT_TIMEOUT, + MMC_ERR_DAT_CRC, + MMC_ERR_AUTO_CMD, + MMC_ERR_ADMA, + MMC_ERR_TUNING, + MMC_ERR_CMDQ_RED, + MMC_ERR_CMDQ_GCE, + MMC_ERR_CMDQ_ICCE, + MMC_ERR_REQ_TIMEOUT, + MMC_ERR_CMDQ_REQ_TIMEOUT, + MMC_ERR_ICE_CFG, + MMC_ERR_CTRL_TIMEOUT, + MMC_ERR_UNEXPECTED_IRQ, + MMC_ERR_MAX, +}; + struct mmc_host_ops { /* * It is optional for the host to implement pre_req and post_req in @@ -162,6 +184,9 @@ struct mmc_host_ops { /* Prepare HS400 target operating frequency depending host driver */ int (*prepare_hs400_tuning)(struct mmc_host *host, struct mmc_ios *ios); + /* Execute HS400 tuning depending host driver */ + int (*execute_hs400_tuning)(struct mmc_host *host, struct mmc_card *card); + /* Prepare switch to DDR during the HS400 init sequence */ int (*hs400_prepare_ddr)(struct mmc_host *host); @@ -190,6 +215,9 @@ struct mmc_host_ops { /* Initialize an SD express card, mandatory for MMC_CAP2_SD_EXP. */ int (*init_sd_express)(struct mmc_host *host, struct mmc_ios *ios); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; struct mmc_cqe_ops { @@ -234,6 +262,9 @@ struct mmc_cqe_ops { * will have zero data bytes transferred. */ void (*cqe_recovery_finish)(struct mmc_host *host); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; struct mmc_async_req { @@ -261,6 +292,7 @@ struct mmc_slot { int cd_irq; bool cd_wake_enabled; void *handler_priv; + ANDROID_OEM_DATA(1); }; /** @@ -489,15 +521,22 @@ struct mmc_host { int cqe_qdepth; bool cqe_enabled; bool cqe_on; + bool cqe_recovery_reset_always; /* Inline encryption support */ #ifdef CONFIG_MMC_CRYPTO - struct blk_keyslot_manager ksm; + struct blk_crypto_profile crypto_profile; #endif /* Host Software Queue support */ bool hsq_enabled; + u32 err_stats[MMC_ERR_MAX]; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_OEM_DATA(1); + unsigned long private[] ____cacheline_aligned; }; @@ -632,7 +671,14 @@ static inline enum dma_data_direction mmc_get_dma_dir(struct mmc_data *data) return data->flags & MMC_DATA_WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE; } +static inline void mmc_debugfs_err_stats_inc(struct mmc_host *host, + enum mmc_err_stat stat) +{ + host->err_stats[stat] += 1; +} + int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error); int mmc_send_abort_tuning(struct mmc_host *host, u32 opcode); +int mmc_get_ext_csd(struct mmc_card *card, u8 **new_ext_csd); #endif /* LINUX_MMC_HOST_H */ diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h index 545578fb814b..f490e2c24e53 100644 --- a/include/linux/mmc/mmc.h +++ b/include/linux/mmc/mmc.h @@ -99,6 +99,12 @@ static inline bool mmc_op_multi(u32 opcode) opcode == MMC_READ_MULTIPLE_BLOCK; } +static inline bool mmc_op_tuning(u32 opcode) +{ + return opcode == MMC_SEND_TUNING_BLOCK || + opcode == MMC_SEND_TUNING_BLOCK_HS200; +} + /* * MMC_SWITCH argument format: * @@ -449,4 +455,13 @@ static inline bool mmc_ready_for_data(u32 status) #define mmc_driver_type_mask(n) (1 << (n)) +struct mmc_card; + +extern int mmc_select_bus_width(struct mmc_card *card); +extern int mmc_select_hs(struct mmc_card *card); +extern int mmc_select_hs_ddr(struct mmc_card *card); +extern int mmc_select_hs400(struct mmc_card *card); +extern int mmc_hs200_tuning(struct mmc_card *card); +extern int mmc_select_timing(struct mmc_card *card); + #endif /* LINUX_MMC_MMC_H */ diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 45fc2c81e370..253bb3025e5d 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -6,8 +6,11 @@ #include #include #include +#include +#include #include #include +#include struct mmu_notifier_subscriptions; struct mmu_notifier; @@ -221,6 +224,11 @@ struct mmu_notifier_ops { */ struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm); void (*free_notifier)(struct mmu_notifier *subscription); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; /* @@ -240,6 +248,9 @@ struct mmu_notifier { struct mm_struct *mm; struct rcu_head rcu; unsigned int users; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; /** @@ -499,15 +510,35 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, __mmu_notifier_invalidate_range(mm, start, end); } -static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) +static inline bool mmu_notifier_subscriptions_init(struct mm_struct *mm) { +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + mm->mmu_notifier_lock = kzalloc(sizeof(struct percpu_rw_semaphore), GFP_KERNEL); + if (!mm->mmu_notifier_lock) + return false; + if (percpu_init_rwsem(mm->mmu_notifier_lock)) { + kfree(mm->mmu_notifier_lock); + return false; + } +#endif + mm->notifier_subscriptions = NULL; + return true; } static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { if (mm_has_notifiers(mm)) __mmu_notifier_subscriptions_destroy(mm); + +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + if (!in_atomic()) { + percpu_free_rwsem(mm->mmu_notifier_lock); + kfree(mm->mmu_notifier_lock); + } else { + percpu_rwsem_async_destroy(mm->mmu_notifier_lock); + } +#endif } @@ -724,8 +755,9 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, { } -static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) +static inline bool mmu_notifier_subscriptions_init(struct mm_struct *mm) { + return true; } static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) @@ -749,4 +781,29 @@ static inline void mmu_notifier_synchronize(void) #endif /* CONFIG_MMU_NOTIFIER */ +#if defined(CONFIG_MMU_NOTIFIER) && defined(CONFIG_SPECULATIVE_PAGE_FAULT) + +static inline bool mmu_notifier_trylock(struct mm_struct *mm) +{ + return percpu_down_read_trylock(mm->mmu_notifier_lock); +} + +static inline void mmu_notifier_unlock(struct mm_struct *mm) +{ + percpu_up_read(mm->mmu_notifier_lock); +} + +#else + +static inline bool mmu_notifier_trylock(struct mm_struct *mm) +{ + return true; +} + +static inline void mmu_notifier_unlock(struct mm_struct *mm) +{ +} + +#endif + #endif /* _LINUX_MMU_NOTIFIER_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6ba100216530..4563cf0626a1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -21,6 +21,7 @@ #include #include #include +#include #include /* Free memory management - zoned buddy allocator. */ @@ -39,12 +40,12 @@ */ #define PAGE_ALLOC_COSTLY_ORDER 3 +#define MAX_KSWAPD_THREADS 16 + enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, - MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ - MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way @@ -61,6 +62,8 @@ enum migratetype { */ MIGRATE_CMA, #endif + MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ + MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_MEMORY_ISOLATION MIGRATE_ISOLATE, /* can't allocate from here */ #endif @@ -72,10 +75,16 @@ extern const char * const migratetype_names[MIGRATE_TYPES]; #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) -# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) +# define is_migrate_cma_page(_page) ({ \ + int mt = get_pageblock_migratetype(_page); \ + bool ret = (mt == MIGRATE_ISOLATE || mt == MIGRATE_CMA) ? true : false; \ + ret; \ +}) +# define get_cma_migrate_type() MIGRATE_CMA #else # define is_migrate_cma(migratetype) false # define is_migrate_cma_page(_page) false +# define get_cma_migrate_type() MIGRATE_MOVABLE #endif static inline bool is_migrate_movable(int mt) @@ -154,9 +163,7 @@ enum zone_stat_item { NR_MLOCK, /* mlock()ed pages found and moved off LRU */ /* Second 128 byte cacheline */ NR_BOUNCE, -#if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ -#endif NR_FREE_CMA_PAGES, NR_VM_ZONE_STAT_ITEMS }; @@ -207,6 +214,7 @@ enum node_stat_item { NR_KERNEL_SCS_KB, /* measured in KiB */ #endif NR_PAGETABLE, /* used for pagetables */ + NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. KVM pagetables */ #ifdef CONFIG_SWAP NR_SWAPCACHE, #endif @@ -294,6 +302,207 @@ enum lruvec_flags { */ }; +#endif /* !__GENERATING_BOUNDS_H */ + +/* + * Evictable pages are divided into multiple generations. The youngest and the + * oldest generation numbers, max_seq and min_seq, are monotonically increasing. + * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An + * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the + * corresponding generation. The gen counter in page->flags stores gen+1 while + * a page is on one of lrugen->lists[]. Otherwise it stores 0. + * + * A page is added to the youngest generation on faulting. The aging needs to + * check the accessed bit at least twice before handing this page over to the + * eviction. The first check takes care of the accessed bit set on the initial + * fault; the second check makes sure this page hasn't been used since then. + * This process, AKA second chance, requires a minimum of two generations, + * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive + * LRU, e.g., /proc/vmstat, these two generations are considered active; the + * rest of generations, if they exist, are considered inactive. See + * lru_gen_is_active(). + * + * PG_active is always cleared while a page is on one of lrugen->lists[] so that + * the aging needs not to worry about it. And it's set again when a page + * considered active is isolated for non-reclaiming purposes, e.g., migration. + * See lru_gen_add_page() and lru_gen_del_page(). + * + * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the + * number of categories of the active/inactive LRU when keeping track of + * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits + * in page->flags. + */ +#define MIN_NR_GENS 2U +#define MAX_NR_GENS 4U + +/* + * Each generation is divided into multiple tiers. A page accessed N times + * through file descriptors is in tier order_base_2(N). A page in the first tier + * (N=0,1) is marked by PG_referenced unless it was faulted in through page + * tables or read ahead. A page in any other tier (N>1) is marked by + * PG_referenced and PG_workingset. This implies a minimum of two tiers is + * supported without using additional bits in page->flags. + * + * In contrast to moving across generations which requires the LRU lock, moving + * across tiers only involves atomic operations on page->flags and therefore + * has a negligible cost in the buffered access path. In the eviction path, + * comparisons of refaulted/(evicted+protected) from the first tier and the + * rest infer whether pages accessed multiple times through file descriptors + * are statistically hot and thus worth protecting. + * + * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the + * number of categories of the active/inactive LRU when keeping track of + * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in + * page->flags. + */ +#define MAX_NR_TIERS 4U + +#ifndef __GENERATING_BOUNDS_H + +struct lruvec; +struct page_vma_mapped_walk; + +#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) +#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) + +#ifdef CONFIG_LRU_GEN + +enum { + LRU_GEN_ANON, + LRU_GEN_FILE, +}; + +enum { + LRU_GEN_CORE, + LRU_GEN_MM_WALK, + LRU_GEN_NONLEAF_YOUNG, + NR_LRU_GEN_CAPS +}; + +#define MIN_LRU_BATCH BITS_PER_LONG +#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) + +/* whether to keep historical stats from evicted generations */ +#ifdef CONFIG_LRU_GEN_STATS +#define NR_HIST_GENS MAX_NR_GENS +#else +#define NR_HIST_GENS 1U +#endif + +/* + * The youngest generation number is stored in max_seq for both anon and file + * types as they are aged on an equal footing. The oldest generation numbers are + * stored in min_seq[] separately for anon and file types as clean file pages + * can be evicted regardless of swap constraints. + * + * Normally anon and file min_seq are in sync. But if swapping is constrained, + * e.g., out of swap space, file min_seq is allowed to advance and leave anon + * min_seq behind. + * + * The number of pages in each generation is eventually consistent and therefore + * can be transiently negative when reset_batch_size() is pending. + */ +struct lru_gen_struct { + /* the aging increments the youngest generation number */ + unsigned long max_seq; + /* the eviction increments the oldest generation numbers */ + unsigned long min_seq[ANON_AND_FILE]; + /* the birth time of each generation in jiffies */ + unsigned long timestamps[MAX_NR_GENS]; + /* the multi-gen LRU lists, lazily sorted on eviction */ + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the multi-gen LRU sizes, eventually consistent */ + long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the exponential moving average of refaulted */ + unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; + /* the exponential moving average of evicted+protected */ + unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; + /* the first tier doesn't need protection, hence the minus one */ + unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; + /* can be modified without holding the LRU lock */ + atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + /* whether the multi-gen LRU is enabled */ + bool enabled; +}; + +enum { + MM_LEAF_TOTAL, /* total leaf entries */ + MM_LEAF_OLD, /* old leaf entries */ + MM_LEAF_YOUNG, /* young leaf entries */ + MM_NONLEAF_TOTAL, /* total non-leaf entries */ + MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */ + MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */ + NR_MM_STATS +}; + +/* double-buffering Bloom filters */ +#define NR_BLOOM_FILTERS 2 + +struct lru_gen_mm_state { + /* set to max_seq after each iteration */ + unsigned long seq; + /* where the current iteration continues (inclusive) */ + struct list_head *head; + /* where the last iteration ended (exclusive) */ + struct list_head *tail; + /* to wait for the last page table walker to finish */ + struct wait_queue_head wait; + /* Bloom filters flip after each iteration */ + unsigned long *filters[NR_BLOOM_FILTERS]; + /* the mm stats for debugging */ + unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; + /* the number of concurrent page table walkers */ + int nr_walkers; +}; + +struct lru_gen_mm_walk { + /* the lruvec under reclaim */ + struct lruvec *lruvec; + /* unstable max_seq from lru_gen_struct */ + unsigned long max_seq; + /* the next address within an mm to scan */ + unsigned long next_addr; + /* to batch promoted pages */ + int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* to batch the mm stats */ + int mm_stats[NR_MM_STATS]; + /* total batched items */ + int batched; + bool can_swap; + bool force_scan; +}; + +void lru_gen_init_lruvec(struct lruvec *lruvec); +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); + +#ifdef CONFIG_MEMCG +void lru_gen_init_memcg(struct mem_cgroup *memcg); +void lru_gen_exit_memcg(struct mem_cgroup *memcg); +#endif + +#else /* !CONFIG_LRU_GEN */ + +static inline void lru_gen_init_lruvec(struct lruvec *lruvec) +{ +} + +static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +{ +} + +#ifdef CONFIG_MEMCG +static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) +{ +} +#endif + +#endif /* CONFIG_LRU_GEN */ + struct lruvec { struct list_head lists[NR_LRU_LISTS]; /* per lruvec lru_lock for memcg */ @@ -311,6 +520,12 @@ struct lruvec { unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; +#ifdef CONFIG_LRU_GEN + /* evictable pages divided into generations */ + struct lru_gen_struct lrugen; + /* to concurrently iterate lru_gen_mm_list */ + struct lru_gen_mm_state mm_state; +#endif #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif @@ -655,6 +870,11 @@ struct zone { /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); } ____cacheline_internodealigned_in_smp; enum pgdat_flags { @@ -843,11 +1063,13 @@ typedef struct pglist_data { wait_queue_head_t pfmemalloc_wait; struct task_struct *kswapd; /* Protected by mem_hotplug_begin/end() */ + struct task_struct *mkswapd[MAX_KSWAPD_THREADS]; int kswapd_order; enum zone_type kswapd_highest_zoneidx; int kswapd_failures; /* Number of 'reclaimed == 0' runs */ + ANDROID_OEM_DATA(1); #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_highest_zoneidx; @@ -895,6 +1117,11 @@ typedef struct pglist_data { unsigned long flags; +#ifdef CONFIG_LRU_GEN + /* kswap mm walk data */ + struct lru_gen_mm_walk mm_walk; +#endif + ZONE_PADDING(_pad2_) /* Per-node vmstats */ @@ -1096,6 +1323,7 @@ static inline struct pglist_data *NODE_DATA(int nid) extern struct pglist_data *first_online_pgdat(void); extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); extern struct zone *next_zone(struct zone *zone); +extern int isolate_anon_lru_page(struct page *page); /** * for_each_online_pgdat - helper macro to iterate over all online nodes diff --git a/include/linux/module.h b/include/linux/module.h index c9f1200b2312..0f004d598db9 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -290,7 +291,7 @@ extern typeof(name) __mod_##type##__##name##_device_table \ * files require multiple MODULE_FIRMWARE() specifiers */ #define MODULE_FIRMWARE(_firmware) MODULE_INFO(firmware, _firmware) -#define MODULE_IMPORT_NS(ns) MODULE_INFO(import_ns, #ns) +#define MODULE_IMPORT_NS(ns) MODULE_INFO(import_ns, __stringify(ns)) struct notifier_block; @@ -380,6 +381,7 @@ struct module { struct module_attribute *modinfo_attrs; const char *version; const char *srcversion; + const char *scmversion; struct kobject *holders_dir; /* Exported symbols */ @@ -404,10 +406,12 @@ struct module { const s32 *gpl_crcs; bool using_gplonly_symbols; -#ifdef CONFIG_MODULE_SIG - /* Signature was verified. */ + /* + * Signature was verified. Unconditionally compiled in Android to + * preserve ABI compatibility between kernels without module + * signing enabled and signed modules. + */ bool sig_ok; -#endif bool async_probe_requested; @@ -538,6 +542,10 @@ struct module { struct error_injection_entry *ei_funcs; unsigned int num_ei_funcs; #endif + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); } ____cacheline_aligned __randomize_layout; #ifndef MODULE_ARCH_INIT #define MODULE_ARCH_INIT {} diff --git a/include/linux/mount.h b/include/linux/mount.h index 5d92a7e1a742..9fc79930799e 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -16,6 +16,7 @@ #include #include #include +#include struct super_block; struct vfsmount; @@ -73,6 +74,11 @@ struct vfsmount { struct super_block *mnt_sb; /* pointer to superblock */ int mnt_flags; struct user_namespace *mnt_userns; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); } __randomize_layout; static inline struct user_namespace *mnt_user_ns(const struct vfsmount *mnt) diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 8f226d460f51..35d7b14239e2 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -19,6 +19,7 @@ #include #include #include +#include #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ @@ -73,6 +74,7 @@ struct mutex { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif + ANDROID_OEM_DATA_ARRAY(1, 2); }; #ifdef CONFIG_DEBUG_MUTEXES diff --git a/include/linux/net.h b/include/linux/net.h index ba736b457a06..b08d7c61e51f 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -200,6 +201,11 @@ struct proto_ops { int (*sendmsg_locked)(struct sock *sk, struct msghdr *msg, size_t size); int (*set_rcvlowat)(struct sock *sk, int val); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #define DECLARE_SOCKADDR(type, dst, src) \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3a75d644a120..26373522a139 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -48,6 +48,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -290,6 +291,9 @@ struct header_ops { const unsigned char *haddr); bool (*validate)(const char *ll_header, unsigned int len); __be16 (*parse_protocol)(const struct sk_buff *skb); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; /* These flag bits are private to the generic network queueing @@ -348,6 +352,11 @@ struct napi_struct { struct hlist_node napi_hash_node; unsigned int napi_id; struct task_struct *thread; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; enum { @@ -621,6 +630,11 @@ struct netdev_queue { #ifdef CONFIG_BQL struct dql dql; #endif + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); } ____cacheline_aligned_in_smp; extern int sysctl_fb_tunnels_only_for_init_net; @@ -758,6 +772,11 @@ struct netdev_rx_queue { #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); } ____cacheline_aligned_in_smp; /* @@ -1009,6 +1028,11 @@ struct xfrmdev_ops { bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #endif @@ -1584,6 +1608,15 @@ struct net_device_ops { struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); + ANDROID_KABI_RESERVE(5); + ANDROID_KABI_RESERVE(6); + ANDROID_KABI_RESERVE(7); + ANDROID_KABI_RESERVE(8); }; /** @@ -2287,6 +2320,15 @@ struct net_device { /* protected by rtnl_lock */ struct bpf_xdp_entity xdp_state[__MAX_XDP_MODE]; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); + ANDROID_KABI_RESERVE(5); + ANDROID_KABI_RESERVE(6); + ANDROID_KABI_RESERVE(7); + ANDROID_KABI_RESERVE(8); }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -2660,6 +2702,11 @@ struct packet_type { struct net *af_packet_net; void *af_packet_priv; struct list_head list; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; struct offload_callbacks { diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 3fda1a508733..b79cb3de242c 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -14,6 +14,7 @@ #include #include #include +#include #include static inline int NF_DROP_GETERR(int verdict) @@ -177,6 +178,8 @@ struct nf_sockopt_ops { int (*get)(struct sock *sk, int optval, void __user *user, int *len); /* Use the module struct to lock set/get code in place */ struct module *owner; + + ANDROID_KABI_RESERVE(1); }; /* Function to register/unregister hook points. */ @@ -379,6 +382,8 @@ struct nf_nat_hook { unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct, enum nf_nat_manip_type mtype, enum ip_conntrack_dir dir); + + ANDROID_KABI_RESERVE(1); }; extern struct nf_nat_hook __rcu *nf_nat_hook; @@ -463,6 +468,8 @@ struct nf_ct_hook { void (*destroy)(struct nf_conntrack *); bool (*get_tuple_skb)(struct nf_conntrack_tuple *, const struct sk_buff *); + + ANDROID_KABI_RESERVE(1); }; extern struct nf_ct_hook __rcu *nf_ct_hook; @@ -478,6 +485,8 @@ struct nfnl_ct_hook { u32 portid, u32 report); void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off); + + ANDROID_KABI_RESERVE(1); }; extern struct nfnl_ct_hook __rcu *nfnl_ct_hook; diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 72f5ebc5c97a..99de4d5fcc1e 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -188,6 +189,8 @@ struct ip_set_type_variant { bool (*same_set)(const struct ip_set *a, const struct ip_set *b); /* Region-locking is used */ bool region_lock; + + ANDROID_KABI_RESERVE(1); }; struct ip_set_region { @@ -234,6 +237,8 @@ struct ip_set_type { /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; + + ANDROID_KABI_RESERVE(1); }; /* register and unregister set type */ @@ -276,6 +281,8 @@ struct ip_set { size_t offset[IPSET_EXT_ID_MAX]; /* The type specific data */ void *data; + + ANDROID_KABI_RESERVE(1); }; static inline void diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 241e005f290a..e9dd51997ca5 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -28,6 +29,8 @@ struct nfnl_callback { const struct nla_policy *policy; enum nfnl_callback_type type; __u16 attr_count; + + ANDROID_KABI_RESERVE(1); }; enum nfnl_abort_action { @@ -47,6 +50,8 @@ struct nfnetlink_subsystem { enum nfnl_abort_action action); void (*cleanup)(struct net *net); bool (*valid_genid)(struct net *net, u32 genid); + + ANDROID_KABI_RESERVE(1); }; int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n); diff --git a/include/linux/netfilter/xt_quota2.h b/include/linux/netfilter/xt_quota2.h new file mode 100644 index 000000000000..a3918718cbf8 --- /dev/null +++ b/include/linux/netfilter/xt_quota2.h @@ -0,0 +1,26 @@ +#ifndef _XT_QUOTA_H +#define _XT_QUOTA_H +#include + +enum xt_quota_flags { + XT_QUOTA_INVERT = 1 << 0, + XT_QUOTA_GROW = 1 << 1, + XT_QUOTA_PACKET = 1 << 2, + XT_QUOTA_NO_CHANGE = 1 << 3, + XT_QUOTA_MASK = 0x0F, +}; + +struct xt_quota_counter; + +struct xt_quota_mtinfo2 { + char name[15]; + u_int8_t flags; + + /* Comparison-invariant */ + aligned_u64 quota; + + /* Used internally by the kernel */ + struct xt_quota_counter *master __attribute__((aligned(8))); +}; + +#endif /* _XT_QUOTA_H */ diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 48314ade1506..c3d17fe68426 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -7,6 +7,7 @@ #ifndef __LINUX_IP6_NETFILTER_H #define __LINUX_IP6_NETFILTER_H +#include #include #include @@ -65,6 +66,8 @@ struct nf_ipv6_ops { const struct nf_bridge_frag_data *data, struct sk_buff *)); #endif + + ANDROID_KABI_RESERVE(1); }; #ifdef CONFIG_NETFILTER diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 0f233b76c9ce..292ec0ce0d63 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -485,6 +485,7 @@ static inline int num_node_state(enum node_states state) #define first_online_node 0 #define first_memory_node 0 #define next_online_node(nid) (MAX_NUMNODES) +#define next_memory_node(nid) (MAX_NUMNODES) #define nr_node_ids 1U #define nr_online_nodes 1U diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 6508b97dbf1d..4d486ef8e219 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -58,6 +58,27 @@ extern int of_flat_dt_is_compatible(unsigned long node, const char *name); extern unsigned long of_get_flat_dt_root(void); extern uint32_t of_get_flat_dt_phandle(unsigned long node); +/* + * early_init_dt_scan_chosen - scan the device tree for ramdisk and bootargs + * + * The boot arguments will be placed into the memory pointed to by @data. + * That memory should be COMMAND_LINE_SIZE big and initialized to be a valid + * (possibly empty) string. Logic for what will be in @data after this + * function finishes: + * + * - CONFIG_CMDLINE_FORCE=true + * CONFIG_CMDLINE + * - CONFIG_CMDLINE_EXTEND=true, @data is non-empty string + * @data + dt bootargs (even if dt bootargs are empty) + * - CONFIG_CMDLINE_EXTEND=true, @data is empty string + * CONFIG_CMDLINE + dt bootargs (even if dt bootargs are empty) + * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=non-empty: + * dt bootargs + * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=empty, @data is non-empty string + * @data is left unchanged + * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=empty, @data is empty string + * CONFIG_CMDLINE (or "" if that's not defined) + */ extern int early_init_dt_scan_chosen(unsigned long node, const char *uname, int depth, void *data); extern int early_init_dt_scan_memory(unsigned long node, const char *uname, diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index ef1e3e736e14..7d79818dc065 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -55,7 +55,8 @@ #define SECTIONS_WIDTH 0 #endif -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \ + <= BITS_PER_LONG - NR_PAGEFLAGS #define NODES_WIDTH NODES_SHIFT #elif defined(CONFIG_SPARSEMEM_VMEMMAP) #error "Vmemmap: No space for nodes field in page flags" @@ -89,8 +90,8 @@ #define LAST_CPUPID_SHIFT 0 #endif -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ - <= BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT #else #define LAST_CPUPID_WIDTH 0 @@ -100,10 +101,15 @@ #define LAST_CPUPID_NOT_IN_PAGE_FLAGS #endif -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ - > BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS #error "Not enough bits in page flags" #endif +/* see the comment on MAX_NR_TIERS */ +#define LRU_REFS_WIDTH min(__LRU_REFS_WIDTH, BITS_PER_LONG - NR_PAGEFLAGS - \ + ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \ + NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH) + #endif #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index fbfd3fad48f2..2e75a29f8f16 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -140,6 +140,9 @@ enum pageflags { #endif #ifdef CONFIG_KASAN_HW_TAGS PG_skip_kasan_poison, +#endif +#ifdef CONFIG_64BIT + PG_oem_reserved, #endif __NR_PAGEFLAGS, @@ -791,7 +794,7 @@ PAGE_TYPE_OPS(Guard, guard) extern bool is_free_buddy_page(struct page *page); -__PAGEFLAG(Isolated, isolated, PF_ANY); +PAGEFLAG(Isolated, isolated, PF_ANY); /* * If network-based swap is enabled, sl*b must keep track of whether pages @@ -845,7 +848,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ 1UL << PG_slab | 1UL << PG_active | \ - 1UL << PG_unevictable | __PG_MLOCKED) + 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) /* * Flags checked when a page is prepped for return by the page allocator. @@ -856,7 +859,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) * alloc-free cycle to prevent from reusing the page. */ #define PAGE_FLAGS_CHECK_AT_PREP \ - (PAGEFLAGS_MASK & ~__PG_HWPOISON) + ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 572458016331..ccd3ed46434f 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -44,7 +44,8 @@ int move_freepages_block(struct zone *zone, struct page *page, */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - unsigned migratetype, int flags); + unsigned migratetype, int flags, + unsigned long *failed_pfn); /* * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. @@ -58,7 +59,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * Test all pages in [start_pfn, end_pfn) are isolated or not. */ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, - int isol_flags); + int isol_flags, unsigned long *failed_pfn); struct page *alloc_migrate_target(struct page *page, unsigned long private); diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 679591301994..6c9016875af4 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -5,6 +5,7 @@ #include #include #include +#include struct page_counter { atomic_long_t usage; @@ -26,6 +27,7 @@ struct page_counter { /* legacy */ unsigned long watermark; unsigned long failcnt; + ANDROID_VENDOR_DATA(1); /* * 'parent' is placed here to be far from 'usage' to reduce diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index fabb2e1e087f..12733d8b382c 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -19,6 +19,10 @@ struct page_ext_operations { enum page_ext_flags { PAGE_EXT_OWNER, PAGE_EXT_OWNER_ALLOCATED, +#if defined(CONFIG_PAGE_PINNER) + /* page migration failed */ + PAGE_EXT_PINNER_MIGRATION_FAILED, +#endif #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, PAGE_EXT_IDLE, @@ -54,8 +58,9 @@ static inline void page_ext_init(void) { } #endif - struct page_ext *lookup_page_ext(const struct page *page); +extern struct page_ext *page_ext_get(struct page *page); +extern void page_ext_put(struct page_ext *page_ext); static inline struct page_ext *page_ext_next(struct page_ext *curr) { @@ -71,11 +76,6 @@ static inline void pgdat_page_ext_init(struct pglist_data *pgdat) { } -static inline struct page_ext *lookup_page_ext(const struct page *page) -{ - return NULL; -} - static inline void page_ext_init(void) { } @@ -87,5 +87,14 @@ static inline void page_ext_init_flatmem_late(void) static inline void page_ext_init_flatmem(void) { } + +static inline struct page_ext *page_ext_get(struct page *page) +{ + return NULL; +} + +static inline void page_ext_put(struct page_ext *page_ext) +{ +} #endif /* CONFIG_PAGE_EXTENSION */ #endif /* __LINUX_PAGE_EXT_H */ diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index d8a6aecf99cb..2a21c793d815 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -47,62 +47,77 @@ extern struct page_ext_operations page_idle_ops; static inline bool page_is_young(struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); + bool page_young; if (unlikely(!page_ext)) return false; - return test_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_young = test_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); + + return page_young; } static inline void set_page_young(struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); } static inline bool test_and_clear_page_young(struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); + bool page_young; if (unlikely(!page_ext)) return false; - return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_young = test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); + + return page_young; } static inline bool page_is_idle(struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); + bool page_idle; if (unlikely(!page_ext)) return false; - return test_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_idle = test_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); + + return page_idle; } static inline void set_page_idle(struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); } static inline void clear_page_idle(struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; clear_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); } #endif /* CONFIG_64BIT */ diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 719bfe5108c5..aa8d21119e5e 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -8,6 +8,8 @@ extern struct static_key_false page_owner_inited; extern struct page_ext_operations page_owner_ops; +extern depot_stack_handle_t get_page_owner_handle(struct page_ext *page_ext, + unsigned long pfn); extern void __reset_page_owner(struct page *page, unsigned int order); extern void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask); diff --git a/include/linux/page_pinner.h b/include/linux/page_pinner.h new file mode 100644 index 000000000000..cb29f5c3d68e --- /dev/null +++ b/include/linux/page_pinner.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_PAGE_PINNER_H +#define __LINUX_PAGE_PINNER_H + +#include + +#ifdef CONFIG_PAGE_PINNER +extern struct static_key_false page_pinner_inited; +extern struct static_key_true failure_tracking; +extern struct page_ext_operations page_pinner_ops; + +extern void __free_page_pinner(struct page *page, unsigned int order); +void __page_pinner_failure_detect(struct page *page); +void __page_pinner_put_page(struct page *page); + +static inline void free_page_pinner(struct page *page, unsigned int order) +{ + if (static_branch_unlikely(&page_pinner_inited)) + __free_page_pinner(page, order); +} + +static inline void page_pinner_put_page(struct page *page) +{ + if (!static_branch_unlikely(&page_pinner_inited)) + return; + + if (!static_branch_unlikely(&failure_tracking)) + return; + + __page_pinner_put_page(page); +} + +static inline void page_pinner_failure_detect(struct page *page) +{ + if (!static_branch_unlikely(&page_pinner_inited)) + return; + + if (!static_branch_unlikely(&failure_tracking)) + return; + + __page_pinner_failure_detect(page); +} +#else +static inline void free_page_pinner(struct page *page, unsigned int order) +{ +} +static inline void page_pinner_put_page(struct page *page) +{ +} +static inline void page_pinner_failure_detect(struct page *page) +{ +} +#endif /* CONFIG_PAGE_PINNER */ +#endif /* __LINUX_PAGE_PINNER_H */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2f7dd14083d9..c44c5016e8be 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -15,6 +15,7 @@ #include #include /* for in_interrupt() */ #include +#include struct pagevec; @@ -310,10 +311,7 @@ static inline struct page *page_cache_alloc(struct address_space *x) return __page_cache_alloc(mapping_gfp_mask(x)); } -static inline gfp_t readahead_gfp_mask(struct address_space *x) -{ - return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN; -} +gfp_t readahead_gfp_mask(struct address_space *x); typedef int filler_t(void *, struct page *); @@ -618,7 +616,7 @@ static inline int trylock_page(struct page *page) /* * lock_page may only be called if we have the page's inode pinned. */ -static inline void lock_page(struct page *page) +static inline __sched void lock_page(struct page *page) { might_sleep(); if (!trylock_page(page)) @@ -630,7 +628,7 @@ static inline void lock_page(struct page *page) * signals. It returns 0 if it locked the page and -EINTR if it was * killed while waiting. */ -static inline int lock_page_killable(struct page *page) +static inline __sched int lock_page_killable(struct page *page) { might_sleep(); if (!trylock_page(page)) @@ -646,7 +644,7 @@ static inline int lock_page_killable(struct page *page) * Returns 0 if the page is locked successfully, or -EIOCBQUEUED if the page * was already locked and the callback defined in 'wait' was queued. */ -static inline int lock_page_async(struct page *page, +static inline __sched int lock_page_async(struct page *page, struct wait_page_queue *wait) { if (!trylock_page(page)) @@ -661,7 +659,7 @@ static inline int lock_page_async(struct page *page, * Return value and mmap_lock implications depend on flags; see * __lock_page_or_retry(). */ -static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm, +static inline __sched int lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { might_sleep(); @@ -682,13 +680,13 @@ extern int wait_on_page_bit_killable(struct page *page, int bit_nr); * ie with increased "page->count" so that the page won't * go away during the wait.. */ -static inline void wait_on_page_locked(struct page *page) +static inline __sched void wait_on_page_locked(struct page *page) { if (PageLocked(page)) wait_on_page_bit(compound_head(page), PG_locked); } -static inline int wait_on_page_locked_killable(struct page *page) +static inline __sched int wait_on_page_locked_killable(struct page *page) { if (!PageLocked(page)) return 0; @@ -736,6 +734,7 @@ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); * Fault in userspace address range. */ size_t fault_in_writeable(char __user *uaddr, size_t size); +size_t fault_in_subpage_writeable(char __user *uaddr, size_t size); size_t fault_in_safe_writeable(const char __user *uaddr, size_t size); size_t fault_in_readable(const char __user *uaddr, size_t size); diff --git a/include/linux/pci.h b/include/linux/pci.h index 9d6e75222868..b0d642b0472c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -41,6 +41,7 @@ #include #include +#include #define PCI_STATUS_ERROR_BITS (PCI_STATUS_DETECTED_PARITY | \ PCI_STATUS_SIG_SYSTEM_ERROR | \ @@ -510,6 +511,11 @@ struct pci_dev { /* These methods index pci_reset_fn_methods[] */ u8 reset_methods[PCI_NUM_RESET_METHODS]; /* In priority order */ + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; static inline struct pci_dev *pci_physfn(struct pci_dev *dev) @@ -573,6 +579,10 @@ struct pci_host_bridge { resource_size_t start, resource_size_t size, resource_size_t align); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + unsigned long private[] ____cacheline_aligned; }; @@ -657,6 +667,11 @@ struct pci_bus { struct bin_attribute *legacy_mem; /* Legacy mem */ unsigned int is_added:1; unsigned int unsafe_warn:1; /* warned about RW1C config write */ + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #define to_pci_bus(n) container_of(n, struct pci_bus, dev) @@ -755,6 +770,8 @@ struct pci_ops { void __iomem *(*map_bus)(struct pci_bus *bus, unsigned int devfn, int where); int (*read)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val); int (*write)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val); + + ANDROID_KABI_RESERVE(1); }; /* @@ -830,6 +847,8 @@ struct pci_error_handlers { /* Device driver may resume normal operations */ void (*resume)(struct pci_dev *dev); + + ANDROID_KABI_RESERVE(1); }; @@ -901,6 +920,11 @@ struct pci_driver { const struct attribute_group **dev_groups; struct device_driver driver; struct pci_dynids dynids; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #define to_pci_driver(drv) container_of(drv, struct pci_driver, driver) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 5fda40f97fe9..bf1668fc9c5e 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -13,7 +13,14 @@ struct percpu_rw_semaphore { struct rcu_sync rss; unsigned int __percpu *read_count; struct rcuwait writer; - wait_queue_head_t waiters; + /* + * destroy_list_entry is used during object destruction when waiters + * can't be used, therefore reusing the same space. + */ + union { + wait_queue_head_t waiters; + struct list_head destroy_list_entry; + }; atomic_t block; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -127,8 +134,12 @@ extern void percpu_up_write(struct percpu_rw_semaphore *); extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, const char *, struct lock_class_key *); +/* Can't be called in atomic context. */ extern void percpu_free_rwsem(struct percpu_rw_semaphore *); +/* Invokes percpu_free_rwsem and frees the semaphore from a worker thread. */ +extern void percpu_rwsem_async_destroy(struct percpu_rw_semaphore *sem); + #define percpu_init_rwsem(sem) \ ({ \ static struct lock_class_key rwsem_key; \ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 014eb0a963fc..26b223d988c5 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -26,11 +26,13 @@ # include #endif +#define PERF_GUEST_ACTIVE 0x01 +#define PERF_GUEST_USER 0x02 + struct perf_guest_info_callbacks { - int (*is_in_guest)(void); - int (*is_user_mode)(void); - unsigned long (*get_guest_ip)(void); - void (*handle_intel_pt_intr)(void); + unsigned int (*state)(void); + unsigned long (*get_ip)(void); + unsigned int (*handle_intel_pt_intr)(void); }; #ifdef CONFIG_HAVE_HW_BREAKPOINT @@ -861,7 +863,7 @@ struct perf_event_context { #define PERF_NR_CONTEXTS 4 /** - * struct perf_event_cpu_context - per cpu event context structure + * struct perf_cpu_context - per cpu event context structure */ struct perf_cpu_context { struct perf_event_context ctx; @@ -1241,6 +1243,7 @@ extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); +#ifdef CONFIG_GUEST_PERF_EVENTS extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) { @@ -1253,8 +1256,37 @@ static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) */ return rcu_dereference(perf_guest_cbs); } -extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); -extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); +static inline unsigned int perf_guest_state(void) +{ + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + + return guest_cbs ? guest_cbs->state() : 0; +} +static inline unsigned long perf_guest_get_ip(void) +{ + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + + /* + * Arbitrarily return '0' in the unlikely scenario that the callbacks + * are unregistered between checking guest state and getting the IP. + */ + return guest_cbs ? guest_cbs->get_ip() : 0; +} +static inline unsigned int perf_guest_handle_intel_pt_intr(void) +{ + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + + if (guest_cbs && guest_cbs->handle_intel_pt_intr) + return guest_cbs->handle_intel_pt_intr(); + return 0; +} +extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); +extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); +#else +static inline unsigned int perf_guest_state(void) { return 0; } +static inline unsigned long perf_guest_get_ip(void) { return 0; } +static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; } +#endif /* CONFIG_GUEST_PERF_EVENTS */ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); @@ -1497,11 +1529,6 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } -static inline int perf_register_guest_info_callbacks -(struct perf_guest_info_callbacks *callbacks) { return 0; } -static inline int perf_unregister_guest_info_callbacks -(struct perf_guest_info_callbacks *callbacks) { return 0; } - static inline void perf_event_mmap(struct vm_area_struct *vma) { } typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index d468efcf48f4..e6889556e0bf 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -212,7 +212,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) @@ -233,7 +233,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, BUILD_BUG(); return 0; } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH @@ -259,6 +259,19 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif +#ifndef arch_has_hw_pte_young +/* + * Return whether the accessed bit is supported on the local CPU. + * + * This stub assumes accessing through an old PTE triggers a page fault. + * Architectures that automatically set the access bit should overwrite it. + */ +static inline bool arch_has_hw_pte_young(void) +{ + return false; +} +#endif + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, diff --git a/include/linux/phy.h b/include/linux/phy.h index 946ccec17858..47f1380b57e1 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -676,6 +677,11 @@ struct phy_device { /* MACsec management functions */ const struct macsec_ops *macsec_ops; #endif + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; static inline struct phy_device *to_phy_device(const struct device *dev) @@ -903,6 +909,9 @@ struct phy_driver { int (*get_sqi)(struct phy_device *dev); /** @get_sqi_max: Get the maximum signal quality indication */ int (*get_sqi_max)(struct phy_device *dev); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; #define to_phy_driver(d) container_of(to_mdio_common_driver(d), \ struct phy_driver, mdiodrv) diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 7c96f169d274..dcc1744503e4 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -11,6 +11,7 @@ #define _PLATFORM_DEVICE_H_ #include +#include #define PLATFORM_DEVID_NONE (-1) #define PLATFORM_DEVID_AUTO (-2) @@ -38,6 +39,9 @@ struct platform_device { /* arch specific additions */ struct pdev_archdata archdata; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; #define platform_get_device_id(pdev) ((pdev)->id_entry) @@ -100,6 +104,8 @@ struct platform_device_info { u64 dma_mask; const struct property_entry *properties; + + ANDROID_KABI_RESERVE(1); }; extern struct platform_device *platform_device_register_full( const struct platform_device_info *pdevinfo); @@ -210,6 +216,8 @@ struct platform_driver { struct device_driver driver; const struct platform_device_id *id_table; bool prevent_deferred_probe; + + ANDROID_KABI_RESERVE(1); }; #define to_platform_driver(drv) (container_of((drv), struct platform_driver, \ diff --git a/include/linux/pm.h b/include/linux/pm.h index 1d8209c09686..81b017b94c45 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -15,6 +15,7 @@ #include #include #include +#include /* * Callbacks for platform drivers to implement. @@ -298,6 +299,8 @@ struct dev_pm_ops { int (*runtime_suspend)(struct device *dev); int (*runtime_resume)(struct device *dev); int (*runtime_idle)(struct device *dev); + + ANDROID_KABI_RESERVE(1); }; #ifdef CONFIG_PM_SLEEP @@ -622,6 +625,9 @@ struct dev_pm_info { struct pm_subsys_data *subsys_data; /* Owned by the subsystem. */ void (*set_latency_tolerance)(struct device *, s32); struct dev_pm_qos *qos; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; extern int dev_pm_get_subsys_data(struct device *dev); @@ -648,6 +654,8 @@ struct dev_pm_domain { int (*activate)(struct device *dev); void (*sync)(struct device *dev); void (*dismiss)(struct device *dev); + + ANDROID_KABI_RESERVE(1); }; /* diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h index 4a69d4af3ff8..200841b958c7 100644 --- a/include/linux/pm_qos.h +++ b/include/linux/pm_qos.h @@ -93,6 +93,7 @@ struct freq_qos_request { enum freq_qos_req_type type; struct plist_node pnode; struct freq_constraints *qos; + ANDROID_OEM_DATA_ARRAY(1, 2); }; diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 9ca1f120a211..ee457436060d 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -17,6 +17,7 @@ #include #include #include +#include /* * All voltages, currents, charges, energies, time and temperatures in uV, @@ -49,6 +50,12 @@ enum { POWER_SUPPLY_CHARGE_TYPE_ADAPTIVE, /* dynamically adjusted speed */ POWER_SUPPLY_CHARGE_TYPE_CUSTOM, /* use CHARGE_CONTROL_* props */ POWER_SUPPLY_CHARGE_TYPE_LONGLIFE, /* slow speed, longer life */ + + /* + * force to 50 to minimize the chances of userspace binary + * incompatibility on newer upstream kernels + */ + POWER_SUPPLY_CHARGE_TYPE_TAPER_EXT = 50, /* charging in CV phase */ }; enum { @@ -227,6 +234,8 @@ struct power_supply_config { char **supplied_to; size_t num_supplicants; + + ANDROID_KABI_RESERVE(1); }; /* Description of power supply */ @@ -268,6 +277,8 @@ struct power_supply_desc { bool no_thermal; /* For APM emulation, think legacy userspace. */ int use_for_apm; + + ANDROID_KABI_RESERVE(1); }; struct power_supply { @@ -309,6 +320,8 @@ struct power_supply { struct led_trigger *charging_blink_full_solid_trig; char *charging_blink_full_solid_trig_name; #endif + + ANDROID_KABI_RESERVE(1); }; /* @@ -377,6 +390,8 @@ struct power_supply_battery_info { int ocv_table_size[POWER_SUPPLY_OCV_TEMP_MAX]; struct power_supply_resistance_temp_table *resist_table; int resist_table_size; + + ANDROID_KABI_RESERVE(1); }; extern struct atomic_notifier_head power_supply_notifier; @@ -393,12 +408,22 @@ static inline struct power_supply *power_supply_get_by_name(const char *name) #ifdef CONFIG_OF extern struct power_supply *power_supply_get_by_phandle(struct device_node *np, const char *property); +extern int power_supply_get_by_phandle_array(struct device_node *np, + const char *property, + struct power_supply **psy, + ssize_t size); extern struct power_supply *devm_power_supply_get_by_phandle( struct device *dev, const char *property); #else /* !CONFIG_OF */ static inline struct power_supply * power_supply_get_by_phandle(struct device_node *np, const char *property) { return NULL; } +static inline int +power_supply_get_by_phandle_array(struct device_node *np, + const char *property, + struct power_supply **psy, + int size) +{ return 0; } static inline struct power_supply * devm_power_supply_get_by_phandle(struct device *dev, const char *property) { return NULL; } diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 6f190002a202..661234fb6462 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -166,6 +166,7 @@ struct psi_group { struct timer_list poll_timer; wait_queue_head_t poll_wait; atomic_t poll_wakeup; + atomic_t poll_scheduled; /* Protects data used by the monitor */ struct mutex trigger_lock; diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 725c9b784e60..7261b89d3cf7 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -5,6 +5,7 @@ #include #include #include +#include struct pwm_capture; struct seq_file; @@ -90,6 +91,8 @@ struct pwm_device { struct pwm_args args; struct pwm_state state; struct pwm_state last; + + ANDROID_KABI_RESERVE(1); }; /** @@ -284,6 +287,8 @@ struct pwm_ops { enum pwm_polarity polarity); int (*enable)(struct pwm_chip *chip, struct pwm_device *pwm); void (*disable)(struct pwm_chip *chip, struct pwm_device *pwm); + + ANDROID_KABI_RESERVE(1); }; /** @@ -310,6 +315,8 @@ struct pwm_chip { /* only used internally by the PWM framework */ struct list_head list; struct pwm_device *pwms; + + ANDROID_KABI_RESERVE(1); }; /** diff --git a/include/linux/quota.h b/include/linux/quota.h index 18ebd39c9487..6d0e5eecb34f 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -316,6 +316,9 @@ struct quota_format_ops { int (*commit_dqblk)(struct dquot *dquot); /* Write structure for one user */ int (*release_dqblk)(struct dquot *dquot); /* Called when last reference to dquot is being dropped */ int (*get_next_id)(struct super_block *sb, struct kqid *qid); /* Get next ID with existing structure in the quota file */ + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; /* Operations working with dquots */ @@ -335,6 +338,9 @@ struct dquot_operations { int (*get_inode_usage) (struct inode *, qsize_t *); /* Get next ID with active quota structure */ int (*get_next_id) (struct super_block *sb, struct kqid *qid); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; struct path; @@ -438,6 +444,9 @@ struct quotactl_ops { int (*set_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *); int (*get_state)(struct super_block *, struct qc_state *); int (*rm_xquota)(struct super_block *, unsigned int); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; struct quota_format_type { diff --git a/include/linux/regmap.h b/include/linux/regmap.h index e3c9a25a853a..a543ce9a1611 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -19,6 +19,7 @@ #include #include #include +#include struct module; struct clk; @@ -409,6 +410,8 @@ struct regmap_config { unsigned int hwlock_mode; bool can_sleep; + + ANDROID_KABI_RESERVE(1); }; /** @@ -446,6 +449,8 @@ struct regmap_range_cfg { /* Data window (per each page) */ unsigned int window_start; unsigned int window_len; + + ANDROID_KABI_RESERVE(1); }; struct regmap_async; @@ -524,6 +529,8 @@ struct regmap_bus { size_t max_raw_read; size_t max_raw_write; bool free_on_exit; + + ANDROID_KABI_RESERVE(1); }; /* diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index bd7a73db2e66..2a6ca85363a8 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -17,6 +17,7 @@ #include #include #include +#include struct gpio_desc; struct regmap; @@ -235,6 +236,8 @@ struct regulator_ops { int (*resume)(struct regulator_dev *rdev); int (*set_pull_down) (struct regulator_dev *); + + ANDROID_KABI_RESERVE(1); }; /* @@ -424,6 +427,8 @@ struct regulator_desc { unsigned int poll_enabled_time; unsigned int (*of_map_mode)(unsigned int mode); + + ANDROID_KABI_RESERVE(1); }; /** @@ -642,6 +647,8 @@ struct regulator_dev { int cached_err; bool use_cached_err; spinlock_t err_lock; + + ANDROID_KABI_RESERVE(1); }; struct regulator_dev * diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 83c09ac36b13..49a8e24bee33 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -656,6 +656,7 @@ struct rproc *devm_rproc_alloc(struct device *dev, const char *name, int devm_rproc_add(struct device *dev, struct rproc *rproc); void rproc_add_carveout(struct rproc *rproc, struct rproc_mem_entry *mem); +void rproc_del_carveout(struct rproc *rproc, struct rproc_mem_entry *mem); struct rproc_mem_entry * rproc_mem_entry_init(struct device *dev, @@ -663,6 +664,7 @@ rproc_mem_entry_init(struct device *dev, int (*alloc)(struct rproc *, struct rproc_mem_entry *), int (*release)(struct rproc *, struct rproc_mem_entry *), const char *name, ...); +void rproc_mem_entry_free(struct rproc_mem_entry *mem); struct rproc_mem_entry * rproc_of_resm_mem_entry_init(struct device *dev, u32 of_resm_idx, size_t len, @@ -673,6 +675,7 @@ void rproc_shutdown(struct rproc *rproc); int rproc_detach(struct rproc *rproc); int rproc_set_firmware(struct rproc *rproc, const char *fw_name); void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type); +void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem); void rproc_coredump_using_sections(struct rproc *rproc); int rproc_coredump_add_segment(struct rproc *rproc, dma_addr_t da, size_t size); int rproc_coredump_add_custom_segment(struct rproc *rproc, diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 3c7d295746f6..b93a349d9158 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -3,8 +3,11 @@ #define _LINUX_RING_BUFFER_H #include -#include #include +#include +#include + +#include struct trace_buffer; struct ring_buffer_iter; @@ -18,6 +21,8 @@ struct ring_buffer_event { u32 array[]; }; +#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) + /** * enum ring_buffer_type - internal ring buffer types * @@ -59,11 +64,49 @@ enum ring_buffer_type { RINGBUF_TYPE_TIME_STAMP, }; +#define TS_SHIFT 27 +#define TS_MASK ((1ULL << TS_SHIFT) - 1) +#define TS_DELTA_TEST (~TS_MASK) + +/* + * We need to fit the time_stamp delta into 27 bits. + */ +static inline int test_time_stamp(u64 delta) +{ + if (delta & TS_DELTA_TEST) + return 1; + return 0; +} + unsigned ring_buffer_event_length(struct ring_buffer_event *event); void *ring_buffer_event_data(struct ring_buffer_event *event); u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *event); +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) + +#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) + +#define RB_ALIGNMENT 4U +#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) +#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ + +#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + +#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) + +struct buffer_data_page { + u64 time_stamp; /* page time stamp */ + local_t commit; /* write committed index */ + unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ +}; + /* * ring_buffer_discard_commit will remove an event that has not * been committed yet. If this is used, then ring_buffer_unlock_commit @@ -98,6 +141,14 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k __ring_buffer_alloc((size), (flags), &__key); \ }) +struct ring_buffer_ext_cb { + int (*update_footers)(int cpu); + int (*swap_reader)(int cpu); +}; + +struct trace_buffer * +ring_buffer_alloc_ext(unsigned long size, struct ring_buffer_ext_cb *cb); + int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full); __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table, int full); @@ -212,4 +263,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); #define trace_rb_cpu_prepare NULL #endif +size_t trace_buffer_pack_size(struct trace_buffer *trace_buffer); +int trace_buffer_pack(struct trace_buffer *trace_buffer, struct trace_buffer_pack *pack); +int ring_buffer_poke(struct trace_buffer *buffer, int cpu); + #endif /* _LINUX_RING_BUFFER_H */ diff --git a/include/linux/ring_buffer_ext.h b/include/linux/ring_buffer_ext.h new file mode 100644 index 000000000000..7d86b8c5ba03 --- /dev/null +++ b/include/linux/ring_buffer_ext.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RING_BUFFER_EXT_H +#define _LINUX_RING_BUFFER_EXT_H +#include +#include + +struct rb_ext_stats { + u64 entries; + unsigned long pages_touched; + unsigned long overrun; +}; + +#define RB_PAGE_FT_HEAD (1 << 0) +#define RB_PAGE_FT_READER (1 << 1) +#define RB_PAGE_FT_COMMIT (1 << 2) + +/* + * The pages where the events are stored are the only shared elements between + * the reader and the external writer. They are convenient to enable + * communication from the writer to the reader. The data will be used by the + * reader to update its view on the ring buffer. + */ +struct rb_ext_page_footer { + atomic_t writer_status; + atomic_t reader_status; + struct rb_ext_stats stats; +}; + +static inline struct rb_ext_page_footer *rb_ext_page_get_footer(void *page) +{ + struct rb_ext_page_footer *footer; + unsigned long page_va = (unsigned long)page; + + page_va = ALIGN_DOWN(page_va, PAGE_SIZE); + + return (struct rb_ext_page_footer *)(page_va + PAGE_SIZE - + sizeof(*footer)); +} + +#define BUF_EXT_PAGE_SIZE (BUF_PAGE_SIZE - sizeof(struct rb_ext_page_footer)) + +/* + * An external writer can't rely on the internal struct ring_buffer_per_cpu. + * Instead, allow to pack the relevant information into struct + * ring_buffer_pack which can be sent to the writer. The latter can then create + * its own view on the ring buffer. + */ +struct ring_buffer_pack { + int cpu; + unsigned long reader_page_va; + unsigned long nr_pages; + unsigned long page_va[]; +}; + +struct trace_buffer_pack { + int nr_cpus; + unsigned long total_pages; + char __data[]; /* contains ring_buffer_pack */ +}; + +static inline +struct ring_buffer_pack *__next_ring_buffer_pack(struct ring_buffer_pack *rb_pack) +{ + size_t len; + + len = offsetof(struct ring_buffer_pack, page_va) + + sizeof(unsigned long) * rb_pack->nr_pages; + + return (struct ring_buffer_pack *)((void *)rb_pack + len); +} + +/* + * Accessor for ring_buffer_pack's within trace_buffer_pack + */ +#define for_each_ring_buffer_pack(rb_pack, cpu, trace_pack) \ + for (rb_pack = (struct ring_buffer_pack *)&trace_pack->__data[0], cpu = 0; \ + cpu < trace_pack->nr_cpus; \ + cpu++, rb_pack = __next_ring_buffer_pack(rb_pack)) +#endif diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c29d9c13378b..87e222a4cffb 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -129,6 +129,11 @@ static inline void anon_vma_lock_read(struct anon_vma *anon_vma) down_read(&anon_vma->root->rwsem); } +static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) +{ + return down_read_trylock(&anon_vma->root->rwsem); +} + static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) { up_read(&anon_vma->root->rwsem); @@ -248,17 +253,14 @@ void page_mlock(struct page *page); void remove_migration_ptes(struct page *old, struct page *new, bool locked); -/* - * Called by memory-failure.c to kill processes. - */ -struct anon_vma *page_lock_anon_vma_read(struct page *page); -void page_unlock_anon_vma_read(struct anon_vma *anon_vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); /* * rmap_walk_control: To control rmap traversing for specific needs * * arg: passed to rmap_one() and invalid_vma() + * try_lock: bail out if the rmap lock is contended + * contended: indicate the rmap traversal bailed out due to lock contention * rmap_one: executed on each vma where page is mapped * done: for checking traversing termination condition * anon_lock: for getting anon_lock by optimized way rather than default @@ -266,6 +268,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); */ struct rmap_walk_control { void *arg; + bool try_lock; + bool contended; /* * Return false if page table scanning in rmap_walk should be stopped. * Otherwise, return true. @@ -273,13 +277,21 @@ struct rmap_walk_control { bool (*rmap_one)(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct page *page); - struct anon_vma *(*anon_lock)(struct page *page); + struct anon_vma *(*anon_lock)(struct page *page, + struct rmap_walk_control *rwc); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; void rmap_walk(struct page *page, struct rmap_walk_control *rwc); void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); +/* + * Called by memory-failure.c to kill processes. + */ +struct anon_vma *page_lock_anon_vma_read(struct page *page, + struct rmap_walk_control *rwc); +void page_unlock_anon_vma_read(struct anon_vma *anon_vma); + #else /* !CONFIG_MMU */ #define anon_vma_init() do {} while (0) diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h index a8dcf8a9ae88..e301b2269400 100644 --- a/include/linux/rpmsg.h +++ b/include/linux/rpmsg.h @@ -2,6 +2,7 @@ /* * Remote processor messaging * + * Copyright (c) 2020 The Linux Foundation. * Copyright (C) 2011 Texas Instruments, Inc. * Copyright (C) 2011 Google, Inc. * All rights reserved. @@ -61,7 +62,20 @@ struct rpmsg_device { const struct rpmsg_device_ops *ops; }; +/** + * rpmsg rx callback return definitions + * @RPMSG_HANDLED: rpmsg user is done processing data, framework can free the + * resources related to the buffer + * @RPMSG_DEFER: rpmsg user is not done processing data, framework will hold + * onto resources related to the buffer until rpmsg_rx_done is + * called. User should check their endpoint to see if rx_done + * is a supported operation. + */ +#define RPMSG_HANDLED 0 +#define RPMSG_DEFER 1 + typedef int (*rpmsg_rx_cb_t)(struct rpmsg_device *, void *, int, void *, u32); +typedef int (*rpmsg_rx_sig_t)(struct rpmsg_device *, void *, u32, u32); /** * struct rpmsg_endpoint - binds a local rpmsg address to its user @@ -69,6 +83,8 @@ typedef int (*rpmsg_rx_cb_t)(struct rpmsg_device *, void *, int, void *, u32); * @refcount: when this drops to zero, the ept is deallocated * @cb: rx callback handler * @cb_lock: must be taken before accessing/changing @cb + * @sig_cb: rx serial signal handler + * @rx_done: if set, rpmsg endpoint supports rpmsg_rx_done * @addr: local rpmsg address * @priv: private data for the driver's use * @@ -91,6 +107,8 @@ struct rpmsg_endpoint { struct kref refcount; rpmsg_rx_cb_t cb; struct mutex cb_lock; + rpmsg_rx_sig_t sig_cb; + bool rx_done; u32 addr; void *priv; @@ -104,6 +122,7 @@ struct rpmsg_endpoint { * @probe: invoked when a matching rpmsg channel (i.e. device) is found * @remove: invoked when the rpmsg channel is removed * @callback: invoked when an inbound message is received on the channel + * @signals: invoked when a serial signal change is received on the channel */ struct rpmsg_driver { struct device_driver drv; @@ -111,6 +130,8 @@ struct rpmsg_driver { int (*probe)(struct rpmsg_device *dev); void (*remove)(struct rpmsg_device *dev); int (*callback)(struct rpmsg_device *, void *, int, void *, u32); + int (*signals)(struct rpmsg_device *rpdev, + void *priv, u32 old, u32 new); }; static inline u16 rpmsg16_to_cpu(struct rpmsg_device *rpdev, __rpmsg16 val) @@ -186,6 +207,11 @@ int rpmsg_trysend_offchannel(struct rpmsg_endpoint *ept, u32 src, u32 dst, __poll_t rpmsg_poll(struct rpmsg_endpoint *ept, struct file *filp, poll_table *wait); +int rpmsg_get_signals(struct rpmsg_endpoint *ept); +int rpmsg_set_signals(struct rpmsg_endpoint *ept, u32 set, u32 clear); + +int rpmsg_rx_done(struct rpmsg_endpoint *ept, void *data); + #else static inline int rpmsg_register_device(struct rpmsg_device *rpdev) @@ -296,6 +322,31 @@ static inline __poll_t rpmsg_poll(struct rpmsg_endpoint *ept, return 0; } +static inline int rpmsg_get_signals(struct rpmsg_endpoint *ept) +{ + /* This shouldn't be possible */ + WARN_ON(1); + + return -ENXIO; +} + +static inline int rpmsg_set_signals(struct rpmsg_endpoint *ept, + u32 set, u32 clear) +{ + /* This shouldn't be possible */ + WARN_ON(1); + + return -ENXIO; +} + +static inline int rpmsg_rx_done(struct rpmsg_endpoint *ept, void *data) +{ + /* This shouldn't be possible */ + WARN_ON(1); + + return -ENXIO; +} + #endif /* IS_ENABLED(CONFIG_RPMSG) */ /* use a macro to avoid include chaining to get THIS_MODULE */ diff --git a/include/linux/rtc.h b/include/linux/rtc.h index bd611e26291d..3c384fe9be22 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -16,6 +16,7 @@ #include #include #include +#include #include extern int rtc_month_days(unsigned int month, unsigned int year); @@ -66,6 +67,8 @@ struct rtc_class_ops { int (*alarm_irq_enable)(struct device *, unsigned int enabled); int (*read_offset)(struct device *, long *offset); int (*set_offset)(struct device *, long offset); + + ANDROID_KABI_RESERVE(1); }; struct rtc_device; @@ -159,6 +162,8 @@ struct rtc_device { unsigned int uie_task_active:1; unsigned int uie_timer_active:1; #endif + + ANDROID_KABI_RESERVE(1); }; #define to_rtc_device(d) container_of(d, struct rtc_device, dev) diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 352c6127cb90..be9220a6ff62 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -32,6 +32,7 @@ #ifdef CONFIG_RWSEM_SPIN_ON_OWNER #include #endif +#include /* * For an uncontended rwsem, count and owner are the only fields a task @@ -64,6 +65,22 @@ struct rw_semaphore { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif + ANDROID_VENDOR_DATA(1); + ANDROID_OEM_DATA_ARRAY(1, 2); +}; + +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { + struct list_head list; + struct task_struct *task; + enum rwsem_waiter_type type; + unsigned long timeout; + unsigned long last_rowner; + bool handoff_set; }; /* In all implementations count != 0 means locked */ diff --git a/include/linux/sched.h b/include/linux/sched.h index e418935f8db6..fa048003a245 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -34,7 +34,9 @@ #include #include #include +#include #include +#include /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; @@ -297,6 +299,8 @@ extern int __must_check io_schedule_prepare(void); extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); extern void io_schedule(void); +extern struct task_struct *pick_migrate_task(struct rq *rq); +extern int select_fallback_rq(int cpu, struct task_struct *p); /** * struct prev_cputime - snapshot of system and user cputime @@ -560,6 +564,11 @@ struct sched_entity { */ struct sched_avg avg; #endif + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; struct sched_rt_entity { @@ -578,6 +587,11 @@ struct sched_rt_entity { /* rq "owned" by this entity/group: */ struct rt_rq *my_q; #endif + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); } __randomize_layout; struct sched_dl_entity { @@ -911,6 +925,10 @@ struct task_struct { #ifdef CONFIG_MEMCG unsigned in_user_fault:1; #endif +#ifdef CONFIG_LRU_GEN + /* whether the LRU algorithm may apply to this access */ + unsigned in_lru_fault:1; +#endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif @@ -999,6 +1017,10 @@ struct task_struct { u64 stimescaled; #endif u64 gtime; +#ifdef CONFIG_CPU_FREQ_TIMES + u64 *time_in_state; + unsigned int max_state; +#endif struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN struct vtime vtime; @@ -1111,6 +1133,7 @@ struct task_struct { raw_spinlock_t pi_lock; struct wake_q_node wake_q; + int wake_q_count; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task: */ @@ -1345,9 +1368,7 @@ struct task_struct { #endif #endif -#if IS_ENABLED(CONFIG_KUNIT) struct kunit *kunit_test; -#endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ @@ -1474,6 +1495,8 @@ struct task_struct { struct callback_head mce_kill_me; int mce_count; #endif + ANDROID_VENDOR_DATA_ARRAY(1, 64); + ANDROID_OEM_DATA_ARRAY(1, 6); #ifdef CONFIG_KRETPROBES struct llist_head kretprobe_instances; @@ -1489,6 +1512,15 @@ struct task_struct { struct callback_head l1d_flush_kill; #endif + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); + ANDROID_KABI_RESERVE(5); + ANDROID_KABI_RESERVE(6); + ANDROID_KABI_RESERVE(7); + ANDROID_KABI_RESERVE(8); + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. @@ -1798,6 +1830,16 @@ current_restore_flags(unsigned long orig_flags, unsigned long flags) extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_effective_cpus); + +#ifdef CONFIG_RT_SOFTINT_OPTIMIZATION +extern bool cpupri_check_rt(void); +#else +static inline bool cpupri_check_rt(void) +{ + return false; +} +#endif + #ifdef CONFIG_SMP extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 5f0e8403e8ce..093aea9b0d8a 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -12,6 +12,7 @@ #include #include #include +#include /* * Types defining task->signal and task->sighand and APIs using them: @@ -235,6 +236,11 @@ struct signal_struct { * and may have inconsistent * permissions. */ + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); } __randomize_layout; /* diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 304f431178fd..d5ebdb6b79de 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -75,6 +75,13 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_SMP +extern unsigned int sysctl_sched_pelt_multiplier; + +int sched_pelt_multiplier(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +#endif + #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern unsigned int sysctl_sched_energy_aware; int sched_energy_aware_handler(struct ctl_table *table, int write, diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 63a04a65e310..b508766ec45b 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -3,6 +3,8 @@ #define _LINUX_SCHED_TOPOLOGY_H #include +#include +#include #include @@ -75,6 +77,8 @@ struct sched_domain_shared { atomic_t nr_busy_cpus; int has_idle_cores; int nr_idle_scan; + + ANDROID_VENDOR_DATA(1); }; struct sched_domain { @@ -144,6 +148,10 @@ struct sched_domain { struct sched_domain_shared *shared; unsigned int span_weight; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + /* * Span of all CPUs in this domain. * diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 00ed419dd464..04e7268eb41b 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -7,6 +7,7 @@ #include #include #include +#include /* * Some day this will be a full-fledged user tracking system.. @@ -33,6 +34,9 @@ struct user_struct { /* Miscellaneous per-user rate limit */ struct ratelimit_state ratelimit; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; extern int uids_sysfs_init(void); diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h index 06cd8fb2f409..9a1394f24da0 100644 --- a/include/linux/sched/wake_q.h +++ b/include/linux/sched/wake_q.h @@ -38,6 +38,7 @@ struct wake_q_head { struct wake_q_node *first; struct wake_q_node **lastp; + int count; }; #define WAKE_Q_TAIL ((struct wake_q_node *) 0x01) @@ -52,6 +53,7 @@ static inline void wake_q_init(struct wake_q_head *head) { head->first = WAKE_Q_TAIL; head->lastp = &head->first; + head->count = 0; } static inline bool wake_q_empty(struct wake_q_head *head) diff --git a/include/linux/sched/xacct.h b/include/linux/sched/xacct.h index c078f0a94cec..9544c9d9d534 100644 --- a/include/linux/sched/xacct.h +++ b/include/linux/sched/xacct.h @@ -28,6 +28,11 @@ static inline void inc_syscw(struct task_struct *tsk) { tsk->ioac.syscw++; } + +static inline void inc_syscfs(struct task_struct *tsk) +{ + tsk->ioac.syscfs++; +} #else static inline void add_rchar(struct task_struct *tsk, ssize_t amt) { @@ -44,6 +49,10 @@ static inline void inc_syscr(struct task_struct *tsk) static inline void inc_syscw(struct task_struct *tsk) { } + +static inline void inc_syscfs(struct task_struct *tsk) +{ +} #endif #endif /* _LINUX_SCHED_XACCT_H */ diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index d22f62203ee3..28b40b844307 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -12,6 +12,7 @@ #include #include #include +#include #define SCMI_MAX_STR_SIZE 16 #define SCMI_MAX_NUM_RATES 16 @@ -82,6 +83,8 @@ struct scmi_clk_proto_ops { u64 rate); int (*enable)(const struct scmi_protocol_handle *ph, u32 clk_id); int (*disable)(const struct scmi_protocol_handle *ph, u32 clk_id); + + ANDROID_KABI_RESERVE(1); }; /** @@ -129,6 +132,8 @@ struct scmi_perf_proto_ops { bool (*fast_switch_possible)(const struct scmi_protocol_handle *ph, struct device *dev); bool (*power_scale_mw_get)(const struct scmi_protocol_handle *ph); + + ANDROID_KABI_RESERVE(1); }; /** @@ -154,6 +159,8 @@ struct scmi_power_proto_ops { u32 state); int (*state_get)(const struct scmi_protocol_handle *ph, u32 domain, u32 *state); + + ANDROID_KABI_RESERVE(1); }; /** @@ -330,6 +337,8 @@ struct scmi_sensor_info { unsigned int resolution; int exponent; struct scmi_range_attrs scalar_attrs; + + ANDROID_KABI_RESERVE(1); }; /* @@ -465,6 +474,8 @@ struct scmi_sensor_proto_ops { u32 sensor_id, u32 *sensor_config); int (*config_set)(const struct scmi_protocol_handle *ph, u32 sensor_id, u32 sensor_config); + + ANDROID_KABI_RESERVE(1); }; /** @@ -485,6 +496,8 @@ struct scmi_reset_proto_ops { int (*reset)(const struct scmi_protocol_handle *ph, u32 domain); int (*assert)(const struct scmi_protocol_handle *ph, u32 domain); int (*deassert)(const struct scmi_protocol_handle *ph, u32 domain); + + ANDROID_KABI_RESERVE(1); }; /** @@ -624,6 +637,8 @@ struct scmi_handle { void (*devm_protocol_put)(struct scmi_device *sdev, u8 proto); const struct scmi_notify_ops *notify_ops; + + ANDROID_KABI_RESERVE(1); }; enum scmi_std_protocol { @@ -652,6 +667,8 @@ struct scmi_device { const char *name; struct device dev; struct scmi_handle *handle; + + ANDROID_KABI_RESERVE(1); }; #define to_scmi_dev(d) container_of(d, struct scmi_device, dev) diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index d5b6b1550d59..3bfdfb5de85c 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #ifdef CONFIG_SERIAL_CORE_CONSOLE @@ -80,6 +81,9 @@ struct uart_ops { void (*poll_put_char)(struct uart_port *, unsigned char); int (*poll_get_char)(struct uart_port *); #endif + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; #define NO_POLL_CHAR 0x00ff0000 @@ -258,6 +262,9 @@ struct uart_port { struct gpio_desc *rs485_term_gpio; /* enable RS485 bus termination */ struct serial_iso7816 iso7816; void *private_data; /* generic platform data pointer */ + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; static inline int serial_port_in(struct uart_port *up, int offset) @@ -337,6 +344,8 @@ struct uart_driver { */ struct uart_state *state; struct tty_driver *tty_driver; + + ANDROID_KABI_RESERVE(1); }; void uart_write_wakeup(struct uart_port *port); diff --git a/include/linux/serio.h b/include/linux/serio.h index 6c27d413da92..25d638f584a6 100644 --- a/include/linux/serio.h +++ b/include/linux/serio.h @@ -13,6 +13,7 @@ #include #include #include +#include #include extern struct bus_type serio_bus; @@ -61,6 +62,8 @@ struct serio { * may get indigestion when exposed to concurrent access (i8042). */ struct mutex *ps2_cmd_mutex; + + ANDROID_KABI_RESERVE(1); }; #define to_serio_port(d) container_of(d, struct serio, dev) @@ -79,6 +82,8 @@ struct serio_driver { void (*cleanup)(struct serio *); struct device_driver driver; + + ANDROID_KABI_RESERVE(1); }; #define to_serio_driver(d) container_of(d, struct serio_driver, driver) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 9814fff58a69..45944bae30fc 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -2,6 +2,8 @@ #ifndef _LINUX_SHRINKER_H #define _LINUX_SHRINKER_H +#include + /* * This struct is used to pass information from page reclaim to the shrinkers. * We consolidate the values for easier extension later. @@ -31,6 +33,7 @@ struct shrink_control { /* current memcg being shrunk (for memcg aware shrinkers) */ struct mem_cgroup *memcg; + ANDROID_OEM_DATA_ARRAY(1, 3); }; #define SHRINK_STOP (~0UL) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 19e595cab23a..76490611e9ef 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -41,6 +41,8 @@ #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include #endif +#include +#include /* The interface for checksum offload between the stack and networking drivers * is as follows... @@ -572,6 +574,8 @@ struct skb_shared_info { * remains valid until skb destructor */ void * destructor_arg; + ANDROID_OEM_DATA_ARRAY(1, 3); + /* must be last field, see pskb_expand_head() */ skb_frag_t frags[MAX_SKB_FRAGS]; }; @@ -966,6 +970,9 @@ struct sk_buff { __u32 headers_end[0]; /* public: */ + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + /* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; sk_buff_data_t end; diff --git a/include/linux/slab.h b/include/linux/slab.h index 083f3ce550bc..e82594244dcd 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -106,7 +106,7 @@ # define SLAB_ACCOUNT 0 #endif -#ifdef CONFIG_KASAN +#ifdef CONFIG_KASAN_GENERIC #define SLAB_KASAN ((slab_flags_t __force)0x08000000U) #else #define SLAB_KASAN 0 diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 6b0b686f6f90..04c72d9f0835 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -200,6 +201,9 @@ struct spi_device { /* the statistics */ struct spi_statistics statistics; + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + /* * likely need more hooks for more protocol options affecting how * the controller talks to each chip, like: @@ -287,6 +291,8 @@ struct spi_driver { int (*remove)(struct spi_device *spi); void (*shutdown)(struct spi_device *spi); struct device_driver driver; + + ANDROID_KABI_RESERVE(1); }; static inline struct spi_driver *to_spi_driver(struct device_driver *drv) @@ -672,6 +678,9 @@ struct spi_controller { /* Interrupt enable state during PTP system timestamping */ unsigned long irq_flags; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; static inline void *spi_controller_get_devdata(struct spi_controller *ctlr) @@ -967,6 +976,8 @@ struct spi_transfer { #define SPI_TRANS_FAIL_NO_START BIT(0) u16 error; + + ANDROID_KABI_RESERVE(1); }; /** @@ -1033,6 +1044,8 @@ struct spi_message { /* list of spi_res reources when the spi message is processed */ struct list_head resources; + + ANDROID_KABI_RESERVE(1); }; static inline void spi_message_init_no_memset(struct spi_message *m) @@ -1457,6 +1470,8 @@ struct spi_board_info { */ u32 mode; + ANDROID_KABI_RESERVE(1); + /* ... may need additional spi_device chip config data here. * avoid stuff protocol drivers can set; but include stuff * needed to behave without being bound to a driver: diff --git a/include/linux/spmi.h b/include/linux/spmi.h index 729bcbf9f5ad..743f08cc9bd9 100644 --- a/include/linux/spmi.h +++ b/include/linux/spmi.h @@ -7,6 +7,7 @@ #include #include #include +#include /* Maximum slave identifier */ #define SPMI_MAX_SLAVE_ID 16 @@ -85,6 +86,7 @@ struct spmi_controller { u8 sid, u16 addr, u8 *buf, size_t len); int (*write_cmd)(struct spmi_controller *ctrl, u8 opcode, u8 sid, u16 addr, const u8 *buf, size_t len); + ANDROID_KABI_RESERVE(1); }; static inline struct spmi_controller *to_spmi_controller(struct device *d) @@ -139,6 +141,7 @@ struct spmi_driver { int (*probe)(struct spmi_device *sdev); void (*remove)(struct spmi_device *sdev); void (*shutdown)(struct spmi_device *sdev); + ANDROID_KABI_RESERVE(1); }; static inline struct spmi_driver *to_spmi_driver(struct device_driver *d) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 22919a94ca19..8ab37500fcba 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -11,14 +11,22 @@ #ifndef _LINUX_STACKDEPOT_H #define _LINUX_STACKDEPOT_H +#include + typedef u32 depot_stack_handle_t; +depot_stack_handle_t __stack_depot_save(unsigned long *entries, + unsigned int nr_entries, + gfp_t gfp_flags, bool can_alloc); + depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); +void stack_depot_print(depot_stack_handle_t stack); + #ifdef CONFIG_STACKDEPOT int stack_depot_init(void); #else diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index bef158815e83..97455880ac41 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -8,22 +8,6 @@ struct task_struct; struct pt_regs; -#ifdef CONFIG_STACKTRACE -void stack_trace_print(const unsigned long *trace, unsigned int nr_entries, - int spaces); -int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries, - unsigned int nr_entries, int spaces); -unsigned int stack_trace_save(unsigned long *store, unsigned int size, - unsigned int skipnr); -unsigned int stack_trace_save_tsk(struct task_struct *task, - unsigned long *store, unsigned int size, - unsigned int skipnr); -unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, - unsigned int size, unsigned int skipnr); -unsigned int stack_trace_save_user(unsigned long *store, unsigned int size); -unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries); - -/* Internal interfaces. Do not use in generic code */ #ifdef CONFIG_ARCH_STACKWALK /** @@ -76,8 +60,25 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie, void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, const struct pt_regs *regs); +#endif /* CONFIG_ARCH_STACKWALK */ -#else /* CONFIG_ARCH_STACKWALK */ +#ifdef CONFIG_STACKTRACE +void stack_trace_print(const unsigned long *trace, unsigned int nr_entries, + int spaces); +int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries, + unsigned int nr_entries, int spaces); +unsigned int stack_trace_save(unsigned long *store, unsigned int size, + unsigned int skipnr); +unsigned int stack_trace_save_tsk(struct task_struct *task, + unsigned long *store, unsigned int size, + unsigned int skipnr); +unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, + unsigned int size, unsigned int skipnr); +unsigned int stack_trace_save_user(unsigned long *store, unsigned int size); +unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries); + +#ifndef CONFIG_ARCH_STACKWALK +/* Internal interfaces. Do not use in generic code */ struct stack_trace { unsigned int nr_entries, max_entries; unsigned long *entries; diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 4bcd65679cee..27321d5d01df 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #ifdef CONFIG_VT @@ -185,6 +186,8 @@ struct platform_suspend_ops { bool (*suspend_again)(void); void (*end)(void); void (*recover)(void); + + ANDROID_KABI_RESERVE(1); }; struct platform_s2idle_ops { @@ -195,6 +198,8 @@ struct platform_s2idle_ops { void (*restore_early)(void); void (*restore)(void); void (*end)(void); + + ANDROID_KABI_RESERVE(1); }; #ifdef CONFIG_SUSPEND @@ -426,6 +431,8 @@ struct platform_hibernation_ops { int (*pre_restore)(void); void (*restore_cleanup)(void); void (*recover)(void); + + ANDROID_KABI_RESERVE(1); }; #ifdef CONFIG_HIBERNATION @@ -508,6 +515,7 @@ extern bool pm_get_wakeup_count(unsigned int *count, bool block); extern bool pm_save_wakeup_count(unsigned int count); extern void pm_wakep_autosleep_enabled(bool set); extern void pm_print_active_wakeup_sources(void); +extern void pm_get_active_wakeup_sources(char *pending_sources, size_t max); extern void lock_system_sleep(void); extern void unlock_system_sleep(void); diff --git a/include/linux/swap.h b/include/linux/swap.h index 4efd267e2937..811218cb9693 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -34,6 +34,8 @@ struct pagevec; SWAP_FLAG_DISCARD_PAGES) #define SWAP_BATCH 64 +int kswapd (void *p); + static inline int current_is_kswapd(void) { return current->flags & PF_KSWAPD; @@ -137,6 +139,10 @@ union swap_header { */ struct reclaim_state { unsigned long reclaimed_slab; +#ifdef CONFIG_LRU_GEN + /* per-thread mm walk data */ + struct lru_gen_mm_walk *mm_walk; +#endif }; #ifdef __KERNEL__ @@ -348,19 +354,9 @@ extern void lru_note_cost_page(struct page *); extern void lru_cache_add(struct page *); extern void mark_page_accessed(struct page *); -extern atomic_t lru_disable_count; - -static inline bool lru_cache_disabled(void) -{ - return atomic_read(&lru_disable_count); -} - -static inline void lru_cache_enable(void) -{ - atomic_dec(&lru_disable_count); -} - +extern bool lru_cache_disabled(void); extern void lru_cache_disable(void); +extern void lru_cache_enable(void); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); diff --git a/include/linux/task_io_accounting.h b/include/linux/task_io_accounting.h index 6f6acce064de..bb26108ca23c 100644 --- a/include/linux/task_io_accounting.h +++ b/include/linux/task_io_accounting.h @@ -19,6 +19,8 @@ struct task_io_accounting { u64 syscr; /* # of write syscalls */ u64 syscw; + /* # of fsync syscalls */ + u64 syscfs; #endif /* CONFIG_TASK_XACCT */ #ifdef CONFIG_TASK_IO_ACCOUNTING diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h index bb5498bcdd96..733ab62ae141 100644 --- a/include/linux/task_io_accounting_ops.h +++ b/include/linux/task_io_accounting_ops.h @@ -97,6 +97,7 @@ static inline void task_chr_io_accounting_add(struct task_io_accounting *dst, dst->wchar += src->wchar; dst->syscr += src->syscr; dst->syscw += src->syscw; + dst->syscfs += src->syscfs; } #else static inline void task_chr_io_accounting_add(struct task_io_accounting *dst, diff --git a/include/linux/tcp.h b/include/linux/tcp.h index a7ebadf83c68..c54be4467977 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -412,6 +413,8 @@ struct tcp_sock { */ struct request_sock __rcu *fastopen_rsk; struct saved_syn *saved_syn; + + ANDROID_KABI_RESERVE(1); }; enum tsq_enum { diff --git a/include/linux/thermal.h b/include/linux/thermal.h index b94314ed0c96..2bf6c4c46e3e 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #define THERMAL_MAX_TRIPS 12 @@ -78,6 +79,8 @@ struct thermal_zone_device_ops { enum thermal_trend *); void (*hot)(struct thermal_zone_device *); void (*critical)(struct thermal_zone_device *); + + ANDROID_KABI_RESERVE(1); }; struct thermal_cooling_device_ops { @@ -87,6 +90,8 @@ struct thermal_cooling_device_ops { int (*get_requested_power)(struct thermal_cooling_device *, u32 *); int (*state2power)(struct thermal_cooling_device *, unsigned long, u32 *); int (*power2state)(struct thermal_cooling_device *, u32, unsigned long *); + + ANDROID_KABI_RESERVE(1); }; struct thermal_cooling_device { @@ -102,6 +107,8 @@ struct thermal_cooling_device { struct mutex lock; /* protect thermal_instances list */ struct list_head thermal_instances; struct list_head node; + + ANDROID_KABI_RESERVE(1); }; /** @@ -175,6 +182,8 @@ struct thermal_zone_device { struct list_head node; struct delayed_work poll_queue; enum thermal_notify_event notify_event; + + ANDROID_KABI_RESERVE(1); }; /** @@ -195,6 +204,8 @@ struct thermal_governor { void (*unbind_from_tz)(struct thermal_zone_device *tz); int (*throttle)(struct thermal_zone_device *tz, int trip); struct list_head governor_list; + + ANDROID_KABI_RESERVE(1); }; /* Structure that holds binding parameters for a zone */ @@ -230,6 +241,8 @@ struct thermal_bind_params { unsigned long *binding_limits; int (*match) (struct thermal_zone_device *tz, struct thermal_cooling_device *cdev); + + ANDROID_KABI_RESERVE(1); }; /* Structure to define Thermal Zone parameters */ @@ -283,6 +296,8 @@ struct thermal_zone_params { * Used by thermal zone drivers (default 0). */ int offset; + + ANDROID_KABI_RESERVE(1); }; /** @@ -300,6 +315,12 @@ struct thermal_zone_params { * temperature. * @set_trip_temp: a pointer to a function that sets the trip temperature on * hardware. + * @change_mode: a pointer to a function that notifies the thermal zone + * mode change. + * @hot: a pointer to a function that notifies the thermal zone + * hot trip violation. + * @critical: a pointer to a function that notifies the thermal zone + * critical trip violation. */ struct thermal_zone_of_device_ops { int (*get_temp)(void *, int *); @@ -307,6 +328,11 @@ struct thermal_zone_of_device_ops { int (*set_trips)(void *, int, int); int (*set_emul_temp)(void *, int); int (*set_trip_temp)(void *, int, int); + int (*change_mode) (void *, enum thermal_device_mode); + void (*hot)(void *sensor_data); + void (*critical)(void *sensor_data); + + ANDROID_KABI_RESERVE(1); }; /* Function declarations */ diff --git a/include/linux/time64.h b/include/linux/time64.h index 81b9686a2079..ee021da0966e 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -142,7 +142,7 @@ static inline s64 timespec64_to_ns(const struct timespec64 *ts) * * Returns the timespec64 representation of the nsec parameter. */ -extern struct timespec64 ns_to_timespec64(const s64 nsec); +extern struct timespec64 ns_to_timespec64(s64 nsec); /** * timespec64_add_ns - Adds nanoseconds to a timespec64 diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 78a98bdff76d..309e10ea8cf9 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -239,17 +239,23 @@ struct ktime_timestamps { * counter value * @cycles: Clocksource counter value to produce the system times * @real: Realtime system time + * @boot: Boot time * @raw: Monotonic raw system time * @clock_was_set_seq: The sequence number of clock was set events * @cs_was_changed_seq: The sequence number of clocksource change events + * @mono_shift: The monotonic clock slope shift + * @mono_mult: The monotonic clock slope mult */ struct system_time_snapshot { u64 cycles; ktime_t real; + ktime_t boot; ktime_t raw; enum clocksource_ids cs_id; unsigned int clock_was_set_seq; u8 cs_was_changed_seq; + u32 mono_shift; + u32 mono_mult; }; /** diff --git a/include/linux/timer.h b/include/linux/timer.h index fda13c9d1256..6d765a94e1d5 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -7,6 +7,7 @@ #include #include #include +#include struct timer_list { /* @@ -21,6 +22,9 @@ struct timer_list { #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; #ifdef CONFIG_LOCKDEP diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 28031b15f878..0d3fd60c70bd 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -242,7 +242,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) extern int __traceiter_##name(data_proto); \ DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \ extern struct tracepoint __tracepoint_##name; \ - static inline void trace_##name(proto) \ + static inline void __nocfi trace_##name(proto) \ { \ if (static_key_false(&__tracepoint_##name.key)) \ __DO_TRACE(name, \ @@ -306,7 +306,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) .unregfunc = _unreg, \ .funcs = NULL }; \ __TRACEPOINT_ENTRY(_name); \ - int __traceiter_##_name(void *__data, proto) \ + int __nocfi __traceiter_##_name(void *__data, proto) \ { \ struct tracepoint_func *it_func_ptr; \ void *it_func; \ diff --git a/include/linux/tty.h b/include/linux/tty.h index 168e57e40bbb..b41d8dfc7005 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -15,6 +15,7 @@ #include #include #include +#include /* @@ -202,6 +203,9 @@ struct tty_struct { /* If the tty has a pending do_SAK, queue it here - akpm */ struct work_struct SAK_work; struct tty_port *port; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); } __randomize_layout; /* Each of a tty's open files has private_data pointing to tty_file_private */ diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index c20431d8def8..8ae79ad51278 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -238,6 +238,7 @@ #include #include #include +#include struct tty_struct; struct tty_driver; @@ -289,6 +290,9 @@ struct tty_operations { void (*poll_put_char)(struct tty_driver *driver, int line, char ch); #endif int (*proc_show)(struct seq_file *, void *); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); } __randomize_layout; struct tty_driver { @@ -323,6 +327,9 @@ struct tty_driver { const struct tty_operations *ops; struct list_head tty_drivers; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); } __randomize_layout; extern struct list_head tty_drivers; diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h index b1d812e902aa..c0aa3a6c5f1c 100644 --- a/include/linux/tty_ldisc.h +++ b/include/linux/tty_ldisc.h @@ -131,6 +131,7 @@ struct tty_struct; #include #include #include +#include /* * the semaphore definition @@ -213,6 +214,9 @@ struct tty_ldisc_ops { const char *fp, int count); struct module *owner; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; struct tty_ldisc { diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h index 6e86e9e118b6..8737f899b85c 100644 --- a/include/linux/tty_port.h +++ b/include/linux/tty_port.h @@ -6,6 +6,7 @@ #include #include #include +#include /* * Port level information. Each device keeps its own port level information @@ -39,6 +40,8 @@ struct tty_port_operations { int (*activate)(struct tty_port *port, struct tty_struct *tty); /* Called on the final put of a port */ void (*destruct)(struct tty_port *port); + + ANDROID_KABI_RESERVE(1); }; struct tty_port_client_operations { @@ -72,6 +75,8 @@ struct tty_port { set to size of fifo */ struct kref kref; /* Ref counter */ void *client_data; + + ANDROID_KABI_RESERVE(1); }; /* tty_port::iflags bits -- use atomic bit ops */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index ac0394087f7d..425c0d6156ef 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -271,6 +271,28 @@ static inline bool pagefault_disabled(void) */ #define faulthandler_disabled() (pagefault_disabled() || in_atomic()) +#ifndef CONFIG_ARCH_HAS_SUBPAGE_FAULTS + +/** + * probe_subpage_writeable: probe the user range for write faults at sub-page + * granularity (e.g. arm64 MTE) + * @uaddr: start of address range + * @size: size of address range + * + * Returns 0 on success, the number of bytes not probed on fault. + * + * It is expected that the caller checked for the write permission of each + * page in the range either by put_user() or GUP. The architecture port can + * implement a more efficient get_user() probing if the same sub-page faults + * are triggered by either a read or a write. + */ +static inline size_t probe_subpage_writeable(char __user *uaddr, size_t size) +{ + return 0; +} + +#endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */ + #ifndef ARCH_HAS_NOCACHE_UACCESS static inline __must_check unsigned long diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 47c5962b876b..7f4e6c71c5d5 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -16,6 +16,7 @@ #include #include #include +#include struct module; struct uio_map; @@ -77,6 +78,8 @@ struct uio_device { struct mutex info_lock; struct kobject *map_dir; struct kobject *portio_dir; + + ANDROID_KABI_RESERVE(1); }; /** @@ -109,6 +112,7 @@ struct uio_info { int (*open)(struct uio_info *info, struct inode *inode); int (*release)(struct uio_info *info, struct inode *inode); int (*irqcontrol)(struct uio_info *info, s32 irq_on); + ANDROID_KABI_RESERVE(1); }; extern int __must_check diff --git a/include/linux/usb.h b/include/linux/usb.h index 978a6011ea1b..60922458f6ac 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -22,6 +22,7 @@ #include /* for current && schedule_timeout */ #include /* for struct mutex */ #include /* for runtime PM */ +#include struct usb_device; struct usb_driver; @@ -257,6 +258,11 @@ struct usb_interface { struct device dev; /* interface specific device info */ struct device *usb_dev; struct work_struct reset_ws; /* for resets in atomic context */ + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #define to_usb_interface(d) container_of(d, struct usb_interface, dev) @@ -402,6 +408,11 @@ struct usb_host_bos { struct usb_ssp_cap_descriptor *ssp_cap; struct usb_ss_container_id_descriptor *ss_id; struct usb_ptm_cap_descriptor *ptm_cap; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; int __usb_get_extra_descriptor(char *buffer, unsigned size, @@ -461,10 +472,13 @@ struct usb_bus { unsigned resuming_ports; /* bit array: resuming root-hub ports */ -#if defined(CONFIG_USB_MON) || defined(CONFIG_USB_MON_MODULE) struct mon_bus *mon_bus; /* non-null when associated */ int monitored; /* non-zero when monitored */ -#endif + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; struct usb_dev_state; @@ -706,6 +720,11 @@ struct usb_device { u16 hub_delay; unsigned use_generic_driver:1; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #define to_usb_device(d) container_of(d, struct usb_device, dev) @@ -1219,6 +1238,11 @@ struct usb_driver { unsigned int supports_autosuspend:1; unsigned int disable_hub_initiated_lpm:1; unsigned int soft_unbind:1; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #define to_usb_driver(d) container_of(d, struct usb_driver, drvwrap.driver) @@ -1602,6 +1626,12 @@ struct urb { int error_count; /* (return) number of ISO errors */ void *context; /* (in) context for completion */ usb_complete_t complete; /* (in) completion routine */ + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); + struct usb_iso_packet_descriptor iso_frame_desc[]; /* (in) ISO ONLY */ }; diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h index 9d2762279286..043a7bfdfb41 100644 --- a/include/linux/usb/composite.h +++ b/include/linux/usb/composite.h @@ -592,6 +592,7 @@ struct usb_function_instance { struct config_group group; struct list_head cfs_list; struct usb_function_driver *fd; + struct usb_function *f; int (*set_inst_name)(struct usb_function_instance *inst, const char *name); void (*free_func_inst)(struct usb_function_instance *inst); diff --git a/include/linux/usb/f_accessory.h b/include/linux/usb/f_accessory.h new file mode 100644 index 000000000000..ebe3c4d59309 --- /dev/null +++ b/include/linux/usb/f_accessory.h @@ -0,0 +1,23 @@ +/* + * Gadget Function Driver for Android USB accessories + * + * Copyright (C) 2011 Google, Inc. + * Author: Mike Lockwood + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef __LINUX_USB_F_ACCESSORY_H +#define __LINUX_USB_F_ACCESSORY_H + +#include + +#endif /* __LINUX_USB_F_ACCESSORY_H */ diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index 10fe57cf40be..a0fd828b2080 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -122,6 +122,8 @@ struct usb_request { int status; unsigned actual; + + ANDROID_KABI_RESERVE(1); }; /*-------------------------------------------------------------------------*/ @@ -152,6 +154,8 @@ struct usb_ep_ops { int (*fifo_status) (struct usb_ep *ep); void (*fifo_flush) (struct usb_ep *ep); + + ANDROID_KABI_RESERVE(1); }; /** @@ -240,6 +244,8 @@ struct usb_ep { u8 address; const struct usb_endpoint_descriptor *desc; const struct usb_ss_ep_comp_descriptor *comp_desc; + + ANDROID_KABI_RESERVE(1); }; /*-------------------------------------------------------------------------*/ @@ -330,6 +336,11 @@ struct usb_gadget_ops { struct usb_endpoint_descriptor *, struct usb_ss_ep_comp_descriptor *); int (*check_config)(struct usb_gadget *gadget); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; /** diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index cfa04bbe55a9..5a2336b49867 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -25,6 +25,7 @@ #include #include #include +#include #define MAX_TOPO_LEVEL 6 @@ -228,6 +229,11 @@ struct usb_hcd { * (ohci 32, uhci 1024, ehci 256/512/1024). */ + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); + /* The HC driver's private data is stored at the end of * this structure. */ @@ -421,6 +427,11 @@ struct hc_driver { #define EHSET_TEST_SINGLE_STEP_SET_FEATURE 0x06 int (*submit_single_step_set_feature)(struct usb_hcd *, struct urb *, int); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; static inline int hcd_giveback_urb_in_bh(struct usb_hcd *hcd) @@ -582,6 +593,11 @@ struct usb_tt { spinlock_t lock; struct list_head clear_list; /* of usb_tt_clear */ struct work_struct clear_work; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; struct usb_tt_clear { @@ -707,8 +723,6 @@ static inline void usb_hcd_resume_root_hub(struct usb_hcd *hcd) /*-------------------------------------------------------------------------*/ -#if defined(CONFIG_USB_MON) || defined(CONFIG_USB_MON_MODULE) - struct usb_mon_operations { void (*urb_submit)(struct usb_bus *bus, struct urb *urb); void (*urb_submit_error)(struct usb_bus *bus, struct urb *urb, int err); @@ -741,16 +755,6 @@ static inline void usbmon_urb_complete(struct usb_bus *bus, struct urb *urb, int usb_mon_register(const struct usb_mon_operations *ops); void usb_mon_deregister(void); -#else - -static inline void usbmon_urb_submit(struct usb_bus *bus, struct urb *urb) {} -static inline void usbmon_urb_submit_error(struct usb_bus *bus, struct urb *urb, - int error) {} -static inline void usbmon_urb_complete(struct usb_bus *bus, struct urb *urb, - int status) {} - -#endif /* CONFIG_USB_MON || CONFIG_USB_MON_MODULE */ - /*-------------------------------------------------------------------------*/ /* random stuff */ diff --git a/include/linux/usb/otg.h b/include/linux/usb/otg.h index 6475f880be37..9a1b417dfd58 100644 --- a/include/linux/usb/otg.h +++ b/include/linux/usb/otg.h @@ -12,6 +12,7 @@ #include #include +#include struct usb_otg { u8 default_a; @@ -40,6 +41,7 @@ struct usb_otg { /* start or continue HNP role switch */ int (*start_hnp)(struct usb_otg *otg); + ANDROID_KABI_RESERVE(1); }; /** diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h index e4de6bc1f69b..fd0f13ba9e65 100644 --- a/include/linux/usb/phy.h +++ b/include/linux/usb/phy.h @@ -10,9 +10,11 @@ #ifndef __LINUX_USB_PHY_H #define __LINUX_USB_PHY_H +#include #include #include #include +#include #include enum usb_phy_interface { @@ -155,6 +157,9 @@ struct usb_phy { * manually detect the charger type. */ enum usb_charger_type (*charger_detect)(struct usb_phy *x); + + ANDROID_VENDOR_DATA(1); + ANDROID_KABI_RESERVE(1); }; /* for board-specific init logic */ diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h index bffc8d3e14ad..cfe939bbb4a6 100644 --- a/include/linux/usb/tcpm.h +++ b/include/linux/usb/tcpm.h @@ -114,6 +114,14 @@ enum tcpm_transmit_type { * Optional; The USB Communications Capable bit indicates if port * partner is capable of communication over the USB data lines * (e.g. D+/- or SS Tx/Rx). Called to notify the status of the bit. + * @check_contaminant: + * Optional; The callback is called when CC pins report open status + * at the end of the toggling period. Chip level drivers are + * expected to check for contaminant and re-enable toggling if + * needed. When 0 is not returned, check_contaminant is expected to + * restart toggling after checking the connector for contaminant. + * This forces the TCPM state machine to tranistion to TOGGLING state + * without calling start_toggling callback. */ struct tcpc_dev { struct fwnode_handle *fwnode; @@ -148,6 +156,7 @@ struct tcpc_dev { bool pps_active, u32 requested_vbus_voltage); bool (*is_vbus_vsafe0v)(struct tcpc_dev *dev); void (*set_partner_usb_comm_capable)(struct tcpc_dev *dev, bool enable); + int (*check_contaminant)(struct tcpc_dev *dev); }; struct tcpm_port; @@ -155,6 +164,10 @@ struct tcpm_port; struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc); void tcpm_unregister_port(struct tcpm_port *port); +int tcpm_update_sink_capabilities(struct tcpm_port *port, const u32 *pdo, + unsigned int nr_pdo, + unsigned int operating_snk_mw); + void tcpm_vbus_change(struct tcpm_port *port); void tcpm_cc_change(struct tcpm_port *port); void tcpm_sink_frs(struct tcpm_port *port); @@ -165,5 +178,6 @@ void tcpm_pd_transmit_complete(struct tcpm_port *port, enum tcpm_transmit_status status); void tcpm_pd_hard_reset(struct tcpm_port *port); void tcpm_tcpc_reset(struct tcpm_port *port); +bool tcpm_is_debouncing(struct tcpm_port *tcpm); #endif /* __LINUX_USB_TCPM_H */ diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h index c1e5910809ad..9afa52b14054 100644 --- a/include/linux/usb/typec.h +++ b/include/linux/usb/typec.h @@ -4,6 +4,7 @@ #define __LINUX_USB_TYPEC_H #include +#include /* USB Type-C Specification releases */ #define USB_TYPEC_REV_1_0 0x100 /* 1.0 */ @@ -221,6 +222,7 @@ struct typec_operations { int (*vconn_set)(struct typec_port *port, enum typec_role role); int (*port_type_set)(struct typec_port *port, enum typec_port_type type); + ANDROID_KABI_RESERVE(1); }; enum usb_pd_svdm_ver { @@ -258,6 +260,7 @@ struct typec_capability { void *driver_data; const struct typec_operations *ops; + ANDROID_KABI_RESERVE(1); }; /* Specific to try_role(). Indicates the user want's to clear the preference. */ diff --git a/include/linux/usb/typec_altmode.h b/include/linux/usb/typec_altmode.h index 65933cbe9129..60531284df7c 100644 --- a/include/linux/usb/typec_altmode.h +++ b/include/linux/usb/typec_altmode.h @@ -30,6 +30,7 @@ struct typec_altmode { char *desc; const struct typec_altmode_ops *ops; + ANDROID_KABI_RESERVE(1); }; #define to_typec_altmode(d) container_of(d, struct typec_altmode, dev) @@ -63,6 +64,7 @@ struct typec_altmode_ops { int (*notify)(struct typec_altmode *altmode, unsigned long conf, void *data); int (*activate)(struct typec_altmode *altmode, int activate); + ANDROID_KABI_RESERVE(1); }; int typec_altmode_enter(struct typec_altmode *altmode, u32 *vdo); diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index 8336e86ce606..a25e3843b599 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -23,6 +23,8 @@ #ifndef __LINUX_USB_USBNET_H #define __LINUX_USB_USBNET_H +#include + /* interface from usbnet core to each USB networking link we handle */ struct usbnet { /* housekeeping */ @@ -84,6 +86,11 @@ struct usbnet { # define EVENT_LINK_CHANGE 11 # define EVENT_SET_RX_MODE 12 # define EVENT_NO_IP_ALIGN 13 + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; static inline struct usb_driver *driver_of(struct usb_interface *intf) @@ -173,6 +180,9 @@ struct driver_info { int out; /* tx endpoint */ unsigned long data; /* Misc driver specific data */ + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; /* Minidrivers are just drivers using the "usbnet" core as a powerful diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 33a4240e6a6f..bd1c6f3542be 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -10,6 +10,7 @@ #include #include #include +#include #define UID_GID_MAP_MAX_BASE_EXTENTS 5 #define UID_GID_MAP_MAX_EXTENTS 340 @@ -99,6 +100,9 @@ struct user_namespace { #endif struct ucounts *ucounts; long ucount_max[UCOUNT_COUNTS]; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); } __randomize_layout; struct ucounts { diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 1af8d65d4c8f..7af6719b645f 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -89,6 +89,8 @@ dma_addr_t virtqueue_get_desc_addr(struct virtqueue *vq); dma_addr_t virtqueue_get_avail_addr(struct virtqueue *vq); dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq); +void virtqueue_disable_dma_api_for_buffers(struct virtqueue *vq); + /** * virtio_device - representation of a device using virtio * @index: unique position on the virtio bus diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index b341dd62aa4d..dafdc7f48c01 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -23,6 +23,8 @@ struct virtio_shm_region { * any of @get/@set, @get_status/@set_status, or @get_features/ * @finalize_features are NOT safe to be called from an atomic * context. + * @enable_cbs: enable the callbacks + * vdev: the virtio_device * @get: read the value of a configuration field * vdev: the virtio_device * offset: the offset of the configuration field @@ -76,6 +78,7 @@ struct virtio_shm_region { */ typedef void vq_callback_t(struct virtqueue *); struct virtio_config_ops { + void (*enable_cbs)(struct virtio_device *vdev); void (*get)(struct virtio_device *vdev, unsigned offset, void *buf, unsigned len); void (*set)(struct virtio_device *vdev, unsigned offset, @@ -230,6 +233,9 @@ void virtio_device_ready(struct virtio_device *dev) { unsigned status = dev->config->get_status(dev); + if (dev->config->enable_cbs) + dev->config->enable_cbs(dev); + BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK); dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK); } diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 35d7eedb5e8e..de584a776908 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -9,7 +9,8 @@ #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4) #define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL -#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) +#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE virtio_transport_max_vsock_pkt_buf_size +extern uint virtio_transport_max_vsock_pkt_buf_size; enum { VSOCK_VQ_RX = 0, /* for host to guest data */ diff --git a/include/linux/vm_event.h b/include/linux/vm_event.h new file mode 100644 index 000000000000..689a21387dad --- /dev/null +++ b/include/linux/vm_event.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VM_EVENT_H +#define _LINUX_VM_EVENT_H + +#include +#include +#include +#include +#include + +#ifdef CONFIG_VM_EVENT_COUNTERS +/* + * Light weight per cpu counter implementation. + * + * Counters should only be incremented and no critical kernel component + * should rely on the counter values. + * + * Counters are handled completely inline. On many platforms the code + * generated will simply be the increment of a global address. + */ + +struct vm_event_state { + unsigned long event[NR_VM_EVENT_ITEMS]; +}; + +DECLARE_PER_CPU(struct vm_event_state, vm_event_states); + +/* + * vm counters are allowed to be racy. Use raw_cpu_ops to avoid the + * local_irq_disable overhead. + */ +static inline void __count_vm_event(enum vm_event_item item) +{ + raw_cpu_inc(vm_event_states.event[item]); +} + +static inline void count_vm_event(enum vm_event_item item) +{ + this_cpu_inc(vm_event_states.event[item]); +} + +static inline void __count_vm_events(enum vm_event_item item, long delta) +{ + raw_cpu_add(vm_event_states.event[item], delta); +} + +static inline void count_vm_events(enum vm_event_item item, long delta) +{ + this_cpu_add(vm_event_states.event[item], delta); +} + +extern void all_vm_events(unsigned long *); + +extern void vm_events_fold_cpu(int cpu); + +#else + +/* Disable counters */ +static inline void count_vm_event(enum vm_event_item item) +{ +} +static inline void count_vm_events(enum vm_event_item item, long delta) +{ +} +static inline void __count_vm_event(enum vm_event_item item) +{ +} +static inline void __count_vm_events(enum vm_event_item item, long delta) +{ +} +static inline void all_vm_events(unsigned long *ret) +{ +} +static inline void vm_events_fold_cpu(int cpu) +{ +} + +#endif /* CONFIG_VM_EVENT_COUNTERS */ + +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT_STATS +#define count_vm_spf_event(x) count_vm_event(x) +#else +#define count_vm_spf_event(x) do {} while (0) +#endif + +#ifdef CONFIG_NUMA_BALANCING +#define count_vm_numa_event(x) count_vm_event(x) +#define count_vm_numa_events(x, y) count_vm_events(x, y) +#else +#define count_vm_numa_event(x) do {} while (0) +#define count_vm_numa_events(x, y) do { (void)(y); } while (0) +#endif /* CONFIG_NUMA_BALANCING */ + +#ifdef CONFIG_DEBUG_TLBFLUSH +#define count_vm_tlb_event(x) count_vm_event(x) +#define count_vm_tlb_events(x, y) count_vm_events(x, y) +#else +#define count_vm_tlb_event(x) do {} while (0) +#define count_vm_tlb_events(x, y) do { (void)(y); } while (0) +#endif + +#ifdef CONFIG_DEBUG_VM_VMACACHE +#define count_vm_vmacache_event(x) count_vm_event(x) +#else +#define count_vm_vmacache_event(x) do {} while (0) +#endif + +#define __count_zid_vm_events(item, zid, delta) \ + __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) + +#endif /* _LINUX_VM_EVENT_H */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index a185cc75ff52..0e1302d5a9a4 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -130,6 +130,32 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, DIRECT_MAP_LEVEL3_SPLIT, +#endif +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + SPF_ATTEMPT, + SPF_ABORT, +#endif +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT_STATS + SPF_ABORT_ODD, + SPF_ABORT_UNMAPPED, + SPF_ABORT_NO_SPECULATE, + SPF_ABORT_VMA_COPY, + SPF_ABORT_ACCESS_ERROR, + SPF_ABORT_PUD, + SPF_ABORT_PMD, + SPF_ABORT_ANON_VMA, + SPF_ABORT_PTE_MAP_LOCK_SEQ1, + SPF_ABORT_PTE_MAP_LOCK_PMD, + SPF_ABORT_PTE_MAP_LOCK_PTL, + SPF_ABORT_PTE_MAP_LOCK_SEQ2, + SPF_ABORT_USERFAULTFD, + SPF_ABORT_FAULT, + SPF_ABORT_SWAP, + SPF_ATTEMPT_ANON, + SPF_ATTEMPT_FILE, + SPF_ATTEMPT_NUMA, + SPF_ATTEMPT_PTE, + SPF_ATTEMPT_WP, #endif NR_VM_EVENT_ITEMS }; diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 5535be1012a2..1d8c46f42f2c 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -35,17 +35,6 @@ struct notifier_block; /* in notifier.h */ #define VM_DEFER_KMEMLEAK 0 #endif -/* - * VM_KASAN is used slightly differently depending on CONFIG_KASAN_VMALLOC. - * - * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after - * shadow memory has been mapped. It's used to handle allocation errors so that - * we don't try to poison shadow on free if it was never allocated. - * - * Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to - * determine which allocations need the module shadow freed. - */ - /* bits [20..32] reserved for arch specific ioremap internals */ /* @@ -126,6 +115,13 @@ static inline int arch_vmap_pte_supported_shift(unsigned long size) } #endif +#ifndef arch_vmap_pgprot_tagged +static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot) +{ + return prot; +} +#endif + /* * Highlevel APIs for driver use */ diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index d6a6cf53b127..b6f22074a71a 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -40,100 +41,6 @@ enum writeback_stat_item { NR_VM_WRITEBACK_STAT_ITEMS, }; -#ifdef CONFIG_VM_EVENT_COUNTERS -/* - * Light weight per cpu counter implementation. - * - * Counters should only be incremented and no critical kernel component - * should rely on the counter values. - * - * Counters are handled completely inline. On many platforms the code - * generated will simply be the increment of a global address. - */ - -struct vm_event_state { - unsigned long event[NR_VM_EVENT_ITEMS]; -}; - -DECLARE_PER_CPU(struct vm_event_state, vm_event_states); - -/* - * vm counters are allowed to be racy. Use raw_cpu_ops to avoid the - * local_irq_disable overhead. - */ -static inline void __count_vm_event(enum vm_event_item item) -{ - raw_cpu_inc(vm_event_states.event[item]); -} - -static inline void count_vm_event(enum vm_event_item item) -{ - this_cpu_inc(vm_event_states.event[item]); -} - -static inline void __count_vm_events(enum vm_event_item item, long delta) -{ - raw_cpu_add(vm_event_states.event[item], delta); -} - -static inline void count_vm_events(enum vm_event_item item, long delta) -{ - this_cpu_add(vm_event_states.event[item], delta); -} - -extern void all_vm_events(unsigned long *); - -extern void vm_events_fold_cpu(int cpu); - -#else - -/* Disable counters */ -static inline void count_vm_event(enum vm_event_item item) -{ -} -static inline void count_vm_events(enum vm_event_item item, long delta) -{ -} -static inline void __count_vm_event(enum vm_event_item item) -{ -} -static inline void __count_vm_events(enum vm_event_item item, long delta) -{ -} -static inline void all_vm_events(unsigned long *ret) -{ -} -static inline void vm_events_fold_cpu(int cpu) -{ -} - -#endif /* CONFIG_VM_EVENT_COUNTERS */ - -#ifdef CONFIG_NUMA_BALANCING -#define count_vm_numa_event(x) count_vm_event(x) -#define count_vm_numa_events(x, y) count_vm_events(x, y) -#else -#define count_vm_numa_event(x) do {} while (0) -#define count_vm_numa_events(x, y) do { (void)(y); } while (0) -#endif /* CONFIG_NUMA_BALANCING */ - -#ifdef CONFIG_DEBUG_TLBFLUSH -#define count_vm_tlb_event(x) count_vm_event(x) -#define count_vm_tlb_events(x, y) count_vm_events(x, y) -#else -#define count_vm_tlb_event(x) do {} while (0) -#define count_vm_tlb_events(x, y) do { (void)(y); } while (0) -#endif - -#ifdef CONFIG_DEBUG_VM_VMACACHE -#define count_vm_vmacache_event(x) count_vm_event(x) -#else -#define count_vm_vmacache_event(x) do {} while (0) -#endif - -#define __count_zid_vm_events(item, zid, delta) \ - __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) - /* * Zone and node-based page accounting with per cpu differentials. */ diff --git a/include/linux/wait.h b/include/linux/wait.h index 21044562aab7..f93e0da760ef 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -229,6 +229,7 @@ void __wake_up_pollfree(struct wait_queue_head *wq_head); #define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL) #define wake_up_interruptible_all(x) __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL) #define wake_up_interruptible_sync(x) __wake_up_sync((x), TASK_INTERRUPTIBLE) +#define wake_up_sync(x) __wake_up_sync((x), TASK_NORMAL) /* * Wakeup macros to be used to report events to the targets. diff --git a/include/linux/wakeup_reason.h b/include/linux/wakeup_reason.h new file mode 100644 index 000000000000..54f5caaa5cde --- /dev/null +++ b/include/linux/wakeup_reason.h @@ -0,0 +1,37 @@ +/* + * include/linux/wakeup_reason.h + * + * Logs the reason which caused the kernel to resume + * from the suspend mode. + * + * Copyright (C) 2014 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_WAKEUP_REASON_H +#define _LINUX_WAKEUP_REASON_H + +#define MAX_SUSPEND_ABORT_LEN 256 + +#ifdef CONFIG_SUSPEND +void log_irq_wakeup_reason(int irq); +void log_threaded_irq_wakeup_reason(int irq, int parent_irq); +void log_suspend_abort_reason(const char *fmt, ...); +void log_abnormal_wakeup_reason(const char *fmt, ...); +void clear_wakeup_reasons(void); +#else +static inline void log_irq_wakeup_reason(int irq) { } +static inline void log_threaded_irq_wakeup_reason(int irq, int parent_irq) { } +static inline void log_suspend_abort_reason(const char *fmt, ...) { } +static inline void log_abnormal_wakeup_reason(const char *fmt, ...) { } +static inline void clear_wakeup_reasons(void) { } +#endif + +#endif /* _LINUX_WAKEUP_REASON_H */ diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 74d3c1efd9bb..229c1673084f 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -14,6 +14,7 @@ #include #include #include +#include struct workqueue_struct; @@ -101,6 +102,8 @@ struct work_struct { #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; #define WORK_DATA_INIT() ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL) @@ -114,6 +117,9 @@ struct delayed_work { /* target workqueue and CPU ->timer uses to queue ->work */ struct workqueue_struct *wq; int cpu; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; struct rcu_work { diff --git a/include/linux/xz.h b/include/linux/xz.h index 9884c8440188..7285ca5d56e9 100644 --- a/include/linux/xz.h +++ b/include/linux/xz.h @@ -233,6 +233,112 @@ XZ_EXTERN void xz_dec_reset(struct xz_dec *s); */ XZ_EXTERN void xz_dec_end(struct xz_dec *s); +/* + * Decompressor for MicroLZMA, an LZMA variant with a very minimal header. + * See xz_dec_microlzma_alloc() below for details. + * + * These functions aren't used or available in preboot code and thus aren't + * marked with XZ_EXTERN. This avoids warnings about static functions that + * are never defined. + */ +/** + * struct xz_dec_microlzma - Opaque type to hold the MicroLZMA decoder state + */ +struct xz_dec_microlzma; + +/** + * xz_dec_microlzma_alloc() - Allocate memory for the MicroLZMA decoder + * @mode XZ_SINGLE or XZ_PREALLOC + * @dict_size LZMA dictionary size. This must be at least 4 KiB and + * at most 3 GiB. + * + * In contrast to xz_dec_init(), this function only allocates the memory + * and remembers the dictionary size. xz_dec_microlzma_reset() must be used + * before calling xz_dec_microlzma_run(). + * + * The amount of allocated memory is a little less than 30 KiB with XZ_SINGLE. + * With XZ_PREALLOC also a dictionary buffer of dict_size bytes is allocated. + * + * On success, xz_dec_microlzma_alloc() returns a pointer to + * struct xz_dec_microlzma. If memory allocation fails or + * dict_size is invalid, NULL is returned. + * + * The compressed format supported by this decoder is a raw LZMA stream + * whose first byte (always 0x00) has been replaced with bitwise-negation + * of the LZMA properties (lc/lp/pb) byte. For example, if lc/lp/pb is + * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00. + * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream + * marker must not be used. The unused values are reserved for future use. + * This MicroLZMA header format was created for use in EROFS but may be used + * by others too. + */ +extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, + uint32_t dict_size); + +/** + * xz_dec_microlzma_reset() - Reset the MicroLZMA decoder state + * @s Decoder state allocated using xz_dec_microlzma_alloc() + * @comp_size Compressed size of the input stream + * @uncomp_size Uncompressed size of the input stream. A value smaller + * than the real uncompressed size of the input stream can + * be specified if uncomp_size_is_exact is set to false. + * uncomp_size can never be set to a value larger than the + * expected real uncompressed size because it would eventually + * result in XZ_DATA_ERROR. + * @uncomp_size_is_exact This is an int instead of bool to avoid + * requiring stdbool.h. This should normally be set to true. + * When this is set to false, error detection is weaker. + */ +extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, + uint32_t comp_size, uint32_t uncomp_size, + int uncomp_size_is_exact); + +/** + * xz_dec_microlzma_run() - Run the MicroLZMA decoder + * @s Decoder state initialized using xz_dec_microlzma_reset() + * @b: Input and output buffers + * + * This works similarly to xz_dec_run() with a few important differences. + * Only the differences are documented here. + * + * The only possible return values are XZ_OK, XZ_STREAM_END, and + * XZ_DATA_ERROR. This function cannot return XZ_BUF_ERROR: if no progress + * is possible due to lack of input data or output space, this function will + * keep returning XZ_OK. Thus, the calling code must be written so that it + * will eventually provide input and output space matching (or exceeding) + * comp_size and uncomp_size arguments given to xz_dec_microlzma_reset(). + * If the caller cannot do this (for example, if the input file is truncated + * or otherwise corrupt), the caller must detect this error by itself to + * avoid an infinite loop. + * + * If the compressed data seems to be corrupt, XZ_DATA_ERROR is returned. + * This can happen also when incorrect dictionary, uncompressed, or + * compressed sizes have been specified. + * + * With XZ_PREALLOC only: As an extra feature, b->out may be NULL to skip over + * uncompressed data. This way the caller doesn't need to provide a temporary + * output buffer for the bytes that will be ignored. + * + * With XZ_SINGLE only: In contrast to xz_dec_run(), the return value XZ_OK + * is also possible and thus XZ_SINGLE is actually a limited multi-call mode. + * After XZ_OK the bytes decoded so far may be read from the output buffer. + * It is possible to continue decoding but the variables b->out and b->out_pos + * MUST NOT be changed by the caller. Increasing the value of b->out_size is + * allowed to make more output space available; one doesn't need to provide + * space for the whole uncompressed data on the first call. The input buffer + * may be changed normally like with XZ_PREALLOC. This way input data can be + * provided from non-contiguous memory. + */ +extern enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s, + struct xz_buf *b); + +/** + * xz_dec_microlzma_end() - Free the memory allocated for the decoder state + * @s: Decoder state allocated using xz_dec_microlzma_alloc(). + * If s is NULL, this function does nothing. + */ +extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s); + /* * Standalone build (userspace build or in-kernel build for boot time use) * needs a CRC32 implementation. For normal in-kernel use, kernel's own diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h index 5e25a098e8ce..40cd99549113 100644 --- a/include/media/v4l2-common.h +++ b/include/media/v4l2-common.h @@ -541,6 +541,10 @@ int v4l2_fill_pixfmt_mp(struct v4l2_pix_format_mplane *pixfmt, u32 pixelformat, s64 v4l2_get_link_freq(struct v4l2_ctrl_handler *handler, unsigned int mul, unsigned int div); +void v4l2_simplify_fraction(u32 *numerator, u32 *denominator, + unsigned int n_terms, unsigned int threshold); +u32 v4l2_fraction_to_interval(u32 numerator, u32 denominator); + static inline u64 v4l2_buffer_get_timestamp(const struct v4l2_buffer *buf) { /* diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h index ebd9cef13309..4af99b8e9243 100644 --- a/include/media/v4l2-ctrls.h +++ b/include/media/v4l2-ctrls.h @@ -11,6 +11,7 @@ #include #include #include +#include #include /* @@ -85,6 +86,9 @@ union v4l2_ctrl_ptr { struct v4l2_area *p_area; void *p; const void *p_const; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; /** @@ -117,6 +121,8 @@ struct v4l2_ctrl_ops { int (*g_volatile_ctrl)(struct v4l2_ctrl *ctrl); int (*try_ctrl)(struct v4l2_ctrl *ctrl); int (*s_ctrl)(struct v4l2_ctrl *ctrl); + + ANDROID_KABI_RESERVE(1); }; /** @@ -138,6 +144,8 @@ struct v4l2_ctrl_type_ops { void (*log)(const struct v4l2_ctrl *ctrl); int (*validate)(const struct v4l2_ctrl *ctrl, u32 idx, union v4l2_ctrl_ptr ptr); + + ANDROID_KABI_RESERVE(1); }; /** @@ -303,6 +311,8 @@ struct v4l2_ctrl { union v4l2_ctrl_ptr p_def; union v4l2_ctrl_ptr p_new; union v4l2_ctrl_ptr p_cur; + + ANDROID_KABI_RESERVE(1); }; /** @@ -355,6 +365,8 @@ struct v4l2_ctrl_ref { u32 p_req_dyn_alloc_elems; u32 p_req_elems; union v4l2_ctrl_ptr p_req; + + ANDROID_KABI_RESERVE(1); }; /** @@ -405,6 +417,8 @@ struct v4l2_ctrl_handler { struct list_head requests; struct list_head requests_queued; struct media_request_object req_obj; + + ANDROID_KABI_RESERVE(1); }; /** @@ -456,6 +470,8 @@ struct v4l2_ctrl_config { const char * const *qmenu; const s64 *qmenu_int; unsigned int is_private:1; + + ANDROID_KABI_RESERVE(1); }; /** diff --git a/include/media/v4l2-dev.h b/include/media/v4l2-dev.h index 6a4afd4a7df2..72e6c26cf024 100644 --- a/include/media/v4l2-dev.h +++ b/include/media/v4l2-dev.h @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -211,6 +212,8 @@ struct v4l2_file_operations { int (*mmap) (struct file *, struct vm_area_struct *); int (*open) (struct file *); int (*release) (struct file *); + + ANDROID_KABI_RESERVE(1); }; /* @@ -307,6 +310,9 @@ struct video_device DECLARE_BITMAP(valid_ioctls, BASE_VIDIOC_PRIVATE); struct mutex *lock; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; /** diff --git a/include/media/v4l2-uvc.h b/include/media/v4l2-uvc.h new file mode 100644 index 000000000000..f83e31661333 --- /dev/null +++ b/include/media/v4l2-uvc.h @@ -0,0 +1,359 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * v4l2 uvc internal API header + * + * Some commonly needed functions for uvc drivers + */ + +#ifndef __LINUX_V4L2_UVC_H +#define __LINUX_V4L2_UVC_H + +/* ------------------------------------------------------------------------ + * GUIDs + */ +#define UVC_GUID_UVC_CAMERA \ + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01} +#define UVC_GUID_UVC_OUTPUT \ + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02} +#define UVC_GUID_UVC_MEDIA_TRANSPORT_INPUT \ + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03} +#define UVC_GUID_UVC_PROCESSING \ + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01} +#define UVC_GUID_UVC_SELECTOR \ + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02} +#define UVC_GUID_EXT_GPIO_CONTROLLER \ + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x03} + +#define UVC_GUID_FORMAT_MJPEG \ + { 'M', 'J', 'P', 'G', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_YUY2 \ + { 'Y', 'U', 'Y', '2', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_YUY2_ISIGHT \ + { 'Y', 'U', 'Y', '2', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0x00, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_NV12 \ + { 'N', 'V', '1', '2', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_YV12 \ + { 'Y', 'V', '1', '2', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_I420 \ + { 'I', '4', '2', '0', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_UYVY \ + { 'U', 'Y', 'V', 'Y', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_Y800 \ + { 'Y', '8', '0', '0', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_Y8 \ + { 'Y', '8', ' ', ' ', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_Y10 \ + { 'Y', '1', '0', ' ', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_Y12 \ + { 'Y', '1', '2', ' ', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_Y16 \ + { 'Y', '1', '6', ' ', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_BY8 \ + { 'B', 'Y', '8', ' ', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_BA81 \ + { 'B', 'A', '8', '1', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_GBRG \ + { 'G', 'B', 'R', 'G', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_GRBG \ + { 'G', 'R', 'B', 'G', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_RGGB \ + { 'R', 'G', 'G', 'B', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_BG16 \ + { 'B', 'G', '1', '6', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_GB16 \ + { 'G', 'B', '1', '6', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_RG16 \ + { 'R', 'G', '1', '6', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_GR16 \ + { 'G', 'R', '1', '6', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_RGBP \ + { 'R', 'G', 'B', 'P', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_BGR3 \ + { 0x7d, 0xeb, 0x36, 0xe4, 0x4f, 0x52, 0xce, 0x11, \ + 0x9f, 0x53, 0x00, 0x20, 0xaf, 0x0b, 0xa7, 0x70} +#define UVC_GUID_FORMAT_M420 \ + { 'M', '4', '2', '0', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} + +#define UVC_GUID_FORMAT_H264 \ + { 'H', '2', '6', '4', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_H265 \ + { 'H', '2', '6', '5', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_Y8I \ + { 'Y', '8', 'I', ' ', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_Y12I \ + { 'Y', '1', '2', 'I', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_Z16 \ + { 'Z', '1', '6', ' ', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_RW10 \ + { 'R', 'W', '1', '0', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_INVZ \ + { 'I', 'N', 'V', 'Z', 0x90, 0x2d, 0x58, 0x4a, \ + 0x92, 0x0b, 0x77, 0x3f, 0x1f, 0x2c, 0x55, 0x6b} +#define UVC_GUID_FORMAT_INZI \ + { 'I', 'N', 'Z', 'I', 0x66, 0x1a, 0x42, 0xa2, \ + 0x90, 0x65, 0xd0, 0x18, 0x14, 0xa8, 0xef, 0x8a} +#define UVC_GUID_FORMAT_INVI \ + { 'I', 'N', 'V', 'I', 0xdb, 0x57, 0x49, 0x5e, \ + 0x8e, 0x3f, 0xf4, 0x79, 0x53, 0x2b, 0x94, 0x6f} +#define UVC_GUID_FORMAT_CNF4 \ + { 'C', ' ', ' ', ' ', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} + +#define UVC_GUID_FORMAT_D3DFMT_L8 \ + {0x32, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_KSMEDIA_L8_IR \ + {0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} + +#define UVC_GUID_FORMAT_HEVC \ + { 'H', 'E', 'V', 'C', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} + +/* ------------------------------------------------------------------------ + * Video formats + */ + +struct uvc_format_desc { + char *name; + u8 guid[16]; + u32 fcc; +}; + +static struct uvc_format_desc uvc_fmts[] = { + { + .name = "YUV 4:2:2 (YUYV)", + .guid = UVC_GUID_FORMAT_YUY2, + .fcc = V4L2_PIX_FMT_YUYV, + }, + { + .name = "YUV 4:2:2 (YUYV)", + .guid = UVC_GUID_FORMAT_YUY2_ISIGHT, + .fcc = V4L2_PIX_FMT_YUYV, + }, + { + .name = "YUV 4:2:0 (NV12)", + .guid = UVC_GUID_FORMAT_NV12, + .fcc = V4L2_PIX_FMT_NV12, + }, + { + .name = "MJPEG", + .guid = UVC_GUID_FORMAT_MJPEG, + .fcc = V4L2_PIX_FMT_MJPEG, + }, + { + .name = "YVU 4:2:0 (YV12)", + .guid = UVC_GUID_FORMAT_YV12, + .fcc = V4L2_PIX_FMT_YVU420, + }, + { + .name = "YUV 4:2:0 (I420)", + .guid = UVC_GUID_FORMAT_I420, + .fcc = V4L2_PIX_FMT_YUV420, + }, + { + .name = "YUV 4:2:0 (M420)", + .guid = UVC_GUID_FORMAT_M420, + .fcc = V4L2_PIX_FMT_M420, + }, + { + .name = "YUV 4:2:2 (UYVY)", + .guid = UVC_GUID_FORMAT_UYVY, + .fcc = V4L2_PIX_FMT_UYVY, + }, + { + .name = "Greyscale 8-bit (Y800)", + .guid = UVC_GUID_FORMAT_Y800, + .fcc = V4L2_PIX_FMT_GREY, + }, + { + .name = "Greyscale 8-bit (Y8 )", + .guid = UVC_GUID_FORMAT_Y8, + .fcc = V4L2_PIX_FMT_GREY, + }, + { + .name = "Greyscale 8-bit (D3DFMT_L8)", + .guid = UVC_GUID_FORMAT_D3DFMT_L8, + .fcc = V4L2_PIX_FMT_GREY, + }, + { + .name = "IR 8-bit (L8_IR)", + .guid = UVC_GUID_FORMAT_KSMEDIA_L8_IR, + .fcc = V4L2_PIX_FMT_GREY, + }, + { + .name = "Greyscale 10-bit (Y10 )", + .guid = UVC_GUID_FORMAT_Y10, + .fcc = V4L2_PIX_FMT_Y10, + }, + { + .name = "Greyscale 12-bit (Y12 )", + .guid = UVC_GUID_FORMAT_Y12, + .fcc = V4L2_PIX_FMT_Y12, + }, + { + .name = "Greyscale 16-bit (Y16 )", + .guid = UVC_GUID_FORMAT_Y16, + .fcc = V4L2_PIX_FMT_Y16, + }, + { + .name = "BGGR Bayer (BY8 )", + .guid = UVC_GUID_FORMAT_BY8, + .fcc = V4L2_PIX_FMT_SBGGR8, + }, + { + .name = "BGGR Bayer (BA81)", + .guid = UVC_GUID_FORMAT_BA81, + .fcc = V4L2_PIX_FMT_SBGGR8, + }, + { + .name = "GBRG Bayer (GBRG)", + .guid = UVC_GUID_FORMAT_GBRG, + .fcc = V4L2_PIX_FMT_SGBRG8, + }, + { + .name = "GRBG Bayer (GRBG)", + .guid = UVC_GUID_FORMAT_GRBG, + .fcc = V4L2_PIX_FMT_SGRBG8, + }, + { + .name = "RGGB Bayer (RGGB)", + .guid = UVC_GUID_FORMAT_RGGB, + .fcc = V4L2_PIX_FMT_SRGGB8, + }, + { + .name = "RGB565", + .guid = UVC_GUID_FORMAT_RGBP, + .fcc = V4L2_PIX_FMT_RGB565, + }, + { + .name = "BGR 8:8:8 (BGR3)", + .guid = UVC_GUID_FORMAT_BGR3, + .fcc = V4L2_PIX_FMT_BGR24, + }, + { + .name = "H.264", + .guid = UVC_GUID_FORMAT_H264, + .fcc = V4L2_PIX_FMT_H264, + }, + { + .name = "H.265", + .guid = UVC_GUID_FORMAT_H265, + .fcc = V4L2_PIX_FMT_HEVC, + }, + { + .name = "Greyscale 8 L/R (Y8I)", + .guid = UVC_GUID_FORMAT_Y8I, + .fcc = V4L2_PIX_FMT_Y8I, + }, + { + .name = "Greyscale 12 L/R (Y12I)", + .guid = UVC_GUID_FORMAT_Y12I, + .fcc = V4L2_PIX_FMT_Y12I, + }, + { + .name = "Depth data 16-bit (Z16)", + .guid = UVC_GUID_FORMAT_Z16, + .fcc = V4L2_PIX_FMT_Z16, + }, + { + .name = "Bayer 10-bit (SRGGB10P)", + .guid = UVC_GUID_FORMAT_RW10, + .fcc = V4L2_PIX_FMT_SRGGB10P, + }, + { + .name = "Bayer 16-bit (SBGGR16)", + .guid = UVC_GUID_FORMAT_BG16, + .fcc = V4L2_PIX_FMT_SBGGR16, + }, + { + .name = "Bayer 16-bit (SGBRG16)", + .guid = UVC_GUID_FORMAT_GB16, + .fcc = V4L2_PIX_FMT_SGBRG16, + }, + { + .name = "Bayer 16-bit (SRGGB16)", + .guid = UVC_GUID_FORMAT_RG16, + .fcc = V4L2_PIX_FMT_SRGGB16, + }, + { + .name = "Bayer 16-bit (SGRBG16)", + .guid = UVC_GUID_FORMAT_GR16, + .fcc = V4L2_PIX_FMT_SGRBG16, + }, + { + .name = "Depth data 16-bit (Z16)", + .guid = UVC_GUID_FORMAT_INVZ, + .fcc = V4L2_PIX_FMT_Z16, + }, + { + .name = "Greyscale 10-bit (Y10 )", + .guid = UVC_GUID_FORMAT_INVI, + .fcc = V4L2_PIX_FMT_Y10, + }, + { + .name = "IR:Depth 26-bit (INZI)", + .guid = UVC_GUID_FORMAT_INZI, + .fcc = V4L2_PIX_FMT_INZI, + }, + { + .name = "4-bit Depth Confidence (Packed)", + .guid = UVC_GUID_FORMAT_CNF4, + .fcc = V4L2_PIX_FMT_CNF4, + }, + { + .name = "HEVC", + .guid = UVC_GUID_FORMAT_HEVC, + .fcc = V4L2_PIX_FMT_HEVC, + }, +}; + +static inline struct uvc_format_desc *uvc_format_by_guid(const u8 guid[16]) +{ + unsigned int len = ARRAY_SIZE(uvc_fmts); + unsigned int i; + + for (i = 0; i < len; ++i) { + if (memcmp(guid, uvc_fmts[i].guid, 16) == 0) + return &uvc_fmts[i]; + } + + return NULL; +} + +#endif /* __LINUX_V4L2_UVC_H */ diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h index 4b72d0e16ee3..1bdaf65b6629 100644 --- a/include/media/videobuf2-core.h +++ b/include/media/videobuf2-core.h @@ -20,7 +20,7 @@ #include #include -#define VB2_MAX_FRAME (32) +#define VB2_MAX_FRAME (64) #define VB2_MAX_PLANES (8) /** diff --git a/include/media/videobuf2-v4l2.h b/include/media/videobuf2-v4l2.h index b66585e304e2..0055b0791068 100644 --- a/include/media/videobuf2-v4l2.h +++ b/include/media/videobuf2-v4l2.h @@ -13,6 +13,7 @@ #define _MEDIA_VIDEOBUF2_V4L2_H #include +#include #include #if VB2_MAX_FRAME != VIDEO_MAX_FRAME @@ -51,6 +52,8 @@ struct vb2_v4l2_buffer { __s32 request_fd; bool is_held; struct vb2_plane planes[VB2_MAX_PLANES]; + + ANDROID_KABI_RESERVE(1); }; /* VB2 V4L2 flags as set in vb2_queue.subsystem_flags */ diff --git a/include/net/TEST_MAPPING b/include/net/TEST_MAPPING new file mode 100644 index 000000000000..1eb8d436031a --- /dev/null +++ b/include/net/TEST_MAPPING @@ -0,0 +1,12 @@ +{ + "presubmit": [ + { + "name": "CtsNetTestCases", + "options": [ + { + "exclude-annotation": "com.android.testutils.SkipPresubmit" + } + ] + } + ] +} diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 53627afab104..1f14107e739c 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -269,6 +269,18 @@ static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset) void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao); +/* Determines into what table to put autoconf PIO/RIO/default routes + * learned on this device. + * + * - If 0, use the same table for every device. This puts routes into + * one of RT_TABLE_{PREFIX,INFO,DFLT} depending on the type of route + * (but note that these three are currently all equal to + * RT6_TABLE_MAIN). + * - If > 0, use the specified table. + * - If < 0, put routes into table dev->ifindex + (-rt_table). + */ +u32 addrconf_rt_table(const struct net_device *dev, u32 default_table); + /* * anycast prototypes (anycast.c) */ diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index f742e50207fb..ab207677e0a8 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -205,8 +205,7 @@ struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); void vsock_remove_sock(struct vsock_sock *vsk); -void vsock_for_each_connected_socket(struct vsock_transport *transport, - void (*fn)(struct sock *sk)); +void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); bool vsock_find_cid(unsigned int cid); diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 3da5cfcf84c1..ea63120f291b 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -609,6 +610,11 @@ struct hci_dev { int (*set_bdaddr)(struct hci_dev *hdev, const bdaddr_t *bdaddr); void (*cmd_timeout)(struct hci_dev *hdev); bool (*prevent_wake)(struct hci_dev *hdev); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #define HCI_PHY_HANDLE(handle) (handle & 0xff) @@ -706,6 +712,11 @@ struct hci_conn { void (*connect_cfm_cb) (struct hci_conn *conn, u8 status); void (*security_cfm_cb) (struct hci_conn *conn, u8 status); void (*disconn_cfm_cb) (struct hci_conn *conn, u8 reason); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; struct hci_chan { @@ -716,6 +727,8 @@ struct hci_chan { unsigned int sent; __u8 state; bool amp; + + ANDROID_KABI_RESERVE(1); }; struct hci_conn_params { @@ -742,6 +755,8 @@ struct hci_conn_params { struct hci_conn *conn; bool explicit_connect; u32 current_flags; + + ANDROID_KABI_RESERVE(1); }; extern struct list_head hci_dev_list; @@ -1494,6 +1509,8 @@ struct hci_cb { __u8 encrypt); void (*key_change_cfm) (struct hci_conn *conn, __u8 status); void (*role_switch_cfm) (struct hci_conn *conn, __u8 status, __u8 role); + + ANDROID_KABI_RESERVE(1); }; static inline void hci_connect_cfm(struct hci_conn *conn, __u8 status) @@ -1754,6 +1771,8 @@ struct hci_mgmt_chan { size_t handler_count; const struct hci_mgmt_handler *handlers; void (*hdev_init) (struct sock *sk, struct hci_dev *hdev); + + ANDROID_KABI_RESERVE(1); }; int hci_mgmt_chan_register(struct hci_mgmt_chan *c); diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 2f766e3437ce..2145350388fd 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -29,6 +29,7 @@ #include #include +#include /* L2CAP defaults */ #define L2CAP_DEFAULT_MTU 672 @@ -645,6 +646,9 @@ struct l2cap_chan { void *data; const struct l2cap_ops *ops; struct mutex lock; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; struct l2cap_ops { @@ -669,6 +673,9 @@ struct l2cap_ops { unsigned long len, int nb); int (*filter) (struct l2cap_chan * chan, struct sk_buff *skb); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; struct l2cap_conn { @@ -704,6 +711,9 @@ struct l2cap_conn { struct mutex chan_lock; struct kref ref; struct list_head users; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; struct l2cap_user { diff --git a/include/net/bluetooth/rfcomm.h b/include/net/bluetooth/rfcomm.h index 99d26879b02a..d8f4e835ede8 100644 --- a/include/net/bluetooth/rfcomm.h +++ b/include/net/bluetooth/rfcomm.h @@ -22,6 +22,7 @@ */ #include +#include #ifndef __RFCOMM_H #define __RFCOMM_H @@ -164,6 +165,8 @@ struct rfcomm_session { uint mtu; struct list_head dlcs; + + ANDROID_KABI_RESERVE(1); }; struct rfcomm_dlc { @@ -197,6 +200,9 @@ struct rfcomm_dlc { void (*data_ready)(struct rfcomm_dlc *d, struct sk_buff *skb); void (*state_change)(struct rfcomm_dlc *d, int err); void (*modem_status)(struct rfcomm_dlc *d, u8 v24_sig); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; /* DLC and session flags */ diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 27336fc70467..110c64243236 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -23,6 +23,7 @@ #include #include #include +#include #include /** @@ -109,7 +110,12 @@ struct wiphy; * on this channel. * @IEEE80211_CHAN_16MHZ: 16 MHz bandwidth is permitted * on this channel. - * + * @IEEE80211_CHAN_NO_320MHZ: If the driver supports 320 MHz on the band, + * this flag indicates that a 320 MHz channel cannot use this + * channel as the control or any of the secondary channels. + * This may be due to the driver or due to regulatory bandwidth + * restrictions. + * @IEEE80211_CHAN_NO_EHT: EHT operation is not permitted on this channel. */ enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = 1<<0, @@ -131,6 +137,8 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_4MHZ = 1<<16, IEEE80211_CHAN_8MHZ = 1<<17, IEEE80211_CHAN_16MHZ = 1<<18, + IEEE80211_CHAN_NO_320MHZ = 1<<19, + IEEE80211_CHAN_NO_EHT = 1<<20, }; #define IEEE80211_CHAN_NO_HT40 \ @@ -360,6 +368,49 @@ struct ieee80211_sta_he_cap { u8 ppe_thres[IEEE80211_HE_PPE_THRES_MAX_LEN]; }; +/** + * struct ieee80211_eht_mcs_nss_supp - EHT max supported NSS per MCS + * + * See P802.11be_D1.3 Table 9-401k - "Subfields of the Supported EHT-MCS + * and NSS Set field" + * + * @only_20mhz: MCS/NSS support for 20 MHz-only STA. + * @bw: MCS/NSS support for 80, 160 and 320 MHz + * @bw._80: MCS/NSS support for BW <= 80 MHz + * @bw._160: MCS/NSS support for BW = 160 MHz + * @bw._320: MCS/NSS support for BW = 320 MHz + */ +struct ieee80211_eht_mcs_nss_supp { + union { + struct ieee80211_eht_mcs_nss_supp_20mhz_only only_20mhz; + struct { + struct ieee80211_eht_mcs_nss_supp_bw _80; + struct ieee80211_eht_mcs_nss_supp_bw _160; + struct ieee80211_eht_mcs_nss_supp_bw _320; + } __packed bw; + } __packed; +} __packed; + +#define IEEE80211_EHT_PPE_THRES_MAX_LEN 32 + +/** + * struct ieee80211_sta_eht_cap - STA's EHT capabilities + * + * This structure describes most essential parameters needed + * to describe 802.11be EHT capabilities for a STA. + * + * @has_eht: true iff EHT data is valid. + * @eht_cap_elem: Fixed portion of the eht capabilities element. + * @eht_mcs_nss_supp: The supported NSS/MCS combinations. + * @eht_ppe_thres: Holds the PPE Thresholds data. + */ +struct ieee80211_sta_eht_cap { + bool has_eht; + struct ieee80211_eht_cap_elem_fixed eht_cap_elem; + struct ieee80211_eht_mcs_nss_supp eht_mcs_nss_supp; + u8 eht_ppe_thres[IEEE80211_EHT_PPE_THRES_MAX_LEN]; +}; + /** * struct ieee80211_sband_iftype_data - sband data per interface type * @@ -371,6 +422,7 @@ struct ieee80211_sta_he_cap { * @he_cap: holds the HE capabilities * @he_6ghz_capa: HE 6 GHz capabilities, must be filled in for a * 6 GHz band channel (and 0 may be valid value). + * @eht_cap: STA's EHT capabilities * @vendor_elems: vendor element(s) to advertise * @vendor_elems.data: vendor element(s) data * @vendor_elems.len: vendor element(s) length @@ -379,6 +431,7 @@ struct ieee80211_sband_iftype_data { u16 types_mask; struct ieee80211_sta_he_cap he_cap; struct ieee80211_he_6ghz_capa he_6ghz_capa; + struct ieee80211_sta_eht_cap eht_cap; struct { const u8 *data; unsigned int len; @@ -445,7 +498,7 @@ struct ieee80211_edmg { * This structure describes most essential parameters needed * to describe 802.11ah S1G capabilities for a STA. * - * @s1g_supported: is STA an S1G STA + * @s1g: is STA an S1G STA * @cap: S1G capabilities information * @nss_mcs: Supported NSS MCS set */ @@ -561,6 +614,26 @@ ieee80211_get_he_6ghz_capa(const struct ieee80211_supported_band *sband, return data->he_6ghz_capa.capa; } +/** + * ieee80211_get_eht_iftype_cap - return ETH capabilities for an sband's iftype + * @sband: the sband to search for the iftype on + * @iftype: enum nl80211_iftype + * + * Return: pointer to the struct ieee80211_sta_eht_cap, or NULL is none found + */ +static inline const struct ieee80211_sta_eht_cap * +ieee80211_get_eht_iftype_cap(const struct ieee80211_supported_band *sband, + enum nl80211_iftype iftype) +{ + const struct ieee80211_sband_iftype_data *data = + ieee80211_get_sband_iftype_data(sband, iftype); + + if (data && data->eht_cap.has_eht) + return &data->eht_cap; + + return NULL; +} + /** * wiphy_read_of_freq_limits - read frequency limits from device tree * @@ -739,6 +812,22 @@ struct cfg80211_tid_config { struct cfg80211_tid_cfg tid_conf[]; }; +/** + * struct cfg80211_fils_aad - FILS AAD data + * @macaddr: STA MAC address + * @kek: FILS KEK + * @kek_len: FILS KEK length + * @snonce: STA Nonce + * @anonce: AP Nonce + */ +struct cfg80211_fils_aad { + const u8 *macaddr; + const u8 *kek; + u8 kek_len; + const u8 *snonce; + const u8 *anonce; +}; + /** * cfg80211_get_chandef_type - return old channel type from chandef * @chandef: the channel definition @@ -851,19 +940,18 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, enum nl80211_iftype iftype); /** - * ieee80211_chandef_rate_flags - returns rate flags for a channel + * ieee80211_chanwidth_rate_flags - return rate flags for channel width + * @width: the channel width of the channel * * In some channel types, not all rates may be used - for example CCK * rates may not be used in 5/10 MHz channels. * - * @chandef: channel definition for the channel - * - * Returns: rate flags which apply for this channel + * Returns: rate flags which apply for this channel width */ static inline enum ieee80211_rate_flags -ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef) +ieee80211_chanwidth_rate_flags(enum nl80211_chan_width width) { - switch (chandef->width) { + switch (width) { case NL80211_CHAN_WIDTH_5: return IEEE80211_RATE_SUPPORTS_5MHZ; case NL80211_CHAN_WIDTH_10: @@ -874,6 +962,20 @@ ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef) return 0; } +/** + * ieee80211_chandef_rate_flags - returns rate flags for a channel + * @chandef: channel definition for the channel + * + * See ieee80211_chanwidth_rate_flags(). + * + * Returns: rate flags which apply for this channel + */ +static inline enum ieee80211_rate_flags +ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef) +{ + return ieee80211_chanwidth_rate_flags(chandef->width); +} + /** * ieee80211_chandef_max_power - maximum transmission power for the chandef * @@ -975,6 +1077,7 @@ struct survey_info { }; #define CFG80211_MAX_WEP_KEYS 4 +#define CFG80211_MAX_NUM_AKM_SUITES 10 /** * struct cfg80211_crypto_settings - Crypto settings @@ -1026,7 +1129,7 @@ struct cfg80211_crypto_settings { int n_ciphers_pairwise; u32 ciphers_pairwise[NL80211_MAX_NR_CIPHER_SUITES]; int n_akm_suites; - u32 akm_suites[NL80211_MAX_NR_AKM_SUITES]; + u32 akm_suites[CFG80211_MAX_NUM_AKM_SUITES]; bool control_port; __be16 control_port_ethertype; bool control_port_no_encrypt; @@ -1038,10 +1141,43 @@ struct cfg80211_crypto_settings { const u8 *sae_pwd; u8 sae_pwd_len; enum nl80211_sae_pwe_mechanism sae_pwe; + + ANDROID_KABI_RESERVE(1); +}; + +/** + * struct cfg80211_mbssid_config - AP settings for multi bssid + * + * @tx_wdev: pointer to the transmitted interface in the MBSSID set + * @index: index of this AP in the multi bssid group. + * @ema: set to true if the beacons should be sent out in EMA mode. + */ +struct cfg80211_mbssid_config { + struct wireless_dev *tx_wdev; + u8 index; + bool ema; +}; + +/** + * struct cfg80211_mbssid_elems - Multiple BSSID elements + * + * @cnt: Number of elements in array %elems. + * + * @elem: Array of multiple BSSID element(s) to be added into Beacon frames. + * @elem.data: Data for multiple BSSID elements. + * @elem.len: Length of data. + */ +struct cfg80211_mbssid_elems { + u8 cnt; + struct { + const u8 *data; + size_t len; + } elem[]; }; /** * struct cfg80211_beacon_data - beacon data + * @link_id: the link ID for the AP MLD link sending this beacon * @head: head portion of beacon (before TIM IE) * or %NULL if not changed * @tail: tail portion of beacon (after TIM IE) @@ -1058,6 +1194,7 @@ struct cfg80211_crypto_settings { * @assocresp_ies_len: length of assocresp_ies in octets * @probe_resp_len: length of probe response template (@probe_resp) * @probe_resp: probe response template (AP mode only) + * @mbssid_ies: multiple BSSID elements * @ftm_responder: enable FTM responder functionality; -1 for no change * (which also implies no change in LCI/civic location data) * @lci: Measurement Report element content, starting with Measurement Token @@ -1066,8 +1203,13 @@ struct cfg80211_crypto_settings { * Token (measurement type 11) * @lci_len: LCI data length * @civicloc_len: Civic location data length + * @he_bss_color: BSS Color settings + * @he_bss_color_valid: indicates whether bss color + * attribute is present in beacon data or not. */ struct cfg80211_beacon_data { + unsigned int link_id; + const u8 *head, *tail; const u8 *beacon_ies; const u8 *proberesp_ies; @@ -1075,6 +1217,7 @@ struct cfg80211_beacon_data { const u8 *probe_resp; const u8 *lci; const u8 *civicloc; + struct cfg80211_mbssid_elems *mbssid_ies; s8 ftm_responder; size_t head_len, tail_len; @@ -1084,6 +1227,10 @@ struct cfg80211_beacon_data { size_t probe_resp_len; size_t lci_len; size_t civicloc_len; + struct cfg80211_he_bss_color he_bss_color; + bool he_bss_color_valid; + + ANDROID_KABI_RESERVE(1); }; struct mac_address { @@ -1139,17 +1286,6 @@ struct cfg80211_unsol_bcast_probe_resp { const u8 *tmpl; }; -/** - * enum cfg80211_ap_settings_flags - AP settings flags - * - * Used by cfg80211_ap_settings - * - * @AP_SETTINGS_EXTERNAL_AUTH_SUPPORT: AP supports external authentication - */ -enum cfg80211_ap_settings_flags { - AP_SETTINGS_EXTERNAL_AUTH_SUPPORT = BIT(0), -}; - /** * struct cfg80211_ap_settings - AP configuration * @@ -1178,6 +1314,8 @@ enum cfg80211_ap_settings_flags { * @ht_cap: HT capabilities (or %NULL if HT isn't enabled) * @vht_cap: VHT capabilities (or %NULL if VHT isn't enabled) * @he_cap: HE capabilities (or %NULL if HE isn't enabled) + * @eht_cap: EHT capabilities (or %NULL if EHT isn't enabled) + * @eht_oper: EHT operation IE (or %NULL if EHT isn't enabled) * @ht_required: stations must support HT * @vht_required: stations must support VHT * @twt_responder: Enable Target Wait Time @@ -1185,10 +1323,10 @@ enum cfg80211_ap_settings_flags { * @sae_h2e_required: stations must support direct H2E technique in SAE * @flags: flags, as defined in enum cfg80211_ap_settings_flags * @he_obss_pd: OBSS Packet Detection settings - * @he_bss_color: BSS Color settings * @he_oper: HE operation IE (or %NULL if HE isn't enabled) * @fils_discovery: FILS discovery transmission parameters * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters + * @mbssid_config: AP settings for multiple bssid */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -1214,13 +1352,17 @@ struct cfg80211_ap_settings { const struct ieee80211_vht_cap *vht_cap; const struct ieee80211_he_cap_elem *he_cap; const struct ieee80211_he_operation *he_oper; + const struct ieee80211_eht_cap_elem *eht_cap; + const struct ieee80211_eht_operation *eht_oper; bool ht_required, vht_required, he_required, sae_h2e_required; bool twt_responder; u32 flags; struct ieee80211_he_obss_pd he_obss_pd; - struct cfg80211_he_bss_color he_bss_color; struct cfg80211_fils_discovery fils_discovery; struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; + struct cfg80211_mbssid_config mbssid_config; + + ANDROID_KABI_RESERVE(1); }; /** @@ -1250,6 +1392,8 @@ struct cfg80211_csa_settings { bool radar_required; bool block_tx; u8 count; + + ANDROID_KABI_RESERVE(1); }; /** @@ -1258,8 +1402,8 @@ struct cfg80211_csa_settings { * Used for bss color change * * @beacon_color_change: beacon data while performing the color countdown - * @counter_offsets_beacon: offsets of the counters within the beacon (tail) - * @counter_offsets_presp: offsets of the counters within the probe response + * @counter_offset_beacon: offsets of the counters within the beacon (tail) + * @counter_offset_presp: offsets of the counters within the probe response * @beacon_next: beacon data to be used after the color change * @count: number of beacons until the color change * @color: the color used after the change @@ -1302,6 +1446,7 @@ struct iface_combination_params { * @STATION_PARAM_APPLY_UAPSD: apply new uAPSD parameters (uapsd_queues, max_sp) * @STATION_PARAM_APPLY_CAPABILITY: apply new capability * @STATION_PARAM_APPLY_PLINK_STATE: apply new plink state + * @STATION_PARAM_APPLY_STA_TXPOWER: apply tx power for STA * * Not all station parameters have in-band "no change" signalling, * for those that don't these flags will are used. @@ -1310,7 +1455,6 @@ enum station_parameters_apply_mask { STATION_PARAM_APPLY_UAPSD = BIT(0), STATION_PARAM_APPLY_CAPABILITY = BIT(1), STATION_PARAM_APPLY_PLINK_STATE = BIT(2), - STATION_PARAM_APPLY_STA_TXPOWER = BIT(3), }; /** @@ -1333,15 +1477,67 @@ struct sta_txpwr { enum nl80211_tx_power_setting type; }; +/** + * struct link_station_parameters - link station parameters + * + * Used to change and create a new link station. + * + * @mld_mac: MAC address of the station + * @link_id: the link id (-1 for non-MLD station) + * @link_mac: MAC address of the link + * @supported_rates: supported rates in IEEE 802.11 format + * (or NULL for no change) + * @supported_rates_len: number of supported rates + * @ht_capa: HT capabilities of station + * @vht_capa: VHT capabilities of station + * @opmode_notif: operating mode field from Operating Mode Notification + * @opmode_notif_used: information if operating mode field is used + * @he_capa: HE capabilities of station + * @he_capa_len: the length of the HE capabilities + * @txpwr: transmit power for an associated station + * @txpwr_set: txpwr field is set + * @he_6ghz_capa: HE 6 GHz Band capabilities of station + * @eht_capa: EHT capabilities of station + * @eht_capa_len: the length of the EHT capabilities + */ +struct link_station_parameters { + const u8 *mld_mac; + int link_id; + const u8 *link_mac; + const u8 *supported_rates; + u8 supported_rates_len; + const struct ieee80211_ht_cap *ht_capa; + const struct ieee80211_vht_cap *vht_capa; + u8 opmode_notif; + bool opmode_notif_used; + const struct ieee80211_he_cap_elem *he_capa; + u8 he_capa_len; + struct sta_txpwr txpwr; + bool txpwr_set; + const struct ieee80211_he_6ghz_capa *he_6ghz_capa; + const struct ieee80211_eht_cap_elem *eht_capa; + u8 eht_capa_len; +}; + +/** + * struct link_station_del_parameters - link station deletion parameters + * + * Used to delete a link station entry (or all stations). + * + * @mld_mac: MAC address of the station + * @link_id: the link id + */ +struct link_station_del_parameters { + const u8 *mld_mac; + u32 link_id; +}; + /** * struct station_parameters - station parameters * * Used to change and create a new station. * * @vlan: vlan interface station should belong to - * @supported_rates: supported rates in IEEE 802.11 format - * (or NULL for no change) - * @supported_rates_len: number of supported rates * @sta_flags_mask: station flags that changed * (bitmask of BIT(%NL80211_STA_FLAG_...)) * @sta_flags_set: station flags values @@ -1352,8 +1548,6 @@ struct sta_txpwr { * @peer_aid: mesh peer AID or zero for no change * @plink_action: plink action to take * @plink_state: set the peer link state for a station - * @ht_capa: HT capabilities of station - * @vht_capa: VHT capabilities of station * @uapsd_queues: bitmap of queues configured for uapsd. same format * as the AC bitmap in the QoS info field * @max_sp: max Service Period. same format as the MAX_SP in the @@ -1370,17 +1564,11 @@ struct sta_txpwr { * @supported_channels_len: number of supported channels * @supported_oper_classes: supported oper classes in IEEE 802.11 format * @supported_oper_classes_len: number of supported operating classes - * @opmode_notif: operating mode field from Operating Mode Notification - * @opmode_notif_used: information if operating mode field is used * @support_p2p_ps: information if station supports P2P PS mechanism - * @he_capa: HE capabilities of station - * @he_capa_len: the length of the HE capabilities * @airtime_weight: airtime scheduler weight for this station - * @txpwr: transmit power for an associated station - * @he_6ghz_capa: HE 6 GHz Band capabilities of station + * @link_sta_params: link related params. */ struct station_parameters { - const u8 *supported_rates; struct net_device *vlan; u32 sta_flags_mask, sta_flags_set; u32 sta_modify_mask; @@ -1388,11 +1576,8 @@ struct station_parameters { u16 aid; u16 vlan_id; u16 peer_aid; - u8 supported_rates_len; u8 plink_action; u8 plink_state; - const struct ieee80211_ht_cap *ht_capa; - const struct ieee80211_vht_cap *vht_capa; u8 uapsd_queues; u8 max_sp; enum nl80211_mesh_power_mode local_pm; @@ -1403,14 +1588,11 @@ struct station_parameters { u8 supported_channels_len; const u8 *supported_oper_classes; u8 supported_oper_classes_len; - u8 opmode_notif; - bool opmode_notif_used; int support_p2p_ps; - const struct ieee80211_he_cap_elem *he_capa; - u8 he_capa_len; u16 airtime_weight; - struct sta_txpwr txpwr; - const struct ieee80211_he_6ghz_capa *he_6ghz_capa; + struct link_station_parameters link_sta_params; + + ANDROID_KABI_RESERVE(1); }; /** @@ -1488,6 +1670,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy, * @RATE_INFO_FLAGS_HE_MCS: HE MCS information * @RATE_INFO_FLAGS_EDMG: 60GHz MCS in EDMG mode * @RATE_INFO_FLAGS_EXTENDED_SC_DMG: 60GHz extended SC MCS + * @RATE_INFO_FLAGS_EHT_MCS: EHT MCS information */ enum rate_info_flags { RATE_INFO_FLAGS_MCS = BIT(0), @@ -1497,6 +1680,7 @@ enum rate_info_flags { RATE_INFO_FLAGS_HE_MCS = BIT(4), RATE_INFO_FLAGS_EDMG = BIT(5), RATE_INFO_FLAGS_EXTENDED_SC_DMG = BIT(6), + RATE_INFO_FLAGS_EHT_MCS = BIT(7), }; /** @@ -1511,6 +1695,8 @@ enum rate_info_flags { * @RATE_INFO_BW_80: 80 MHz bandwidth * @RATE_INFO_BW_160: 160 MHz bandwidth * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation + * @RATE_INFO_BW_320: 320 MHz bandwidth + * @RATE_INFO_BW_EHT_RU: bandwidth determined by EHT RU allocation */ enum rate_info_bw { RATE_INFO_BW_20 = 0, @@ -1520,6 +1706,8 @@ enum rate_info_bw { RATE_INFO_BW_80, RATE_INFO_BW_160, RATE_INFO_BW_HE_RU, + RATE_INFO_BW_320, + RATE_INFO_BW_EHT_RU, }; /** @@ -1537,6 +1725,9 @@ enum rate_info_bw { * @he_ru_alloc: HE RU allocation (from &enum nl80211_he_ru_alloc, * only valid if bw is %RATE_INFO_BW_HE_RU) * @n_bonded_ch: In case of EDMG the number of bonded channels (1-4) + * @eht_gi: EHT guard interval (from &enum nl80211_eht_gi) + * @eht_ru_alloc: EHT RU allocation (from &enum nl80211_eht_ru_alloc, + * only valid if bw is %RATE_INFO_BW_EHT_RU) */ struct rate_info { u8 flags; @@ -1548,6 +1739,8 @@ struct rate_info { u8 he_dcm; u8 he_ru_alloc; u8 n_bonded_ch; + u8 eht_gi; + u8 eht_ru_alloc; }; /** @@ -1759,6 +1952,8 @@ struct station_info { u32 airtime_link_metric; u8 connected_to_as; + + ANDROID_KABI_RESERVE(1); }; /** @@ -2019,6 +2214,9 @@ struct bss_parameters { * @plink_timeout: If no tx activity is seen from a STA we've established * peering with for longer than this time (in seconds), then remove it * from the STA's list of peers. Default is 30 minutes. + * @dot11MeshConnectedToAuthServer: if set to true then this mesh STA + * will advertise that it is connected to a authentication server + * in the mesh formation field. * @dot11MeshConnectedToMeshGate: if set to true, advertise that this STA is * connected to a mesh gate in mesh formation info. If false, the * value in mesh formation is determined by the presence of root paths @@ -2061,6 +2259,8 @@ struct mesh_config { u16 dot11MeshAwakeWindowDuration; u32 plink_timeout; bool dot11MeshNolearn; + + ANDROID_KABI_RESERVE(1); }; /** @@ -2110,6 +2310,8 @@ struct mesh_setup { struct cfg80211_bitrate_mask beacon_rate; bool userspace_handles_dfs; bool control_port_over_nl80211; + + ANDROID_KABI_RESERVE(1); }; /** @@ -2131,6 +2333,7 @@ struct ocb_setup { * @cwmax: Maximum contention window [a value of the form 2^n-1 in the range * 1..32767] * @aifs: Arbitration interframe space [0..255] + * @link_id: link_id or -1 for non-MLD */ struct ieee80211_txq_params { enum nl80211_ac ac; @@ -2138,6 +2341,7 @@ struct ieee80211_txq_params { u16 cwmin; u16 cwmax; u8 aifs; + int link_id; }; /** @@ -2191,12 +2395,12 @@ struct cfg80211_scan_info { /** * struct cfg80211_scan_6ghz_params - relevant for 6 GHz only * - * @short_bssid: short ssid to scan for + * @short_ssid: short ssid to scan for * @bssid: bssid to scan for * @channel_idx: idx of the channel in the channel array in the scan request * which the above info relvant to * @unsolicited_probe: the AP transmits unsolicited probe response every 20 TU - * @short_ssid_valid: short_ssid is valid and can be used + * @short_ssid_valid: @short_ssid is valid and can be used * @psc_no_listen: when set, and the channel is a PSC channel, no need to wait * 20 TUs before starting to send probe requests. */ @@ -2271,6 +2475,8 @@ struct cfg80211_scan_request { u32 n_6ghz_params; struct cfg80211_scan_6ghz_params *scan_6ghz_params; + ANDROID_KABI_RESERVE(1); + /* keep last */ struct ieee80211_channel *channels[]; }; @@ -2417,6 +2623,8 @@ struct cfg80211_sched_scan_request { bool nl_owner_dead; struct list_head list; + ANDROID_KABI_RESERVE(1); + /* keep last */ struct ieee80211_channel *channels[]; }; @@ -2540,6 +2748,8 @@ struct cfg80211_bss { u8 bssid_index; u8 max_bssid_indicator; + ANDROID_KABI_RESERVE(1); + u8 priv[] __aligned(sizeof(void *)); }; @@ -2588,6 +2798,12 @@ static inline const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 id) * Authentication algorithm number, i.e., starting at the Authentication * transaction sequence number field. * @auth_data_len: Length of auth_data buffer in octets + * @link_id: if >= 0, indicates authentication should be done as an MLD, + * the interface address is included as the MLD address and the + * necessary link (with the given link_id) will be created (and + * given an MLD address) by the driver + * @ap_mld_addr: AP MLD address in case of authentication request with + * an AP MLD, valid iff @link_id >= 0 */ struct cfg80211_auth_request { struct cfg80211_bss *bss; @@ -2595,9 +2811,25 @@ struct cfg80211_auth_request { size_t ie_len; enum nl80211_auth_type auth_type; const u8 *key; - u8 key_len, key_idx; + u8 key_len; + s8 key_idx; const u8 *auth_data; size_t auth_data_len; + s8 link_id; + const u8 *ap_mld_addr; +}; + +/** + * struct cfg80211_assoc_link - per-link information for MLO association + * @bss: the BSS pointer, see also &struct cfg80211_assoc_request::bss; + * if this is %NULL for a link, that link is not requested + * @elems: extra elements for the per-STA profile for this link + * @elems_len: length of the elements + */ +struct cfg80211_assoc_link { + struct cfg80211_bss *bss; + const u8 *elems; + size_t elems_len; }; /** @@ -2611,6 +2843,10 @@ struct cfg80211_auth_request { * userspace if this flag is set. Only applicable for cfg80211_connect() * request (connect callback). * @ASSOC_REQ_DISABLE_HE: Disable HE + * @ASSOC_REQ_DISABLE_EHT: Disable EHT + * @CONNECT_REQ_MLO_SUPPORT: Userspace indicates support for handling MLD links. + * Drivers shall disable MLO features for the current association if this + * flag is not set. */ enum cfg80211_assoc_req_flags { ASSOC_REQ_DISABLE_HT = BIT(0), @@ -2618,6 +2854,8 @@ enum cfg80211_assoc_req_flags { ASSOC_REQ_USE_RRM = BIT(2), CONNECT_REQ_EXTERNAL_AUTH_SUPPORT = BIT(3), ASSOC_REQ_DISABLE_HE = BIT(4), + ASSOC_REQ_DISABLE_EHT = BIT(5), + CONNECT_REQ_MLO_SUPPORT = BIT(6), }; /** @@ -2629,6 +2867,8 @@ enum cfg80211_assoc_req_flags { * given a reference that it must give back to cfg80211_send_rx_assoc() * or to cfg80211_assoc_timeout(). To ensure proper refcounting, new * association requests while already associating must be rejected. + * This also applies to the @links.bss parameter, which is used instead + * of this one (it is %NULL) for MLO associations. * @ie: Extra IEs to add to (Re)Association Request frame or %NULL * @ie_len: Length of ie buffer in octets * @use_mfp: Use management frame protection (IEEE 802.11w) in this association @@ -2653,6 +2893,11 @@ enum cfg80211_assoc_req_flags { * with 16 octets of STA Nonce followed by 16 octets of AP Nonce. * @s1g_capa: S1G capability override * @s1g_capa_mask: S1G capability override mask + * @links: per-link information for MLO connections + * @link_id: >= 0 for MLO connections, where links are given, and indicates + * the link on which the association request should be sent + * @ap_mld_addr: AP MLD address in case of MLO association request, + * valid iff @link_id >= 0 */ struct cfg80211_assoc_request { struct cfg80211_bss *bss; @@ -2668,6 +2913,11 @@ struct cfg80211_assoc_request { size_t fils_kek_len; const u8 *fils_nonces; struct ieee80211_s1g_cap s1g_capa, s1g_capa_mask; + struct cfg80211_assoc_link links[IEEE80211_MLD_MAX_NUM_LINKS]; + const u8 *ap_mld_addr; + s8 link_id; + + ANDROID_KABI_RESERVE(1); }; /** @@ -2676,7 +2926,7 @@ struct cfg80211_assoc_request { * This structure provides information needed to complete IEEE 802.11 * deauthentication. * - * @bssid: the BSSID of the BSS to deauthenticate from + * @bssid: the BSSID or AP MLD address to deauthenticate from * @ie: Extra IEs to add to Deauthentication frame or %NULL * @ie_len: Length of ie buffer in octets * @reason_code: The reason code for the deauthentication @@ -2697,7 +2947,7 @@ struct cfg80211_deauth_request { * This structure provides information needed to complete IEEE 802.11 * disassociation. * - * @bss: the BSS to disassociate from + * @ap_addr: the BSSID or AP MLD address to disassociate from * @ie: Extra IEs to add to Disassociation frame or %NULL * @ie_len: Length of ie buffer in octets * @reason_code: The reason code for the disassociation @@ -2705,7 +2955,7 @@ struct cfg80211_deauth_request { * Disassociation frame is to be transmitted. */ struct cfg80211_disassoc_request { - struct cfg80211_bss *bss; + const u8 *ap_addr; const u8 *ie; size_t ie_len; u16 reason_code; @@ -2766,6 +3016,8 @@ struct cfg80211_ibss_params { struct ieee80211_ht_cap ht_capa_mask; struct key_params *wep_keys; int wep_tx_key; + + ANDROID_KABI_RESERVE(1); }; /** @@ -2880,6 +3132,8 @@ struct cfg80211_connect_params { size_t fils_erp_rrk_len; bool want_1x; struct ieee80211_edmg edmg; + + ANDROID_KABI_RESERVE(1); }; /** @@ -3150,7 +3404,7 @@ struct cfg80211_wowlan_wakeup { * @kck: key confirmation key (@kck_len bytes) * @replay_ctr: replay counter (NL80211_REPLAY_CTR_LEN bytes) * @kek_len: length of kek - * @kck_len length of kck + * @kck_len: length of kck * @akm: akm (oui, id) */ struct cfg80211_gtk_rekey_data { @@ -3188,6 +3442,9 @@ struct cfg80211_update_ft_ies_params { * @dont_wait_for_ack: tells the low level not to wait for an ack * @n_csa_offsets: length of csa_offsets array * @csa_offsets: array of all the csa offsets in the frame + * @link_id: for MLO, the link ID to transmit on, -1 if not given; note + * that the link ID isn't validated (much), it's in range but the + * link might not exist (or be used by the receiver STA) */ struct cfg80211_mgmt_tx_params { struct ieee80211_channel *chan; @@ -3199,6 +3456,7 @@ struct cfg80211_mgmt_tx_params { bool dont_wait_for_ack; int n_csa_offsets; const u16 *csa_offsets; + int link_id; }; /** @@ -3337,6 +3595,8 @@ struct cfg80211_nan_func { u8 num_rx_filters; u8 instance_id; u64 cookie; + + ANDROID_KABI_RESERVE(1); }; /** @@ -3497,6 +3757,8 @@ struct cfg80211_pmsr_ftm_result { dist_avg_valid:1, dist_variance_valid:1, dist_spread_valid:1; + + ANDROID_KABI_RESERVE(1); }; /** @@ -3512,6 +3774,7 @@ struct cfg80211_pmsr_ftm_result { * @type: type of the measurement reported, note that we only support reporting * one type at a time, but you can report multiple results separately and * they're all aggregated for userspace. + * @ftm: FTM result */ struct cfg80211_pmsr_result { u64 host_time, ap_tsf; @@ -3650,7 +3913,7 @@ struct cfg80211_update_owe_info { * for the entire device * @interface_stypes: bitmap of management frame subtypes registered * for the given interface - * @global_mcast_rx: mcast RX is needed globally for these subtypes + * @global_mcast_stypes: mcast RX is needed globally for these subtypes * @interface_mcast_stypes: mcast RX is needed on this interface * for these subtypes */ @@ -3695,23 +3958,39 @@ struct mgmt_frame_regs { * keep the struct wireless_dev's iftype updated. * This additionally holds the RTNL to be able to do netdev changes. * + * @add_intf_link: Add a new MLO link to the given interface. Note that + * the wdev->link[] data structure has been updated, so the new link + * address is available. + * @del_intf_link: Remove an MLO link from the given interface. + * * @add_key: add a key with the given parameters. @mac_addr will be %NULL - * when adding a group key. + * when adding a group key. @link_id will be -1 for non-MLO connection. + * For MLO connection, @link_id will be >= 0 for group key and -1 for + * pairwise key, @mac_addr will be peer's MLD address for MLO pairwise key. * * @get_key: get information about the key with the given parameters. * @mac_addr will be %NULL when requesting information for a group * key. All pointers given to the @callback function need not be valid * after it returns. This function should return an error if it is * not possible to retrieve the key, -ENOENT if it doesn't exist. + * @link_id will be -1 for non-MLO connection. For MLO connection, + * @link_id will be >= 0 for group key and -1 for pairwise key, @mac_addr + * will be peer's MLD address for MLO pairwise key. * * @del_key: remove a key given the @mac_addr (%NULL for a group key) - * and @key_index, return -ENOENT if the key doesn't exist. + * and @key_index, return -ENOENT if the key doesn't exist. @link_id will + * be -1 for non-MLO connection. For MLO connection, @link_id will be >= 0 + * for group key and -1 for pairwise key, @mac_addr will be peer's MLD + * address for MLO pairwise key. * - * @set_default_key: set the default key on an interface + * @set_default_key: set the default key on an interface. @link_id will be >= 0 + * for MLO connection and -1 for non-MLO connection. * - * @set_default_mgmt_key: set the default management frame key on an interface + * @set_default_mgmt_key: set the default management frame key on an interface. + * @link_id will be >= 0 for MLO connection and -1 for non-MLO connection. * - * @set_default_beacon_key: set the default Beacon frame key on an interface + * @set_default_beacon_key: set the default Beacon frame key on an interface. + * @link_id will be >= 0 for MLO connection and -1 for non-MLO connection. * * @set_rekey_data: give the data necessary for GTK rekeying to the driver * @@ -4018,6 +4297,22 @@ struct mgmt_frame_regs { * @set_sar_specs: Update the SAR (TX power) settings. * * @color_change: Initiate a color change. + * + * @set_fils_aad: Set FILS AAD data to the AP driver so that the driver can use + * those to decrypt (Re)Association Request and encrypt (Re)Association + * Response frame. + * + * @set_radar_background: Configure dedicated offchannel chain available for + * radar/CAC detection on some hw. This chain can't be used to transmit + * or receive frames and it is bounded to a running wdev. + * Background radar/CAC detection allows to avoid the CAC downtime + * switching to a different channel during CAC detection on the selected + * radar channel. + * The caller is expected to set chandef pointer to NULL in order to + * disable background CAC/radar detection. + * @add_link_station: Add a link to a station. + * @mod_link_station: Modify a link of a station. + * @del_link_station: Remove a link of a station. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -4036,30 +4331,40 @@ struct cfg80211_ops { enum nl80211_iftype type, struct vif_params *params); + int (*add_intf_link)(struct wiphy *wiphy, + struct wireless_dev *wdev, + unsigned int link_id); + void (*del_intf_link)(struct wiphy *wiphy, + struct wireless_dev *wdev, + unsigned int link_id); + int (*add_key)(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool pairwise, const u8 *mac_addr, - struct key_params *params); + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr, struct key_params *params); int (*get_key)(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool pairwise, const u8 *mac_addr, - void *cookie, + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params*)); int (*del_key)(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool pairwise, const u8 *mac_addr); + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr); int (*set_default_key)(struct wiphy *wiphy, - struct net_device *netdev, + struct net_device *netdev, int link_id, u8 key_index, bool unicast, bool multicast); int (*set_default_mgmt_key)(struct wiphy *wiphy, - struct net_device *netdev, + struct net_device *netdev, int link_id, u8 key_index); int (*set_default_beacon_key)(struct wiphy *wiphy, struct net_device *netdev, + int link_id, u8 key_index); int (*start_ap)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ap_settings *settings); int (*change_beacon)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_beacon_data *info); - int (*stop_ap)(struct wiphy *wiphy, struct net_device *dev); + int (*stop_ap)(struct wiphy *wiphy, struct net_device *dev, + unsigned int link_id); int (*add_station)(struct wiphy *wiphy, struct net_device *dev, @@ -4167,6 +4472,7 @@ struct cfg80211_ops { int (*set_bitrate_mask)(struct wiphy *wiphy, struct net_device *dev, + unsigned int link_id, const u8 *peer, const struct cfg80211_bitrate_mask *mask); @@ -4242,6 +4548,7 @@ struct cfg80211_ops { int (*get_channel)(struct wiphy *wiphy, struct wireless_dev *wdev, + unsigned int link_id, struct cfg80211_chan_def *chandef); int (*start_p2p_device)(struct wiphy *wiphy, @@ -4278,6 +4585,7 @@ struct cfg80211_ops { struct cfg80211_qos_map *qos_map); int (*set_ap_chanwidth)(struct wiphy *wiphy, struct net_device *dev, + unsigned int link_id, struct cfg80211_chan_def *chandef); int (*add_tx_ts)(struct wiphy *wiphy, struct net_device *dev, @@ -4324,7 +4632,7 @@ struct cfg80211_ops { struct net_device *dev, const u8 *buf, size_t len, const u8 *dest, const __be16 proto, - const bool noencrypt, + const bool noencrypt, int link_id, u64 *cookie); int (*get_ftm_responder_stats)(struct wiphy *wiphy, @@ -4348,6 +4656,21 @@ struct cfg80211_ops { int (*color_change)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_color_change_settings *params); + int (*set_fils_aad)(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_fils_aad *fils_aad); + int (*set_radar_background)(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef); + int (*add_link_station)(struct wiphy *wiphy, struct net_device *dev, + struct link_station_parameters *params); + int (*mod_link_station)(struct wiphy *wiphy, struct net_device *dev, + struct link_station_parameters *params); + int (*del_link_station)(struct wiphy *wiphy, struct net_device *dev, + struct link_station_del_parameters *params); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; /* @@ -4399,10 +4722,14 @@ struct cfg80211_ops { * @WIPHY_FLAG_HAS_STATIC_WEP: The device supports static WEP key installation * before connection. * @WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK: The device supports bigger kek and kck keys + * @WIPHY_FLAG_SUPPORTS_MLO: This is a temporary flag gating the MLO APIs, + * in order to not have them reachable in normal drivers, until we have + * complete feature/interface combinations/etc. advertisement. No driver + * should set this flag for now. */ enum wiphy_flags { WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK = BIT(0), - /* use hole at 1 */ + WIPHY_FLAG_SUPPORTS_MLO = BIT(1), WIPHY_FLAG_SPLIT_SCAN_6GHZ = BIT(2), WIPHY_FLAG_NETNS_OK = BIT(3), WIPHY_FLAG_PS_ON_BY_DEFAULT = BIT(4), @@ -4711,6 +5038,8 @@ struct wiphy_vendor_command { unsigned long *storage); const struct nla_policy *policy; unsigned int maxattr; + + ANDROID_KABI_RESERVE(1); }; /** @@ -4723,19 +5052,32 @@ struct wiphy_vendor_command { * 802.11-2012 8.4.2.29 for the defined fields. * @extended_capabilities_mask: mask of the valid values * @extended_capabilities_len: length of the extended capabilities + * @eml_capabilities: EML capabilities (for MLO) + * @mld_capa_and_ops: MLD capabilities and operations (for MLO) */ struct wiphy_iftype_ext_capab { enum nl80211_iftype iftype; const u8 *extended_capabilities; const u8 *extended_capabilities_mask; u8 extended_capabilities_len; + u16 eml_capabilities; + u16 mld_capa_and_ops; }; +/** + * cfg80211_get_iftype_ext_capa - lookup interface type extended capability + * @wiphy: the wiphy to look up from + * @type: the interface type to look up + */ +const struct wiphy_iftype_ext_capab * +cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type); + /** * struct cfg80211_pmsr_capabilities - cfg80211 peer measurement capabilities * @max_peers: maximum number of peers in a single measurement * @report_ap_tsf: can report assoc AP's TSF for radio resource measurement * @randomize_mac_addr: can randomize MAC address for measurement + * @ftm: FTM measurement data * @ftm.supported: FTM measurement is supported * @ftm.asap: ASAP-mode is supported * @ftm.non_asap: non-ASAP-mode is supported @@ -4981,6 +5323,20 @@ struct wiphy_iftype_akm_suites { * %NL80211_TID_CONFIG_ATTR_RETRY_LONG attributes * @sar_capa: SAR control capabilities * @rfkill: a pointer to the rfkill structure + * + * @mbssid_max_interfaces: maximum number of interfaces supported by the driver + * in a multiple BSSID set. This field must be set to a non-zero value + * by the driver to advertise MBSSID support. + * @ema_max_profile_periodicity: maximum profile periodicity supported by + * the driver. Setting this field to a non-zero value indicates that the + * driver supports enhanced multi-BSSID advertisements (EMA AP). + * @max_num_akm_suites: maximum number of AKM suites allowed for + * configuration through %NL80211_CMD_CONNECT, %NL80211_CMD_ASSOCIATE and + * %NL80211_CMD_START_AP. Set to NL80211_MAX_NR_AKM_SUITES if not set by + * driver. If set by driver minimum allowed value is + * NL80211_MAX_NR_AKM_SUITES in order to avoid compatibility issues with + * legacy userspace and maximum allowed value is + * CFG80211_MAX_NUM_AKM_SUITES. */ struct wiphy { struct mutex mtx; @@ -5125,6 +5481,12 @@ struct wiphy { struct rfkill *rfkill; + u8 mbssid_max_interfaces; + u8 ema_max_profile_periodicity; + u16 max_num_akm_suites; + + ANDROID_KABI_RESERVE(1); + char priv[] __aligned(NETDEV_ALIGN); }; @@ -5349,16 +5711,9 @@ static inline void wiphy_unlock(struct wiphy *wiphy) * @netdev: (private) Used to reference back to the netdev, may be %NULL * @identifier: (private) Identifier used in nl80211 to identify this * wireless device if it has no netdev - * @current_bss: (private) Used by the internal configuration code - * @chandef: (private) Used by the internal configuration code to track - * the user-set channel definition. - * @preset_chandef: (private) Used by the internal configuration code to - * track the channel to be used for AP later + * @u: union containing data specific to @iftype + * @connected: indicates if connected or not (STA mode) * @bssid: (private) Used by the internal configuration code - * @ssid: (private) Used by the internal configuration code - * @ssid_len: (private) Used by the internal configuration code - * @mesh_id_len: (private) Used by the internal configuration code - * @mesh_id_up_len: (private) Used by the internal configuration code * @wext: (private) Used by the internal wireless extensions compat code * @wext.ibss: (private) IBSS data part of wext handling * @wext.connect: (private) connection handling data @@ -5398,8 +5753,6 @@ static inline void wiphy_unlock(struct wiphy *wiphy) * @conn_owner_nlportid: (private) connection owner socket port ID * @disconnect_wk: (private) auto-disconnect work * @disconnect_bssid: (private) the BSSID to use for auto-disconnect - * @ibss_fixed: (private) IBSS is using fixed BSSID - * @ibss_dfs_possible: (private) IBSS may change to a DFS channel * @event_list: (private) list for internal event processing * @event_lock: (private) lock for event list * @owner_nlportid: (private) owner socket port ID @@ -5410,6 +5763,9 @@ static inline void wiphy_unlock(struct wiphy *wiphy) * @pmsr_free_wk: (private) peer measurements cleanup work * @unprot_beacon_reported: (private) timestamp of last * unprotected beacon report + * @links: array of %IEEE80211_MLD_MAX_NUM_LINKS elements containing @addr + * @ap and @client for each link + * @valid_links: bitmap describing what elements of @links are valid */ struct wireless_dev { struct wiphy *wiphy; @@ -5431,8 +5787,6 @@ struct wireless_dev { u8 address[ETH_ALEN] __aligned(sizeof(u16)); /* currently used for IBSS and SME - might be rearranged later */ - u8 ssid[IEEE80211_MAX_SSID_LEN]; - u8 ssid_len, mesh_id_len, mesh_id_up_len; struct cfg80211_conn *conn; struct cfg80211_cached_keys *connect_keys; enum ieee80211_bss_type conn_bss_type; @@ -5444,23 +5798,17 @@ struct wireless_dev { struct list_head event_list; spinlock_t event_lock; - struct cfg80211_internal_bss *current_bss; /* associated / joined */ - struct cfg80211_chan_def preset_chandef; - struct cfg80211_chan_def chandef; - - bool ibss_fixed; - bool ibss_dfs_possible; + u8 connected:1; bool ps; int ps_timeout; - int beacon_interval; - u32 ap_unexpected_nlportid; u32 owner_nlportid; bool nl_owner_dead; + /* FIXME: need to rework radar detection for MLO */ bool cac_started; unsigned long cac_start_time; unsigned int cac_time_ms; @@ -5488,6 +5836,53 @@ struct wireless_dev { struct work_struct pmsr_free_wk; unsigned long unprot_beacon_reported; + + union { + struct { + u8 connected_addr[ETH_ALEN] __aligned(2); + u8 ssid[IEEE80211_MAX_SSID_LEN]; + u8 ssid_len; + } client; + struct { + int beacon_interval; + struct cfg80211_chan_def preset_chandef; + struct cfg80211_chan_def chandef; + u8 id[IEEE80211_MAX_SSID_LEN]; + u8 id_len, id_up_len; + } mesh; + struct { + struct cfg80211_chan_def preset_chandef; + u8 ssid[IEEE80211_MAX_SSID_LEN]; + u8 ssid_len; + } ap; + struct { + struct cfg80211_internal_bss *current_bss; + struct cfg80211_chan_def chandef; + int beacon_interval; + u8 ssid[IEEE80211_MAX_SSID_LEN]; + u8 ssid_len; + } ibss; + struct { + struct cfg80211_chan_def chandef; + } ocb; + } u; + + struct { + u8 addr[ETH_ALEN] __aligned(2); + union { + struct { + unsigned int beacon_interval; + struct cfg80211_chan_def chandef; + } ap; + struct { + struct cfg80211_internal_bss *current_bss; + } client; + }; + } links[IEEE80211_MLD_MAX_NUM_LINKS]; + u16 valid_links; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; static inline u8 *wdev_address(struct wireless_dev *wdev) @@ -5516,6 +5911,32 @@ static inline void *wdev_priv(struct wireless_dev *wdev) return wiphy_priv(wdev->wiphy); } +/** + * wdev_chandef - return chandef pointer from wireless_dev + * @wdev: the wdev + * @link_id: the link ID for MLO + * + * Return: The chandef depending on the mode, or %NULL. + */ +struct cfg80211_chan_def *wdev_chandef(struct wireless_dev *wdev, + unsigned int link_id); + +static inline void WARN_INVALID_LINK_ID(struct wireless_dev *wdev, + unsigned int link_id) +{ + WARN_ON(link_id && !wdev->valid_links); + WARN_ON(wdev->valid_links && + !(wdev->valid_links & BIT(link_id))); +} + +#define for_each_valid_link(link_info, link_id) \ + for (link_id = 0; \ + link_id < ((link_info)->valid_links ? \ + ARRAY_SIZE((link_info)->links) : 1); \ + link_id++) \ + if (!(link_info)->valid_links || \ + ((link_info)->valid_links & BIT(link_id))) + /** * DOC: Utility functions * @@ -5792,6 +6213,7 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr); * @addr: the device MAC address * @iftype: the virtual interface type * @data_offset: offset of payload after the 802.11 header + * @is_amsdu: true if the 802.11 header is A-MSDU * Return: 0 on success. Non-zero on error. */ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, @@ -6531,16 +6953,36 @@ void cfg80211_rx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len); void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr); /** - * cfg80211_rx_assoc_resp - notification of processed association response - * @dev: network device + * struct cfg80211_rx_assoc_resp - association response data * @bss: the BSS that association was requested with, ownership of the pointer - * moves to cfg80211 in this call + * moves to cfg80211 in the call to cfg80211_rx_assoc_resp() * @buf: (Re)Association Response frame (header + body) * @len: length of the frame data * @uapsd_queues: bitmap of queues configured for uapsd. Same format * as the AC bitmap in the QoS info field * @req_ies: information elements from the (Re)Association Request frame * @req_ies_len: length of req_ies data + * @ap_mld_addr: AP MLD address (in case of MLO) + * @links: per-link information indexed by link ID, use links[0] for + * non-MLO connections + */ +struct cfg80211_rx_assoc_resp { + const u8 *buf; + size_t len; + const u8 *req_ies; + size_t req_ies_len; + int uapsd_queues; + const u8 *ap_mld_addr; + struct { + const u8 *addr; + struct cfg80211_bss *bss; + } links[IEEE80211_MLD_MAX_NUM_LINKS]; +}; + +/** + * cfg80211_rx_assoc_resp - notification of processed association response + * @dev: network device + * @data: association response data, &struct cfg80211_rx_assoc_resp * * After being asked to associate via cfg80211_ops::assoc() the driver must * call either this function or cfg80211_auth_timeout(). @@ -6548,30 +6990,32 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr); * This function may sleep. The caller must hold the corresponding wdev's mutex. */ void cfg80211_rx_assoc_resp(struct net_device *dev, - struct cfg80211_bss *bss, - const u8 *buf, size_t len, - int uapsd_queues, - const u8 *req_ies, size_t req_ies_len); + struct cfg80211_rx_assoc_resp *data); /** - * cfg80211_assoc_timeout - notification of timed out association + * struct cfg80211_assoc_failure - association failure data + * @ap_mld_addr: AP MLD address, or %NULL + * @bss: list of BSSes, must use entry 0 for non-MLO connections + * (@ap_mld_addr is %NULL) + * @timeout: indicates the association failed due to timeout, otherwise + * the association was abandoned for a reason reported through some + * other API (e.g. deauth RX) + */ +struct cfg80211_assoc_failure { + const u8 *ap_mld_addr; + struct cfg80211_bss *bss[IEEE80211_MLD_MAX_NUM_LINKS]; + bool timeout; +}; + +/** + * cfg80211_assoc_failure - notification of association failure * @dev: network device - * @bss: The BSS entry with which association timed out. + * @data: data describing the association failure * * This function may sleep. The caller must hold the corresponding wdev's mutex. */ -void cfg80211_assoc_timeout(struct net_device *dev, struct cfg80211_bss *bss); - -/** - * cfg80211_abandon_assoc - notify cfg80211 of abandoned association attempt - * @dev: network device - * @bss: The BSS entry with which association was abandoned. - * - * Call this whenever - for reasons reported through other API, like deauth RX, - * an association attempt was abandoned. - * This function may sleep. The caller must hold the corresponding wdev's mutex. - */ -void cfg80211_abandon_assoc(struct net_device *dev, struct cfg80211_bss *bss); +void cfg80211_assoc_failure(struct net_device *dev, + struct cfg80211_assoc_failure *data); /** * cfg80211_tx_mlme_mgmt - notification of transmitted deauth/disassoc frame @@ -6648,6 +7092,7 @@ void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, * @macaddr: the MAC address of the new candidate * @ie: information elements advertised by the peer candidate * @ie_len: length of the information elements buffer + * @sig_dbm: signal level in dBm * @gfp: allocation flags * * This function notifies cfg80211 that the mesh peer candidate has been @@ -7012,13 +7457,6 @@ struct cfg80211_fils_resp_params { * indicate that this is a failure, but without a status code. * @timeout_reason is used to report the reason for the timeout in that * case. - * @bssid: The BSSID of the AP (may be %NULL) - * @bss: Entry of bss to which STA got connected to, can be obtained through - * cfg80211_get_bss() (may be %NULL). But it is recommended to store the - * bss from the connect_request and hold a reference to it and return - * through this param to avoid a warning if the bss is expired during the - * connection, esp. for those drivers implementing connect op. - * Only one parameter among @bssid and @bss needs to be specified. * @req_ie: Association request IEs (may be %NULL) * @req_ie_len: Association request IEs length * @resp_ie: Association response IEs (may be %NULL) @@ -7030,17 +7468,41 @@ struct cfg80211_fils_resp_params { * not known. This value is used only if @status < 0 to indicate that the * failure is due to a timeout and not due to explicit rejection by the AP. * This value is ignored in other cases (@status >= 0). + * @valid_links: For MLO connection, BIT mask of the valid link ids. Otherwise + * zero. + * @ap_mld_addr: For MLO connection, MLD address of the AP. Otherwise %NULL. + * @links : For MLO connection, contains link info for the valid links indicated + * using @valid_links. For non-MLO connection, links[0] contains the + * connected AP info. + * @links.addr: For MLO connection, MAC address of the STA link. Otherwise + * %NULL. + * @links.bssid: For MLO connection, MAC address of the AP link. For non-MLO + * connection, links[0].bssid points to the BSSID of the AP (may be %NULL). + * @links.bss: For MLO connection, entry of bss to which STA link is connected. + * For non-MLO connection, links[0].bss points to entry of bss to which STA + * is connected. It can be obtained through cfg80211_get_bss() (may be + * %NULL). It is recommended to store the bss from the connect_request and + * hold a reference to it and return through this param to avoid a warning + * if the bss is expired during the connection, esp. for those drivers + * implementing connect op. Only one parameter among @bssid and @bss needs + * to be specified. */ struct cfg80211_connect_resp_params { int status; - const u8 *bssid; - struct cfg80211_bss *bss; const u8 *req_ie; size_t req_ie_len; const u8 *resp_ie; size_t resp_ie_len; struct cfg80211_fils_resp_params fils; enum nl80211_timeout_reason timeout_reason; + + const u8 *ap_mld_addr; + u16 valid_links; + struct { + const u8 *addr; + const u8 *bssid; + struct cfg80211_bss *bss; + } links[IEEE80211_MLD_MAX_NUM_LINKS]; }; /** @@ -7110,8 +7572,8 @@ cfg80211_connect_bss(struct net_device *dev, const u8 *bssid, memset(¶ms, 0, sizeof(params)); params.status = status; - params.bssid = bssid; - params.bss = bss; + params.links[0].bssid = bssid; + params.links[0].bss = bss; params.req_ie = req_ie; params.req_ie_len = req_ie_len; params.resp_ie = resp_ie; @@ -7182,24 +7644,40 @@ cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid, /** * struct cfg80211_roam_info - driver initiated roaming information * - * @channel: the channel of the new AP - * @bss: entry of bss to which STA got roamed (may be %NULL if %bssid is set) - * @bssid: the BSSID of the new AP (may be %NULL if %bss is set) * @req_ie: association request IEs (maybe be %NULL) * @req_ie_len: association request IEs length * @resp_ie: association response IEs (may be %NULL) * @resp_ie_len: assoc response IEs length * @fils: FILS related roaming information. + * @valid_links: For MLO roaming, BIT mask of the new valid links is set. + * Otherwise zero. + * @ap_mld_addr: For MLO roaming, MLD address of the new AP. Otherwise %NULL. + * @links : For MLO roaming, contains new link info for the valid links set in + * @valid_links. For non-MLO roaming, links[0] contains the new AP info. + * @links.addr: For MLO roaming, MAC address of the STA link. Otherwise %NULL. + * @links.bssid: For MLO roaming, MAC address of the new AP link. For non-MLO + * roaming, links[0].bssid points to the BSSID of the new AP. May be + * %NULL if %links.bss is set. + * @links.channel: the channel of the new AP. + * @links.bss: For MLO roaming, entry of new bss to which STA link got + * roamed. For non-MLO roaming, links[0].bss points to entry of bss to + * which STA got roamed (may be %NULL if %links.bssid is set) */ struct cfg80211_roam_info { - struct ieee80211_channel *channel; - struct cfg80211_bss *bss; - const u8 *bssid; const u8 *req_ie; size_t req_ie_len; const u8 *resp_ie; size_t resp_ie_len; struct cfg80211_fils_resp_params fils; + + const u8 *ap_mld_addr; + u16 valid_links; + struct { + const u8 *addr; + const u8 *bssid; + struct ieee80211_channel *channel; + struct cfg80211_bss *bss; + } links[IEEE80211_MLD_MAX_NUM_LINKS]; }; /** @@ -7365,6 +7843,48 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, enum nl80211_connect_failed_reason reason, gfp_t gfp); +/** + * struct cfg80211_rx_info - received management frame info + * + * @freq: Frequency on which the frame was received in kHz + * @sig_dbm: signal strength in dBm, or 0 if unknown + * @have_link_id: indicates the frame was received on a link of + * an MLD, i.e. the @link_id field is valid + * @link_id: the ID of the link the frame was received on + * @buf: Management frame (header + body) + * @len: length of the frame data + * @flags: flags, as defined in enum nl80211_rxmgmt_flags + * @rx_tstamp: Hardware timestamp of frame RX in nanoseconds + * @ack_tstamp: Hardware timestamp of ack TX in nanoseconds + */ +struct cfg80211_rx_info { + int freq; + int sig_dbm; + bool have_link_id; + u8 link_id; + const u8 *buf; + size_t len; + u32 flags; + u64 rx_tstamp; + u64 ack_tstamp; +}; + +/** + * cfg80211_rx_mgmt_ext - management frame notification with extended info + * @wdev: wireless device receiving the frame + * @info: RX info as defined in struct cfg80211_rx_info + * + * This function is called whenever an Action frame is received for a station + * mode interface, but is not processed in kernel. + * + * Return: %true if a user space application has registered for this frame. + * For action frames, that makes it responsible for rejecting unrecognized + * action frames; %false otherwise, in which case for action frames the + * driver is responsible for rejecting the frame. + */ +bool cfg80211_rx_mgmt_ext(struct wireless_dev *wdev, + struct cfg80211_rx_info *info); + /** * cfg80211_rx_mgmt_khz - notification of received, unprocessed management frame * @wdev: wireless device receiving the frame @@ -7382,8 +7902,20 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, * action frames; %false otherwise, in which case for action frames the * driver is responsible for rejecting the frame. */ -bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, - const u8 *buf, size_t len, u32 flags); +static inline bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, + int sig_dbm, const u8 *buf, size_t len, + u32 flags) +{ + struct cfg80211_rx_info info = { + .freq = freq, + .sig_dbm = sig_dbm, + .buf = buf, + .len = len, + .flags = flags + }; + + return cfg80211_rx_mgmt_ext(wdev, &info); +} /** * cfg80211_rx_mgmt - notification of received, unprocessed management frame @@ -7406,10 +7938,49 @@ static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, const u8 *buf, size_t len, u32 flags) { - return cfg80211_rx_mgmt_khz(wdev, MHZ_TO_KHZ(freq), sig_dbm, buf, len, - flags); + struct cfg80211_rx_info info = { + .freq = MHZ_TO_KHZ(freq), + .sig_dbm = sig_dbm, + .buf = buf, + .len = len, + .flags = flags + }; + + return cfg80211_rx_mgmt_ext(wdev, &info); } +/** + * struct cfg80211_tx_status - TX status for management frame information + * + * @cookie: Cookie returned by cfg80211_ops::mgmt_tx() + * @tx_tstamp: hardware TX timestamp in nanoseconds + * @ack_tstamp: hardware ack RX timestamp in nanoseconds + * @buf: Management frame (header + body) + * @len: length of the frame data + * @ack: Whether frame was acknowledged + */ +struct cfg80211_tx_status { + u64 cookie; + u64 tx_tstamp; + u64 ack_tstamp; + const u8 *buf; + size_t len; + bool ack; +}; + +/** + * cfg80211_mgmt_tx_status_ext - TX status notification with extended info + * @wdev: wireless device receiving the frame + * @status: TX status data + * @gfp: context flags + * + * This function is called whenever a management frame was requested to be + * transmitted with cfg80211_ops::mgmt_tx() to report the TX status of the + * transmission attempt with extended info. + */ +void cfg80211_mgmt_tx_status_ext(struct wireless_dev *wdev, + struct cfg80211_tx_status *status, gfp_t gfp); + /** * cfg80211_mgmt_tx_status - notification of TX status for management frame * @wdev: wireless device receiving the frame @@ -7423,8 +7994,19 @@ static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, * transmitted with cfg80211_ops::mgmt_tx() to report the TX status of the * transmission attempt. */ -void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, - const u8 *buf, size_t len, bool ack, gfp_t gfp); +static inline void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, + u64 cookie, const u8 *buf, + size_t len, bool ack, gfp_t gfp) +{ + struct cfg80211_tx_status status = { + .cookie = cookie, + .buf = buf, + .len = len, + .ack = ack + }; + + cfg80211_mgmt_tx_status_ext(wdev, &status, gfp); +} /** * cfg80211_control_port_tx_status - notification of TX status for control @@ -7517,15 +8099,33 @@ void cfg80211_cqm_txe_notify(struct net_device *dev, const u8 *peer, void cfg80211_cqm_beacon_loss_notify(struct net_device *dev, gfp_t gfp); /** - * cfg80211_radar_event - radar detection event + * __cfg80211_radar_event - radar detection event * @wiphy: the wiphy * @chandef: chandef for the current channel + * @offchan: the radar has been detected on the offchannel chain * @gfp: context flags * * This function is called when a radar is detected on the current chanenl. */ -void cfg80211_radar_event(struct wiphy *wiphy, - struct cfg80211_chan_def *chandef, gfp_t gfp); +void __cfg80211_radar_event(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + bool offchan, gfp_t gfp); + +static inline void +cfg80211_radar_event(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + gfp_t gfp) +{ + __cfg80211_radar_event(wiphy, chandef, false, gfp); +} + +static inline void +cfg80211_background_radar_event(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + gfp_t gfp) +{ + __cfg80211_radar_event(wiphy, chandef, true, gfp); +} /** * cfg80211_sta_opmode_change_notify - STA's ht/vht operation mode change event @@ -7556,6 +8156,14 @@ void cfg80211_cac_event(struct net_device *netdev, const struct cfg80211_chan_def *chandef, enum nl80211_radar_event event, gfp_t gfp); +/** + * cfg80211_background_cac_abort - Channel Availability Check offchan abort event + * @wiphy: the wiphy + * + * This function is called by the driver when a Channel Availability Check + * (CAC) is aborted by a offchannel dedicated chain. + */ +void cfg80211_background_cac_abort(struct wiphy *wiphy); /** * cfg80211_gtk_rekey_notify - notify userspace about driver rekeying @@ -7692,17 +8300,20 @@ bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy, * cfg80211_ch_switch_notify - update wdev channel and notify userspace * @dev: the device which switched channels * @chandef: the new channel definition + * @link_id: the link ID for MLO, must be 0 for non-MLO * * Caller must acquire wdev_lock, therefore must only be called from sleepable * driver context! */ void cfg80211_ch_switch_notify(struct net_device *dev, - struct cfg80211_chan_def *chandef); + struct cfg80211_chan_def *chandef, + unsigned int link_id); /* * cfg80211_ch_switch_started_notify - notify channel switch start * @dev: the device on which the channel switch started * @chandef: the future channel definition + * @link_id: the link ID for MLO, must be 0 for non-MLO * @count: the number of TBTTs until the channel switch happens * @quiet: whether or not immediate quiet was requested by the AP * @@ -7712,7 +8323,8 @@ void cfg80211_ch_switch_notify(struct net_device *dev, */ void cfg80211_ch_switch_started_notify(struct net_device *dev, struct cfg80211_chan_def *chandef, - u8 count, bool quiet); + unsigned int link_id, u8 count, + bool quiet); /** * ieee80211_operating_class_to_band - convert operating class to band @@ -8173,6 +8785,18 @@ bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype, bool is_4addr, u8 check_swif); +/** + * cfg80211_assoc_comeback - notification of association that was + * temporarly rejected with a comeback + * @netdev: network device + * @ap_addr: AP (MLD) address that rejected the assocation + * @timeout: timeout interval value TUs. + * + * this function may sleep. the caller must hold the corresponding wdev's mutex. + */ +void cfg80211_assoc_comeback(struct net_device *netdev, + const u8 *ap_addr, u32 timeout); + /* Logging, debugging and troubleshooting/diagnostic helpers. */ /* wiphy_printk helpers, similar to dev_printk */ diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 7cb3fa8310ed..a48feee47b2b 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -3,6 +3,7 @@ #define __NET_GENERIC_NETLINK_H #include +#include #include #include @@ -68,6 +69,8 @@ struct genl_family { const struct genl_small_ops *small_ops; const struct genl_multicast_group *mcgrps; struct module *module; + + ANDROID_KABI_RESERVE(1); }; /** @@ -159,6 +162,8 @@ struct genl_ops { u8 internal_flags; u8 flags; u8 validate; + + ANDROID_KABI_RESERVE(1); }; /** diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 695ed45841f0..9b9de29a8fcd 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -51,6 +52,8 @@ struct inet_connection_sock_af_ops { char __user *optval, int __user *optlen); void (*addr2sockaddr)(struct sock *sk, struct sockaddr *); void (*mtu_reduced)(struct sock *sk); + + ANDROID_KABI_RESERVE(1); }; /** inet_connection_sock - INET connection oriented sock @@ -134,6 +137,8 @@ struct inet_connection_sock { u32 icsk_probes_tstamp; u32 icsk_user_timeout; + ANDROID_KABI_RESERVE(1); + u64 icsk_ca_priv[104 / sizeof(u64)]; #define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) }; diff --git a/include/net/ip.h b/include/net/ip.h index 6ae923c55cf4..2bf6bbc3c5b5 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -345,6 +345,13 @@ static inline bool inet_is_local_reserved_port(struct net *net, unsigned short p return test_bit(port, net->ipv4.sysctl_local_reserved_ports); } +static inline bool inet_is_local_unbindable_port(struct net *net, unsigned short port) +{ + if (!net->ipv4.sysctl_local_unbindable_ports) + return false; + return test_bit(port, net->ipv4.sysctl_local_unbindable_ports); +} + static inline bool sysctl_dev_name_is_allowed(const char *name) { return strcmp(name, "default") != 0 && strcmp(name, "all") != 0; @@ -361,6 +368,11 @@ static inline bool inet_is_local_reserved_port(struct net *net, unsigned short p return false; } +static inline bool inet_is_local_unbindable_port(struct net *net, unsigned short port) +{ + return false; +} + static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port) { return port < PROT_SOCK; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index bbb27639f293..1b4d27e40d81 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -67,6 +68,8 @@ struct fib6_config { struct nlattr *fc_encap; u16 fc_encap_type; bool fc_is_fdb; + + ANDROID_KABI_RESERVE(1); }; struct fib6_node { @@ -83,6 +86,8 @@ struct fib6_node { int fn_sernum; struct fib6_info __rcu *rr_ptr; struct rcu_head rcu; + + ANDROID_KABI_RESERVE(1); }; struct fib6_gc_args { @@ -202,6 +207,9 @@ struct fib6_info { struct rcu_head rcu; struct nexthop *nh; + + ANDROID_KABI_RESERVE(1); + struct fib6_nh fib6_nh[]; }; @@ -221,6 +229,8 @@ struct rt6_info { /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; + + ANDROID_KABI_RESERVE(1); }; struct fib6_result { diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 618d1f427cb2..5b7d50b3d068 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -702,6 +703,8 @@ struct ieee80211_bss_conf { u32 unsol_bcast_probe_resp_interval; bool s1g; struct cfg80211_bitrate_mask beacon_tx_rate; + + ANDROID_KABI_RESERVE(1); }; /** @@ -1102,6 +1105,9 @@ struct ieee80211_tx_info { void *rate_driver_data[ IEEE80211_TX_INFO_RATE_DRIVER_DATA_SIZE / sizeof(void *)]; }; + + ANDROID_KABI_RESERVE(1); + void *driver_data[ IEEE80211_TX_INFO_DRIVER_DATA_SIZE / sizeof(void *)]; }; @@ -1591,6 +1597,8 @@ struct ieee80211_conf { struct cfg80211_chan_def chandef; bool radar_enabled; enum ieee80211_smps_mode smps_mode; + + ANDROID_KABI_RESERVE(1); }; /** @@ -1746,6 +1754,8 @@ struct ieee80211_vif { bool color_change_active; u8 color_change_color; + ANDROID_KABI_RESERVE(1); + /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; @@ -2035,6 +2045,45 @@ struct ieee80211_sta_txpwr { enum nl80211_tx_power_setting type; }; +#define MAX_STA_LINKS 15 + +/** + * struct ieee80211_link_sta - station Link specific info + * All link specific info for a STA link for a non MLD STA(single) + * or a MLD STA(multiple entries) are stored here. + * + * @addr: MAC address of the Link STA. For non-MLO STA this is same as the addr + * in ieee80211_sta. For MLO Link STA this addr can be same or different + * from addr in ieee80211_sta (representing MLD STA addr) + * @supp_rates: Bitmap of supported rates + * @ht_cap: HT capabilities of this STA; restricted to our own capabilities + * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities + * @he_cap: HE capabilities of this STA + * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities + * @eht_cap: EHT capabilities of this STA + * @bandwidth: current bandwidth the station can receive with + * @rx_nss: in HT/VHT, the maximum number of spatial streams the + * station can receive at the moment, changed by operating mode + * notifications and capabilities. The value is only valid after + * the station moves to associated state. + * @txpwr: the station tx power configuration + * + */ +struct ieee80211_link_sta { + u8 addr[ETH_ALEN]; + + u32 supp_rates[NUM_NL80211_BANDS]; + struct ieee80211_sta_ht_cap ht_cap; + struct ieee80211_sta_vht_cap vht_cap; + struct ieee80211_sta_he_cap he_cap; + struct ieee80211_he_6ghz_capa he_6ghz_capa; + struct ieee80211_sta_eht_cap eht_cap; + + u8 rx_nss; + enum ieee80211_sta_rx_bandwidth bandwidth; + struct ieee80211_sta_txpwr txpwr; +}; + /** * struct ieee80211_sta - station table entry * @@ -2044,14 +2093,11 @@ struct ieee80211_sta_txpwr { * either be protected by rcu_read_lock() explicitly or implicitly, * or you must take good care to not use such a pointer after a * call to your sta_remove callback that removed it. + * This also represents the MLD STA in case of MLO association + * and holds pointers to various link STA's * * @addr: MAC address * @aid: AID we assigned to the station if we're an AP - * @supp_rates: Bitmap of supported rates (per band) - * @ht_cap: HT capabilities of this STA; restricted to our own capabilities - * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities - * @he_cap: HE capabilities of this STA - * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU * that this station is allowed to transmit to us. * Can be modified by driver. @@ -2063,11 +2109,6 @@ struct ieee80211_sta_txpwr { * if wme is supported. The bits order is like in * IEEE80211_WMM_IE_STA_QOSINFO_AC_*. * @max_sp: max Service Period. Only valid if wme is supported. - * @bandwidth: current bandwidth the station can receive with - * @rx_nss: in HT/VHT, the maximum number of spatial streams the - * station can receive at the moment, changed by operating mode - * notifications and capabilities. The value is only valid after - * the station moves to associated state. * @smps_mode: current SMPS mode (off, static or dynamic) * @rates: rate control selection table * @tdls: indicates whether the STA is a TDLS peer @@ -2080,24 +2121,28 @@ struct ieee80211_sta_txpwr { * @support_p2p_ps: indicates whether the STA supports P2P PS mechanism or not. * @max_rc_amsdu_len: Maximum A-MSDU size in bytes recommended by rate control. * @max_tid_amsdu_len: Maximum A-MSDU size in bytes for this TID - * @txpwr: the station tx power configuration * @txq: per-TID data TX queues (if driver uses the TXQ abstraction); note that * the last entry (%IEEE80211_NUM_TIDS) is used for non-data frames + * @multi_link_sta: Identifies if this sta is a MLD STA + * @deflink: This holds the default link STA information, for non MLO STA all link + * specific STA information is accessed through @deflink or through + * link[0] which points to address of @deflink. For MLO Link STA + * the first added link STA will point to deflink. + * @link: reference to Link Sta entries. For Non MLO STA, except 1st link, + * i.e link[0] all links would be assigned to NULL by default and + * would access link information via @deflink or link[0]. For MLO + * STA, first link STA being added will point its link pointer to + * @deflink address and remaining would be allocated and the address + * would be assigned to link[link_id] where link_id is the id assigned + * by the AP. */ struct ieee80211_sta { - u32 supp_rates[NUM_NL80211_BANDS]; u8 addr[ETH_ALEN]; u16 aid; - struct ieee80211_sta_ht_cap ht_cap; - struct ieee80211_sta_vht_cap vht_cap; - struct ieee80211_sta_he_cap he_cap; - struct ieee80211_he_6ghz_capa he_6ghz_capa; u16 max_rx_aggregation_subframes; bool wme; u8 uapsd_queues; u8 max_sp; - u8 rx_nss; - enum ieee80211_sta_rx_bandwidth bandwidth; enum ieee80211_smps_mode smps_mode; struct ieee80211_sta_rates __rcu *rates; bool tdls; @@ -2124,10 +2169,15 @@ struct ieee80211_sta { bool support_p2p_ps; u16 max_rc_amsdu_len; u16 max_tid_amsdu_len[IEEE80211_NUM_TIDS]; - struct ieee80211_sta_txpwr txpwr; struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1]; + bool multi_link_sta; + struct ieee80211_link_sta deflink; + struct ieee80211_link_sta *link[MAX_STA_LINKS]; + + ANDROID_KABI_RESERVE(1); + /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; @@ -2633,6 +2683,8 @@ struct ieee80211_hw { u8 tx_sk_pacing_shift; u8 weight_multiplier; u32 max_mtu; + + ANDROID_KABI_RESERVE(1); }; static inline bool _ieee80211_hw_check(struct ieee80211_hw *hw, @@ -3933,6 +3985,14 @@ struct ieee80211_prep_tx_info { * twt structure. * @twt_teardown_request: Update the hw with TWT teardown request received * from the peer. + * @set_radar_background: Configure dedicated offchannel chain available for + * radar/CAC detection on some hw. This chain can't be used to transmit + * or receive frames and it is bounded to a running wdev. + * Background radar/CAC detection allows to avoid the CAC downtime + * switching to a different channel during CAC detection on the selected + * radar channel. + * The caller is expected to set chandef pointer to NULL in order to + * disable background CAC/radar detection. */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, @@ -4261,6 +4321,13 @@ struct ieee80211_ops { struct ieee80211_twt_setup *twt); void (*twt_teardown_request)(struct ieee80211_hw *hw, struct ieee80211_sta *sta, u8 flowid); + int (*set_radar_background)(struct ieee80211_hw *hw, + struct cfg80211_chan_def *chandef); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; /** @@ -4908,12 +4975,14 @@ void ieee80211_report_low_ack(struct ieee80211_sta *sta, u32 num_packets); * @cntdwn_counter_offs: array of IEEE80211_MAX_CNTDWN_COUNTERS_NUM offsets * to countdown counters. This array can contain zero values which * should be ignored. + * @mbssid_off: position of the multiple bssid element */ struct ieee80211_mutable_offsets { u16 tim_offset; u16 tim_length; u16 cntdwn_counter_offs[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; + u16 mbssid_off; }; /** @@ -6276,13 +6345,18 @@ struct rate_control_ops { struct dentry *dir); u32 (*get_expected_throughput)(void *priv_sta); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; static inline int rate_supported(struct ieee80211_sta *sta, enum nl80211_band band, int index) { - return (sta == NULL || sta->supp_rates[band] & BIT(index)); + return (sta == NULL || sta->deflink.supp_rates[band] & BIT(index)); } static inline s8 diff --git a/include/net/neighbour.h b/include/net/neighbour.h index d5767e25509c..7750ef0fe0dd 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -28,6 +28,7 @@ #include #include #include +#include #include /* @@ -83,6 +84,8 @@ struct neigh_parms { int reachable_time; int data[NEIGH_VAR_DATA_MAX]; DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX); + + ANDROID_KABI_RESERVE(1); }; static inline void neigh_var_set(struct neigh_parms *p, int index, int val) @@ -157,6 +160,9 @@ struct neighbour { struct list_head gc_list; struct rcu_head rcu; struct net_device *dev; + + ANDROID_KABI_RESERVE(1); + u8 primary_key[0]; } __randomize_layout; @@ -226,6 +232,8 @@ struct neigh_table { struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; struct pneigh_entry **phash_buckets; + + ANDROID_KABI_RESERVE(1); }; enum { diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 34c266502a50..8e2a91eae32f 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -15,6 +15,8 @@ #include #include +#include +#include #include #include @@ -121,6 +123,10 @@ struct nf_conn { /* Storage reserved for other modules, must be the last member */ union nf_conntrack_proto proto; + + ANDROID_OEM_DATA(1); + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; static inline struct nf_conn * diff --git a/include/net/netns/can.h b/include/net/netns/can.h index 52fbd8291a96..d075f610e647 100644 --- a/include/net/netns/can.h +++ b/include/net/netns/can.h @@ -7,6 +7,7 @@ #define __NETNS_CAN_H__ #include +#include struct can_dev_rcv_lists; struct can_pkg_stats; @@ -35,6 +36,8 @@ struct netns_can { /* CAN GW per-net gateway jobs */ struct hlist_head cgw_list; + + ANDROID_KABI_RESERVE(1); }; #endif /* __NETNS_CAN_H__ */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index d60a10cfc382..153e071e2b6e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -10,6 +10,7 @@ #include #include #include +#include struct ctl_table_header; struct ipv4_devconf; @@ -197,6 +198,7 @@ struct netns_ipv4 { #ifdef CONFIG_SYSCTL unsigned long *sysctl_local_reserved_ports; + unsigned long *sysctl_local_unbindable_ports; int sysctl_ip_prot_sock; #endif @@ -222,5 +224,7 @@ struct netns_ipv4 { atomic_t rt_genid; siphash_key_t ip_id_key; + + ANDROID_KABI_RESERVE(1); }; #endif diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index ff82983b7ab4..b0916c85be38 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -7,6 +7,7 @@ #ifndef __NETNS_IPV6_H__ #define __NETNS_IPV6_H__ +#include #include #include @@ -55,6 +56,7 @@ struct netns_sysctl_ipv6 { u64 ioam6_id_wide; bool skip_notify_on_dev_down; u8 fib_notify_on_flag_change; + ANDROID_KABI_RESERVE(1); }; struct netns_ipv6 { @@ -114,6 +116,8 @@ struct netns_ipv6 { u32 seq; } ip6addrlbl_table; struct ioam6_pernet_data *ioam6_data; + + ANDROID_KABI_RESERVE(1); }; #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index b593f95e9991..bf983f025ad2 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -3,6 +3,7 @@ #define __NETNS_NETFILTER_H #include +#include struct proc_dir_entry; struct nf_logger; @@ -33,5 +34,7 @@ struct netns_nf { #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) unsigned int defrag_ipv6_users; #endif + + ANDROID_KABI_RESERVE(1); }; #endif diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index 8c77832d0240..26930d6a2452 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -3,9 +3,11 @@ #define _NETNS_NFTABLES_H_ #include +#include struct netns_nftables { u8 gencursor; + ANDROID_KABI_RESERVE(1); }; #endif diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index bd7c3be4af5d..c8dcbf385f50 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -7,6 +7,7 @@ #include #include #include +#include #include struct ctl_table_header; @@ -82,6 +83,8 @@ struct netns_xfrm { spinlock_t xfrm_policy_lock; struct mutex xfrm_cfg_mutex; + + ANDROID_KABI_RESERVE(1); }; #endif diff --git a/include/net/page_pool.h b/include/net/page_pool.h index a4082406a003..6c48d6c6ef53 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -33,6 +33,7 @@ #include /* Needed by ptr_ring */ #include #include +#include #define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA * map/unmap @@ -132,6 +133,8 @@ struct page_pool { refcount_t user_cnt; u64 destroy_cnt; + + ANDROID_KABI_RESERVE(1); }; struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 6906da5c733e..ce4e891ffc64 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -119,6 +120,8 @@ struct Qdisc { struct rcu_head rcu; + ANDROID_KABI_RESERVE(1); + /* private data */ long privdata[] ____cacheline_aligned; }; @@ -263,6 +266,8 @@ struct Qdisc_class_ops { struct sk_buff *skb, struct tcmsg*); int (*dump_stats)(struct Qdisc *, unsigned long, struct gnet_dump *); + + ANDROID_KABI_RESERVE(1); }; /* Qdisc_class_ops flag values */ @@ -308,6 +313,8 @@ struct Qdisc_ops { u32 (*egress_block_get)(struct Qdisc *sch); struct module *owner; + + ANDROID_KABI_RESERVE(1); }; diff --git a/include/net/sock.h b/include/net/sock.h index 3e9db5146765..96657cd1b415 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -69,6 +69,8 @@ #include #include #include +#include +#include /* * This structure really needs to be cleaned up. @@ -535,6 +537,16 @@ struct sock { struct bpf_local_storage __rcu *sk_bpf_storage; #endif struct rcu_head sk_rcu; + + ANDROID_OEM_DATA(1); + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); + ANDROID_KABI_RESERVE(5); + ANDROID_KABI_RESERVE(6); + ANDROID_KABI_RESERVE(7); + ANDROID_KABI_RESERVE(8); }; enum sk_pacing { diff --git a/include/net/tls.h b/include/net/tls.h index bf3d63a52788..06ff28896b06 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -113,6 +114,9 @@ struct tls_rec { char aad_space[TLS_AAD_SPACE_SIZE]; u8 iv_data[MAX_IV_SIZE]; struct aead_request aead_req; + + ANDROID_KABI_RESERVE(1); + u8 aead_req_ctx[]; }; @@ -141,6 +145,8 @@ struct tls_sw_context_tx { #define BIT_TX_SCHEDULED 0 #define BIT_TX_CLOSING 1 unsigned long tx_bitmask; + + ANDROID_KABI_RESERVE(1); }; struct tls_sw_context_rx { @@ -158,6 +164,8 @@ struct tls_sw_context_rx { /* protect crypto_wait with decrypt_pending*/ spinlock_t decrypt_compl_lock; bool async_notify; + + ANDROID_KABI_RESERVE(1); }; struct tls_record_info { @@ -296,6 +304,12 @@ struct tlsdev_ops { int (*tls_dev_resync)(struct net_device *netdev, struct sock *sk, u32 seq, u8 *rcd_sn, enum tls_offload_ctx_dir direction); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); + }; enum tls_offload_sync_type { diff --git a/include/net/virt_wifi.h b/include/net/virt_wifi.h new file mode 100644 index 000000000000..343e73968d41 --- /dev/null +++ b/include/net/virt_wifi.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* include/net/virt_wifi.h + * + * Define the extension interface for the network data simulation + * + * Copyright (C) 2019 Google, Inc. + * + * Author: lesl@google.com + */ +#ifndef __VIRT_WIFI_H +#define __VIRT_WIFI_H + +struct virt_wifi_network_simulation { + void (*notify_device_open)(struct net_device *dev); + void (*notify_device_stop)(struct net_device *dev); + void (*notify_scan_trigger)(struct wiphy *wiphy, + struct cfg80211_scan_request *request); + int (*generate_virt_scan_result)(struct wiphy *wiphy); +}; + +int virt_wifi_register_network_simulation( + struct virt_wifi_network_simulation *ops); +int virt_wifi_unregister_network_simulation(void); +#endif + diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 73030094c6e6..f2fe2b5cd113 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1580,12 +1580,10 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, int xfrm_trans_queue(struct sk_buff *skb, int (*finish)(struct net *, struct sock *, struct sk_buff *)); -int xfrm_output_resume(struct sock *sk, struct sk_buff *skb, int err); +int xfrm_output_resume(struct sk_buff *skb, int err); int xfrm_output(struct sock *sk, struct sk_buff *skb); -#if IS_ENABLED(CONFIG_NET_PKTGEN) int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb); -#endif void xfrm_local_error(struct sk_buff *skb, int mtu); int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb); diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h index 5cf84228b51d..d361b648d474 100644 --- a/include/scsi/libiscsi.h +++ b/include/scsi/libiscsi.h @@ -385,7 +385,7 @@ extern int iscsi_eh_recover_target(struct scsi_cmnd *sc); extern int iscsi_eh_session_reset(struct scsi_cmnd *sc); extern int iscsi_eh_device_reset(struct scsi_cmnd *sc); extern int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc); -extern enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc); +extern enum scsi_timeout_action iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc); /* * iSCSI host helpers. diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h index 3e46859774c8..dc67020aa557 100644 --- a/include/scsi/scsi.h +++ b/include/scsi/scsi.h @@ -93,6 +93,7 @@ static inline int scsi_status_is_check_condition(int status) * Internal return values. */ enum scsi_disposition { + NEEDS_DELAYED_RETRY = 0x2000, NEEDS_RETRY = 0x2001, SUCCESS = 0x2002, FAILED = 0x2003, diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 685249233f2f..c0bda647cc7b 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -12,6 +12,7 @@ #include #include #include +#include struct Scsi_Host; struct scsi_driver; @@ -58,8 +59,9 @@ struct scsi_pointer { #define SCMD_TAGGED (1 << 0) #define SCMD_INITIALIZED (1 << 1) #define SCMD_LAST (1 << 2) +#define SCMD_FAIL_IF_RECOVERING (1 << 4) /* flags preserved across unprep / reprep */ -#define SCMD_PRESERVED_FLAGS (SCMD_INITIALIZED) +#define SCMD_PRESERVED_FLAGS (SCMD_INITIALIZED | SCMD_FAIL_IF_RECOVERING) /* for scmd->state */ #define SCMD_STATE_COMPLETE 0 @@ -140,6 +142,11 @@ struct scsi_cmnd { unsigned long state; /* Command completion state */ unsigned int extra_len; /* length of alignment and padding */ + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; /* Variant of blk_mq_rq_from_pdu() that verifies the type of its argument. */ diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 3b3dbc37653d..6b81c68c2667 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -9,6 +9,7 @@ #include #include #include +#include struct bsg_device; struct device; @@ -244,6 +245,12 @@ struct scsi_device { struct mutex state_mutex; enum scsi_device_state sdev_state; struct task_struct *quiesced_by; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); + unsigned long sdev_data[]; } __attribute__((aligned(sizeof(unsigned long)))); diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 1a02e58eb4e4..fcd431829ab2 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -10,6 +10,7 @@ #include #include #include +#include struct block_device; struct completion; @@ -28,6 +29,18 @@ struct scsi_transport_template; #define MODE_INITIATOR 0x01 #define MODE_TARGET 0x02 +/** + * enum scsi_timeout_action - How to handle a command that timed out. + * @SCSI_EH_DONE: The command has already been completed. + * @SCSI_EH_RESET_TIMER: Reset the timer and continue waiting for completion. + * @SCSI_EH_NOT_HANDLED: The command has not yet finished. Abort the command. + */ +enum scsi_timeout_action { + SCSI_EH_DONE, + SCSI_EH_RESET_TIMER, + SCSI_EH_NOT_HANDLED, +}; + struct scsi_host_template { /* * Put fields referenced in IO submission path together in @@ -332,7 +345,7 @@ struct scsi_host_template { * * Status: OPTIONAL */ - enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *); + enum scsi_timeout_action (*eh_timed_out)(struct scsi_cmnd *); /* * Optional routine that allows the transport to decide if a cmd * is retryable. Return true if the transport is in a state the @@ -502,6 +515,11 @@ struct scsi_host_template { /* Delay for runtime autosuspend */ int rpm_autosuspend_delay; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; /* @@ -709,6 +727,8 @@ struct Scsi_Host { */ struct device *dma_dev; + ANDROID_KABI_RESERVE(1); + /* * We should ensure that this is aligned, both for better performance * and also because some compilers (m68k) don't automatically force diff --git a/include/scsi/scsi_proto.h b/include/scsi/scsi_proto.h index f017843a8124..c03e35fc382c 100644 --- a/include/scsi/scsi_proto.h +++ b/include/scsi/scsi_proto.h @@ -307,7 +307,9 @@ enum zbc_zone_type { ZBC_ZONE_TYPE_CONV = 0x1, ZBC_ZONE_TYPE_SEQWRITE_REQ = 0x2, ZBC_ZONE_TYPE_SEQWRITE_PREF = 0x3, - /* 0x4 to 0xf are reserved */ + ZBC_ZONE_TYPE_SEQ_OR_BEFORE_REQ = 0x4, + ZBC_ZONE_TYPE_GAP = 0x5, + /* 0x6 to 0xf are reserved */ }; /* Zone conditions of REPORT ZONES zone descriptors */ @@ -323,6 +325,11 @@ enum zbc_zone_cond { ZBC_ZONE_COND_OFFLINE = 0xf, }; +enum zbc_zone_alignment_method { + ZBC_CONSTANT_ZONE_LENGTH = 0x1, + ZBC_CONSTANT_ZONE_START_OFFSET = 0x8, +}; + /* Version descriptor values for INQUIRY */ enum scsi_version_descriptor { SCSI_VERSION_DESCRIPTOR_FCP4 = 0x0a40, diff --git a/include/scsi/scsi_transport_fc.h b/include/scsi/scsi_transport_fc.h index e80a7c542c88..3dcda19d3520 100644 --- a/include/scsi/scsi_transport_fc.h +++ b/include/scsi/scsi_transport_fc.h @@ -862,7 +862,7 @@ struct fc_vport *fc_vport_create(struct Scsi_Host *shost, int channel, int fc_vport_terminate(struct fc_vport *vport); int fc_block_rport(struct fc_rport *rport); int fc_block_scsi_eh(struct scsi_cmnd *cmnd); -enum blk_eh_timer_return fc_eh_timed_out(struct scsi_cmnd *scmd); +enum scsi_timeout_action fc_eh_timed_out(struct scsi_cmnd *scmd); bool fc_eh_should_retry_cmd(struct scsi_cmnd *scmd); static inline struct Scsi_Host *fc_bsg_to_shost(struct bsg_job *job) diff --git a/include/scsi/scsi_transport_srp.h b/include/scsi/scsi_transport_srp.h index d22df12584f9..dfc78aa112ad 100644 --- a/include/scsi/scsi_transport_srp.h +++ b/include/scsi/scsi_transport_srp.h @@ -118,7 +118,7 @@ extern int srp_reconnect_rport(struct srp_rport *rport); extern void srp_start_tl_fail_timers(struct srp_rport *rport); extern void srp_remove_host(struct Scsi_Host *); extern void srp_stop_rport_timers(struct srp_rport *rport); -enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd); +enum scsi_timeout_action srp_timed_out(struct scsi_cmnd *scmd); /** * srp_chkready() - evaluate the transport layer state before I/O diff --git a/include/sound/compress_driver.h b/include/sound/compress_driver.h index d91289c6f00e..77d027f73f4a 100644 --- a/include/sound/compress_driver.h +++ b/include/sound/compress_driver.h @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -54,6 +55,8 @@ struct snd_compr_runtime { dma_addr_t dma_addr; size_t dma_bytes; struct snd_dma_buffer *dma_buffer_p; + + ANDROID_KABI_RESERVE(1); }; /** @@ -84,6 +87,8 @@ struct snd_compr_stream { bool pause_in_draining; void *private_data; struct snd_dma_buffer dma_buffer; + + ANDROID_KABI_RESERVE(1); }; /** @@ -132,6 +137,8 @@ struct snd_compr_ops { struct snd_compr_caps *caps); int (*get_codec_caps) (struct snd_compr_stream *stream, struct snd_compr_codec_caps *codec); + + ANDROID_KABI_RESERVE(1); }; /** @@ -162,6 +169,7 @@ struct snd_compr { struct snd_info_entry *proc_root; struct snd_info_entry *proc_info_entry; #endif + ANDROID_KABI_RESERVE(1); }; /* compress device register APIs */ diff --git a/include/sound/core.h b/include/sound/core.h index 39cee40ac22e..3c196f5bc875 100644 --- a/include/sound/core.h +++ b/include/sound/core.h @@ -14,6 +14,8 @@ #include /* pm_message_t */ #include #include +#include +#include /* number of supported soundcards */ #ifdef CONFIG_SND_DYNAMIC_MINORS @@ -61,6 +63,8 @@ struct snd_device_ops { int (*dev_free)(struct snd_device *dev); int (*dev_register)(struct snd_device *dev); int (*dev_disconnect)(struct snd_device *dev); + + ANDROID_KABI_RESERVE(1); }; struct snd_device { @@ -70,6 +74,8 @@ struct snd_device { enum snd_device_type type; /* device type */ void *device_data; /* device structure */ const struct snd_device_ops *ops; /* operations */ + + ANDROID_KABI_RESERVE(1); }; #define snd_device(n) list_entry(n, struct snd_device, list) @@ -103,6 +109,11 @@ struct snd_card { size_t user_ctl_alloc_size; // current memory allocation by user controls. struct list_head controls; /* all controls for this card */ struct list_head ctl_files; /* active control files */ +#ifdef CONFIG_SND_CTL_FAST_LOOKUP + struct xarray ctl_numids; /* hash table for numids */ + struct xarray ctl_hash; /* hash table for ctl id matching */ + bool ctl_hash_collision; /* ctl_hash collision seen? */ +#endif struct snd_info_entry *proc_root; /* root for soundcard specific files */ struct proc_dir_entry *proc_root_link; /* number link to real id */ @@ -139,6 +150,9 @@ struct snd_card { struct snd_mixer_oss *mixer_oss; int mixer_oss_change_count; #endif + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; #define dev_to_snd_card(p) container_of(p, struct snd_card, card_dev) @@ -214,6 +228,8 @@ struct snd_minor { void *private_data; /* private data for f_ops->open */ struct device *dev; /* device for sysfs */ struct snd_card *card_ptr; /* assigned card instance */ + + ANDROID_KABI_RESERVE(1); }; /* return a device pointer linked to each sound device as a parent */ diff --git a/include/sound/hwdep.h b/include/sound/hwdep.h index 8d6cdb254039..d99a53025ff0 100644 --- a/include/sound/hwdep.h +++ b/include/sound/hwdep.h @@ -9,6 +9,7 @@ #include #include +#include struct snd_hwdep; @@ -34,6 +35,8 @@ struct snd_hwdep_ops { struct snd_hwdep_dsp_status *status); int (*dsp_load)(struct snd_hwdep *hw, struct snd_hwdep_dsp_image *image); + + ANDROID_KABI_RESERVE(1); }; struct snd_hwdep { @@ -59,6 +62,8 @@ struct snd_hwdep { int used; /* reference counter */ unsigned int dsp_loaded; /* bit fields of loaded dsp indices */ unsigned int exclusive:1; /* exclusive access mode */ + + ANDROID_KABI_RESERVE(1); }; extern int snd_hwdep_new(struct snd_card *card, char *id, int device, diff --git a/include/sound/info.h b/include/sound/info.h index 7c13bf52cc81..25af5d8d1527 100644 --- a/include/sound/info.h +++ b/include/sound/info.h @@ -9,6 +9,7 @@ #include #include +#include #include /* buffer for information */ @@ -55,6 +56,8 @@ struct snd_info_entry_ops { int (*mmap)(struct snd_info_entry *entry, void *file_private_data, struct inode *inode, struct file *file, struct vm_area_struct *vma); + + ANDROID_KABI_RESERVE(1); }; struct snd_info_entry { @@ -74,6 +77,8 @@ struct snd_info_entry { struct mutex access; struct list_head children; struct list_head list; + + ANDROID_KABI_RESERVE(1); }; #if defined(CONFIG_SND_OSSEMUL) && defined(CONFIG_SND_PROC_FS) diff --git a/include/sound/jack.h b/include/sound/jack.h index 1ed90e2109e9..4c91b37bf240 100644 --- a/include/sound/jack.h +++ b/include/sound/jack.h @@ -9,6 +9,7 @@ */ #include +#include struct input_dev; @@ -71,6 +72,8 @@ struct snd_jack { int hw_status_cache; void *private_data; void (*private_free)(struct snd_jack *); + + ANDROID_KABI_RESERVE(1); }; #ifdef CONFIG_SND_JACK diff --git a/include/sound/pcm.h b/include/sound/pcm.h index 181df0452ae2..4412052ca01c 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -16,6 +16,7 @@ #include #include #include +#include #define snd_pcm_substream_chip(substream) ((substream)->private_data) #define snd_pcm_chip(pcm) ((pcm)->private_data) @@ -77,6 +78,7 @@ struct snd_pcm_ops { unsigned long offset); int (*mmap)(struct snd_pcm_substream *substream, struct vm_area_struct *vma); int (*ack)(struct snd_pcm_substream *substream); + ANDROID_KABI_RESERVE(1); }; /* @@ -430,6 +432,8 @@ struct snd_pcm_runtime { /* -- OSS things -- */ struct snd_pcm_oss_runtime oss; #endif + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); }; struct snd_pcm_group { /* keep linked substreams */ @@ -482,6 +486,8 @@ struct snd_pcm_substream { /* misc flags */ unsigned int hw_opened: 1; unsigned int managed_buffer_alloc:1; + ANDROID_VENDOR_DATA(1); + ANDROID_KABI_RESERVE(1); }; #define SUBSTREAM_BUSY(substream) ((substream)->ref_count > 0) @@ -506,6 +512,7 @@ struct snd_pcm_str { #endif struct snd_kcontrol *chmap_kctl; /* channel-mapping controls */ struct device dev; + ANDROID_KABI_RESERVE(1); }; struct snd_pcm { @@ -528,6 +535,7 @@ struct snd_pcm { #if IS_ENABLED(CONFIG_SND_PCM_OSS) struct snd_pcm_oss oss; #endif + ANDROID_KABI_RESERVE(1); }; /* diff --git a/include/sound/soc.h b/include/sound/soc.h index 8e6dd8a257c5..0fa0a3458232 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -239,6 +240,14 @@ .get = xhandler_get, .put = xhandler_put, \ .private_value = SOC_DOUBLE_R_VALUE(reg_left, reg_right, xshift, \ xmax, xinvert) } +#define SOC_SINGLE_MULTI_EXT(xname, xreg, xshift, xmax, xinvert, xcount,\ + xhandler_get, xhandler_put) \ +{ .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = xname, \ + .info = snd_soc_info_multi_ext, \ + .get = xhandler_get, .put = xhandler_put, \ + .private_value = (unsigned long)&(struct soc_multi_mixer_control) \ + {.reg = xreg, .shift = xshift, .rshift = xshift, .max = xmax, \ + .count = xcount, .platform_max = xmax, .invert = xinvert} } #define SOC_SINGLE_EXT_TLV(xname, xreg, xshift, xmax, xinvert,\ xhandler_get, xhandler_put, tlv_array) \ { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = xname, \ @@ -575,6 +584,8 @@ int snd_soc_get_strobe(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol); int snd_soc_put_strobe(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol); +int snd_soc_info_multi_ext(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo); /* SoC PCM stream information */ struct snd_soc_pcm_stream { @@ -586,6 +597,7 @@ struct snd_soc_pcm_stream { unsigned int channels_min; /* min channels */ unsigned int channels_max; /* max channels */ unsigned int sig_bits; /* number of bits of content */ + ANDROID_VENDOR_DATA(1); }; /* SoC audio ops */ @@ -721,6 +733,8 @@ struct snd_soc_dai_link { #ifdef CONFIG_SND_SOC_TOPOLOGY struct snd_soc_dobj dobj; /* For topology */ #endif + ANDROID_VENDOR_DATA(1); + ANDROID_KABI_RESERVE(1); }; static inline struct snd_soc_dai_link_component* @@ -990,6 +1004,11 @@ struct snd_soc_card { unsigned int component_chaining:1; void *drvdata; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #define for_each_card_prelinks(card, i, link) \ for ((i) = 0; \ @@ -1074,6 +1093,9 @@ struct snd_soc_pcm_runtime { unsigned int fe_compr:1; /* for Dynamic PCM */ int num_components; + + ANDROID_KABI_RESERVE(1); + struct snd_soc_component *components[]; /* CPU/Codec/Platform */ }; /* see soc_new_pcm_runtime() */ @@ -1113,6 +1135,8 @@ struct soc_mixer_control { #ifdef CONFIG_SND_SOC_TOPOLOGY struct snd_soc_dobj dobj; #endif + + ANDROID_KABI_RESERVE(1); }; struct soc_bytes { @@ -1139,6 +1163,11 @@ struct soc_mreg_control { unsigned int regbase, regcount, nbits, invert; }; +struct soc_multi_mixer_control { + int min, max, platform_max, count; + unsigned int reg, rreg, shift, rshift, invert; +}; + /* enumerated kcontrol */ struct soc_enum { int reg; @@ -1152,6 +1181,8 @@ struct soc_enum { #ifdef CONFIG_SND_SOC_TOPOLOGY struct snd_soc_dobj dobj; #endif + + ANDROID_KABI_RESERVE(1); }; static inline bool snd_soc_volsw_is_stereo(struct soc_mixer_control *mc) diff --git a/include/sound/timer.h b/include/sound/timer.h index 760e132cc0cd..b7695e4dcca0 100644 --- a/include/sound/timer.h +++ b/include/sound/timer.h @@ -10,6 +10,7 @@ #include #include +#include #define snd_timer_chip(timer) ((timer)->private_data) @@ -52,6 +53,8 @@ struct snd_timer_hardware { int (*stop) (struct snd_timer * timer); int (*set_period) (struct snd_timer * timer, unsigned long period_num, unsigned long period_den); int (*precise_resolution) (struct snd_timer * timer, unsigned long *num, unsigned long *den); + + ANDROID_KABI_RESERVE(1); }; struct snd_timer { @@ -77,6 +80,8 @@ struct snd_timer { struct work_struct task_work; int max_instances; /* upper limit of timer instances */ int num_instances; /* current number of timer instances */ + + ANDROID_KABI_RESERVE(1); }; struct snd_timer_instance { @@ -106,6 +111,8 @@ struct snd_timer_instance { struct list_head slave_list_head; struct list_head slave_active_head; struct snd_timer_instance *master; + + ANDROID_KABI_RESERVE(1); }; /* diff --git a/include/trace/events/OWNERS b/include/trace/events/OWNERS new file mode 100644 index 000000000000..a63dbf4659f2 --- /dev/null +++ b/include/trace/events/OWNERS @@ -0,0 +1 @@ +per-file f2fs**=file:/fs/f2fs/OWNERS diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 3d708dae1542..979dc7cdfe74 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -91,6 +91,51 @@ TRACE_EVENT(cma_alloc_start, __entry->align) ); +TRACE_EVENT(cma_alloc_info, + + TP_PROTO(const char *name, const struct page *page, unsigned int count, unsigned int align, struct cma_alloc_info *info), + + TP_ARGS(name, page, count, align, info), + + TP_STRUCT__entry( + __string(name, name) + __field(unsigned long, pfn) + __field(unsigned int, count) + __field(unsigned int, align) + __field(unsigned long, nr_migrated) + __field(unsigned long, nr_reclaimed) + __field(unsigned long, nr_mapped) + __field(unsigned int, err_iso) + __field(unsigned int, err_mig) + __field(unsigned int, err_test) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->pfn = page ? page_to_pfn(page) : -1; + __entry->count = count; + __entry->align = align; + __entry->nr_migrated = info->nr_migrated; + __entry->nr_reclaimed = info->nr_reclaimed; + __entry->nr_mapped = info->nr_mapped; + __entry->err_iso = info->nr_isolate_fail; + __entry->err_mig = info->nr_migrate_fail; + __entry->err_test = info->nr_test_fail; + ), + + TP_printk("name=%s pfn=0x%lx count=%u align=%u nr_migrated=%lu nr_reclaimed=%lu nr_mapped=%lu err_iso=%u err_mig=%u err_test=%u", + __get_str(name), + __entry->pfn, + __entry->count, + __entry->align, + __entry->nr_migrated, + __entry->nr_reclaimed, + __entry->nr_mapped, + __entry->err_iso, + __entry->err_mig, + __entry->err_test) +); + DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index 2f422f4f1fb9..c79f1d4c39af 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -11,10 +11,10 @@ TRACE_EVENT(damon_aggregated, - TP_PROTO(struct damon_target *t, struct damon_region *r, - unsigned int nr_regions), + TP_PROTO(struct damon_target *t, unsigned int target_id, + struct damon_region *r, unsigned int nr_regions), - TP_ARGS(t, r, nr_regions), + TP_ARGS(t, target_id, r, nr_regions), TP_STRUCT__entry( __field(unsigned long, target_id) @@ -22,19 +22,22 @@ TRACE_EVENT(damon_aggregated, __field(unsigned long, start) __field(unsigned long, end) __field(unsigned int, nr_accesses) + __field(unsigned int, age) ), TP_fast_assign( - __entry->target_id = t->id; + __entry->target_id = target_id; __entry->nr_regions = nr_regions; __entry->start = r->ar.start; __entry->end = r->ar.end; __entry->nr_accesses = r->nr_accesses; + __entry->age = r->age; ), - TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u", + TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u %u", __entry->target_id, __entry->nr_regions, - __entry->start, __entry->end, __entry->nr_accesses) + __entry->start, __entry->end, + __entry->nr_accesses, __entry->age) ); #endif /* _TRACE_DAMON_H */ diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index db4f2cec8360..16ae7b666810 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -24,7 +24,7 @@ struct erofs_map_blocks; #define show_mflags(flags) __print_flags(flags, "", \ { EROFS_MAP_MAPPED, "M" }, \ { EROFS_MAP_META, "I" }, \ - { EROFS_MAP_ZIPPED, "Z" }) + { EROFS_MAP_ENCODED, "E" }) TRACE_EVENT(erofs_lookup, diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 4cb055af1ec0..8e702647f30c 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -15,10 +15,6 @@ TRACE_DEFINE_ENUM(NODE); TRACE_DEFINE_ENUM(DATA); TRACE_DEFINE_ENUM(META); TRACE_DEFINE_ENUM(META_FLUSH); -TRACE_DEFINE_ENUM(INMEM); -TRACE_DEFINE_ENUM(INMEM_DROP); -TRACE_DEFINE_ENUM(INMEM_INVALIDATE); -TRACE_DEFINE_ENUM(INMEM_REVOKE); TRACE_DEFINE_ENUM(IPU); TRACE_DEFINE_ENUM(OPU); TRACE_DEFINE_ENUM(HOT); @@ -52,6 +48,8 @@ TRACE_DEFINE_ENUM(CP_DISCARD); TRACE_DEFINE_ENUM(CP_TRIMMED); TRACE_DEFINE_ENUM(CP_PAUSE); TRACE_DEFINE_ENUM(CP_RESIZE); +TRACE_DEFINE_ENUM(EX_READ); +TRACE_DEFINE_ENUM(EX_BLOCK_AGE); #define show_block_type(type) \ __print_symbolic(type, \ @@ -59,10 +57,6 @@ TRACE_DEFINE_ENUM(CP_RESIZE); { DATA, "DATA" }, \ { META, "META" }, \ { META_FLUSH, "META_FLUSH" }, \ - { INMEM, "INMEM" }, \ - { INMEM_DROP, "INMEM_DROP" }, \ - { INMEM_INVALIDATE, "INMEM_INVALIDATE" }, \ - { INMEM_REVOKE, "INMEM_REVOKE" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) @@ -162,6 +156,11 @@ TRACE_DEFINE_ENUM(CP_RESIZE); { COMPRESS_ZSTD, "ZSTD" }, \ { COMPRESS_LZORLE, "LZO-RLE" }) +#define show_extent_type(type) \ + __print_symbolic(type, \ + { EX_READ, "Read" }, \ + { EX_BLOCK_AGE, "Block Age" }) + struct f2fs_sb_info; struct f2fs_io_info; struct extent_info; @@ -330,7 +329,7 @@ TRACE_EVENT(f2fs_unlink_enter, __field(ino_t, ino) __field(loff_t, size) __field(blkcnt_t, blocks) - __field(const char *, name) + __string(name, dentry->d_name.name) ), TP_fast_assign( @@ -338,7 +337,7 @@ TRACE_EVENT(f2fs_unlink_enter, __entry->ino = dir->i_ino; __entry->size = dir->i_size; __entry->blocks = dir->i_blocks; - __entry->name = dentry->d_name.name; + __assign_str(name, dentry->d_name.name); ), TP_printk("dev = (%d,%d), dir ino = %lu, i_size = %lld, " @@ -346,7 +345,7 @@ TRACE_EVENT(f2fs_unlink_enter, show_dev_ino(__entry), __entry->size, (unsigned long long)__entry->blocks, - __entry->name) + __get_str(name)) ); DEFINE_EVENT(f2fs__inode_exit, f2fs_unlink_exit, @@ -540,17 +539,17 @@ TRACE_EVENT(f2fs_truncate_partial_nodes, TRACE_EVENT(f2fs_file_write_iter, - TP_PROTO(struct inode *inode, unsigned long offset, - unsigned long length, int ret), + TP_PROTO(struct inode *inode, loff_t offset, size_t length, + ssize_t ret), TP_ARGS(inode, offset, length, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(unsigned long, offset) - __field(unsigned long, length) - __field(int, ret) + __field(loff_t, offset) + __field(size_t, length) + __field(ssize_t, ret) ), TP_fast_assign( @@ -562,7 +561,7 @@ TRACE_EVENT(f2fs_file_write_iter, ), TP_printk("dev = (%d,%d), ino = %lu, " - "offset = %lu, length = %lu, written(err) = %d", + "offset = %lld, length = %zu, written(err) = %zd", show_dev_ino(__entry), __entry->offset, __entry->length, @@ -570,9 +569,10 @@ TRACE_EVENT(f2fs_file_write_iter, ); TRACE_EVENT(f2fs_map_blocks, - TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map, int ret), + TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map, + int create, int flag, int ret), - TP_ARGS(inode, map, ret), + TP_ARGS(inode, map, create, flag, ret), TP_STRUCT__entry( __field(dev_t, dev) @@ -583,11 +583,14 @@ TRACE_EVENT(f2fs_map_blocks, __field(unsigned int, m_flags) __field(int, m_seg_type) __field(bool, m_may_create) + __field(bool, m_multidev_dio) + __field(int, create) + __field(int, flag) __field(int, ret) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev = map->m_bdev->bd_dev; __entry->ino = inode->i_ino; __entry->m_lblk = map->m_lblk; __entry->m_pblk = map->m_pblk; @@ -595,12 +598,16 @@ TRACE_EVENT(f2fs_map_blocks, __entry->m_flags = map->m_flags; __entry->m_seg_type = map->m_seg_type; __entry->m_may_create = map->m_may_create; + __entry->m_multidev_dio = map->m_multidev_dio; + __entry->create = create; + __entry->flag = flag; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, file offset = %llu, " - "start blkaddr = 0x%llx, len = 0x%llx, flags = %u," - "seg_type = %d, may_create = %d, err = %d", + "start blkaddr = 0x%llx, len = 0x%llx, flags = %u, " + "seg_type = %d, may_create = %d, multidevice = %d, " + "create = %d, flag = %d, err = %d", show_dev_ino(__entry), (unsigned long long)__entry->m_lblk, (unsigned long long)__entry->m_pblk, @@ -608,6 +615,9 @@ TRACE_EVENT(f2fs_map_blocks, __entry->m_flags, __entry->m_seg_type, __entry->m_may_create, + __entry->m_multidev_dio, + __entry->create, + __entry->flag, __entry->ret) ); @@ -641,19 +651,22 @@ TRACE_EVENT(f2fs_background_gc, TRACE_EVENT(f2fs_gc_begin, - TP_PROTO(struct super_block *sb, bool sync, bool background, + TP_PROTO(struct super_block *sb, int gc_type, bool no_bg_gc, + unsigned int nr_free_secs, long long dirty_nodes, long long dirty_dents, long long dirty_imeta, unsigned int free_sec, unsigned int free_seg, int reserved_seg, unsigned int prefree_seg), - TP_ARGS(sb, sync, background, dirty_nodes, dirty_dents, dirty_imeta, + TP_ARGS(sb, gc_type, no_bg_gc, nr_free_secs, dirty_nodes, + dirty_dents, dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), TP_STRUCT__entry( __field(dev_t, dev) - __field(bool, sync) - __field(bool, background) + __field(int, gc_type) + __field(bool, no_bg_gc) + __field(unsigned int, nr_free_secs) __field(long long, dirty_nodes) __field(long long, dirty_dents) __field(long long, dirty_imeta) @@ -665,8 +678,9 @@ TRACE_EVENT(f2fs_gc_begin, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->sync = sync; - __entry->background = background; + __entry->gc_type = gc_type; + __entry->no_bg_gc = no_bg_gc; + __entry->nr_free_secs = nr_free_secs; __entry->dirty_nodes = dirty_nodes; __entry->dirty_dents = dirty_dents; __entry->dirty_imeta = dirty_imeta; @@ -676,12 +690,13 @@ TRACE_EVENT(f2fs_gc_begin, __entry->prefree_seg = prefree_seg; ), - TP_printk("dev = (%d,%d), sync = %d, background = %d, nodes = %lld, " - "dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " + TP_printk("dev = (%d,%d), gc_type = %s, no_background_GC = %d, nr_free_secs = %u, " + "nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " "rsv_seg:%d, prefree_seg:%u", show_dev(__entry->dev), - __entry->sync, - __entry->background, + show_gc_type(__entry->gc_type), + (__entry->gc_type == BG_GC) ? __entry->no_bg_gc : -1, + __entry->nr_free_secs, __entry->dirty_nodes, __entry->dirty_dents, __entry->dirty_imeta, @@ -925,30 +940,39 @@ TRACE_EVENT(f2fs_fallocate, TRACE_EVENT(f2fs_direct_IO_enter, - TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw), + TP_PROTO(struct inode *inode, struct kiocb *iocb, long len, int rw), - TP_ARGS(inode, offset, len, rw), + TP_ARGS(inode, iocb, len, rw), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(loff_t, pos) + __field(loff_t, ki_pos) + __field(int, ki_flags) + __field(u16, ki_hint) + __field(u16, ki_ioprio) __field(unsigned long, len) __field(int, rw) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->pos = offset; - __entry->len = len; - __entry->rw = rw; + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->ki_pos = iocb->ki_pos; + __entry->ki_flags = iocb->ki_flags; + __entry->ki_hint = iocb->ki_hint; + __entry->ki_ioprio = iocb->ki_ioprio; + __entry->len = len; + __entry->rw = rw; ), - TP_printk("dev = (%d,%d), ino = %lu pos = %lld len = %lu rw = %d", + TP_printk("dev = (%d,%d), ino = %lu pos = %lld len = %lu ki_flags = %x ki_hint = %x ki_ioprio = %x rw = %d", show_dev_ino(__entry), - __entry->pos, + __entry->ki_pos, __entry->len, + __entry->ki_flags, + __entry->ki_hint, + __entry->ki_ioprio, __entry->rw) ); @@ -1276,20 +1300,6 @@ DEFINE_EVENT(f2fs__page, f2fs_vm_page_mkwrite, TP_ARGS(page, type) ); -DEFINE_EVENT(f2fs__page, f2fs_register_inmem_page, - - TP_PROTO(struct page *page, int type), - - TP_ARGS(page, type) -); - -DEFINE_EVENT(f2fs__page, f2fs_commit_inmem_page, - - TP_PROTO(struct page *page, int type), - - TP_ARGS(page, type) -); - TRACE_EVENT(f2fs_filemap_fault, TP_PROTO(struct inode *inode, pgoff_t index, unsigned long ret), @@ -1408,26 +1418,26 @@ TRACE_EVENT(f2fs_readpages, TRACE_EVENT(f2fs_write_checkpoint, - TP_PROTO(struct super_block *sb, int reason, char *msg), + TP_PROTO(struct super_block *sb, int reason, const char *msg), TP_ARGS(sb, reason, msg), TP_STRUCT__entry( __field(dev_t, dev) __field(int, reason) - __field(char *, msg) + __string(dest_msg, msg) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->reason = reason; - __entry->msg = msg; + __assign_str(dest_msg, msg); ), TP_printk("dev = (%d,%d), checkpoint for %s, state = %s", show_dev(__entry->dev), show_cpreason(__entry->reason), - __entry->msg) + __get_str(dest_msg)) ); DECLARE_EVENT_CLASS(f2fs_discard, @@ -1526,28 +1536,31 @@ TRACE_EVENT(f2fs_issue_flush, TRACE_EVENT(f2fs_lookup_extent_tree_start, - TP_PROTO(struct inode *inode, unsigned int pgofs), + TP_PROTO(struct inode *inode, unsigned int pgofs, enum extent_type type), - TP_ARGS(inode, pgofs), + TP_ARGS(inode, pgofs, type), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, pgofs) + __field(enum extent_type, type) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pgofs = pgofs; + __entry->type = type; ), - TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u", + TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, type = %s", show_dev_ino(__entry), - __entry->pgofs) + __entry->pgofs, + show_extent_type(__entry->type)) ); -TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end, +TRACE_EVENT_CONDITION(f2fs_lookup_read_extent_tree_end, TP_PROTO(struct inode *inode, unsigned int pgofs, struct extent_info *ei), @@ -1561,8 +1574,8 @@ TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end, __field(ino_t, ino) __field(unsigned int, pgofs) __field(unsigned int, fofs) - __field(u32, blk) __field(unsigned int, len) + __field(u32, blk) ), TP_fast_assign( @@ -1570,25 +1583,65 @@ TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end, __entry->ino = inode->i_ino; __entry->pgofs = pgofs; __entry->fofs = ei->fofs; - __entry->blk = ei->blk; __entry->len = ei->len; + __entry->blk = ei->blk; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " - "ext_info(fofs: %u, blk: %u, len: %u)", + "read_ext_info(fofs: %u, len: %u, blk: %u)", show_dev_ino(__entry), __entry->pgofs, __entry->fofs, - __entry->blk, - __entry->len) + __entry->len, + __entry->blk) ); -TRACE_EVENT(f2fs_update_extent_tree_range, +TRACE_EVENT_CONDITION(f2fs_lookup_age_extent_tree_end, - TP_PROTO(struct inode *inode, unsigned int pgofs, block_t blkaddr, - unsigned int len), + TP_PROTO(struct inode *inode, unsigned int pgofs, + struct extent_info *ei), - TP_ARGS(inode, pgofs, blkaddr, len), + TP_ARGS(inode, pgofs, ei), + + TP_CONDITION(ei), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(unsigned int, pgofs) + __field(unsigned int, fofs) + __field(unsigned int, len) + __field(unsigned long long, age) + __field(unsigned long long, blocks) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pgofs = pgofs; + __entry->fofs = ei->fofs; + __entry->len = ei->len; + __entry->age = ei->age; + __entry->blocks = ei->last_blocks; + ), + + TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " + "age_ext_info(fofs: %u, len: %u, age: %llu, blocks: %llu)", + show_dev_ino(__entry), + __entry->pgofs, + __entry->fofs, + __entry->len, + __entry->age, + __entry->blocks) +); + +TRACE_EVENT(f2fs_update_read_extent_tree_range, + + TP_PROTO(struct inode *inode, unsigned int pgofs, unsigned int len, + block_t blkaddr, + unsigned int c_len), + + TP_ARGS(inode, pgofs, len, blkaddr, c_len), TP_STRUCT__entry( __field(dev_t, dev) @@ -1596,70 +1649,115 @@ TRACE_EVENT(f2fs_update_extent_tree_range, __field(unsigned int, pgofs) __field(u32, blk) __field(unsigned int, len) + __field(unsigned int, c_len) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pgofs = pgofs; - __entry->blk = blkaddr; __entry->len = len; + __entry->blk = blkaddr; + __entry->c_len = c_len; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " - "blkaddr = %u, len = %u", + "len = %u, blkaddr = %u, c_len = %u", show_dev_ino(__entry), __entry->pgofs, + __entry->len, __entry->blk, - __entry->len) + __entry->c_len) +); + +TRACE_EVENT(f2fs_update_age_extent_tree_range, + + TP_PROTO(struct inode *inode, unsigned int pgofs, unsigned int len, + unsigned long long age, + unsigned long long last_blks), + + TP_ARGS(inode, pgofs, len, age, last_blks), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(unsigned int, pgofs) + __field(unsigned int, len) + __field(unsigned long long, age) + __field(unsigned long long, blocks) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pgofs = pgofs; + __entry->len = len; + __entry->age = age; + __entry->blocks = last_blks; + ), + + TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " + "len = %u, age = %llu, blocks = %llu", + show_dev_ino(__entry), + __entry->pgofs, + __entry->len, + __entry->age, + __entry->blocks) ); TRACE_EVENT(f2fs_shrink_extent_tree, TP_PROTO(struct f2fs_sb_info *sbi, unsigned int node_cnt, - unsigned int tree_cnt), + unsigned int tree_cnt, enum extent_type type), - TP_ARGS(sbi, node_cnt, tree_cnt), + TP_ARGS(sbi, node_cnt, tree_cnt, type), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, node_cnt) __field(unsigned int, tree_cnt) + __field(enum extent_type, type) ), TP_fast_assign( __entry->dev = sbi->sb->s_dev; __entry->node_cnt = node_cnt; __entry->tree_cnt = tree_cnt; + __entry->type = type; ), - TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u", + TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u, type = %s", show_dev(__entry->dev), __entry->node_cnt, - __entry->tree_cnt) + __entry->tree_cnt, + show_extent_type(__entry->type)) ); TRACE_EVENT(f2fs_destroy_extent_tree, - TP_PROTO(struct inode *inode, unsigned int node_cnt), + TP_PROTO(struct inode *inode, unsigned int node_cnt, + enum extent_type type), - TP_ARGS(inode, node_cnt), + TP_ARGS(inode, node_cnt, type), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, node_cnt) + __field(enum extent_type, type) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->node_cnt = node_cnt; + __entry->type = type; ), - TP_printk("dev = (%d,%d), ino = %lu, destroyed: node_cnt = %u", + TP_printk("dev = (%d,%d), ino = %lu, destroyed: node_cnt = %u, type = %s", show_dev_ino(__entry), - __entry->node_cnt) + __entry->node_cnt, + show_extent_type(__entry->type)) ); DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, @@ -1831,7 +1929,10 @@ TRACE_EVENT(f2fs_iostat, __field(unsigned long long, app_bio) __field(unsigned long long, app_wio) __field(unsigned long long, app_mio) + __field(unsigned long long, app_bcdio) + __field(unsigned long long, app_mcdio) __field(unsigned long long, fs_dio) + __field(unsigned long long, fs_cdio) __field(unsigned long long, fs_nio) __field(unsigned long long, fs_mio) __field(unsigned long long, fs_gc_dio) @@ -1843,6 +1944,8 @@ TRACE_EVENT(f2fs_iostat, __field(unsigned long long, app_brio) __field(unsigned long long, app_rio) __field(unsigned long long, app_mrio) + __field(unsigned long long, app_bcrio) + __field(unsigned long long, app_mcrio) __field(unsigned long long, fs_drio) __field(unsigned long long, fs_gdrio) __field(unsigned long long, fs_cdrio) @@ -1857,7 +1960,10 @@ TRACE_EVENT(f2fs_iostat, __entry->app_bio = iostat[APP_BUFFERED_IO]; __entry->app_wio = iostat[APP_WRITE_IO]; __entry->app_mio = iostat[APP_MAPPED_IO]; + __entry->app_bcdio = iostat[APP_BUFFERED_CDATA_IO]; + __entry->app_mcdio = iostat[APP_MAPPED_CDATA_IO]; __entry->fs_dio = iostat[FS_DATA_IO]; + __entry->fs_cdio = iostat[FS_CDATA_IO]; __entry->fs_nio = iostat[FS_NODE_IO]; __entry->fs_mio = iostat[FS_META_IO]; __entry->fs_gc_dio = iostat[FS_GC_DATA_IO]; @@ -1869,6 +1975,8 @@ TRACE_EVENT(f2fs_iostat, __entry->app_brio = iostat[APP_BUFFERED_READ_IO]; __entry->app_rio = iostat[APP_READ_IO]; __entry->app_mrio = iostat[APP_MAPPED_READ_IO]; + __entry->app_bcrio = iostat[APP_BUFFERED_CDATA_READ_IO]; + __entry->app_mcrio = iostat[APP_MAPPED_CDATA_READ_IO]; __entry->fs_drio = iostat[FS_DATA_READ_IO]; __entry->fs_gdrio = iostat[FS_GDATA_READ_IO]; __entry->fs_cdrio = iostat[FS_CDATA_READ_IO]; @@ -1878,20 +1986,24 @@ TRACE_EVENT(f2fs_iostat, ), TP_printk("dev = (%d,%d), " - "app [write=%llu (direct=%llu, buffered=%llu), mapped=%llu], " - "fs [data=%llu, node=%llu, meta=%llu, discard=%llu], " + "app [write=%llu (direct=%llu, buffered=%llu), mapped=%llu, " + "compr(buffered=%llu, mapped=%llu)], " + "fs [data=%llu, cdata=%llu, node=%llu, meta=%llu, discard=%llu], " "gc [data=%llu, node=%llu], " "cp [data=%llu, node=%llu, meta=%llu], " "app [read=%llu (direct=%llu, buffered=%llu), mapped=%llu], " - "fs [data=%llu, (gc_data=%llu, compr_data=%llu), " + "compr(buffered=%llu, mapped=%llu)], " + "fs [data=%llu, (gc_data=%llu, cdata=%llu), " "node=%llu, meta=%llu]", show_dev(__entry->dev), __entry->app_wio, __entry->app_dio, - __entry->app_bio, __entry->app_mio, __entry->fs_dio, + __entry->app_bio, __entry->app_mio, __entry->app_bcdio, + __entry->app_mcdio, __entry->fs_dio, __entry->fs_cdio, __entry->fs_nio, __entry->fs_mio, __entry->fs_discard, __entry->fs_gc_dio, __entry->fs_gc_nio, __entry->fs_cp_dio, __entry->fs_cp_nio, __entry->fs_cp_mio, __entry->app_rio, __entry->app_drio, __entry->app_brio, - __entry->app_mrio, __entry->fs_drio, __entry->fs_gdrio, + __entry->app_mrio, __entry->app_bcrio, __entry->app_mcrio, + __entry->fs_drio, __entry->fs_gdrio, __entry->fs_cdrio, __entry->fs_nrio, __entry->fs_mrio) ); @@ -2054,6 +2166,100 @@ TRACE_EVENT(f2fs_fiemap, __entry->ret) ); +DECLARE_EVENT_CLASS(f2fs__rw_start, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *pathname, char *command), + + TP_ARGS(inode, offset, bytes, pid, pathname, command), + + TP_STRUCT__entry( + __string(pathbuf, pathname) + __field(loff_t, offset) + __field(int, bytes) + __field(loff_t, i_size) + __string(cmdline, command) + __field(pid_t, pid) + __field(ino_t, ino) + ), + + TP_fast_assign( + /* + * Replace the spaces in filenames and cmdlines + * because this screws up the tooling that parses + * the traces. + */ + __assign_str(pathbuf, pathname); + (void)strreplace(__get_str(pathbuf), ' ', '_'); + __entry->offset = offset; + __entry->bytes = bytes; + __entry->i_size = i_size_read(inode); + __assign_str(cmdline, command); + (void)strreplace(__get_str(cmdline), ' ', '_'); + __entry->pid = pid; + __entry->ino = inode->i_ino; + ), + + TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s," + " pid %d, i_size %llu, ino %lu", + __get_str(pathbuf), __entry->offset, __entry->bytes, + __get_str(cmdline), __entry->pid, __entry->i_size, + (unsigned long) __entry->ino) +); + +DECLARE_EVENT_CLASS(f2fs__rw_end, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + + TP_ARGS(inode, offset, bytes), + + TP_STRUCT__entry( + __field(ino_t, ino) + __field(loff_t, offset) + __field(int, bytes) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->offset = offset; + __entry->bytes = bytes; + ), + + TP_printk("ino %lu, offset %llu, bytes %d", + (unsigned long) __entry->ino, + __entry->offset, __entry->bytes) +); + +DEFINE_EVENT(f2fs__rw_start, f2fs_dataread_start, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *pathname, char *command), + + TP_ARGS(inode, offset, bytes, pid, pathname, command) +); + +DEFINE_EVENT(f2fs__rw_end, f2fs_dataread_end, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + + TP_ARGS(inode, offset, bytes) +); + +DEFINE_EVENT(f2fs__rw_start, f2fs_datawrite_start, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *pathname, char *command), + + TP_ARGS(inode, offset, bytes, pid, pathname, command) +); + +DEFINE_EVENT(f2fs__rw_end, f2fs_datawrite_end, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + + TP_ARGS(inode, offset, bytes) +); + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index eeceafaaea4c..bb70f46656b2 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -160,6 +160,51 @@ DEFINE_EVENT(softirq, softirq_raise, TP_ARGS(vec_nr) ); +DECLARE_EVENT_CLASS(tasklet, + + TP_PROTO(void *func), + + TP_ARGS(func), + + TP_STRUCT__entry( + __field( void *, func) + ), + + TP_fast_assign( + __entry->func = func; + ), + + TP_printk("function=%ps", __entry->func) +); + +DEFINE_EVENT(tasklet, tasklet_entry, + + TP_PROTO(void *func), + + TP_ARGS(func) +); + +DEFINE_EVENT(tasklet, tasklet_exit, + + TP_PROTO(void *func), + + TP_ARGS(func) +); + +DEFINE_EVENT(tasklet, tasklet_hi_entry, + + TP_PROTO(void *func), + + TP_ARGS(func) +); + +DEFINE_EVENT(tasklet, tasklet_hi_exit, + + TP_PROTO(void *func), + + TP_ARGS(func) +); + #endif /* _TRACE_IRQ_H */ /* This part must be outside protection */ diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 116ed4d5d0f8..9bd906d09788 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -49,12 +49,20 @@ {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\ - {(unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"}, \ - {(unsigned long)__GFP_SKIP_KASAN_POISON,"__GFP_SKIP_KASAN_POISON"}\ + {(unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"} \ + +#ifdef CONFIG_KASAN_HW_TAGS +#define __def_gfpflag_names_kasan , \ + {(unsigned long)__GFP_SKIP_ZERO, "__GFP_SKIP_ZERO"}, \ + {(unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"}, \ + {(unsigned long)__GFP_SKIP_KASAN_UNPOISON, "__GFP_SKIP_KASAN_UNPOISON"} +#else +#define __def_gfpflag_names_kasan +#endif #define show_gfp_flags(flags) \ (flags) ? __print_flags(flags, "|", \ - __def_gfpflag_names \ + __def_gfpflag_names __def_gfpflag_names_kasan \ ) : "none" #ifdef CONFIG_MMU @@ -83,8 +91,10 @@ #ifdef CONFIG_64BIT #define IF_HAVE_PG_ARCH_2(flag,string) ,{1UL << flag, string} +#define IF_HAVE_PG_OEM_RESERVED(flag,string) ,{1UL << flag, string} #else #define IF_HAVE_PG_ARCH_2(flag,string) +#define IF_HAVE_PG_OEM_RESERVED(flag,string) #endif #ifdef CONFIG_KASAN_HW_TAGS @@ -121,6 +131,7 @@ IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \ IF_HAVE_PG_IDLE(PG_young, "young" ) \ IF_HAVE_PG_IDLE(PG_idle, "idle" ) \ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) \ +IF_HAVE_PG_OEM_RESERVED(PG_oem_reserved,"oem_reserved" ) \ IF_HAVE_PG_SKIP_KASAN_POISON(PG_skip_kasan_poison, "skip_kasan_poison") #define show_page_flags(flags) \ diff --git a/include/trace/events/rwmmio.h b/include/trace/events/rwmmio.h new file mode 100644 index 000000000000..f66397007258 --- /dev/null +++ b/include/trace/events/rwmmio.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021 Qualcomm Innovation Center, Inc. All rights reserved. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rwmmio + +#if !defined(_TRACE_RWMMIO_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RWMMIO_H + +#include + +TRACE_EVENT(rwmmio_write, + + TP_PROTO(unsigned long caller, u64 val, u8 width, volatile void __iomem *addr), + + TP_ARGS(caller, val, width, addr), + + TP_STRUCT__entry( + __field(u64, caller) + __field(u64, val) + __field(u64, addr) + __field(u8, width) + ), + + TP_fast_assign( + __entry->caller = caller; + __entry->val = val; + __entry->addr = (unsigned long)(void *)addr; + __entry->width = width; + ), + + TP_printk("%pS width=%d val=%#llx addr=%#llx", + (void *)(unsigned long)__entry->caller, __entry->width, + __entry->val, __entry->addr) +); + +TRACE_EVENT(rwmmio_post_write, + + TP_PROTO(unsigned long caller, u64 val, u8 width, volatile void __iomem *addr), + + TP_ARGS(caller, val, width, addr), + + TP_STRUCT__entry( + __field(u64, caller) + __field(u64, val) + __field(u64, addr) + __field(u8, width) + ), + + TP_fast_assign( + __entry->caller = caller; + __entry->val = val; + __entry->addr = (unsigned long)(void *)addr; + __entry->width = width; + ), + + TP_printk("%pS width=%d val=%#llx addr=%#llx", + (void *)(unsigned long)__entry->caller, __entry->width, + __entry->val, __entry->addr) +); + +TRACE_EVENT(rwmmio_read, + + TP_PROTO(unsigned long caller, u8 width, const volatile void __iomem *addr), + + TP_ARGS(caller, width, addr), + + TP_STRUCT__entry( + __field(u64, caller) + __field(u64, addr) + __field(u8, width) + ), + + TP_fast_assign( + __entry->caller = caller; + __entry->addr = (unsigned long)(void *)addr; + __entry->width = width; + ), + + TP_printk("%pS width=%d addr=%#llx", + (void *)(unsigned long)__entry->caller, __entry->width, __entry->addr) +); + +TRACE_EVENT(rwmmio_post_read, + + TP_PROTO(unsigned long caller, u64 val, u8 width, const volatile void __iomem *addr), + + TP_ARGS(caller, val, width, addr), + + TP_STRUCT__entry( + __field(u64, caller) + __field(u64, val) + __field(u64, addr) + __field(u8, width) + ), + + TP_fast_assign( + __entry->caller = caller; + __entry->val = val; + __entry->addr = (unsigned long)(void *)addr; + __entry->width = width; + ), + + TP_printk("%pS width=%d val=%#llx addr=%#llx", + (void *)(unsigned long)__entry->caller, __entry->width, + __entry->val, __entry->addr) +); + +#endif /* _TRACE_RWMMIO_H */ + +#include diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 94640482cfe7..a059a6244179 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -484,6 +484,30 @@ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_blocked, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); +/* + * Tracepoint for recording the cause of uninterruptible sleep. + */ +TRACE_EVENT(sched_blocked_reason, + + TP_PROTO(struct task_struct *tsk), + + TP_ARGS(tsk), + + TP_STRUCT__entry( + __field( pid_t, pid ) + __field( void*, caller ) + __field( bool, io_wait ) + ), + + TP_fast_assign( + __entry->pid = tsk->pid; + __entry->caller = (void *)get_wchan(tsk); + __entry->io_wait = tsk->in_iowait; + ), + + TP_printk("pid=%d iowait=%d caller=%pS", __entry->pid, __entry->io_wait, __entry->caller) +); + /* * Tracepoint for accounting runtime (time the task is executing * on a CPU). diff --git a/include/trace/hooks/audio_usboffload.h b/include/trace/hooks/audio_usboffload.h new file mode 100644 index 000000000000..e948fc6c3ae7 --- /dev/null +++ b/include/trace/hooks/audio_usboffload.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM audio_usboffload + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_AUDIO_USBOFFLOAD_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_AUDIO_USBOFFLOAD_H + +#include + +struct snd_usb_audio; +struct usb_interface; + +DECLARE_HOOK(android_vh_audio_usb_offload_vendor_set, + TP_PROTO(void *arg), + TP_ARGS(arg)); + +DECLARE_HOOK(android_vh_audio_usb_offload_ep_action, + TP_PROTO(void *arg, bool action), + TP_ARGS(arg, action)); + +DECLARE_HOOK(android_vh_audio_usb_offload_synctype, + TP_PROTO(void *arg, int attr, bool *need_ignore), + TP_ARGS(arg, attr, need_ignore)); + +DECLARE_HOOK(android_vh_audio_usb_offload_connect, + TP_PROTO(struct usb_interface *intf, struct snd_usb_audio *chip), + TP_ARGS(intf, chip)); + +DECLARE_RESTRICTED_HOOK(android_rvh_audio_usb_offload_disconnect, + TP_PROTO(struct usb_interface *intf), + TP_ARGS(intf), 1); + +#endif /* _TRACE_HOOK_AUDIO_USBOFFLOAD_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/avc.h b/include/trace/hooks/avc.h new file mode 100644 index 000000000000..5100dde72c9d --- /dev/null +++ b/include/trace/hooks/avc.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM avc + +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_AVC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_AVC_H +#include +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ +struct avc_node; +DECLARE_RESTRICTED_HOOK(android_rvh_selinux_avc_insert, + TP_PROTO(const struct avc_node *node), + TP_ARGS(node), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_selinux_avc_node_delete, + TP_PROTO(const struct avc_node *node), + TP_ARGS(node), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_selinux_avc_node_replace, + TP_PROTO(const struct avc_node *old, const struct avc_node *new), + TP_ARGS(old, new), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_selinux_avc_lookup, + TP_PROTO(const struct avc_node *node, u32 ssid, u32 tsid, u16 tclass), + TP_ARGS(node, ssid, tsid, tclass), 1); + +#endif /* _TRACE_HOOK_AVC_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/binder.h b/include/trace/hooks/binder.h new file mode 100644 index 000000000000..c80b3c15ddc9 --- /dev/null +++ b/include/trace/hooks/binder.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM binder +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_BINDER_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_BINDER_H +#include +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ + +struct binder_proc; +struct binder_thread; +struct binder_transaction; +struct task_struct; +struct binder_transaction_data; + +DECLARE_HOOK(android_vh_binder_transaction_init, + TP_PROTO(struct binder_transaction *t), + TP_ARGS(t)); +DECLARE_HOOK(android_vh_binder_priority_skip, + TP_PROTO(struct task_struct *task, bool *skip), + TP_ARGS(task, skip)); +DECLARE_HOOK(android_vh_binder_set_priority, + TP_PROTO(struct binder_transaction *t, struct task_struct *task), + TP_ARGS(t, task)); +DECLARE_HOOK(android_vh_binder_restore_priority, + TP_PROTO(struct binder_transaction *t, struct task_struct *task), + TP_ARGS(t, task)); +DECLARE_HOOK(android_vh_binder_wakeup_ilocked, + TP_PROTO(struct task_struct *task, bool sync, struct binder_proc *proc), + TP_ARGS(task, sync, proc)); +DECLARE_HOOK(android_vh_binder_wait_for_work, + TP_PROTO(bool do_proc_work, struct binder_thread *tsk, struct binder_proc *proc), + TP_ARGS(do_proc_work, tsk, proc)); +DECLARE_HOOK(android_vh_sync_txn_recvd, + TP_PROTO(struct task_struct *tsk, struct task_struct *from), + TP_ARGS(tsk, from)); +DECLARE_RESTRICTED_HOOK(android_rvh_binder_transaction, + TP_PROTO(struct binder_proc *target_proc, struct binder_proc *proc, + struct binder_thread *thread, struct binder_transaction_data *tr), + TP_ARGS(target_proc, proc, thread, tr), 1); +DECLARE_HOOK(android_vh_binder_print_transaction_info, + TP_PROTO(struct seq_file *m, struct binder_proc *proc, + const char *prefix, struct binder_transaction *t), + TP_ARGS(m, proc, prefix, t)); + +#endif /* _TRACE_HOOK_BINDER_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/block.h b/include/trace/hooks/block.h new file mode 100644 index 000000000000..e8bee374f989 --- /dev/null +++ b/include/trace/hooks/block.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM block + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_BLOCK_H + +#include + +struct blk_mq_tags; +struct blk_mq_alloc_data; +struct blk_mq_tag_set; + +DECLARE_HOOK(android_vh_blk_alloc_rqs, + TP_PROTO(size_t *rq_size, struct blk_mq_tag_set *set, + struct blk_mq_tags *tags), + TP_ARGS(rq_size, set, tags)); + +DECLARE_HOOK(android_vh_blk_rq_ctx_init, + TP_PROTO(struct request *rq, struct blk_mq_tags *tags, + struct blk_mq_alloc_data *data, u64 alloc_time_ns), + TP_ARGS(rq, tags, data, alloc_time_ns)); + +#endif /* _TRACE_HOOK_BLOCK_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/buffer.h b/include/trace/hooks/buffer.h new file mode 100644 index 000000000000..50e8f71dab9f --- /dev/null +++ b/include/trace/hooks/buffer.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM buffer + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_BUFFER_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_BUFFER_H + +#include +#include + +DECLARE_HOOK(android_vh_bh_lru_install, + TP_PROTO(struct page *page, bool *flush), + TP_ARGS(page, flush)); + +/* macro versions of hooks are no longer required */ + +#endif /* _TRACE_HOOK_BUFFER_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/bug.h b/include/trace/hooks/bug.h new file mode 100644 index 000000000000..bbae73fd14c4 --- /dev/null +++ b/include/trace/hooks/bug.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bug +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_BUG_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_BUG_H +#include + +DECLARE_RESTRICTED_HOOK(android_rvh_report_bug, + TP_PROTO(const char *file, unsigned int line, unsigned long bugaddr), + TP_ARGS(file, line, bugaddr), 1); + +#endif /* _TRACE_HOOK_BUG_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/cfg80211.h b/include/trace/hooks/cfg80211.h new file mode 100644 index 000000000000..26cef55c4e0f --- /dev/null +++ b/include/trace/hooks/cfg80211.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cfg80211 + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_CFG80211_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_CFG80211_H + +#include + +struct wiphy; +struct wireless_dev; + +DECLARE_HOOK(android_vh_cfg80211_set_context, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, int context_id, + const void *data), + TP_ARGS(wiphy, wdev, context_id, data)); + +DECLARE_HOOK(android_vh_cfg80211_get_context, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, int context_id, + void *data, size_t max_data_len), + TP_ARGS(wiphy, wdev, context_id, data, max_data_len)); + +#endif /* _TRACE_HOOK_CFG80211_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/cgroup.h b/include/trace/hooks/cgroup.h new file mode 100644 index 000000000000..ada3e064a8df --- /dev/null +++ b/include/trace/hooks/cgroup.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cgroup +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_CGROUP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_CGROUP_H +#include + +struct cgroup; +struct cgroup_taskset; +struct cgroup_subsys; +struct cgroup_subsys_state; +struct task_struct; + +DECLARE_HOOK(android_vh_cgroup_set_task, + TP_PROTO(int ret, struct task_struct *task), + TP_ARGS(ret, task)); + +DECLARE_HOOK(android_vh_cgroup_attach, + TP_PROTO(struct cgroup_subsys *ss, struct cgroup_taskset *tset), + TP_ARGS(ss, tset)) +DECLARE_RESTRICTED_HOOK(android_rvh_cgroup_force_kthread_migration, + TP_PROTO(struct task_struct *tsk, struct cgroup *dst_cgrp, bool *force_migration), + TP_ARGS(tsk, dst_cgrp, force_migration), 1); +struct mem_cgroup; +DECLARE_HOOK(android_rvh_memcgv2_init, + TP_PROTO(struct mem_cgroup *memcg), + TP_ARGS(memcg)); + +DECLARE_HOOK(android_rvh_memcgv2_calc_decayed_watermark, + TP_PROTO(struct mem_cgroup *memcg), + TP_ARGS(memcg)); + +struct page_counter; +DECLARE_HOOK(android_rvh_update_watermark, + TP_PROTO(u64 new, struct page_counter *counter), + TP_ARGS(new, counter)); + +DECLARE_RESTRICTED_HOOK(android_rvh_cpu_cgroup_attach, + TP_PROTO(struct cgroup_taskset *tset), + TP_ARGS(tset), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_cpu_cgroup_can_attach, + TP_PROTO(struct cgroup_taskset *tset, int *retval), + TP_ARGS(tset, retval), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_cpu_cgroup_online, + TP_PROTO(struct cgroup_subsys_state *css), + TP_ARGS(css), 1); + +#endif + +#include diff --git a/include/trace/hooks/cpufreq.h b/include/trace/hooks/cpufreq.h new file mode 100644 index 000000000000..c6c2b19cdadf --- /dev/null +++ b/include/trace/hooks/cpufreq.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cpufreq + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_CPUFREQ_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_CPUFREQ_H + +#include + +struct cpufreq_policy; + +DECLARE_RESTRICTED_HOOK(android_rvh_show_max_freq, + TP_PROTO(struct cpufreq_policy *policy, unsigned int *max_freq), + TP_ARGS(policy, max_freq), 1); + +DECLARE_HOOK(android_vh_freq_table_limits, + TP_PROTO(struct cpufreq_policy *policy, unsigned int min_freq, + unsigned int max_freq), + TP_ARGS(policy, min_freq, max_freq)); + +DECLARE_RESTRICTED_HOOK(android_rvh_cpufreq_transition, + TP_PROTO(struct cpufreq_policy *policy), + TP_ARGS(policy), 1); + +DECLARE_HOOK(android_vh_cpufreq_resolve_freq, + TP_PROTO(struct cpufreq_policy *policy, unsigned int *target_freq, + unsigned int old_target_freq), + TP_ARGS(policy, target_freq, old_target_freq)); + +DECLARE_HOOK(android_vh_cpufreq_fast_switch, + TP_PROTO(struct cpufreq_policy *policy, unsigned int *target_freq, + unsigned int old_target_freq), + TP_ARGS(policy, target_freq, old_target_freq)); + +DECLARE_HOOK(android_vh_cpufreq_target, + TP_PROTO(struct cpufreq_policy *policy, unsigned int *target_freq, + unsigned int old_target_freq), + TP_ARGS(policy, target_freq, old_target_freq)); + +#endif /* _TRACE_HOOK_CPUFREQ_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/cpuidle.h b/include/trace/hooks/cpuidle.h new file mode 100644 index 000000000000..b1ee27ed6707 --- /dev/null +++ b/include/trace/hooks/cpuidle.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cpuidle + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_CPUIDLE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_CPUIDLE_H + +#include + +struct cpuidle_device; + +DECLARE_HOOK(android_vh_cpu_idle_enter, + TP_PROTO(int *state, struct cpuidle_device *dev), + TP_ARGS(state, dev)) +DECLARE_HOOK(android_vh_cpu_idle_exit, + TP_PROTO(int state, struct cpuidle_device *dev), + TP_ARGS(state, dev)) + +#endif /* _TRACE_HOOK_CPUIDLE_H */ +/* This part must be outside protection */ +#include + diff --git a/include/trace/hooks/cpuidle_psci.h b/include/trace/hooks/cpuidle_psci.h new file mode 100644 index 000000000000..3ca307a3d82b --- /dev/null +++ b/include/trace/hooks/cpuidle_psci.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cpuidle_psci +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_CPUIDLE_PSCI_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_CPUIDLE_PSCI_H +#include +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ + +struct cpuidle_device; + +DECLARE_HOOK(android_vh_cpuidle_psci_enter, + TP_PROTO(struct cpuidle_device *dev, bool s2idle), + TP_ARGS(dev, s2idle)); + +DECLARE_HOOK(android_vh_cpuidle_psci_exit, + TP_PROTO(struct cpuidle_device *dev, bool s2idle), + TP_ARGS(dev, s2idle)); + +#endif /* _TRACE_HOOK_CPUIDLE_PSCI_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/creds.h b/include/trace/hooks/creds.h new file mode 100644 index 000000000000..da3d400f0e9e --- /dev/null +++ b/include/trace/hooks/creds.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM creds + +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_CREDS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_CREDS_H +#include +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ +struct cred; +struct task_struct; + +DECLARE_RESTRICTED_HOOK(android_rvh_commit_creds, + TP_PROTO(const struct task_struct *task, const struct cred *new), + TP_ARGS(task, new), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_exit_creds, + TP_PROTO(const struct task_struct *task, const struct cred *cred), + TP_ARGS(task, cred), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_override_creds, + TP_PROTO(const struct task_struct *task, const struct cred *new), + TP_ARGS(task, new), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_revert_creds, + TP_PROTO(const struct task_struct *task, const struct cred *old), + TP_ARGS(task, old), 1); + +#endif /* _TRACE_HOOK_CREDS_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/debug.h b/include/trace/hooks/debug.h new file mode 100644 index 000000000000..5a20141d742b --- /dev/null +++ b/include/trace/hooks/debug.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM debug + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_DEBUG_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_DEBUG_H + +#include + +struct pt_regs; + +DECLARE_HOOK(android_vh_ipi_stop, + TP_PROTO(struct pt_regs *regs), + TP_ARGS(regs)) + +#endif /* _TRACE_HOOK_DEBUG_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/dmabuf.h b/include/trace/hooks/dmabuf.h new file mode 100644 index 000000000000..31ac68577522 --- /dev/null +++ b/include/trace/hooks/dmabuf.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM dmabuf + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_DMABUF_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_DMABUF_H + +#include + +struct dma_buf; + +DECLARE_HOOK(android_vh_dma_buf_release, + TP_PROTO(struct dma_buf *data), + TP_ARGS(data)); +#endif /* _TRACE_HOOK_DMABUF_H */ +/* This part must be outside protection */ +#include + diff --git a/include/trace/hooks/drm_atomic.h b/include/trace/hooks/drm_atomic.h new file mode 100644 index 000000000000..cce7ee492a09 --- /dev/null +++ b/include/trace/hooks/drm_atomic.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM drm_atomic + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_DRM_ATOMIC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_DRM_ATOMIC_H + +#include + +struct drm_atomic_state; +struct drm_crtc; + +DECLARE_HOOK(android_vh_drm_atomic_check_modeset, + TP_PROTO(struct drm_atomic_state *state, struct drm_crtc *crtc, bool *allow), + TP_ARGS(state, crtc, allow)) + +#endif /* _TRACE_HOOK_DRM_ATOMIC_H */ +/* This part must be outside protection */ +#include + diff --git a/include/trace/hooks/drm_framebuffer.h b/include/trace/hooks/drm_framebuffer.h new file mode 100644 index 000000000000..2c2221dfa72c --- /dev/null +++ b/include/trace/hooks/drm_framebuffer.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM drm_framebuffer + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_DRM_FRAMEBUFFER_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_DRM_FRAMEBUFFER_H + +#include + +struct drm_framebuffer; + +DECLARE_HOOK(android_vh_atomic_remove_fb, + TP_PROTO(struct drm_framebuffer *fb, bool *allow), + TP_ARGS(fb, allow)) + +#endif /* _TRACE_HOOK_DRM_FRAMEBUFFER_H */ +/* This part must be outside protection */ +#include + diff --git a/include/trace/hooks/dtask.h b/include/trace/hooks/dtask.h new file mode 100644 index 000000000000..58ca172d83be --- /dev/null +++ b/include/trace/hooks/dtask.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM dtask +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_DTASK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_DTASK_H +#include +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ +struct mutex; +struct rt_mutex_base; +struct rw_semaphore; +struct task_struct; + +DECLARE_HOOK(android_vh_mutex_wait_start, + TP_PROTO(struct mutex *lock), + TP_ARGS(lock)); +DECLARE_HOOK(android_vh_mutex_wait_finish, + TP_PROTO(struct mutex *lock), + TP_ARGS(lock)); +DECLARE_HOOK(android_vh_mutex_init, + TP_PROTO(struct mutex *lock), + TP_ARGS(lock)); + +DECLARE_HOOK(android_vh_rtmutex_wait_start, + TP_PROTO(struct rt_mutex_base *lock), + TP_ARGS(lock)); +DECLARE_HOOK(android_vh_rtmutex_wait_finish, + TP_PROTO(struct rt_mutex_base *lock), + TP_ARGS(lock)); + +DECLARE_HOOK(android_vh_rwsem_read_wait_start, + TP_PROTO(struct rw_semaphore *sem), + TP_ARGS(sem)); +DECLARE_HOOK(android_vh_rwsem_read_wait_finish, + TP_PROTO(struct rw_semaphore *sem), + TP_ARGS(sem)); +DECLARE_HOOK(android_vh_rwsem_write_wait_start, + TP_PROTO(struct rw_semaphore *sem), + TP_ARGS(sem)); +DECLARE_HOOK(android_vh_rwsem_write_wait_finish, + TP_PROTO(struct rw_semaphore *sem), + TP_ARGS(sem)); + +DECLARE_HOOK(android_vh_sched_show_task, + TP_PROTO(struct task_struct *task), + TP_ARGS(task)); + +struct mutex_waiter; +DECLARE_HOOK(android_vh_alter_mutex_list_add, + TP_PROTO(struct mutex *lock, + struct mutex_waiter *waiter, + struct list_head *list, + bool *already_on_list), + TP_ARGS(lock, waiter, list, already_on_list)); +DECLARE_HOOK(android_vh_mutex_unlock_slowpath, + TP_PROTO(struct mutex *lock), + TP_ARGS(lock)); + +#endif /* _TRACE_HOOK_DTASK_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/epoch.h b/include/trace/hooks/epoch.h new file mode 100644 index 000000000000..ccee2d8cd429 --- /dev/null +++ b/include/trace/hooks/epoch.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM epoch + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_EPOCH_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_EPOCH_H + +#include + +DECLARE_HOOK(android_vh_show_suspend_epoch_val, + TP_PROTO(u64 suspend_ns, u64 suspend_cycles), + TP_ARGS(suspend_ns, suspend_cycles)); + +DECLARE_HOOK(android_vh_show_resume_epoch_val, + TP_PROTO(u64 resume_cycles), + TP_ARGS(resume_cycles)); + +#endif /* _TRACE_HOOK_EPOCH_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/evdev.h b/include/trace/hooks/evdev.h new file mode 100644 index 000000000000..48ac94aafc9d --- /dev/null +++ b/include/trace/hooks/evdev.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM evdev + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_EVDEV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_EVDEV_H + +#include + +DECLARE_HOOK(android_vh_pass_input_event, + TP_PROTO(int head, int tail, int bufsize, int type, int code, int value), + TP_ARGS(head, tail, bufsize, type, code, value)) + +#endif /* _TRACE_HOOK_EVDEV_H */ +/* This part must be outside protection */ +#include + diff --git a/include/trace/hooks/fault.h b/include/trace/hooks/fault.h new file mode 100644 index 000000000000..edfdf23422f5 --- /dev/null +++ b/include/trace/hooks/fault.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fault +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_FAULT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_FAULT_H +#include + +struct pt_regs; + +DECLARE_RESTRICTED_HOOK(android_rvh_die_kernel_fault, + TP_PROTO(const char *msg, unsigned long addr, unsigned int esr, struct pt_regs *regs), + TP_ARGS(msg, addr, esr, regs), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_do_sea, + TP_PROTO(unsigned long addr, unsigned int esr, struct pt_regs *regs), + TP_ARGS(addr, esr, regs), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_do_mem_abort, + TP_PROTO(unsigned long addr, unsigned int esr, struct pt_regs *regs), + TP_ARGS(addr, esr, regs), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_do_sp_pc_abort, + TP_PROTO(unsigned long addr, unsigned int esr, struct pt_regs *regs), + TP_ARGS(addr, esr, regs), + TP_CONDITION(!user_mode(regs))); + +DECLARE_HOOK(android_vh_handle_tlb_conf, + TP_PROTO(unsigned long addr, unsigned int esr, int *ret), + TP_ARGS(addr, esr, ret)); + +#endif /* _TRACE_HOOK_FAULT_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/fips140.h b/include/trace/hooks/fips140.h new file mode 100644 index 000000000000..fd4a42c013c7 --- /dev/null +++ b/include/trace/hooks/fips140.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fips140 +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_FIPS140_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_FIPS140_H +#include + +struct crypto_aes_ctx; + +/* + * These hooks exist only for the benefit of the FIPS140 crypto module, which + * uses them to swap out the underlying implementation with one that is integrity + * checked as per FIPS 140 requirements. No other uses are allowed or + * supported. + */ + +DECLARE_HOOK(android_vh_sha256, + TP_PROTO(const u8 *data, + unsigned int len, + u8 *out, + int *hook_inuse), + TP_ARGS(data, len, out, hook_inuse)); + +DECLARE_HOOK(android_vh_aes_expandkey, + TP_PROTO(struct crypto_aes_ctx *ctx, + const u8 *in_key, + unsigned int key_len, + int *err), + TP_ARGS(ctx, in_key, key_len, err)); + +DECLARE_HOOK(android_vh_aes_encrypt, + TP_PROTO(const struct crypto_aes_ctx *ctx, + u8 *out, + const u8 *in, + int *hook_inuse), + TP_ARGS(ctx, out, in, hook_inuse)); + +DECLARE_HOOK(android_vh_aes_decrypt, + TP_PROTO(const struct crypto_aes_ctx *ctx, + u8 *out, + const u8 *in, + int *hook_inuse), + TP_ARGS(ctx, out, in, hook_inuse)); + +#endif /* _TRACE_HOOK_FIPS140_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/fpsimd.h b/include/trace/hooks/fpsimd.h new file mode 100644 index 000000000000..10337180a7be --- /dev/null +++ b/include/trace/hooks/fpsimd.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fpsimd + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_FPSIMD_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_FPSIMD_H + +#include + +struct task_struct; + +DECLARE_HOOK(android_vh_is_fpsimd_save, + TP_PROTO(struct task_struct *prev, struct task_struct *next), + TP_ARGS(prev, next)) + +#endif /* _TRACE_HOOK_FPSIMD_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/ftrace_dump.h b/include/trace/hooks/ftrace_dump.h new file mode 100644 index 000000000000..3d0b980ee85e --- /dev/null +++ b/include/trace/hooks/ftrace_dump.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ftrace_dump + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_FTRACE_DUMP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_FTRACE_DUMP_H + +#include + +struct trace_seq; + +DECLARE_HOOK(android_vh_ftrace_oops_enter, + TP_PROTO(bool *ftrace_check), + TP_ARGS(ftrace_check)); + +DECLARE_HOOK(android_vh_ftrace_oops_exit, + TP_PROTO(bool *ftrace_check), + TP_ARGS(ftrace_check)); + +DECLARE_HOOK(android_vh_ftrace_size_check, + TP_PROTO(unsigned long size, bool *ftrace_check), + TP_ARGS(size, ftrace_check)); + +DECLARE_HOOK(android_vh_ftrace_format_check, + TP_PROTO(bool *ftrace_check), + TP_ARGS(ftrace_check)); + +DECLARE_HOOK(android_vh_ftrace_dump_buffer, + TP_PROTO(struct trace_seq *trace_buf, bool *dump_printk), + TP_ARGS(trace_buf, dump_printk)); + +#endif /* _TRACE_HOOK_FTRACE_DUMP_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/futex.h b/include/trace/hooks/futex.h new file mode 100644 index 000000000000..1be35fefb610 --- /dev/null +++ b/include/trace/hooks/futex.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM futex +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_FUTEX_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_FUTEX_H +#include + +struct plist_node; +struct plist_head; +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ +DECLARE_HOOK(android_vh_alter_futex_plist_add, + TP_PROTO(struct plist_node *node, + struct plist_head *head, + bool *already_on_hb), + TP_ARGS(node, head, already_on_hb)); +#endif /* _TRACE_HOOK_FUTEX_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/gic.h b/include/trace/hooks/gic.h new file mode 100644 index 000000000000..b583d9fbe0dd --- /dev/null +++ b/include/trace/hooks/gic.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM gic + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_GIC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_GIC_H + +#include + +struct irq_data; +struct gic_chip_data; + +DECLARE_HOOK(android_vh_gic_resume, + TP_PROTO(struct gic_chip_data *gd), + TP_ARGS(gd)); + +DECLARE_HOOK(android_vh_gic_set_affinity, + TP_PROTO(struct irq_data *d, const struct cpumask *mask_val, + bool force, u8 *gic_cpu_map, void __iomem *reg), + TP_ARGS(d, mask_val, force, gic_cpu_map, reg)); + +#endif /* _TRACE_HOOK_GIC_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/gic_v3.h b/include/trace/hooks/gic_v3.h new file mode 100644 index 000000000000..ae7f8d1bacdd --- /dev/null +++ b/include/trace/hooks/gic_v3.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM gic_v3 +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_GIC_V3_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_GIC_V3_H +#include +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ +struct cpumask; +struct irq_data; + +DECLARE_HOOK(android_vh_gic_v3_affinity_init, + TP_PROTO(int irq, u32 offset, u64 *affinity), + TP_ARGS(irq, offset, affinity)); +DECLARE_RESTRICTED_HOOK(android_rvh_gic_v3_set_affinity, + TP_PROTO(struct irq_data *d, const struct cpumask *mask_val, + u64 *affinity, bool force, void __iomem *base, + void __iomem *rbase, u64 redist_stride), + TP_ARGS(d, mask_val, affinity, force, base, rbase, redist_stride), + 1); + +#endif /* _TRACE_HOOK_GIC_V3_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/hung_task.h b/include/trace/hooks/hung_task.h new file mode 100644 index 000000000000..115727d2916c --- /dev/null +++ b/include/trace/hooks/hung_task.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hung_task + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_HUNG_TASK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_HUNG_TASK_H + +#include + +struct task_struct; + +DECLARE_HOOK(android_vh_check_uninterruptible_tasks, + TP_PROTO(struct task_struct *t, unsigned long timeout, + bool *need_check), + TP_ARGS(t, timeout, need_check)); + +DECLARE_HOOK(android_vh_check_uninterruptible_tasks_dn, + TP_PROTO(void *unused), + TP_ARGS(unused)); + +#endif /* _TRACE_HOOK_HUNG_TASK_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/iommu.h b/include/trace/hooks/iommu.h new file mode 100644 index 000000000000..e9bb6642a373 --- /dev/null +++ b/include/trace/hooks/iommu.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM iommu + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_IOMMU_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_IOMMU_H + +#include + +struct iova_domain; + +DECLARE_RESTRICTED_HOOK(android_rvh_iommu_setup_dma_ops, + TP_PROTO(struct device *dev, u64 dma_base, u64 dma_limit), + TP_ARGS(dev, dma_base, dma_limit), 1); + + +DECLARE_HOOK(android_vh_iommu_iovad_alloc_iova, + TP_PROTO(struct device *dev, struct iova_domain *iovad, dma_addr_t iova, size_t size), + TP_ARGS(dev, iovad, iova, size)); + +DECLARE_HOOK(android_vh_iommu_iovad_free_iova, + TP_PROTO(struct iova_domain *iovad, dma_addr_t iova, size_t size), + TP_ARGS(iovad, iova, size)); + +#endif /* _TRACE_HOOK_IOMMU_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/ipv4.h b/include/trace/hooks/ipv4.h new file mode 100644 index 000000000000..cc4e3488c013 --- /dev/null +++ b/include/trace/hooks/ipv4.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ipv4 +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_IPV4_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_IPV4_H +#include + +DECLARE_RESTRICTED_HOOK(android_rvh_tcp_sendmsg_locked, + TP_PROTO(struct sock *sk, int size), + TP_ARGS(sk, size), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_tcp_recvmsg, + TP_PROTO(struct sock *sk), + TP_ARGS(sk), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_udp_sendmsg, + TP_PROTO(struct sock *sk), + TP_ARGS(sk), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_udp_recvmsg, + TP_PROTO(struct sock *sk), + TP_ARGS(sk), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_tcp_recvmsg_stat, + TP_PROTO(struct sock *sk, int size), + TP_ARGS(sk, size), 1); + +#endif /* _TRACE_HOOK_IPV4_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/logbuf.h b/include/trace/hooks/logbuf.h new file mode 100644 index 000000000000..72c34f7fbad0 --- /dev/null +++ b/include/trace/hooks/logbuf.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM logbuf + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_LOGBUF_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_LOGBUF_H + +#include + +struct printk_ringbuffer; +struct printk_record; + +DECLARE_HOOK(android_vh_logbuf, + TP_PROTO(struct printk_ringbuffer *rb, struct printk_record *r), + TP_ARGS(rb, r)) + +DECLARE_HOOK(android_vh_logbuf_pr_cont, + TP_PROTO(struct printk_record *r, size_t text_len), + TP_ARGS(r, text_len)) + +#endif /* _TRACE_HOOK_LOGBUF_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/memory.h b/include/trace/hooks/memory.h new file mode 100644 index 000000000000..d7dbfdeacd4d --- /dev/null +++ b/include/trace/hooks/memory.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM memory + +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_MEMORY_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_MEMORY_H +#include +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ +DECLARE_HOOK(android_vh_set_memory_nx, + TP_PROTO(unsigned long addr, int nr_pages), + TP_ARGS(addr, nr_pages)); + +DECLARE_HOOK(android_vh_set_memory_rw, + TP_PROTO(unsigned long addr, int nr_pages), + TP_ARGS(addr, nr_pages)); + +#endif /* _TRACE_HOOK_MEMORY_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/mm.h b/include/trace/hooks/mm.h new file mode 100644 index 000000000000..0d58bb6ec0c6 --- /dev/null +++ b/include/trace/hooks/mm.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mm + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_MM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_MM_H + +#include + +struct oom_control; +struct slabinfo; +struct cma; +struct compact_control; + +DECLARE_RESTRICTED_HOOK(android_rvh_set_skip_swapcache_flags, + TP_PROTO(gfp_t *flags), + TP_ARGS(flags), 1); +DECLARE_RESTRICTED_HOOK(android_rvh_set_gfp_zone_flags, + TP_PROTO(gfp_t *flags), + TP_ARGS(flags), 1); +DECLARE_RESTRICTED_HOOK(android_rvh_set_readahead_gfp_mask, + TP_PROTO(gfp_t *flags), + TP_ARGS(flags), 1); +DECLARE_HOOK(android_vh_cma_alloc_start, + TP_PROTO(s64 *ts), + TP_ARGS(ts)); +DECLARE_HOOK(android_vh_cma_alloc_finish, + TP_PROTO(struct cma *cma, struct page *page, unsigned long count, + unsigned int align, gfp_t gfp_mask, s64 ts), + TP_ARGS(cma, page, count, align, gfp_mask, ts)); +DECLARE_HOOK(android_vh_meminfo_proc_show, + TP_PROTO(struct seq_file *m), + TP_ARGS(m)); +DECLARE_HOOK(android_vh_exit_mm, + TP_PROTO(struct mm_struct *mm), + TP_ARGS(mm)); +DECLARE_HOOK(android_vh_show_mem, + TP_PROTO(unsigned int filter, nodemask_t *nodemask), + TP_ARGS(filter, nodemask)); +DECLARE_HOOK(android_vh_alloc_pages_slowpath, + TP_PROTO(gfp_t gfp_mask, unsigned int order, unsigned long delta), + TP_ARGS(gfp_mask, order, delta)); +DECLARE_HOOK(android_vh_cma_alloc_adjust, + TP_PROTO(struct zone *zone, bool *is_cma_alloc), + TP_ARGS(zone, is_cma_alloc)); +DECLARE_HOOK(android_vh_print_slabinfo_header, + TP_PROTO(struct seq_file *m), + TP_ARGS(m)); +DECLARE_HOOK(android_vh_cache_show, + TP_PROTO(struct seq_file *m, struct slabinfo *sinfo, struct kmem_cache *s), + TP_ARGS(m, sinfo, s)); +DECLARE_HOOK(android_vh_oom_check_panic, + TP_PROTO(struct oom_control *oc, int *ret), + TP_ARGS(oc, ret)); +DECLARE_HOOK(android_vh_drain_all_pages_bypass, + TP_PROTO(gfp_t gfp_mask, unsigned int order, unsigned long alloc_flags, + int migratetype, unsigned long did_some_progress, + bool *bypass), + TP_ARGS(gfp_mask, order, alloc_flags, migratetype, did_some_progress, bypass)); +DECLARE_HOOK(android_vh_cma_drain_all_pages_bypass, + TP_PROTO(unsigned int migratetype, bool *bypass), + TP_ARGS(migratetype, bypass)); +DECLARE_HOOK(android_vh_pcplist_add_cma_pages_bypass, + TP_PROTO(int migratetype, bool *bypass), + TP_ARGS(migratetype, bypass)); +DECLARE_HOOK(android_vh_mmap_region, + TP_PROTO(struct vm_area_struct *vma, unsigned long addr), + TP_ARGS(vma, addr)); +DECLARE_HOOK(android_vh_try_to_unmap_one, + TP_PROTO(struct vm_area_struct *vma, struct page *page, unsigned long addr, bool ret), + TP_ARGS(vma, page, addr, ret)); +DECLARE_HOOK(android_vh_mm_compaction_begin, + TP_PROTO(struct compact_control *cc, long *vendor_ret), + TP_ARGS(cc, vendor_ret)); +DECLARE_HOOK(android_vh_mm_compaction_end, + TP_PROTO(struct compact_control *cc, long vendor_ret), + TP_ARGS(cc, vendor_ret)); +DECLARE_HOOK(android_vh_pagevec_drain, + TP_PROTO(struct page *page, bool *ret), + TP_ARGS(page, ret)); +DECLARE_HOOK(android_vh_zap_pte_range_tlb_start, + TP_PROTO(void *ret), + TP_ARGS(ret)); +DECLARE_HOOK(android_vh_zap_pte_range_tlb_force_flush, + TP_PROTO(struct page *page, bool *flush), + TP_ARGS(page, flush)); +DECLARE_HOOK(android_vh_zap_pte_range_tlb_end, + TP_PROTO(void *ret), + TP_ARGS(ret)); +DECLARE_HOOK(android_vh_skip_lru_disable, + TP_PROTO(bool *skip), + TP_ARGS(skip)); +DECLARE_HOOK(android_vh_do_madvise_blk_plug, + TP_PROTO(int behavior, bool *do_plug), + TP_ARGS(behavior, do_plug)); +DECLARE_HOOK(android_vh_shrink_inactive_list_blk_plug, + TP_PROTO(bool *do_plug), + TP_ARGS(do_plug)); +DECLARE_HOOK(android_vh_shrink_lruvec_blk_plug, + TP_PROTO(bool *do_plug), + TP_ARGS(do_plug)); +DECLARE_HOOK(android_vh_reclaim_pages_plug, + TP_PROTO(bool *do_plug), + TP_ARGS(do_plug)); +struct mem_cgroup; +DECLARE_HOOK(android_vh_mem_cgroup_alloc, + TP_PROTO(struct mem_cgroup *memcg), + TP_ARGS(memcg)); +DECLARE_HOOK(android_vh_mem_cgroup_free, + TP_PROTO(struct mem_cgroup *memcg), + TP_ARGS(memcg)); +DECLARE_HOOK(android_vh_mem_cgroup_id_remove, + TP_PROTO(struct mem_cgroup *memcg), + TP_ARGS(memcg)); +struct cgroup_subsys_state; +DECLARE_HOOK(android_vh_mem_cgroup_css_online, + TP_PROTO(struct cgroup_subsys_state *css, struct mem_cgroup *memcg), + TP_ARGS(css, memcg)); +DECLARE_HOOK(android_vh_mem_cgroup_css_offline, + TP_PROTO(struct cgroup_subsys_state *css, struct mem_cgroup *memcg), + TP_ARGS(css, memcg)); +#endif /* _TRACE_HOOK_MM_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/mmc.h b/include/trace/hooks/mmc.h new file mode 100644 index 000000000000..e6cc0c59422f --- /dev/null +++ b/include/trace/hooks/mmc.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mmc +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_MMC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_MMC_H + +#include + +struct blk_mq_queue_data; +struct mmc_host; +struct mmc_card; +struct sdhci_host; + +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ +DECLARE_HOOK(android_vh_mmc_check_status, + TP_PROTO(const struct blk_mq_queue_data *bd, int *ret), + TP_ARGS(bd, ret)); + +DECLARE_HOOK(android_vh_mmc_sdio_pm_flag_set, + TP_PROTO(struct mmc_host *host), + TP_ARGS(host)); +DECLARE_HOOK(android_vh_mmc_blk_reset, + TP_PROTO(struct mmc_host *host, int err), + TP_ARGS(host, err)); +DECLARE_HOOK(android_vh_mmc_blk_mq_rw_recovery, + TP_PROTO(struct mmc_card *card), + TP_ARGS(card)); +DECLARE_HOOK(android_vh_sd_update_bus_speed_mode, + TP_PROTO(struct mmc_card *card), + TP_ARGS(card)); +DECLARE_HOOK(android_vh_mmc_attach_sd, + TP_PROTO(struct mmc_host *host, u32 ocr, int err), + TP_ARGS(host, ocr, err)); +DECLARE_HOOK(android_vh_sdhci_get_cd, + TP_PROTO(struct sdhci_host *host, bool *allow), + TP_ARGS(host, allow)); +DECLARE_HOOK(android_vh_mmc_gpio_cd_irqt, + TP_PROTO(struct mmc_host *host, bool *allow), + TP_ARGS(host, allow)); + +DECLARE_RESTRICTED_HOOK(android_rvh_mmc_cache_card_properties, + TP_PROTO(struct mmc_host *host), + TP_ARGS(host), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_partial_init, + TP_PROTO(struct mmc_host *host, bool *partial_init), + TP_ARGS(host, partial_init), 1); + +#endif /* _TRACE_HOOK_MMC_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/module.h b/include/trace/hooks/module.h new file mode 100644 index 000000000000..2c6300222b75 --- /dev/null +++ b/include/trace/hooks/module.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM module + +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_MODULE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_MODULE_H +#include +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ +struct module; + +DECLARE_HOOK(android_vh_set_module_permit_before_init, + TP_PROTO(const struct module *mod), + TP_ARGS(mod)); + +DECLARE_HOOK(android_vh_set_module_permit_after_init, + TP_PROTO(const struct module *mod), + TP_ARGS(mod)); + +#endif /* _TRACE_HOOK_MODULE_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/mpam.h b/include/trace/hooks/mpam.h new file mode 100644 index 000000000000..d358fe3d79eb --- /dev/null +++ b/include/trace/hooks/mpam.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mpam +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_MPAM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_MPAM_H +#include +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ +struct task_struct; + +DECLARE_HOOK(android_vh_mpam_set, + TP_PROTO(struct task_struct *prev, struct task_struct *next), + TP_ARGS(prev, next)); + +#endif /* _TRACE_HOOK_MPAM_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/net.h b/include/trace/hooks/net.h new file mode 100644 index 000000000000..2c5e8ce17025 --- /dev/null +++ b/include/trace/hooks/net.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM net +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_NET_VH_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_NET_VH_H +#include + +struct packet_type; +struct sk_buff; +struct list_head; + +DECLARE_HOOK(android_vh_ptype_head, + TP_PROTO(const struct packet_type *pt, struct list_head *vendor_pt), + TP_ARGS(pt, vendor_pt)); +DECLARE_HOOK(android_vh_kfree_skb, + TP_PROTO(struct sk_buff *skb), TP_ARGS(skb)); + +struct nf_conn; +struct sock; +DECLARE_RESTRICTED_HOOK(android_rvh_nf_conn_alloc, + TP_PROTO(struct nf_conn *nf_conn), TP_ARGS(nf_conn), 1); +DECLARE_RESTRICTED_HOOK(android_rvh_nf_conn_free, + TP_PROTO(struct nf_conn *nf_conn), TP_ARGS(nf_conn), 1); +DECLARE_RESTRICTED_HOOK(android_rvh_sk_alloc, + TP_PROTO(struct sock *sock), TP_ARGS(sock), 1); +DECLARE_RESTRICTED_HOOK(android_rvh_sk_free, + TP_PROTO(struct sock *sock), TP_ARGS(sock), 1); + +/* macro versions of hooks are no longer required */ + +#endif /* _TRACE_HOOK_NET_VH_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/pm_domain.h b/include/trace/hooks/pm_domain.h new file mode 100644 index 000000000000..44c4ae7f5363 --- /dev/null +++ b/include/trace/hooks/pm_domain.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM pm_domain + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_PM_DOMAIN_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_PM_DOMAIN_H + +#include + +struct generic_pm_domain; + +DECLARE_HOOK(android_vh_allow_domain_state, + TP_PROTO(struct generic_pm_domain *genpd, uint32_t idx, bool *allow), + TP_ARGS(genpd, idx, allow)) + +#endif /* _TRACE_HOOK_PM_DOMAIN_H */ + +#include diff --git a/include/trace/hooks/power.h b/include/trace/hooks/power.h new file mode 100644 index 000000000000..bcf8eab2cffb --- /dev/null +++ b/include/trace/hooks/power.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM power +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_POWER_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_POWER_H +#include + +struct task_struct; + +DECLARE_HOOK(android_vh_try_to_freeze_todo, + TP_PROTO(unsigned int todo, unsigned int elapsed_msecs, bool wq_busy), + TP_ARGS(todo, elapsed_msecs, wq_busy)); + +DECLARE_HOOK(android_vh_try_to_freeze_todo_unfrozen, + TP_PROTO(struct task_struct *p), + TP_ARGS(p)); + +#endif /* _TRACE_HOOK_POWER_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/preemptirq.h b/include/trace/hooks/preemptirq.h new file mode 100644 index 000000000000..52308c882acb --- /dev/null +++ b/include/trace/hooks/preemptirq.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM preemptirq + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_PREEMPTIRQ_H + +#include + +DECLARE_RESTRICTED_HOOK(android_rvh_preempt_disable, + TP_PROTO(unsigned long ip, unsigned long parent_ip), + TP_ARGS(ip, parent_ip), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_preempt_enable, + TP_PROTO(unsigned long ip, unsigned long parent_ip), + TP_ARGS(ip, parent_ip), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_irqs_disable, + TP_PROTO(unsigned long ip, unsigned long parent_ip), + TP_ARGS(ip, parent_ip), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_irqs_enable, + TP_PROTO(unsigned long ip, unsigned long parent_ip), + TP_ARGS(ip, parent_ip), 1); + +#endif /* _TRACE_HOOK_PREEMPTIRQ_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/printk.h b/include/trace/hooks/printk.h new file mode 100644 index 000000000000..b3e9598386c7 --- /dev/null +++ b/include/trace/hooks/printk.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM printk + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_PRINTK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_PRINTK_H + +#include + +DECLARE_HOOK(android_vh_printk_hotplug, + TP_PROTO(int *flag), + TP_ARGS(flag)); + +DECLARE_HOOK(android_vh_printk_caller_id, + TP_PROTO(u32 *caller_id), + TP_ARGS(caller_id)); +DECLARE_HOOK(android_vh_printk_caller, + TP_PROTO(char *caller, size_t size, u32 id, int *ret), + TP_ARGS(caller, size, id, ret)); +DECLARE_HOOK(android_vh_printk_ext_header, + TP_PROTO(char *caller, size_t size, u32 id, int *ret), + TP_ARGS(caller, size, id, ret)); + +#endif /* _TRACE_HOOK_PRINTK_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/psci.h b/include/trace/hooks/psci.h new file mode 100644 index 000000000000..23090d9d2ef9 --- /dev/null +++ b/include/trace/hooks/psci.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM psci +#undef TRACE_INCLUDE_PATH + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_PSCI_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_PSCI_H + +#include + +DECLARE_HOOK(android_vh_psci_tos_resident_on, + TP_PROTO(int cpu, bool *resident), + TP_ARGS(cpu, resident)); + +DECLARE_HOOK(android_vh_psci_cpu_suspend, + TP_PROTO(u32 state, bool *deny), + TP_ARGS(state, deny)); + +/* macro versions of hooks are no longer required */ + +#endif /* _TRACE_HOOK_PSCI_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/regmap.h b/include/trace/hooks/regmap.h new file mode 100644 index 000000000000..13cdf942d66f --- /dev/null +++ b/include/trace/hooks/regmap.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM regmap +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_REGMAP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_REGMAP_H +#include + +struct regmap; +struct regmap_config; + +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ +DECLARE_HOOK(android_vh_regmap_update, + TP_PROTO(const struct regmap_config *config, struct regmap *map), + TP_ARGS(config, map)); + +#endif /* _TRACE_HOOK_REGMAP_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/remoteproc.h b/include/trace/hooks/remoteproc.h new file mode 100644 index 000000000000..55fae7034379 --- /dev/null +++ b/include/trace/hooks/remoteproc.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM remoteproc + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_RPROC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_RPROC_H + +#include + +struct rproc; + +/* When recovery succeeds */ +DECLARE_HOOK(android_vh_rproc_recovery, + TP_PROTO(struct rproc *rproc), + TP_ARGS(rproc)); + +/* When recovery mode is enabled or disabled by sysfs */ +DECLARE_HOOK(android_vh_rproc_recovery_set, + TP_PROTO(struct rproc *rproc), + TP_ARGS(rproc)); + +#endif /* _TRACE_HOOK_RPROC_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/rwsem.h b/include/trace/hooks/rwsem.h new file mode 100644 index 000000000000..99c18e7e55b4 --- /dev/null +++ b/include/trace/hooks/rwsem.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rwsem +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_RWSEM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_RWSEM_H +#include +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ + +struct rw_semaphore; +struct rwsem_waiter; + +DECLARE_HOOK(android_vh_rwsem_init, + TP_PROTO(struct rw_semaphore *sem), + TP_ARGS(sem)); +DECLARE_HOOK(android_vh_rwsem_wake, + TP_PROTO(struct rw_semaphore *sem), + TP_ARGS(sem)); +DECLARE_HOOK(android_vh_rwsem_write_finished, + TP_PROTO(struct rw_semaphore *sem), + TP_ARGS(sem)); +DECLARE_HOOK(android_vh_alter_rwsem_list_add, + TP_PROTO(struct rwsem_waiter *waiter, + struct rw_semaphore *sem, + bool *already_on_list), + TP_ARGS(waiter, sem, already_on_list)); +#endif /* _TRACE_HOOK_RWSEM_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/sched.h b/include/trace/hooks/sched.h new file mode 100644 index 000000000000..10c537161edc --- /dev/null +++ b/include/trace/hooks/sched.h @@ -0,0 +1,432 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM sched +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_SCHED_H +#include +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ + +struct cgroup_taskset; +struct cgroup_subsys_state; +struct em_perf_domain; +enum uclamp_id; +struct sched_entity; +struct task_struct; +struct uclamp_se; +struct sched_attr; + +DECLARE_RESTRICTED_HOOK(android_rvh_select_task_rq_fair, + TP_PROTO(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags, int *new_cpu), + TP_ARGS(p, prev_cpu, sd_flag, wake_flags, new_cpu), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_select_task_rq_rt, + TP_PROTO(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags, int *new_cpu), + TP_ARGS(p, prev_cpu, sd_flag, wake_flags, new_cpu), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_select_task_rq_dl, + TP_PROTO(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags, int *new_cpu), + TP_ARGS(p, prev_cpu, sd_flag, wake_flags, new_cpu), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_select_fallback_rq, + TP_PROTO(int cpu, struct task_struct *p, int *new_cpu), + TP_ARGS(cpu, p, new_cpu), 1); + +struct rq; +DECLARE_HOOK(android_vh_scheduler_tick, + TP_PROTO(struct rq *rq), + TP_ARGS(rq)); + +DECLARE_RESTRICTED_HOOK(android_rvh_enqueue_task, + TP_PROTO(struct rq *rq, struct task_struct *p, int flags), + TP_ARGS(rq, p, flags), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_dequeue_task, + TP_PROTO(struct rq *rq, struct task_struct *p, int flags), + TP_ARGS(rq, p, flags), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_can_migrate_task, + TP_PROTO(struct task_struct *p, int dst_cpu, int *can_migrate), + TP_ARGS(p, dst_cpu, can_migrate), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_find_lowest_rq, + TP_PROTO(struct task_struct *p, struct cpumask *local_cpu_mask, + int ret, int *lowest_cpu), + TP_ARGS(p, local_cpu_mask, ret, lowest_cpu), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_prepare_prio_fork, + TP_PROTO(struct task_struct *p), + TP_ARGS(p), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_finish_prio_fork, + TP_PROTO(struct task_struct *p), + TP_ARGS(p), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_rtmutex_prepare_setprio, + TP_PROTO(struct task_struct *p, struct task_struct *pi_task), + TP_ARGS(p, pi_task), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_rto_next_cpu, + TP_PROTO(int rto_cpu, struct cpumask *rto_mask, int *cpu), + TP_ARGS(rto_cpu, rto_mask, cpu), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_is_cpu_allowed, + TP_PROTO(struct task_struct *p, int cpu, bool *allowed), + TP_ARGS(p, cpu, allowed), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_get_nohz_timer_target, + TP_PROTO(int *cpu, bool *done), + TP_ARGS(cpu, done), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_set_user_nice, + TP_PROTO(struct task_struct *p, long *nice, bool *allowed), + TP_ARGS(p, nice, allowed), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_setscheduler, + TP_PROTO(struct task_struct *p), + TP_ARGS(p), 1); + +struct sched_group; +DECLARE_RESTRICTED_HOOK(android_rvh_find_busiest_group, + TP_PROTO(struct sched_group *busiest, struct rq *dst_rq, int *out_balance), + TP_ARGS(busiest, dst_rq, out_balance), 1); + +DECLARE_HOOK(android_vh_dump_throttled_rt_tasks, + TP_PROTO(int cpu, u64 clock, ktime_t rt_period, u64 rt_runtime, + s64 rt_period_timer_expires), + TP_ARGS(cpu, clock, rt_period, rt_runtime, rt_period_timer_expires)); + +DECLARE_HOOK(android_vh_jiffies_update, + TP_PROTO(void *unused), + TP_ARGS(unused)); + +struct rq_flags; +DECLARE_RESTRICTED_HOOK(android_rvh_sched_newidle_balance, + TP_PROTO(struct rq *this_rq, struct rq_flags *rf, + int *pulled_task, int *done), + TP_ARGS(this_rq, rf, pulled_task, done), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_sched_nohz_balancer_kick, + TP_PROTO(struct rq *rq, unsigned int *flags, int *done), + TP_ARGS(rq, flags, done), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_sched_rebalance_domains, + TP_PROTO(struct rq *rq, int *continue_balancing), + TP_ARGS(rq, continue_balancing), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_find_busiest_queue, + TP_PROTO(int dst_cpu, struct sched_group *group, + struct cpumask *env_cpus, struct rq **busiest, + int *done), + TP_ARGS(dst_cpu, group, env_cpus, busiest, done), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_migrate_queued_task, + TP_PROTO(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, int new_cpu, + int *detached), + TP_ARGS(rq, rf, p, new_cpu, detached), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_find_energy_efficient_cpu, + TP_PROTO(struct task_struct *p, int prev_cpu, int sync, int *new_cpu), + TP_ARGS(p, prev_cpu, sync, new_cpu), 1); + +DECLARE_HOOK(android_vh_set_sugov_sched_attr, + TP_PROTO(struct sched_attr *attr), + TP_ARGS(attr)); + +DECLARE_RESTRICTED_HOOK(android_rvh_set_iowait, + TP_PROTO(struct task_struct *p, struct rq *rq, int *should_iowait_boost), + TP_ARGS(p, rq, should_iowait_boost), 1); + +struct sugov_policy; +DECLARE_RESTRICTED_HOOK(android_rvh_set_sugov_update, + TP_PROTO(struct sugov_policy *sg_policy, unsigned int next_freq, bool *should_update), + TP_ARGS(sg_policy, next_freq, should_update), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_cpu_overutilized, + TP_PROTO(int cpu, int *overutilized), + TP_ARGS(cpu, overutilized), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_sched_setaffinity, + TP_PROTO(struct task_struct *p, const struct cpumask *in_mask, int *retval), + TP_ARGS(p, in_mask, retval), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_sched_getaffinity, + TP_PROTO(struct task_struct *p, struct cpumask *in_mask), + TP_ARGS(p, in_mask), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_update_cpus_allowed, + TP_PROTO(struct task_struct *p, cpumask_var_t cpus_requested, + const struct cpumask *new_mask, int *ret), + TP_ARGS(p, cpus_requested, new_mask, ret), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_set_task_cpu, + TP_PROTO(struct task_struct *p, unsigned int new_cpu), + TP_ARGS(p, new_cpu), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_try_to_wake_up, + TP_PROTO(struct task_struct *p), + TP_ARGS(p), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_try_to_wake_up_success, + TP_PROTO(struct task_struct *p), + TP_ARGS(p), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_sched_fork, + TP_PROTO(struct task_struct *p), + TP_ARGS(p), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_wake_up_new_task, + TP_PROTO(struct task_struct *p), + TP_ARGS(p), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_new_task_stats, + TP_PROTO(struct task_struct *p), + TP_ARGS(p), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_flush_task, + TP_PROTO(struct task_struct *prev), + TP_ARGS(prev), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_tick_entry, + TP_PROTO(struct rq *rq), + TP_ARGS(rq), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_schedule, + TP_PROTO(struct task_struct *prev, struct task_struct *next, struct rq *rq), + TP_ARGS(prev, next, rq), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_sched_cpu_starting, + TP_PROTO(int cpu), + TP_ARGS(cpu), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_sched_cpu_dying, + TP_PROTO(int cpu), + TP_ARGS(cpu), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_account_irq, + TP_PROTO(struct task_struct *curr, int cpu, s64 delta), + TP_ARGS(curr, cpu, delta), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_account_irq_start, + TP_PROTO(struct task_struct *curr, int cpu, s64 delta), + TP_ARGS(curr, cpu, delta), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_account_irq_end, + TP_PROTO(struct task_struct *curr, int cpu, s64 delta), + TP_ARGS(curr, cpu, delta), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_place_entity, + TP_PROTO(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial, u64 *vruntime), + TP_ARGS(cfs_rq, se, initial, vruntime), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_build_perf_domains, + TP_PROTO(bool *eas_check), + TP_ARGS(eas_check), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_update_cpu_capacity, + TP_PROTO(int cpu, unsigned long *capacity), + TP_ARGS(cpu, capacity), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_update_misfit_status, + TP_PROTO(struct task_struct *p, struct rq *rq, bool *need_update), + TP_ARGS(p, rq, need_update), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_sched_fork_init, + TP_PROTO(struct task_struct *p), + TP_ARGS(p), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_ttwu_cond, + TP_PROTO(int cpu, bool *cond), + TP_ARGS(cpu, cond), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_schedule_bug, + TP_PROTO(void *unused), + TP_ARGS(unused), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_sched_exec, + TP_PROTO(bool *cond), + TP_ARGS(cond), 1); + +DECLARE_HOOK(android_vh_map_util_freq, + TP_PROTO(unsigned long util, unsigned long freq, + unsigned long cap, unsigned long *next_freq), + TP_ARGS(util, freq, cap, next_freq)); + +DECLARE_HOOK(android_vh_em_cpu_energy, + TP_PROTO(struct em_perf_domain *pd, + unsigned long max_util, unsigned long sum_util, + unsigned long *energy), + TP_ARGS(pd, max_util, sum_util, energy)); + +DECLARE_HOOK(android_vh_build_sched_domains, + TP_PROTO(bool has_asym), + TP_ARGS(has_asym)); + +DECLARE_RESTRICTED_HOOK(android_rvh_check_preempt_tick, + TP_PROTO(struct task_struct *p, unsigned long *ideal_runtime, bool *skip_preempt, + unsigned long delta_exec, struct cfs_rq *cfs_rq, struct sched_entity *curr, + unsigned int granularity), + TP_ARGS(p, ideal_runtime, skip_preempt, delta_exec, cfs_rq, curr, granularity), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_check_preempt_wakeup_ignore, + TP_PROTO(struct task_struct *p, bool *ignore), + TP_ARGS(p, ignore), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_replace_next_task_fair, + TP_PROTO(struct rq *rq, struct task_struct **p, struct sched_entity **se, bool *repick, + bool simple, struct task_struct *prev), + TP_ARGS(rq, p, se, repick, simple, prev), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_sched_balance_rt, + TP_PROTO(struct rq *rq, struct task_struct *p, int *done), + TP_ARGS(rq, p, done), 1); + +struct cfs_rq; +DECLARE_RESTRICTED_HOOK(android_rvh_pick_next_entity, + TP_PROTO(struct cfs_rq *cfs_rq, struct sched_entity *curr, + struct sched_entity **se), + TP_ARGS(cfs_rq, curr, se), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_check_preempt_wakeup, + TP_PROTO(struct rq *rq, struct task_struct *p, bool *preempt, bool *nopreempt, + int wake_flags, struct sched_entity *se, struct sched_entity *pse, + int next_buddy_marked, unsigned int granularity), + TP_ARGS(rq, p, preempt, nopreempt, wake_flags, se, pse, next_buddy_marked, + granularity), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_set_cpus_allowed_ptr_locked, + TP_PROTO(const struct cpumask *cpu_valid_mask, const struct cpumask *new_mask, + unsigned int *dest_cpu), + TP_ARGS(cpu_valid_mask, new_mask, dest_cpu), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_do_sched_yield, + TP_PROTO(struct rq *rq), + TP_ARGS(rq), 1); + +DECLARE_HOOK(android_vh_free_task, + TP_PROTO(struct task_struct *p), + TP_ARGS(p)); + +DECLARE_HOOK(android_vh_irqtime_account_process_tick, + TP_PROTO(struct task_struct *p, struct rq *rq, int user_tick, int ticks), + TP_ARGS(p, rq, user_tick, ticks)); + +/* Conditionally defined upon CONFIG_UCLAMP_TASK */ +struct uclamp_se; +DECLARE_RESTRICTED_HOOK(android_rvh_uclamp_eff_get, + TP_PROTO(struct task_struct *p, enum uclamp_id clamp_id, + struct uclamp_se *uclamp_max, struct uclamp_se *uclamp_eff, int *ret), + TP_ARGS(p, clamp_id, uclamp_max, uclamp_eff, ret), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_after_enqueue_task, + TP_PROTO(struct rq *rq, struct task_struct *p, int flags), + TP_ARGS(rq, p, flags), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_after_dequeue_task, + TP_PROTO(struct rq *rq, struct task_struct *p, int flags), + TP_ARGS(rq, p, flags), 1); + +struct cfs_rq; +struct rq_flags; +DECLARE_RESTRICTED_HOOK(android_rvh_enqueue_entity, + TP_PROTO(struct cfs_rq *cfs, struct sched_entity *se), + TP_ARGS(cfs, se), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_dequeue_entity, + TP_PROTO(struct cfs_rq *cfs, struct sched_entity *se), + TP_ARGS(cfs, se), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_entity_tick, + TP_PROTO(struct cfs_rq *cfs_rq, struct sched_entity *se), + TP_ARGS(cfs_rq, se), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_enqueue_task_fair, + TP_PROTO(struct rq *rq, struct task_struct *p, int flags), + TP_ARGS(rq, p, flags), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_dequeue_task_fair, + TP_PROTO(struct rq *rq, struct task_struct *p, int flags), + TP_ARGS(rq, p, flags), 1); + +DECLARE_HOOK(android_vh_em_dev_register_pd, + TP_PROTO(bool *cond), + TP_ARGS(cond)); + +DECLARE_RESTRICTED_HOOK(android_rvh_post_init_entity_util_avg, + TP_PROTO(struct sched_entity *se), + TP_ARGS(se), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_find_new_ilb, + TP_PROTO(struct cpumask *nohz_idle_cpus_mask, int *ilb), + TP_ARGS(nohz_idle_cpus_mask, ilb), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_util_est_update, + TP_PROTO(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep, int *ret), + TP_ARGS(cfs_rq, p, task_sleep, ret), 1); + +DECLARE_HOOK(android_vh_setscheduler_uclamp, + TP_PROTO(struct task_struct *tsk, int clamp_id, unsigned int value), + TP_ARGS(tsk, clamp_id, value)); + +DECLARE_HOOK(android_vh_do_wake_up_sync, + TP_PROTO(struct wait_queue_head *wq_head, int *done), + TP_ARGS(wq_head, done)); + +DECLARE_HOOK(android_vh_set_wake_flags, + TP_PROTO(int *wake_flags, unsigned int *mode), + TP_ARGS(wake_flags, mode)); + +DECLARE_RESTRICTED_HOOK(android_rvh_effective_cpu_util, + TP_PROTO(int cpu, unsigned long util_cfs, unsigned long max, int type, + struct task_struct *p, unsigned long *new_util), + TP_ARGS(cpu, util_cfs, max, type, p, new_util), 1); + +DECLARE_HOOK(android_vh_force_compatible_pre, + TP_PROTO(void *unused), + TP_ARGS(unused)); + +DECLARE_HOOK(android_vh_force_compatible_post, + TP_PROTO(void *unused), + TP_ARGS(unused)); + +DECLARE_HOOK(android_vh_dup_task_struct, + TP_PROTO(struct task_struct *tsk, struct task_struct *orig), + TP_ARGS(tsk, orig)); + +DECLARE_HOOK(android_vh_account_task_time, + TP_PROTO(struct task_struct *p, struct rq *rq, int user_tick), + TP_ARGS(p, rq, user_tick)); + +DECLARE_RESTRICTED_HOOK(android_rvh_attach_entity_load_avg, + TP_PROTO(struct cfs_rq *cfs_rq, struct sched_entity *se), + TP_ARGS(cfs_rq, se), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_detach_entity_load_avg, + TP_PROTO(struct cfs_rq *cfs_rq, struct sched_entity *se), + TP_ARGS(cfs_rq, se), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_update_load_avg, + TP_PROTO(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se), + TP_ARGS(now, cfs_rq, se), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_remove_entity_load_avg, + TP_PROTO(struct cfs_rq *cfs_rq, struct sched_entity *se), + TP_ARGS(cfs_rq, se), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_update_blocked_fair, + TP_PROTO(struct rq *rq), + TP_ARGS(rq), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_update_rt_rq_load_avg, + TP_PROTO(u64 now, struct rq *rq, struct task_struct *tsk, int running), + TP_ARGS(now, rq, tsk, running), 1); + +/* macro versions of hooks are no longer required */ + +#endif /* _TRACE_HOOK_SCHED_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/selinux.h b/include/trace/hooks/selinux.h new file mode 100644 index 000000000000..0b65631b5dde --- /dev/null +++ b/include/trace/hooks/selinux.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM selinux + +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_SELINUX_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_SELINUX_H +#include +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ +struct selinux_state; +DECLARE_RESTRICTED_HOOK(android_rvh_selinux_is_initialized, + TP_PROTO(const struct selinux_state *state), + TP_ARGS(state), 1); + +#endif /* _TRACE_HOOK_SELINUX_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/signal.h b/include/trace/hooks/signal.h new file mode 100644 index 000000000000..ec9c5e86661b --- /dev/null +++ b/include/trace/hooks/signal.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM signal +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_SIGNAL_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_SIGNAL_H +#include +#include + +struct task_struct; +DECLARE_HOOK(android_vh_do_send_sig_info, + TP_PROTO(int sig, struct task_struct *killer, struct task_struct *dst), + TP_ARGS(sig, killer, dst)); +#endif /* _TRACE_HOOK_SIGNAL_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/softlockup.h b/include/trace/hooks/softlockup.h new file mode 100644 index 000000000000..52e41e3d2804 --- /dev/null +++ b/include/trace/hooks/softlockup.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM softlockup +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_SOFTLOCKUP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_SOFTLOCKUP_H +#include + +struct pt_regs; + +DECLARE_HOOK(android_vh_watchdog_timer_softlockup, + TP_PROTO(int duration, struct pt_regs *regs, bool is_panic), + TP_ARGS(duration, regs, is_panic)); + +#endif /* _TRACE_HOOK_SOFTLOCKUP_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/sys.h b/include/trace/hooks/sys.h new file mode 100644 index 000000000000..a14bd4135410 --- /dev/null +++ b/include/trace/hooks/sys.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM sys +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_SYS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_SYS_H +#include + +struct task_struct; + +DECLARE_HOOK(android_vh_syscall_prctl_finished, + TP_PROTO(int option, struct task_struct *task), + TP_ARGS(option, task)); +#endif + +#include diff --git a/include/trace/hooks/syscall_check.h b/include/trace/hooks/syscall_check.h new file mode 100644 index 000000000000..c906ff6b8e7f --- /dev/null +++ b/include/trace/hooks/syscall_check.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM syscall_check + +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_SYSCALL_CHECK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_SYSCALL_CHECK_H +#include +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ +struct file; +union bpf_attr; + +DECLARE_HOOK(android_vh_check_mmap_file, + TP_PROTO(const struct file *file, unsigned long prot, + unsigned long flag, unsigned long ret), + TP_ARGS(file, prot, flag, ret)); + +DECLARE_HOOK(android_vh_check_file_open, + TP_PROTO(const struct file *file), + TP_ARGS(file)); + +DECLARE_HOOK(android_vh_check_bpf_syscall, + TP_PROTO(int cmd, const union bpf_attr *attr, unsigned int size), + TP_ARGS(cmd, attr, size)); + +#endif /* _TRACE_HOOK_SYSCALL_CHECK_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/sysrqcrash.h b/include/trace/hooks/sysrqcrash.h new file mode 100644 index 000000000000..92e7bc7ca9c3 --- /dev/null +++ b/include/trace/hooks/sysrqcrash.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM sysrqcrash +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_SYSRQCRASH_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_SYSRQCRASH_H +#include +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ +DECLARE_HOOK(android_vh_sysrq_crash, + TP_PROTO(void *data), + TP_ARGS(data)); + +#endif /* _TRACE_HOOK_SYSRQCRASH_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/thermal.h b/include/trace/hooks/thermal.h new file mode 100644 index 000000000000..df5a5acd3684 --- /dev/null +++ b/include/trace/hooks/thermal.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM thermal + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_THERMAL_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_THERMAL_H + +#include + +struct thermal_cooling_device; +struct thermal_zone_device; +struct cpufreq_policy; + +DECLARE_HOOK(android_vh_modify_thermal_request_freq, + TP_PROTO(struct cpufreq_policy *policy, unsigned long *request_freq), + TP_ARGS(policy, request_freq)); + +DECLARE_HOOK(android_vh_modify_thermal_target_freq, + TP_PROTO(struct cpufreq_policy *policy, unsigned int *target_freq), + TP_ARGS(policy, target_freq)); + +DECLARE_HOOK(android_vh_thermal_register, + TP_PROTO(struct cpufreq_policy *policy), + TP_ARGS(policy)); + +DECLARE_HOOK(android_vh_thermal_unregister, + TP_PROTO(struct cpufreq_policy *policy), + TP_ARGS(policy)); + +DECLARE_HOOK(android_vh_enable_thermal_power_throttle, + TP_PROTO(bool *enable, bool *override), + TP_ARGS(enable, override)); + +DECLARE_HOOK(android_vh_thermal_power_cap, + TP_PROTO(u32 *power_range), + TP_ARGS(power_range)); + +DECLARE_HOOK(android_vh_get_thermal_zone_device, + TP_PROTO(struct thermal_zone_device *tz), + TP_ARGS(tz)); + +DECLARE_HOOK(android_vh_disable_thermal_cooling_stats, + TP_PROTO(struct thermal_cooling_device *cdev, int *disable_stats), + TP_ARGS(cdev, disable_stats)); + +DECLARE_HOOK(android_vh_enable_thermal_genl_check, + TP_PROTO(int event, int tz_id, int *enable_thermal_genl), + TP_ARGS(event, tz_id, enable_thermal_genl)); + +#endif /* _TRACE_HOOK_THERMAL_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/timekeeping.h b/include/trace/hooks/timekeeping.h new file mode 100644 index 000000000000..6d2bad768df9 --- /dev/null +++ b/include/trace/hooks/timekeeping.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM timekeeping + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_TIMEKEEPING_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_TIMEKEEPING_H + +#include + +struct timekeeper; + +DECLARE_RESTRICTED_HOOK(android_rvh_tk_based_time_sync, + TP_PROTO(struct timekeeper *tk), + TP_ARGS(tk), 1); + +#endif /* _TRACE_HOOK_TIMEKEEPING_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/timer.h b/include/trace/hooks/timer.h new file mode 100644 index 000000000000..67ef865dad4a --- /dev/null +++ b/include/trace/hooks/timer.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM timer + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_TIMER_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_TIMER_H + +#include + +DECLARE_HOOK(android_vh_timer_calc_index, + TP_PROTO(unsigned int lvl, unsigned long *expires), + TP_ARGS(lvl, expires)); + +#endif /* _TRACE_HOOK_TIMER_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/topology.h b/include/trace/hooks/topology.h new file mode 100644 index 000000000000..47bc431b789c --- /dev/null +++ b/include/trace/hooks/topology.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM topology + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_TOPOLOGY_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_TOPOLOGY_H + +#include + +struct cpumask; + +DECLARE_HOOK(android_vh_arch_set_freq_scale, + TP_PROTO(const struct cpumask *cpus, unsigned long freq, + unsigned long max, unsigned long *scale), + TP_ARGS(cpus, freq, max, scale)); + +DECLARE_HOOK(android_vh_update_topology_flags_workfn, + TP_PROTO(void *unused), + TP_ARGS(unused)); + +DECLARE_HOOK(android_vh_use_amu_fie, + TP_PROTO(bool *use_amu_fie), + TP_ARGS(use_amu_fie)); + +#endif /* _TRACE_HOOK_TOPOLOGY_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/traps.h b/include/trace/hooks/traps.h new file mode 100644 index 000000000000..4313a014d3ee --- /dev/null +++ b/include/trace/hooks/traps.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM traps +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_TRAPS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_TRAPS_H +#include + +struct pt_regs; + +DECLARE_RESTRICTED_HOOK(android_rvh_do_undefinstr, + TP_PROTO(struct pt_regs *regs), + TP_ARGS(regs), + TP_CONDITION(!user_mode(regs))); + +DECLARE_RESTRICTED_HOOK(android_rvh_do_ptrauth_fault, + TP_PROTO(struct pt_regs *regs, unsigned int esr), + TP_ARGS(regs, esr), + TP_CONDITION(!user_mode(regs))); + +DECLARE_RESTRICTED_HOOK(android_rvh_panic_unhandled, + TP_PROTO(struct pt_regs *regs, const char *vector, unsigned int esr), + TP_ARGS(regs, vector, esr), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_arm64_serror_panic, + TP_PROTO(struct pt_regs *regs, unsigned int esr), + TP_ARGS(regs, esr), 1); + +#endif /* _TRACE_HOOK_TRAPS_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/ufshcd.h b/include/trace/hooks/ufshcd.h new file mode 100644 index 000000000000..b0eaa1f68675 --- /dev/null +++ b/include/trace/hooks/ufshcd.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ufshcd +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_UFSHCD_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_UFSHCD_H +#include +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ + +struct ufs_hba; +struct ufshcd_lrb; +struct uic_command; +struct request; +struct scsi_device; + +DECLARE_HOOK(android_vh_ufs_fill_prdt, + TP_PROTO(struct ufs_hba *hba, struct ufshcd_lrb *lrbp, + unsigned int segments, int *err), + TP_ARGS(hba, lrbp, segments, err)); + +DECLARE_RESTRICTED_HOOK(android_rvh_ufs_complete_init, + TP_PROTO(struct ufs_hba *hba), + TP_ARGS(hba), 1); + +DECLARE_RESTRICTED_HOOK(android_rvh_ufs_reprogram_all_keys, + TP_PROTO(struct ufs_hba *hba, int *err), + TP_ARGS(hba, err), 1); + +DECLARE_HOOK(android_vh_ufs_prepare_command, + TP_PROTO(struct ufs_hba *hba, struct request *rq, + struct ufshcd_lrb *lrbp, int *err), + TP_ARGS(hba, rq, lrbp, err)); + +DECLARE_HOOK(android_vh_ufs_update_sysfs, + TP_PROTO(struct ufs_hba *hba), + TP_ARGS(hba)); + +DECLARE_HOOK(android_vh_ufs_send_command, + TP_PROTO(struct ufs_hba *hba, struct ufshcd_lrb *lrbp), + TP_ARGS(hba, lrbp)); + +DECLARE_HOOK(android_vh_ufs_compl_command, + TP_PROTO(struct ufs_hba *hba, struct ufshcd_lrb *lrbp), + TP_ARGS(hba, lrbp)); + +DECLARE_HOOK(android_vh_ufs_send_uic_command, + TP_PROTO(struct ufs_hba *hba, const struct uic_command *ucmd, + int str_t), + TP_ARGS(hba, ucmd, str_t)); + +DECLARE_HOOK(android_vh_ufs_send_tm_command, + TP_PROTO(struct ufs_hba *hba, int tag, int str_t), + TP_ARGS(hba, tag, str_t)); + +DECLARE_HOOK(android_vh_ufs_check_int_errors, + TP_PROTO(struct ufs_hba *hba, bool queue_eh_work), + TP_ARGS(hba, queue_eh_work)); + +DECLARE_HOOK(android_vh_ufs_update_sdev, + TP_PROTO(struct scsi_device *sdev), + TP_ARGS(sdev)); + +DECLARE_HOOK(android_vh_ufs_clock_scaling, + TP_PROTO(struct ufs_hba *hba, bool *force_out, bool *force_scaling, bool *scale_up), + TP_ARGS(hba, force_out, force_scaling, scale_up)); +#endif /* _TRACE_HOOK_UFSHCD_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/usb.h b/include/trace/hooks/usb.h new file mode 100644 index 000000000000..fe379b174289 --- /dev/null +++ b/include/trace/hooks/usb.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM usb +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_HOOK_USB_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_USB_H +#include +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ +struct usb_device; + +DECLARE_HOOK(android_vh_usb_new_device_added, + TP_PROTO(struct usb_device *udev, int *err), + TP_ARGS(udev, err)); +#endif /* _TRACE_HOOK_USB_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/vendor_hooks.h b/include/trace/hooks/vendor_hooks.h new file mode 100644 index 000000000000..9f71c75b8f8a --- /dev/null +++ b/include/trace/hooks/vendor_hooks.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Note: we intentionally omit include file ifdef protection + * This is due to the way trace events work. If a file includes two + * trace event headers under one "CREATE_TRACE_POINTS" the first include + * will override the DECLARE_RESTRICTED_HOOK and break the second include. + */ + +#ifndef __GENKSYMS__ +#include +#endif + +#if defined(CONFIG_TRACEPOINTS) && defined(CONFIG_ANDROID_VENDOR_HOOKS) + +#define DECLARE_HOOK DECLARE_TRACE + +int android_rvh_probe_register(struct tracepoint *tp, void *probe, void *data); + +#ifdef TRACE_HEADER_MULTI_READ + +#define DEFINE_HOOK_FN(_name, _reg, _unreg, proto, args) \ + static const char __tpstrtab_##_name[] \ + __section("__tracepoints_strings") = #_name; \ + extern struct static_call_key STATIC_CALL_KEY(tp_func_##_name); \ + int __traceiter_##_name(void *__data, proto); \ + struct tracepoint __tracepoint_##_name __used \ + __section("__tracepoints") = { \ + .name = __tpstrtab_##_name, \ + .key = STATIC_KEY_INIT_FALSE, \ + .static_call_key = &STATIC_CALL_KEY(tp_func_##_name), \ + .static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_##_name), \ + .iterator = &__traceiter_##_name, \ + .regfunc = _reg, \ + .unregfunc = _unreg, \ + .funcs = NULL }; \ + __TRACEPOINT_ENTRY(_name); \ + int __nocfi __traceiter_##_name(void *__data, proto) \ + { \ + struct tracepoint_func *it_func_ptr; \ + void *it_func; \ + \ + it_func_ptr = (&__tracepoint_##_name)->funcs; \ + it_func = (it_func_ptr)->func; \ + do { \ + __data = (it_func_ptr)->data; \ + ((void(*)(void *, proto))(it_func))(__data, args); \ + it_func = READ_ONCE((++it_func_ptr)->func); \ + } while (it_func); \ + return 0; \ + } \ + DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name); + +#undef DECLARE_RESTRICTED_HOOK +#define DECLARE_RESTRICTED_HOOK(name, proto, args, cond) \ + DEFINE_HOOK_FN(name, NULL, NULL, PARAMS(proto), PARAMS(args)) + +/* prevent additional recursion */ +#undef TRACE_HEADER_MULTI_READ +#else /* TRACE_HEADER_MULTI_READ */ + +#ifdef CONFIG_HAVE_STATIC_CALL +#define __DO_RESTRICTED_HOOK_CALL(name, args) \ + do { \ + struct tracepoint_func *it_func_ptr; \ + void *__data; \ + it_func_ptr = (&__tracepoint_##name)->funcs; \ + if (it_func_ptr) { \ + __data = (it_func_ptr)->data; \ + static_call(tp_func_##name)(__data, args); \ + } \ + } while (0) +#else +#define __DO_RESTRICTED_HOOK_CALL(name, args) __traceiter_##name(NULL, args) +#endif + +#define DO_RESTRICTED_HOOK(name, args, cond) \ + do { \ + if (!(cond)) \ + return; \ + \ + __DO_RESTRICTED_HOOK_CALL(name, TP_ARGS(args)); \ + } while (0) + +#define __DECLARE_RESTRICTED_HOOK(name, proto, args, cond, data_proto) \ + extern int __traceiter_##name(data_proto); \ + DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \ + extern struct tracepoint __tracepoint_##name; \ + static inline void __nocfi trace_##name(proto) \ + { \ + if (static_key_false(&__tracepoint_##name.key)) \ + DO_RESTRICTED_HOOK(name, \ + TP_ARGS(args), \ + TP_CONDITION(cond)); \ + } \ + static inline bool \ + trace_##name##_enabled(void) \ + { \ + return static_key_false(&__tracepoint_##name.key); \ + } \ + static inline int \ + register_trace_##name(void (*probe)(data_proto), void *data) \ + { \ + return android_rvh_probe_register(&__tracepoint_##name, \ + (void *)probe, data); \ + } \ + /* vendor hooks cannot be unregistered */ \ + +#undef DECLARE_RESTRICTED_HOOK +#define DECLARE_RESTRICTED_HOOK(name, proto, args, cond) \ + __DECLARE_RESTRICTED_HOOK(name, PARAMS(proto), PARAMS(args), \ + cond, \ + PARAMS(void *__data, proto)) + +#endif /* TRACE_HEADER_MULTI_READ */ + +#else /* !CONFIG_TRACEPOINTS || !CONFIG_ANDROID_VENDOR_HOOKS */ +/* suppress trace hooks */ +#define DECLARE_HOOK DECLARE_EVENT_NOP +#define DECLARE_RESTRICTED_HOOK(name, proto, args, cond) \ + DECLARE_EVENT_NOP(name, PARAMS(proto), PARAMS(args)) +#endif diff --git a/include/trace/hooks/vmscan.h b/include/trace/hooks/vmscan.h new file mode 100644 index 000000000000..de141abe1a29 --- /dev/null +++ b/include/trace/hooks/vmscan.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vmscan + +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_VMSCAN_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_VMSCAN_H + +#include + +DECLARE_RESTRICTED_HOOK(android_rvh_set_balance_anon_file_reclaim, + TP_PROTO(bool *balance_anon_file_reclaim), + TP_ARGS(balance_anon_file_reclaim), 1); +DECLARE_HOOK(android_vh_kswapd_per_node, + TP_PROTO(int nid, bool *skip, bool run), + TP_ARGS(nid, skip, run)); +DECLARE_HOOK(android_vh_page_referenced_check_bypass, + TP_PROTO(struct page *page, unsigned long nr_to_scan, int lru, bool *bypass), + TP_ARGS(page, nr_to_scan, lru, bypass)); + +DECLARE_HOOK(android_vh_shrink_node_memcgs, + TP_PROTO(struct mem_cgroup *memcg, bool *skip), + TP_ARGS(memcg, skip)); +DECLARE_HOOK(android_vh_shrink_slab_bypass, + TP_PROTO(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority, bool *bypass), + TP_ARGS(gfp_mask, nid, memcg, priority, bypass)); +DECLARE_HOOK(android_vh_do_shrink_slab, + TP_PROTO(struct shrinker *shrinker, struct shrink_control *shrinkctl, int priority), + TP_ARGS(shrinker, shrinkctl, priority)); +#endif /* _TRACE_HOOK_VMSCAN_H */ +/* This part must be outside protection */ +#include diff --git a/include/trace/hooks/wqlockup.h b/include/trace/hooks/wqlockup.h new file mode 100644 index 000000000000..92e01ab0ad0e --- /dev/null +++ b/include/trace/hooks/wqlockup.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM wqlockup +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_WQLOCKUP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_WQLOCKUP_H +#include +/* + * Following tracepoints are not exported in tracefs and provide a + * mechanism for vendor modules to hook and extend functionality + */ +DECLARE_HOOK(android_vh_wq_lockup_pool, + TP_PROTO(int cpu, unsigned long pool_ts), + TP_ARGS(cpu, pool_ts)); + +#endif /* _TRACE_HOOK_WQLOCKUP_H */ +/* This part must be outside protection */ +#include diff --git a/include/uapi/linux/OWNERS b/include/uapi/linux/OWNERS new file mode 100644 index 000000000000..8aed640a4679 --- /dev/null +++ b/include/uapi/linux/OWNERS @@ -0,0 +1,3 @@ +per-file f2fs**=file:/fs/f2fs/OWNERS +per-file fuse**=file:/fs/fuse/OWNERS +per-file net**=file:/net/OWNERS diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 3246f2c74696..e748f20cb52f 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -38,10 +38,58 @@ enum { BINDER_TYPE_PTR = B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE), }; -enum { +/** + * enum flat_binder_object_shifts: shift values for flat_binder_object_flags + * @FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT: shift for getting scheduler policy. + * + */ +enum flat_binder_object_shifts { + FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT = 9, +}; + +/** + * enum flat_binder_object_flags - flags for use in flat_binder_object.flags + */ +enum flat_binder_object_flags { + /** + * @FLAT_BINDER_FLAG_PRIORITY_MASK: bit-mask for min scheduler priority + * + * These bits can be used to set the minimum scheduler priority + * at which transactions into this node should run. Valid values + * in these bits depend on the scheduler policy encoded in + * @FLAT_BINDER_FLAG_SCHED_POLICY_MASK. + * + * For SCHED_NORMAL/SCHED_BATCH, the valid range is between [-20..19] + * For SCHED_FIFO/SCHED_RR, the value can run between [1..99] + */ FLAT_BINDER_FLAG_PRIORITY_MASK = 0xff, + /** + * @FLAT_BINDER_FLAG_ACCEPTS_FDS: whether the node accepts fds. + */ FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100, + /** + * @FLAT_BINDER_FLAG_SCHED_POLICY_MASK: bit-mask for scheduling policy + * + * These two bits can be used to set the min scheduling policy at which + * transactions on this node should run. These match the UAPI + * scheduler policy values, eg: + * 00b: SCHED_NORMAL + * 01b: SCHED_FIFO + * 10b: SCHED_RR + * 11b: SCHED_BATCH + */ + FLAT_BINDER_FLAG_SCHED_POLICY_MASK = + 3U << FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT, + + /** + * @FLAT_BINDER_FLAG_INHERIT_RT: whether the node inherits RT policy + * + * Only when set, calls into this node will inherit a real-time + * scheduling policy from the caller (for synchronous transactions). + */ + FLAT_BINDER_FLAG_INHERIT_RT = 0x800, + /** * @FLAT_BINDER_FLAG_TXN_SECURITY_CTX: request security contexts * @@ -271,6 +319,7 @@ enum transaction_flags { TF_STATUS_CODE = 0x08, /* contents are a 32-bit status code */ TF_ACCEPT_FDS = 0x10, /* allow replies with file descriptors */ TF_CLEAR_BUF = 0x20, /* clear buffer on txn complete */ + TF_UPDATE_TXN = 0x40, /* update the outdated pending async txn */ }; struct binder_transaction_data { @@ -433,7 +482,7 @@ enum binder_driver_return_protocol { BR_FROZEN_REPLY = _IO('r', 18), /* - * The target of the last transaction (either a bcTRANSACTION or + * The target of the last sync transaction (either a bcTRANSACTION or * a bcATTEMPT_ACQUIRE) is frozen. No parameters. */ @@ -443,6 +492,11 @@ enum binder_driver_return_protocol { * asynchronous transaction makes the allocated async buffer size exceed * detection threshold. No parameters. */ + + BR_TRANSACTION_PENDING_FROZEN = _IO('r', 20), + /* + * The target of the last async transaction is frozen. No parameters. + */ }; enum binder_driver_command_protocol { diff --git a/include/uapi/linux/android_fuse.h b/include/uapi/linux/android_fuse.h new file mode 100644 index 000000000000..221e30ea7f01 --- /dev/null +++ b/include/uapi/linux/android_fuse.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause WITH Linux-syscall-note */ +/* Copyright (c) 2022 Google LLC */ + +#ifndef _LINUX_ANDROID_FUSE_H +#define _LINUX_ANDROID_FUSE_H + +#ifdef __KERNEL__ +#include +#else +#include +#endif + +#define FUSE_ACTION_KEEP 0 +#define FUSE_ACTION_REMOVE 1 +#define FUSE_ACTION_REPLACE 2 + +struct fuse_entry_bpf_out { + uint64_t backing_action; + uint64_t backing_fd; + uint64_t bpf_action; + uint64_t bpf_fd; +}; + +struct fuse_entry_bpf { + struct fuse_entry_bpf_out out; + struct file *backing_file; + struct file *bpf_file; +}; + +struct fuse_read_out { + uint64_t offset; + uint32_t again; + uint32_t padding; +}; + +struct fuse_in_postfilter_header { + uint32_t len; + uint32_t opcode; + uint64_t unique; + uint64_t nodeid; + uint32_t uid; + uint32_t gid; + uint32_t pid; + uint32_t error_in; +}; + +/* + * Fuse BPF Args + * + * Used to communicate with bpf programs to allow checking or altering certain values. + * The end_offset allows the bpf verifier to check boundaries statically. This reflects + * the ends of the buffer. size shows the length that was actually used. + * + */ + +/** One input argument of a request */ +struct fuse_bpf_in_arg { + uint32_t size; + const void *value; + const void *end_offset; +}; + +/** One output argument of a request */ +struct fuse_bpf_arg { + uint32_t size; + void *value; + void *end_offset; +}; + +#define FUSE_MAX_IN_ARGS 5 +#define FUSE_MAX_OUT_ARGS 3 + +#define FUSE_BPF_FORCE (1 << 0) +#define FUSE_BPF_OUT_ARGVAR (1 << 6) + +struct fuse_bpf_args { + uint64_t nodeid; + uint32_t opcode; + uint32_t error_in; + uint32_t in_numargs; + uint32_t out_numargs; + uint32_t flags; + struct fuse_bpf_in_arg in_args[FUSE_MAX_IN_ARGS]; + struct fuse_bpf_arg out_args[FUSE_MAX_OUT_ARGS]; +}; + +#define FUSE_BPF_USER_FILTER 1 +#define FUSE_BPF_BACKING 2 +#define FUSE_BPF_POST_FILTER 4 + +#define FUSE_OPCODE_FILTER 0x0ffff +#define FUSE_PREFILTER 0x10000 +#define FUSE_POSTFILTER 0x20000 + +struct bpf_prog *fuse_get_bpf_prog(struct file *file); + +#endif /* _LINUX_ANDROID_FUSE_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a887e582f0e7..7493f0fd0d73 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -949,6 +949,16 @@ enum bpf_prog_type { BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ + + /* + * Until fuse-bpf is upstreamed, this value must be at the end to allow for + * other recently-added upstreamed values to be correct. + * This works because no one should use this value directly, rather they must + * read the value from /sys/fs/fuse/bpf_prog_type_fuse + * Please maintain this value at the end of the list until fuse-bpf is + * upstreamed. + */ + BPF_PROG_TYPE_FUSE, }; enum bpf_attach_type { diff --git a/include/uapi/linux/dm-user.h b/include/uapi/linux/dm-user.h new file mode 100644 index 000000000000..6d8f535b3542 --- /dev/null +++ b/include/uapi/linux/dm-user.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: LGPL-2.0+ WITH Linux-syscall-note */ +/* + * Copyright (C) 2020 Google, Inc + * Copyright (C) 2020 Palmer Dabbelt + */ + +#ifndef _LINUX_DM_USER_H +#define _LINUX_DM_USER_H + +#include + +/* + * dm-user proxies device mapper ops between the kernel and userspace. It's + * essentially just an RPC mechanism: all kernel calls create a request, + * userspace handles that with a response. Userspace obtains requests via + * read() and provides responses via write(). + * + * See Documentation/block/dm-user.rst for more information. + */ + +#define DM_USER_REQ_MAP_READ 0 +#define DM_USER_REQ_MAP_WRITE 1 +#define DM_USER_REQ_MAP_FLUSH 2 +#define DM_USER_REQ_MAP_DISCARD 3 +#define DM_USER_REQ_MAP_SECURE_ERASE 4 +#define DM_USER_REQ_MAP_WRITE_SAME 5 +#define DM_USER_REQ_MAP_WRITE_ZEROES 6 +#define DM_USER_REQ_MAP_ZONE_OPEN 7 +#define DM_USER_REQ_MAP_ZONE_CLOSE 8 +#define DM_USER_REQ_MAP_ZONE_FINISH 9 +#define DM_USER_REQ_MAP_ZONE_APPEND 10 +#define DM_USER_REQ_MAP_ZONE_RESET 11 +#define DM_USER_REQ_MAP_ZONE_RESET_ALL 12 + +#define DM_USER_REQ_MAP_FLAG_FAILFAST_DEV 0x00001 +#define DM_USER_REQ_MAP_FLAG_FAILFAST_TRANSPORT 0x00002 +#define DM_USER_REQ_MAP_FLAG_FAILFAST_DRIVER 0x00004 +#define DM_USER_REQ_MAP_FLAG_SYNC 0x00008 +#define DM_USER_REQ_MAP_FLAG_META 0x00010 +#define DM_USER_REQ_MAP_FLAG_PRIO 0x00020 +#define DM_USER_REQ_MAP_FLAG_NOMERGE 0x00040 +#define DM_USER_REQ_MAP_FLAG_IDLE 0x00080 +#define DM_USER_REQ_MAP_FLAG_INTEGRITY 0x00100 +#define DM_USER_REQ_MAP_FLAG_FUA 0x00200 +#define DM_USER_REQ_MAP_FLAG_PREFLUSH 0x00400 +#define DM_USER_REQ_MAP_FLAG_RAHEAD 0x00800 +#define DM_USER_REQ_MAP_FLAG_BACKGROUND 0x01000 +#define DM_USER_REQ_MAP_FLAG_NOWAIT 0x02000 +#define DM_USER_REQ_MAP_FLAG_CGROUP_PUNT 0x04000 +#define DM_USER_REQ_MAP_FLAG_NOUNMAP 0x08000 +#define DM_USER_REQ_MAP_FLAG_HIPRI 0x10000 +#define DM_USER_REQ_MAP_FLAG_DRV 0x20000 +#define DM_USER_REQ_MAP_FLAG_SWAP 0x40000 + +#define DM_USER_RESP_SUCCESS 0 +#define DM_USER_RESP_ERROR 1 +#define DM_USER_RESP_UNSUPPORTED 2 + +struct dm_user_message { + __u64 seq; + __u64 type; + __u64 flags; + __u64 sector; + __u64 len; + __u8 buf[]; +}; + +#endif diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 61bf4774b8f2..7ef574f3256a 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -427,6 +427,8 @@ typedef struct elf64_shdr { #define NT_ARM_PACG_KEYS 0x408 /* ARM pointer authentication generic key */ #define NT_ARM_TAGGED_ADDR_CTRL 0x409 /* arm64 tagged address control (prctl()) */ #define NT_ARM_PAC_ENABLED_KEYS 0x40a /* arm64 ptr auth enabled keys (prctl()) */ +#define NT_ARM_SSVE 0x40b /* ARM Streaming SVE registers */ +#define NT_ARM_ZA 0x40c /* ARM SME ZA registers */ #define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ #define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ diff --git a/include/uapi/linux/f2fs.h b/include/uapi/linux/f2fs.h index 352a822d4370..955d440be104 100644 --- a/include/uapi/linux/f2fs.h +++ b/include/uapi/linux/f2fs.h @@ -13,7 +13,7 @@ #define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2) #define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) -#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) +#define F2FS_IOC_ABORT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 5) #define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) #define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \ @@ -42,6 +42,7 @@ struct f2fs_comp_option) #define F2FS_IOC_DECOMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 23) #define F2FS_IOC_COMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 24) +#define F2FS_IOC_START_ATOMIC_REPLACE _IO(F2FS_IOCTL_MAGIC, 25) /* * should be same as XFS_IOC_GOINGDOWN. diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h index 9f4428be3e36..024def3ad43d 100644 --- a/include/uapi/linux/fscrypt.h +++ b/include/uapi/linux/fscrypt.h @@ -26,8 +26,11 @@ #define FSCRYPT_MODE_AES_256_CTS 4 #define FSCRYPT_MODE_AES_128_CBC 5 #define FSCRYPT_MODE_AES_128_CTS 6 +#define FSCRYPT_MODE_SM4_XTS 7 +#define FSCRYPT_MODE_SM4_CTS 8 #define FSCRYPT_MODE_ADIANTUM 9 -/* If adding a mode number > 9, update FSCRYPT_MODE_MAX in fscrypt_private.h */ +#define FSCRYPT_MODE_AES_256_HCTR2 10 +/* If adding a mode number > 10, update FSCRYPT_MODE_MAX in fscrypt_private.h */ /* * Legacy policy version; ad-hoc KDF and no key verification. @@ -124,7 +127,10 @@ struct fscrypt_add_key_arg { struct fscrypt_key_specifier key_spec; __u32 raw_size; __u32 key_id; - __u32 __reserved[8]; + __u32 __reserved[7]; + /* N.B.: "temporary" flag, not reserved upstream */ +#define __FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED 0x00000001 + __u32 __flags; __u8 raw[]; }; @@ -184,8 +190,6 @@ struct fscrypt_get_key_status_arg { #define FS_ENCRYPTION_MODE_AES_256_CTS FSCRYPT_MODE_AES_256_CTS #define FS_ENCRYPTION_MODE_AES_128_CBC FSCRYPT_MODE_AES_128_CBC #define FS_ENCRYPTION_MODE_AES_128_CTS FSCRYPT_MODE_AES_128_CTS -#define FS_ENCRYPTION_MODE_SPECK128_256_XTS 7 /* removed */ -#define FS_ENCRYPTION_MODE_SPECK128_256_CTS 8 /* removed */ #define FS_ENCRYPTION_MODE_ADIANTUM FSCRYPT_MODE_ADIANTUM #define FS_KEY_DESC_PREFIX FSCRYPT_KEY_DESC_PREFIX #define FS_KEY_DESC_PREFIX_SIZE FSCRYPT_KEY_DESC_PREFIX_SIZE diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 36ed092227fa..430983c5b8dd 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -367,6 +367,7 @@ struct fuse_file_lock { #define FUSE_SUBMOUNTS (1 << 27) #define FUSE_HANDLE_KILLPRIV_V2 (1 << 28) #define FUSE_SETXATTR_EXT (1 << 29) +#define FUSE_PASSTHROUGH (1 << 31) /** * CUSE INIT request/reply flags @@ -513,6 +514,7 @@ enum fuse_opcode { FUSE_SETUPMAPPING = 48, FUSE_REMOVEMAPPING = 49, FUSE_SYNCFS = 50, + FUSE_CANONICAL_PATH = 2016, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -639,7 +641,7 @@ struct fuse_create_in { struct fuse_open_out { uint64_t fh; uint32_t open_flags; - uint32_t padding; + uint32_t passthrough_fh; }; struct fuse_release_in { @@ -923,6 +925,9 @@ struct fuse_notify_retrieve_in { /* Device ioctls: */ #define FUSE_DEV_IOC_MAGIC 229 #define FUSE_DEV_IOC_CLONE _IOR(FUSE_DEV_IOC_MAGIC, 0, uint32_t) +/* 127 is reserved for the V1 interface implementation in Android (deprecated) */ +/* 126 is reserved for the V2 interface implementation in Android */ +#define FUSE_DEV_IOC_PASSTHROUGH_OPEN _IOW(FUSE_DEV_IOC_MAGIC, 126, __u32) struct fuse_lseek_in { uint64_t fh; diff --git a/include/uapi/linux/icmp.h b/include/uapi/linux/icmp.h index 163c0998aec9..d3242d586e65 100644 --- a/include/uapi/linux/icmp.h +++ b/include/uapi/linux/icmp.h @@ -97,7 +97,11 @@ struct icmphdr { } echo; __be32 gateway; struct { +#ifdef __BIONIC__ + __be16 __linux_unused; +#else __be16 __unused; +#endif __be16 mtu; } frag; __u8 reserved[4]; diff --git a/include/uapi/linux/incrementalfs.h b/include/uapi/linux/incrementalfs.h new file mode 100644 index 000000000000..f8338aff82f5 --- /dev/null +++ b/include/uapi/linux/incrementalfs.h @@ -0,0 +1,590 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Userspace interface for Incremental FS. + * + * Incremental FS is special-purpose Linux virtual file system that allows + * execution of a program while its binary and resource files are still being + * lazily downloaded over the network, USB etc. + * + * Copyright 2019 Google LLC + */ +#ifndef _UAPI_LINUX_INCREMENTALFS_H +#define _UAPI_LINUX_INCREMENTALFS_H + +#include +#include +#include +#include + +/* ===== constants ===== */ +#define INCFS_NAME "incremental-fs" + +/* + * Magic number used in file header and in memory superblock + * Note that it is a 5 byte unsigned long. Thus on 32 bit kernels, it is + * truncated to a 4 byte number + */ +#define INCFS_MAGIC_NUMBER (0x5346434e49ul & ULONG_MAX) + +#define INCFS_DATA_FILE_BLOCK_SIZE 4096 +#define INCFS_HEADER_VER 1 + +/* TODO: This value is assumed in incfs_copy_signature_info_from_user to be the + * actual signature length. Set back to 64 when fixed. + */ +#define INCFS_MAX_HASH_SIZE 32 +#define INCFS_MAX_FILE_ATTR_SIZE 512 + +#define INCFS_INDEX_NAME ".index" +#define INCFS_INCOMPLETE_NAME ".incomplete" +#define INCFS_PENDING_READS_FILENAME ".pending_reads" +#define INCFS_LOG_FILENAME ".log" +#define INCFS_BLOCKS_WRITTEN_FILENAME ".blocks_written" +#define INCFS_XATTR_ID_NAME (XATTR_USER_PREFIX "incfs.id") +#define INCFS_XATTR_SIZE_NAME (XATTR_USER_PREFIX "incfs.size") +#define INCFS_XATTR_METADATA_NAME (XATTR_USER_PREFIX "incfs.metadata") +#define INCFS_XATTR_VERITY_NAME (XATTR_USER_PREFIX "incfs.verity") + +#define INCFS_MAX_SIGNATURE_SIZE 8096 +#define INCFS_SIGNATURE_VERSION 2 +#define INCFS_SIGNATURE_SECTIONS 2 + +#define INCFS_IOCTL_BASE_CODE 'g' + +/* ===== ioctl requests on the command dir ===== */ + +/* + * Create a new file + * May only be called on .pending_reads file + */ +#define INCFS_IOC_CREATE_FILE \ + _IOWR(INCFS_IOCTL_BASE_CODE, 30, struct incfs_new_file_args) + +/* Read file signature */ +#define INCFS_IOC_READ_FILE_SIGNATURE \ + _IOR(INCFS_IOCTL_BASE_CODE, 31, struct incfs_get_file_sig_args) + +/* + * Fill in one or more data block. This may only be called on a handle + * passed as a parameter to INCFS_IOC_PERMIT_FILLING + * + * Returns number of blocks filled in, or error if none were + */ +#define INCFS_IOC_FILL_BLOCKS \ + _IOR(INCFS_IOCTL_BASE_CODE, 32, struct incfs_fill_blocks) + +/* + * Permit INCFS_IOC_FILL_BLOCKS on the given file descriptor + * May only be called on .pending_reads file + * + * Returns 0 on success or error + */ +#define INCFS_IOC_PERMIT_FILL \ + _IOW(INCFS_IOCTL_BASE_CODE, 33, struct incfs_permit_fill) + +/* + * Fills buffer with ranges of populated blocks + * + * Returns 0 if all ranges written + * error otherwise + * + * Either way, range_buffer_size_out is set to the number + * of bytes written. Should be set to 0 by caller. The ranges + * filled are valid, but if an error was returned there might + * be more ranges to come. + * + * Ranges are ranges of filled blocks: + * + * 1 2 7 9 + * + * means blocks 1, 2, 7, 8, 9 are filled, 0, 3, 4, 5, 6 and 10 on + * are not + * + * If hashing is enabled for the file, the hash blocks are simply + * treated as though they immediately followed the data blocks. + */ +#define INCFS_IOC_GET_FILLED_BLOCKS \ + _IOR(INCFS_IOCTL_BASE_CODE, 34, struct incfs_get_filled_blocks_args) + +/* + * Creates a new mapped file + * May only be called on .pending_reads file + */ +#define INCFS_IOC_CREATE_MAPPED_FILE \ + _IOWR(INCFS_IOCTL_BASE_CODE, 35, struct incfs_create_mapped_file_args) + +/* + * Get number of blocks, total and filled + * May only be called on .pending_reads file + */ +#define INCFS_IOC_GET_BLOCK_COUNT \ + _IOR(INCFS_IOCTL_BASE_CODE, 36, struct incfs_get_block_count_args) + +/* + * Get per UID read timeouts + * May only be called on .pending_reads file + */ +#define INCFS_IOC_GET_READ_TIMEOUTS \ + _IOR(INCFS_IOCTL_BASE_CODE, 37, struct incfs_get_read_timeouts_args) + +/* + * Set per UID read timeouts + * May only be called on .pending_reads file + */ +#define INCFS_IOC_SET_READ_TIMEOUTS \ + _IOW(INCFS_IOCTL_BASE_CODE, 38, struct incfs_set_read_timeouts_args) + +/* + * Get last read error + * May only be called on .pending_reads file + */ +#define INCFS_IOC_GET_LAST_READ_ERROR \ + _IOW(INCFS_IOCTL_BASE_CODE, 39, struct incfs_get_last_read_error_args) + +/* ===== sysfs feature flags ===== */ +/* + * Each flag is represented by a file in /sys/fs/incremental-fs/features + * If the file exists the feature is supported + * Also the file contents will be the line "supported" + */ + +/* + * Basic flag stating that the core incfs file system is available + */ +#define INCFS_FEATURE_FLAG_COREFS "corefs" + +/* + * zstd compression support + */ +#define INCFS_FEATURE_FLAG_ZSTD "zstd" + +/* + * v2 feature set support. Covers: + * INCFS_IOC_CREATE_MAPPED_FILE + * INCFS_IOC_GET_BLOCK_COUNT + * INCFS_IOC_GET_READ_TIMEOUTS/INCFS_IOC_SET_READ_TIMEOUTS + * .blocks_written status file + * .incomplete folder + * report_uid mount option + */ +#define INCFS_FEATURE_FLAG_V2 "v2" + +enum incfs_compression_alg { + COMPRESSION_NONE = 0, + COMPRESSION_LZ4 = 1, + COMPRESSION_ZSTD = 2, +}; + +enum incfs_block_flags { + INCFS_BLOCK_FLAGS_NONE = 0, + INCFS_BLOCK_FLAGS_HASH = 1, +}; + +typedef struct { + __u8 bytes[16]; +} incfs_uuid_t __attribute__((aligned (8))); + +/* + * Description of a pending read. A pending read - a read call by + * a userspace program for which the filesystem currently doesn't have data. + * + * Reads from .pending_reads and .log return an array of these structure + */ +struct incfs_pending_read_info { + /* Id of a file that is being read from. */ + incfs_uuid_t file_id; + + /* A number of microseconds since system boot to the read. */ + __aligned_u64 timestamp_us; + + /* Index of a file block that is being read. */ + __u32 block_index; + + /* A serial number of this pending read. */ + __u32 serial_number; +}; + +/* + * Description of a pending read. A pending read - a read call by + * a userspace program for which the filesystem currently doesn't have data. + * + * This version of incfs_pending_read_info is used whenever the file system is + * mounted with the report_uid flag + */ +struct incfs_pending_read_info2 { + /* Id of a file that is being read from. */ + incfs_uuid_t file_id; + + /* A number of microseconds since system boot to the read. */ + __aligned_u64 timestamp_us; + + /* Index of a file block that is being read. */ + __u32 block_index; + + /* A serial number of this pending read. */ + __u32 serial_number; + + /* The UID of the reading process */ + __u32 uid; + + __u32 reserved; +}; + +/* + * Description of a data or hash block to add to a data file. + */ +struct incfs_fill_block { + /* Index of a data block. */ + __u32 block_index; + + /* Length of data */ + __u32 data_len; + + /* + * A pointer to an actual data for the block. + * + * Equivalent to: __u8 *data; + */ + __aligned_u64 data; + + /* + * Compression algorithm used to compress the data block. + * Values from enum incfs_compression_alg. + */ + __u8 compression; + + /* Values from enum incfs_block_flags */ + __u8 flags; + + __u16 reserved1; + + __u32 reserved2; + + __aligned_u64 reserved3; +}; + +/* + * Description of a number of blocks to add to a data file + * + * Argument for INCFS_IOC_FILL_BLOCKS + */ +struct incfs_fill_blocks { + /* Number of blocks */ + __u64 count; + + /* A pointer to an array of incfs_fill_block structs */ + __aligned_u64 fill_blocks; +}; + +/* + * Permit INCFS_IOC_FILL_BLOCKS on the given file descriptor + * May only be called on .pending_reads file + * + * Argument for INCFS_IOC_PERMIT_FILL + */ +struct incfs_permit_fill { + /* File to permit fills on */ + __u32 file_descriptor; +}; + +enum incfs_hash_tree_algorithm { + INCFS_HASH_TREE_NONE = 0, + INCFS_HASH_TREE_SHA256 = 1 +}; + +/* + * Create a new file or directory. + */ +struct incfs_new_file_args { + /* Id of a file to create. */ + incfs_uuid_t file_id; + + /* + * Total size of the new file. Ignored if S_ISDIR(mode). + */ + __aligned_u64 size; + + /* + * File mode. Permissions and dir flag. + */ + __u16 mode; + + __u16 reserved1; + + __u32 reserved2; + + /* + * A pointer to a null-terminated relative path to the file's parent + * dir. + * Max length: PATH_MAX + * + * Equivalent to: char *directory_path; + */ + __aligned_u64 directory_path; + + /* + * A pointer to a null-terminated file's name. + * Max length: PATH_MAX + * + * Equivalent to: char *file_name; + */ + __aligned_u64 file_name; + + /* + * A pointer to a file attribute to be set on creation. + * + * Equivalent to: u8 *file_attr; + */ + __aligned_u64 file_attr; + + /* + * Length of the data buffer specfied by file_attr. + * Max value: INCFS_MAX_FILE_ATTR_SIZE + */ + __u32 file_attr_len; + + __u32 reserved4; + + /* + * Points to an APK V4 Signature data blob + * Signature must have two sections + * Format is: + * u32 version + * u32 size_of_hash_info_section + * u8 hash_info_section[] + * u32 size_of_signing_info_section + * u8 signing_info_section[] + * + * Note that incfs does not care about what is in signing_info_section + * + * hash_info_section has following format: + * u32 hash_algorithm; // Must be SHA256 == 1 + * u8 log2_blocksize; // Must be 12 for 4096 byte blocks + * u32 salt_size; + * u8 salt[]; + * u32 hash_size; + * u8 root_hash[]; + */ + __aligned_u64 signature_info; + + /* Size of signature_info */ + __aligned_u64 signature_size; + + __aligned_u64 reserved6; +}; + +/* + * Request a digital signature blob for a given file. + * Argument for INCFS_IOC_READ_FILE_SIGNATURE ioctl + */ +struct incfs_get_file_sig_args { + /* + * A pointer to the data buffer to save an signature blob to. + * + * Equivalent to: u8 *file_signature; + */ + __aligned_u64 file_signature; + + /* Size of the buffer at file_signature. */ + __u32 file_signature_buf_size; + + /* + * Number of bytes save file_signature buffer. + * It is set after ioctl done. + */ + __u32 file_signature_len_out; +}; + +struct incfs_filled_range { + __u32 begin; + __u32 end; +}; + +/* + * Request ranges of filled blocks + * Argument for INCFS_IOC_GET_FILLED_BLOCKS + */ +struct incfs_get_filled_blocks_args { + /* + * A buffer to populate with ranges of filled blocks + * + * Equivalent to struct incfs_filled_ranges *range_buffer + */ + __aligned_u64 range_buffer; + + /* Size of range_buffer */ + __u32 range_buffer_size; + + /* Start index to read from */ + __u32 start_index; + + /* + * End index to read to. 0 means read to end. This is a range, + * so incfs will read from start_index to end_index - 1 + */ + __u32 end_index; + + /* Actual number of blocks in file */ + __u32 total_blocks_out; + + /* The number of data blocks in file */ + __u32 data_blocks_out; + + /* Number of bytes written to range buffer */ + __u32 range_buffer_size_out; + + /* Sector scanned up to, if the call was interrupted */ + __u32 index_out; +}; + +/* + * Create a new mapped file + * Argument for INCFS_IOC_CREATE_MAPPED_FILE + */ +struct incfs_create_mapped_file_args { + /* + * Total size of the new file. + */ + __aligned_u64 size; + + /* + * File mode. Permissions and dir flag. + */ + __u16 mode; + + __u16 reserved1; + + __u32 reserved2; + + /* + * A pointer to a null-terminated relative path to the incfs mount + * point + * Max length: PATH_MAX + * + * Equivalent to: char *directory_path; + */ + __aligned_u64 directory_path; + + /* + * A pointer to a null-terminated file name. + * Max length: PATH_MAX + * + * Equivalent to: char *file_name; + */ + __aligned_u64 file_name; + + /* Id of source file to map. */ + incfs_uuid_t source_file_id; + + /* + * Offset in source file to start mapping. Must be a multiple of + * INCFS_DATA_FILE_BLOCK_SIZE + */ + __aligned_u64 source_offset; +}; + +/* + * Get information about the blocks in this file + * Argument for INCFS_IOC_GET_BLOCK_COUNT + */ +struct incfs_get_block_count_args { + /* Total number of data blocks in the file */ + __u32 total_data_blocks_out; + + /* Number of filled data blocks in the file */ + __u32 filled_data_blocks_out; + + /* Total number of hash blocks in the file */ + __u32 total_hash_blocks_out; + + /* Number of filled hash blocks in the file */ + __u32 filled_hash_blocks_out; +}; + +/* Description of timeouts for one UID */ +struct incfs_per_uid_read_timeouts { + /* UID to apply these timeouts to */ + __u32 uid; + + /* + * Min time in microseconds to read any block. Note that this doesn't + * apply to reads which are satisfied from the page cache. + */ + __u32 min_time_us; + + /* + * Min time in microseconds to satisfy a pending read. Any pending read + * which is filled before this time will be delayed so that the total + * read time >= this value. + */ + __u32 min_pending_time_us; + + /* + * Max time in microseconds to satisfy a pending read before the read + * times out. If set to U32_MAX, defaults to mount options + * read_timeout_ms * 1000. Must be >= min_pending_time_us + */ + __u32 max_pending_time_us; +}; + +/* + * Get the read timeouts array + * Argument for INCFS_IOC_GET_READ_TIMEOUTS + */ +struct incfs_get_read_timeouts_args { + /* + * A pointer to a buffer to fill with the current timeouts + * + * Equivalent to struct incfs_per_uid_read_timeouts * + */ + __aligned_u64 timeouts_array; + + /* Size of above buffer in bytes */ + __u32 timeouts_array_size; + + /* Size used in bytes, or size needed if -ENOMEM returned */ + __u32 timeouts_array_size_out; +}; + +/* + * Set the read timeouts array + * Arguments for INCFS_IOC_SET_READ_TIMEOUTS + */ +struct incfs_set_read_timeouts_args { + /* + * A pointer to an array containing the new timeouts + * This will replace any existing timeouts + * + * Equivalent to struct incfs_per_uid_read_timeouts * + */ + __aligned_u64 timeouts_array; + + /* Size of above array in bytes. Must be < 256 */ + __u32 timeouts_array_size; +}; + +/* + * Get last read error struct + * Arguments for INCFS_IOC_GET_LAST_READ_ERROR + */ +struct incfs_get_last_read_error_args { + /* File id of last file that had a read error */ + incfs_uuid_t file_id_out; + + /* Time of last read error, in us, from CLOCK_MONOTONIC */ + __u64 time_us_out; + + /* Index of page that was being read at last read error */ + __u32 page_out; + + /* errno of last read error */ + __u32 errno_out; + + /* uid of last read error */ + __u32 uid_out; + + __u32 reserved1; + __u64 reserved2; +}; + +#endif /* _UAPI_LINUX_INCREMENTALFS_H */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a067410ebea5..adc4e137c3bc 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -433,6 +433,8 @@ struct kvm_run { #define KVM_SYSTEM_EVENT_SHUTDOWN 1 #define KVM_SYSTEM_EVENT_RESET 2 #define KVM_SYSTEM_EVENT_CRASH 3 +#define KVM_SYSTEM_EVENT_WAKEUP 4 +#define KVM_SYSTEM_EVENT_SUSPEND 5 __u32 type; __u64 flags; } system_event; @@ -616,6 +618,7 @@ struct kvm_vapic_addr { #define KVM_MP_STATE_OPERATING 7 #define KVM_MP_STATE_LOAD 8 #define KVM_MP_STATE_AP_RESET_HOLD 9 +#define KVM_MP_STATE_SUSPENDED 10 struct kvm_mp_state { __u32 mp_state; @@ -863,6 +866,12 @@ struct kvm_ppc_resize_hpt { #define KVM_VM_TYPE_ARM_IPA_SIZE_MASK 0xffULL #define KVM_VM_TYPE_ARM_IPA_SIZE(x) \ ((x) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK) + +#define KVM_VM_TYPE_ARM_PROTECTED (1UL << 31) + +#define KVM_VM_TYPE_MASK (KVM_VM_TYPE_ARM_IPA_SIZE_MASK | \ + KVM_VM_TYPE_ARM_PROTECTED) + /* * ioctls for /dev/kvm fds: */ @@ -1112,6 +1121,30 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_BINARY_STATS_FD 203 #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204 #define KVM_CAP_ARM_MTE 205 +#if 0 +/* Defined upstream but not backported to this kernel. */ +#define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206 +#define KVM_CAP_VM_GPA_BITS 207 +#define KVM_CAP_XSAVE2 208 +#define KVM_CAP_SYS_ATTRIBUTES 209 +#define KVM_CAP_PPC_AIL_MODE_3 210 +#define KVM_CAP_S390_MEM_OP_EXTENSION 211 +#define KVM_CAP_PMU_CAPABILITY 212 +#define KVM_CAP_DISABLE_QUIRKS2 213 +/* #define KVM_CAP_VM_TSC_CONTROL 214 */ +#define KVM_CAP_SYSTEM_EVENT_DATA 215 +#endif +#define KVM_CAP_ARM_SYSTEM_SUSPEND 216 +#if 0 +#define KVM_CAP_S390_PROTECTED_DUMP 217 +#define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218 +#define KVM_CAP_X86_NOTIFY_VMEXIT 219 +#define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220 +#define KVM_CAP_S390_ZPCI_OP 221 +#define KVM_CAP_S390_CPU_TOPOLOGY 222 +#define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223 +#endif +#define KVM_CAP_ARM_PROTECTED_VM 0xffbadab1 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/include/uapi/linux/netfilter/xt_IDLETIMER.h b/include/uapi/linux/netfilter/xt_IDLETIMER.h index 7bfb31a66fc9..104ac3250edd 100644 --- a/include/uapi/linux/netfilter/xt_IDLETIMER.h +++ b/include/uapi/linux/netfilter/xt_IDLETIMER.h @@ -33,7 +33,7 @@ struct idletimer_tg_info_v1 { char label[MAX_IDLETIMER_LABEL_SIZE]; - __u8 send_nl_msg; /* unused: for compatibility with Android */ + __u8 send_nl_msg; __u8 timer_type; /* for kernel module internal use only */ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c2efea98e060..afbd7905b8cd 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -11,7 +11,7 @@ * Copyright 2008 Jouni Malinen * Copyright 2008 Colin McCabe * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -300,6 +300,40 @@ * the interface goes down. */ +/** + * DOC: FILS shared key crypto offload + * + * This feature is applicable to drivers running in AP mode. + * + * FILS shared key crypto offload can be advertised by drivers by setting + * @NL80211_EXT_FEATURE_FILS_CRYPTO_OFFLOAD flag. The drivers that support + * FILS shared key crypto offload should be able to encrypt and decrypt + * association frames for FILS shared key authentication as per IEEE 802.11ai. + * With this capability, for FILS key derivation, drivers depend on userspace. + * + * After FILS key derivation, userspace shares the FILS AAD details with the + * driver and the driver stores the same to use in decryption of association + * request and in encryption of association response. The below parameters + * should be given to the driver in %NL80211_CMD_SET_FILS_AAD. + * %NL80211_ATTR_MAC - STA MAC address, used for storing FILS AAD per STA + * %NL80211_ATTR_FILS_KEK - Used for encryption or decryption + * %NL80211_ATTR_FILS_NONCES - Used for encryption or decryption + * (STA Nonce 16 bytes followed by AP Nonce 16 bytes) + * + * Once the association is done, the driver cleans the FILS AAD data. + */ + +/** + * DOC: Multi-Link Operation + * + * In Multi-Link Operation, a connection between to MLDs utilizes multiple + * links. To use this in nl80211, various commands and responses now need + * to or will include the new %NL80211_ATTR_MLO_LINKS attribute. + * Additionally, various commands that need to operate on a specific link + * now need to be given the %NL80211_ATTR_MLO_LINK_ID attribute, e.g. to + * use %NL80211_CMD_START_AP or similar functions. + */ + /** * enum nl80211_commands - supported nl80211 commands * @@ -337,17 +371,28 @@ * @NL80211_CMD_DEL_INTERFACE: Virtual interface was deleted, has attributes * %NL80211_ATTR_IFINDEX and %NL80211_ATTR_WIPHY. Can also be sent from * userspace to request deletion of a virtual interface, then requires - * attribute %NL80211_ATTR_IFINDEX. + * attribute %NL80211_ATTR_IFINDEX. If multiple BSSID advertisements are + * enabled using %NL80211_ATTR_MBSSID_CONFIG, %NL80211_ATTR_MBSSID_ELEMS, + * and if this command is used for the transmitting interface, then all + * the non-transmitting interfaces are deleted as well. * * @NL80211_CMD_GET_KEY: Get sequence counter information for a key specified - * by %NL80211_ATTR_KEY_IDX and/or %NL80211_ATTR_MAC. + * by %NL80211_ATTR_KEY_IDX and/or %NL80211_ATTR_MAC. %NL80211_ATTR_MAC + * represents peer's MLD address for MLO pairwise key. For MLO group key, + * the link is identified by %NL80211_ATTR_MLO_LINK_ID. * @NL80211_CMD_SET_KEY: Set key attributes %NL80211_ATTR_KEY_DEFAULT, * %NL80211_ATTR_KEY_DEFAULT_MGMT, or %NL80211_ATTR_KEY_THRESHOLD. + * For MLO connection, the link to set default key is identified by + * %NL80211_ATTR_MLO_LINK_ID. * @NL80211_CMD_NEW_KEY: add a key with given %NL80211_ATTR_KEY_DATA, * %NL80211_ATTR_KEY_IDX, %NL80211_ATTR_MAC, %NL80211_ATTR_KEY_CIPHER, - * and %NL80211_ATTR_KEY_SEQ attributes. + * and %NL80211_ATTR_KEY_SEQ attributes. %NL80211_ATTR_MAC represents + * peer's MLD address for MLO pairwise key. The link to add MLO + * group key is identified by %NL80211_ATTR_MLO_LINK_ID. * @NL80211_CMD_DEL_KEY: delete a key identified by %NL80211_ATTR_KEY_IDX - * or %NL80211_ATTR_MAC. + * or %NL80211_ATTR_MAC. %NL80211_ATTR_MAC represents peer's MLD address + * for MLO pairwise key. The link to delete group key is identified by + * %NL80211_ATTR_MLO_LINK_ID. * * @NL80211_CMD_GET_BEACON: (not used) * @NL80211_CMD_SET_BEACON: change the beacon on an access point interface @@ -727,6 +772,13 @@ * %NL80211_ATTR_CSA_C_OFFSETS_TX is an array of offsets to CSA * counters which will be updated to the current value. This attribute * is used during CSA period. + * For TX on an MLD, the frequency can be omitted and the link ID be + * specified, or if transmitting to a known peer MLD (with MLD addresses + * in the frame) both can be omitted and the link will be selected by + * lower layers. + * For RX notification, %NL80211_ATTR_RX_HW_TIMESTAMP may be included to + * indicate the frame RX timestamp and %NL80211_ATTR_TX_HW_TIMESTAMP may + * be included to indicate the ack TX timestamp. * @NL80211_CMD_FRAME_WAIT_CANCEL: When an off-channel TX was requested, this * command may be used with the corresponding cookie to cancel the wait * time if it is known that it is no longer necessary. This command is @@ -737,7 +789,9 @@ * transmitted with %NL80211_CMD_FRAME. %NL80211_ATTR_COOKIE identifies * the TX command and %NL80211_ATTR_FRAME includes the contents of the * frame. %NL80211_ATTR_ACK flag is included if the recipient acknowledged - * the frame. + * the frame. %NL80211_ATTR_TX_HW_TIMESTAMP may be included to indicate the + * tx timestamp and %NL80211_ATTR_RX_HW_TIMESTAMP may be included to + * indicate the ack RX timestamp. * @NL80211_CMD_ACTION_TX_STATUS: Alias for @NL80211_CMD_FRAME_TX_STATUS for * backward compatibility. * @@ -1082,6 +1136,12 @@ * has been received. %NL80211_ATTR_FRAME is used to specify the * frame contents. The frame is the raw EAPoL data, without ethernet or * 802.11 headers. + * For an MLD transmitter, the %NL80211_ATTR_MLO_LINK_ID may be given and + * its effect will depend on the destination: If the destination is known + * to be an MLD, this will be used as a hint to select the link to transmit + * the frame on. If the destination is not an MLD, this will select both + * the link to transmit on and the source address will be set to the link + * address of that link. * When used as an event indication %NL80211_ATTR_CONTROL_PORT_ETHERTYPE, * %NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT and %NL80211_ATTR_MAC are added * indicating the protocol type of the received frame; whether the frame @@ -1200,6 +1260,27 @@ * @NL80211_CMD_COLOR_CHANGE_COMPLETED: Notify userland that the color change * has completed * + * @NL80211_CMD_SET_FILS_AAD: Set FILS AAD data to the driver using - + * &NL80211_ATTR_MAC - for STA MAC address + * &NL80211_ATTR_FILS_KEK - for KEK + * &NL80211_ATTR_FILS_NONCES - for FILS Nonces + * (STA Nonce 16 bytes followed by AP Nonce 16 bytes) + * + * @NL80211_CMD_ASSOC_COMEBACK: notification about an association + * temporal rejection with comeback. The event includes %NL80211_ATTR_MAC + * to describe the BSSID address of the AP and %NL80211_ATTR_TIMEOUT to + * specify the timeout value. + * + * @NL80211_CMD_ADD_LINK: Add a new link to an interface. The + * %NL80211_ATTR_MLO_LINK_ID attribute is used for the new link. + * @NL80211_CMD_REMOVE_LINK: Remove a link from an interface. This may come + * without %NL80211_ATTR_MLO_LINK_ID as an easy way to remove all links + * in preparation for e.g. roaming to a regular (non-MLO) AP. + * + * @NL80211_CMD_ADD_LINK_STA: Add a link to an MLD station + * @NL80211_CMD_MODIFY_LINK_STA: Modify a link of an MLD station + * @NL80211_CMD_REMOVE_LINK_STA: Remove a link of an MLD station + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1440,6 +1521,17 @@ enum nl80211_commands { NL80211_CMD_COLOR_CHANGE_ABORTED, NL80211_CMD_COLOR_CHANGE_COMPLETED, + NL80211_CMD_SET_FILS_AAD, + + NL80211_CMD_ASSOC_COMEBACK, + + NL80211_CMD_ADD_LINK, + NL80211_CMD_REMOVE_LINK, + + NL80211_CMD_ADD_LINK_STA, + NL80211_CMD_MODIFY_LINK_STA, + NL80211_CMD_REMOVE_LINK_STA, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -2299,8 +2391,10 @@ enum nl80211_commands { * * @NL80211_ATTR_IFTYPE_EXT_CAPA: Nested attribute of the following attributes: * %NL80211_ATTR_IFTYPE, %NL80211_ATTR_EXT_CAPA, - * %NL80211_ATTR_EXT_CAPA_MASK, to specify the extended capabilities per - * interface type. + * %NL80211_ATTR_EXT_CAPA_MASK, to specify the extended capabilities and + * other interface-type specific capabilities per interface type. For MLO, + * %NL80211_ATTR_EML_CAPABILITY and %NL80211_ATTR_MLD_CAPA_AND_OPS are + * present. * * @NL80211_ATTR_MU_MIMO_GROUP_DATA: array of 24 bytes that defines a MU-MIMO * groupID for monitor mode. @@ -2436,7 +2530,9 @@ enum nl80211_commands { * space supports external authentication. This attribute shall be used * with %NL80211_CMD_CONNECT and %NL80211_CMD_START_AP request. The driver * may offload authentication processing to user space if this capability - * is indicated in the respective requests from the user space. + * is indicated in the respective requests from the user space. (This flag + * attribute deprecated for %NL80211_CMD_START_AP, use + * %NL80211_ATTR_AP_SETTINGS_FLAGS) * * @NL80211_ATTR_NSS: Station's New/updated RX_NSS value notified using this * u8 attribute. This is used with %NL80211_CMD_STA_OPMODE_CHANGED. @@ -2593,6 +2689,66 @@ enum nl80211_commands { * @NL80211_ATTR_COLOR_CHANGE_ELEMS: Nested set of attributes containing the IE * information for the time while performing a color switch. * + * @NL80211_ATTR_MBSSID_CONFIG: Nested attribute for multiple BSSID + * advertisements (MBSSID) parameters in AP mode. + * Kernel uses this attribute to indicate the driver's support for MBSSID + * and enhanced multi-BSSID advertisements (EMA AP) to the userspace. + * Userspace should use this attribute to configure per interface MBSSID + * parameters. + * See &enum nl80211_mbssid_config_attributes for details. + * + * @NL80211_ATTR_MBSSID_ELEMS: Nested parameter to pass multiple BSSID elements. + * Mandatory parameter for the transmitting interface to enable MBSSID. + * Optional for the non-transmitting interfaces. + * + * @NL80211_ATTR_RADAR_BACKGROUND: Configure dedicated offchannel chain + * available for radar/CAC detection on some hw. This chain can't be used + * to transmit or receive frames and it is bounded to a running wdev. + * Background radar/CAC detection allows to avoid the CAC downtime + * switching on a different channel during CAC detection on the selected + * radar channel. + * + * @NL80211_ATTR_AP_SETTINGS_FLAGS: u32 attribute contains ap settings flags, + * enumerated in &enum nl80211_ap_settings_flags. This attribute shall be + * used with %NL80211_CMD_START_AP request. + * + * @NL80211_ATTR_EHT_CAPABILITY: EHT Capability information element (from + * association request when used with NL80211_CMD_NEW_STATION). Can be set + * only if %NL80211_STA_FLAG_WME is set. + * + * @NL80211_ATTR_MLO_LINK_ID: A (u8) link ID for use with MLO, to be used with + * various commands that need a link ID to operate. + * @NL80211_ATTR_MLO_LINKS: A nested array of links, each containing some + * per-link information and a link ID. + * @NL80211_ATTR_MLD_ADDR: An MLD address, used with various commands such as + * authenticate/associate. + * + * @NL80211_ATTR_MLO_SUPPORT: Flag attribute to indicate user space supports MLO + * connection. Used with %NL80211_CMD_CONNECT. If this attribute is not + * included in NL80211_CMD_CONNECT drivers must not perform MLO connection. + * + * @NL80211_ATTR_MAX_NUM_AKM_SUITES: U16 attribute. Indicates maximum number of + * AKM suites allowed for %NL80211_CMD_CONNECT, %NL80211_CMD_ASSOCIATE and + * %NL80211_CMD_START_AP in %NL80211_CMD_GET_WIPHY response. If this + * attribute is not present userspace shall consider maximum number of AKM + * suites allowed as %NL80211_MAX_NR_AKM_SUITES which is the legacy maximum + * number prior to the introduction of this attribute. + * + * @NL80211_ATTR_EML_CAPABILITY: EML Capability information (u16) + * @NL80211_ATTR_MLD_CAPA_AND_OPS: MLD Capabilities and Operations (u16) + * + * @NL80211_ATTR_TX_HW_TIMESTAMP: Hardware timestamp for TX operation in + * nanoseconds (u64). This is the device clock timestamp so it will + * probably reset when the device is stopped or the firmware is reset. + * When used with %NL80211_CMD_FRAME_TX_STATUS, indicates the frame TX + * timestamp. When used with %NL80211_CMD_FRAME RX notification, indicates + * the ack TX timestamp. + * @NL80211_ATTR_RX_HW_TIMESTAMP: Hardware timestamp for RX operation in + * nanoseconds (u64). This is the device clock timestamp so it will + * probably reset when the device is stopped or the firmware is reset. + * When used with %NL80211_CMD_FRAME_TX_STATUS, indicates the ack RX + * timestamp. When used with %NL80211_CMD_FRAME RX notification, indicates + * the incoming frame RX timestamp. * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3096,6 +3252,31 @@ enum nl80211_attrs { NL80211_ATTR_COLOR_CHANGE_COLOR, NL80211_ATTR_COLOR_CHANGE_ELEMS, + NL80211_ATTR_MBSSID_CONFIG, + NL80211_ATTR_MBSSID_ELEMS, + + NL80211_ATTR_RADAR_BACKGROUND, + + NL80211_ATTR_AP_SETTINGS_FLAGS, + + NL80211_ATTR_EHT_CAPABILITY, + + NL80211_ATTR_DISABLE_EHT, + + NL80211_ATTR_MLO_LINKS, + NL80211_ATTR_MLO_LINK_ID, + NL80211_ATTR_MLD_ADDR, + + NL80211_ATTR_MLO_SUPPORT, + + NL80211_ATTR_MAX_NUM_AKM_SUITES, + + NL80211_ATTR_EML_CAPABILITY, + NL80211_ATTR_MLD_CAPA_AND_OPS, + + NL80211_ATTR_TX_HW_TIMESTAMP, + NL80211_ATTR_RX_HW_TIMESTAMP, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -3150,7 +3331,14 @@ enum nl80211_attrs { #define NL80211_HE_MIN_CAPABILITY_LEN 16 #define NL80211_HE_MAX_CAPABILITY_LEN 54 #define NL80211_MAX_NR_CIPHER_SUITES 5 + +/* + * NL80211_MAX_NR_AKM_SUITES is obsolete when %NL80211_ATTR_MAX_NUM_AKM_SUITES + * present in %NL80211_CMD_GET_WIPHY response. + */ #define NL80211_MAX_NR_AKM_SUITES 2 +#define NL80211_EHT_MIN_CAPABILITY_LEN 13 +#define NL80211_EHT_MAX_CAPABILITY_LEN 51 #define NL80211_MIN_REMAIN_ON_CHANNEL_TIME 10 @@ -3319,6 +3507,56 @@ enum nl80211_he_ru_alloc { NL80211_RATE_INFO_HE_RU_ALLOC_2x996, }; +/** + * enum nl80211_eht_gi - EHT guard interval + * @NL80211_RATE_INFO_EHT_GI_0_8: 0.8 usec + * @NL80211_RATE_INFO_EHT_GI_1_6: 1.6 usec + * @NL80211_RATE_INFO_EHT_GI_3_2: 3.2 usec + */ +enum nl80211_eht_gi { + NL80211_RATE_INFO_EHT_GI_0_8, + NL80211_RATE_INFO_EHT_GI_1_6, + NL80211_RATE_INFO_EHT_GI_3_2, +}; + +/** + * enum nl80211_eht_ru_alloc - EHT RU allocation values + * @NL80211_RATE_INFO_EHT_RU_ALLOC_26: 26-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_52: 52-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_52P26: 52+26-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_106: 106-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_106P26: 106+26 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_242: 242-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_484: 484-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_484P242: 484+242 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_996: 996-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_996P484: 996+484 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_996P484P242: 996+484+242 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_2x996: 2x996-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_2x996P484: 2x996+484 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_3x996: 3x996-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_3x996P484: 3x996+484 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_4x996: 4x996-tone RU allocation + */ +enum nl80211_eht_ru_alloc { + NL80211_RATE_INFO_EHT_RU_ALLOC_26, + NL80211_RATE_INFO_EHT_RU_ALLOC_52, + NL80211_RATE_INFO_EHT_RU_ALLOC_52P26, + NL80211_RATE_INFO_EHT_RU_ALLOC_106, + NL80211_RATE_INFO_EHT_RU_ALLOC_106P26, + NL80211_RATE_INFO_EHT_RU_ALLOC_242, + NL80211_RATE_INFO_EHT_RU_ALLOC_484, + NL80211_RATE_INFO_EHT_RU_ALLOC_484P242, + NL80211_RATE_INFO_EHT_RU_ALLOC_996, + NL80211_RATE_INFO_EHT_RU_ALLOC_996P484, + NL80211_RATE_INFO_EHT_RU_ALLOC_996P484P242, + NL80211_RATE_INFO_EHT_RU_ALLOC_2x996, + NL80211_RATE_INFO_EHT_RU_ALLOC_2x996P484, + NL80211_RATE_INFO_EHT_RU_ALLOC_3x996, + NL80211_RATE_INFO_EHT_RU_ALLOC_3x996P484, + NL80211_RATE_INFO_EHT_RU_ALLOC_4x996, +}; + /** * enum nl80211_rate_info - bitrate information * @@ -3358,6 +3596,13 @@ enum nl80211_he_ru_alloc { * @NL80211_RATE_INFO_HE_DCM: HE DCM value (u8, 0/1) * @NL80211_RATE_INFO_RU_ALLOC: HE RU allocation, if not present then * non-OFDMA was used (u8, see &enum nl80211_he_ru_alloc) + * @NL80211_RATE_INFO_320_MHZ_WIDTH: 320 MHz bitrate + * @NL80211_RATE_INFO_EHT_MCS: EHT MCS index (u8, 0-15) + * @NL80211_RATE_INFO_EHT_NSS: EHT NSS value (u8, 1-8) + * @NL80211_RATE_INFO_EHT_GI: EHT guard interval identifier + * (u8, see &enum nl80211_eht_gi) + * @NL80211_RATE_INFO_EHT_RU_ALLOC: EHT RU allocation, if not present then + * non-OFDMA was used (u8, see &enum nl80211_eht_ru_alloc) * @__NL80211_RATE_INFO_AFTER_LAST: internal use */ enum nl80211_rate_info { @@ -3379,6 +3624,11 @@ enum nl80211_rate_info { NL80211_RATE_INFO_HE_GI, NL80211_RATE_INFO_HE_DCM, NL80211_RATE_INFO_HE_RU_ALLOC, + NL80211_RATE_INFO_320_MHZ_WIDTH, + NL80211_RATE_INFO_EHT_MCS, + NL80211_RATE_INFO_EHT_NSS, + NL80211_RATE_INFO_EHT_GI, + NL80211_RATE_INFO_EHT_RU_ALLOC, /* keep last */ __NL80211_RATE_INFO_AFTER_LAST, @@ -3689,13 +3939,20 @@ enum nl80211_mpath_info { * capabilities IE * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE: HE PPE thresholds information as * defined in HE capabilities IE - * @NL80211_BAND_IFTYPE_ATTR_MAX: highest band HE capability attribute currently - * defined * @NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA: HE 6GHz band capabilities (__le16), * given for all 6 GHz band channels * @NL80211_BAND_IFTYPE_ATTR_VENDOR_ELEMS: vendor element capabilities that are * advertised on this band/for this iftype (binary) + * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MAC: EHT MAC capabilities as in EHT + * capabilities element + * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PHY: EHT PHY capabilities as in EHT + * capabilities element + * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MCS_SET: EHT supported NSS/MCS as in EHT + * capabilities element + * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PPE: EHT PPE thresholds information as + * defined in EHT capabilities element * @__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST: internal use + * @NL80211_BAND_IFTYPE_ATTR_MAX: highest band attribute currently defined */ enum nl80211_band_iftype_attr { __NL80211_BAND_IFTYPE_ATTR_INVALID, @@ -3707,6 +3964,10 @@ enum nl80211_band_iftype_attr { NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE, NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA, NL80211_BAND_IFTYPE_ATTR_VENDOR_ELEMS, + NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MAC, + NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PHY, + NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MCS_SET, + NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PPE, /* keep last */ __NL80211_BAND_IFTYPE_ATTR_AFTER_LAST, @@ -3851,6 +4112,10 @@ enum nl80211_wmm_rule { * on this channel in current regulatory domain. * @NL80211_FREQUENCY_ATTR_16MHZ: 16 MHz operation is allowed * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_NO_320MHZ: any 320 MHz channel using this channel + * as the primary or any of the secondary channels isn't possible + * @NL80211_FREQUENCY_ATTR_NO_EHT: EHT operation is not allowed on this channel + * in current regulatory domain. * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -3887,6 +4152,8 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_4MHZ, NL80211_FREQUENCY_ATTR_8MHZ, NL80211_FREQUENCY_ATTR_16MHZ, + NL80211_FREQUENCY_ATTR_NO_320MHZ, + NL80211_FREQUENCY_ATTR_NO_EHT, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, @@ -4085,6 +4352,7 @@ enum nl80211_sched_scan_match_attr { * @NL80211_RRF_NO_80MHZ: 80MHz operation not allowed * @NL80211_RRF_NO_160MHZ: 160MHz operation not allowed * @NL80211_RRF_NO_HE: HE operation not allowed + * @NL80211_RRF_NO_320MHZ: 320MHz operation not allowed */ enum nl80211_reg_rule_flags { NL80211_RRF_NO_OFDM = 1<<0, @@ -4103,6 +4371,7 @@ enum nl80211_reg_rule_flags { NL80211_RRF_NO_80MHZ = 1<<15, NL80211_RRF_NO_160MHZ = 1<<16, NL80211_RRF_NO_HE = 1<<17, + NL80211_RRF_NO_320MHZ = 1<<18, }; #define NL80211_RRF_PASSIVE_SCAN NL80211_RRF_NO_IR @@ -4600,6 +4869,8 @@ enum nl80211_key_mode { * @NL80211_CHAN_WIDTH_4: 4 MHz OFDM channel * @NL80211_CHAN_WIDTH_8: 8 MHz OFDM channel * @NL80211_CHAN_WIDTH_16: 16 MHz OFDM channel + * @NL80211_CHAN_WIDTH_320: 320 MHz channel, the %NL80211_ATTR_CENTER_FREQ1 + * attribute must be provided as well */ enum nl80211_chan_width { NL80211_CHAN_WIDTH_20_NOHT, @@ -4615,6 +4886,7 @@ enum nl80211_chan_width { NL80211_CHAN_WIDTH_4, NL80211_CHAN_WIDTH_8, NL80211_CHAN_WIDTH_16, + NL80211_CHAN_WIDTH_320, }; /** @@ -4686,6 +4958,8 @@ enum nl80211_bss_scan_width { * Contains a nested array of signal strength attributes (u8, dBm), * using the nesting index as the antenna number. * @NL80211_BSS_FREQUENCY_OFFSET: frequency offset in KHz + * @NL80211_BSS_MLO_LINK_ID: MLO link ID of the BSS (u8). + * @NL80211_BSS_MLD_ADDR: MLD address of this BSS if connected to it. * @__NL80211_BSS_AFTER_LAST: internal * @NL80211_BSS_MAX: highest BSS attribute */ @@ -4711,6 +4985,8 @@ enum nl80211_bss { NL80211_BSS_PARENT_BSSID, NL80211_BSS_CHAIN_SIGNAL, NL80211_BSS_FREQUENCY_OFFSET, + NL80211_BSS_MLO_LINK_ID, + NL80211_BSS_MLD_ADDR, /* keep last */ __NL80211_BSS_AFTER_LAST, @@ -4929,6 +5205,7 @@ enum nl80211_txrate_gi { * @NL80211_BAND_60GHZ: around 60 GHz band (58.32 - 69.12 GHz) * @NL80211_BAND_6GHZ: around 6 GHz band (5.9 - 7.2 GHz) * @NL80211_BAND_S1GHZ: around 900MHz, supported by S1G PHYs + * @NL80211_BAND_LC: light communication band (placeholder) * @NUM_NL80211_BANDS: number of bands, avoid using this in userspace * since newer kernel versions may support more bands */ @@ -4938,6 +5215,7 @@ enum nl80211_band { NL80211_BAND_60GHZ, NL80211_BAND_6GHZ, NL80211_BAND_S1GHZ, + NL80211_BAND_LC, NUM_NL80211_BANDS, }; @@ -5686,13 +5964,15 @@ enum nl80211_tdls_operation { NL80211_TDLS_DISABLE_LINK, }; -/* +/** * enum nl80211_ap_sme_features - device-integrated AP features - * Reserved for future use, no bits are defined in - * NL80211_ATTR_DEVICE_AP_SME yet. -enum nl80211_ap_sme_features { -}; + * @NL80211_AP_SME_SA_QUERY_OFFLOAD: SA Query procedures offloaded to driver + * when user space indicates support for SA Query procedures offload during + * "start ap" with %NL80211_AP_SETTINGS_SA_QUERY_OFFLOAD_SUPPORT. */ +enum nl80211_ap_sme_features { + NL80211_AP_SME_SA_QUERY_OFFLOAD = 1 << 0, +}; /** * enum nl80211_feature_flags - device/driver features @@ -5995,6 +6275,22 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_BSS_COLOR: The driver supports BSS color collision * detection and change announcemnts. * + * @NL80211_EXT_FEATURE_FILS_CRYPTO_OFFLOAD: Driver running in AP mode supports + * FILS encryption and decryption for (Re)Association Request and Response + * frames. Userspace has to share FILS AAD details to the driver by using + * @NL80211_CMD_SET_FILS_AAD. + * + * @NL80211_EXT_FEATURE_RADAR_BACKGROUND: Device supports background radar/CAC + * detection. + * + * @NL80211_EXT_FEATURE_POWERED_ADDR_CHANGE: Device can perform a MAC address + * change without having to bring the underlying network device down + * first. For example, in station mode this can be used to vary the + * origin MAC address prior to a connection to a new AP for privacy + * or other reasons. Note that certain driver specific restrictions + * might apply, e.g. no scans in progress, no offchannel operations + * in progress, and no active connections. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6060,6 +6356,9 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_SECURE_RTT, NL80211_EXT_FEATURE_PROT_RANGE_NEGO_AND_MEASURE, NL80211_EXT_FEATURE_BSS_COLOR, + NL80211_EXT_FEATURE_FILS_CRYPTO_OFFLOAD, + NL80211_EXT_FEATURE_RADAR_BACKGROUND, + NL80211_EXT_FEATURE_POWERED_ADDR_CHANGE, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, @@ -7349,4 +7648,76 @@ enum nl80211_sar_specs_attrs { NL80211_SAR_ATTR_SPECS_MAX = __NL80211_SAR_ATTR_SPECS_LAST - 1, }; +/** + * enum nl80211_mbssid_config_attributes - multiple BSSID (MBSSID) and enhanced + * multi-BSSID advertisements (EMA) in AP mode. + * Kernel uses some of these attributes to advertise driver's support for + * MBSSID and EMA. + * Remaining attributes should be used by the userspace to configure the + * features. + * + * @__NL80211_MBSSID_CONFIG_ATTR_INVALID: Invalid + * + * @NL80211_MBSSID_CONFIG_ATTR_MAX_INTERFACES: Used by the kernel to advertise + * the maximum number of MBSSID interfaces supported by the driver. + * Driver should indicate MBSSID support by setting + * wiphy->mbssid_max_interfaces to a value more than or equal to 2. + * + * @NL80211_MBSSID_CONFIG_ATTR_MAX_EMA_PROFILE_PERIODICITY: Used by the kernel + * to advertise the maximum profile periodicity supported by the driver + * if EMA is enabled. Driver should indicate EMA support to the userspace + * by setting wiphy->ema_max_profile_periodicity to + * a non-zero value. + * + * @NL80211_MBSSID_CONFIG_ATTR_INDEX: Mandatory parameter to pass the index of + * this BSS (u8) in the multiple BSSID set. + * Value must be set to 0 for the transmitting interface and non-zero for + * all non-transmitting interfaces. The userspace will be responsible + * for using unique indices for the interfaces. + * Range: 0 to wiphy->mbssid_max_interfaces-1. + * + * @NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX: Mandatory parameter for + * a non-transmitted profile which provides the interface index (u32) of + * the transmitted profile. The value must match one of the interface + * indices advertised by the kernel. Optional if the interface being set up + * is the transmitting one, however, if provided then the value must match + * the interface index of the same. + * + * @NL80211_MBSSID_CONFIG_ATTR_EMA: Flag used to enable EMA AP feature. + * Setting this flag is permitted only if the driver advertises EMA support + * by setting wiphy->ema_max_profile_periodicity to non-zero. + * + * @__NL80211_MBSSID_CONFIG_ATTR_LAST: Internal + * @NL80211_MBSSID_CONFIG_ATTR_MAX: highest attribute + */ +enum nl80211_mbssid_config_attributes { + __NL80211_MBSSID_CONFIG_ATTR_INVALID, + + NL80211_MBSSID_CONFIG_ATTR_MAX_INTERFACES, + NL80211_MBSSID_CONFIG_ATTR_MAX_EMA_PROFILE_PERIODICITY, + NL80211_MBSSID_CONFIG_ATTR_INDEX, + NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX, + NL80211_MBSSID_CONFIG_ATTR_EMA, + + /* keep last */ + __NL80211_MBSSID_CONFIG_ATTR_LAST, + NL80211_MBSSID_CONFIG_ATTR_MAX = __NL80211_MBSSID_CONFIG_ATTR_LAST - 1, +}; + +/** + * enum nl80211_ap_settings_flags - AP settings flags + * + * @NL80211_AP_SETTINGS_EXTERNAL_AUTH_SUPPORT: AP supports external + * authentication. + * @NL80211_AP_SETTINGS_SA_QUERY_OFFLOAD_SUPPORT: Userspace supports SA Query + * procedures offload to driver. If driver advertises + * %NL80211_AP_SME_SA_QUERY_OFFLOAD in AP SME features, userspace shall + * ignore SA Query procedures and validations when this flag is set by + * userspace. + */ +enum nl80211_ap_settings_flags { + NL80211_AP_SETTINGS_EXTERNAL_AUTH_SUPPORT = 1 << 0, + NL80211_AP_SETTINGS_SA_QUERY_OFFLOAD_SUPPORT = 1 << 1, +}; + #endif /* __LINUX_NL80211_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 43bd7f713c39..5de124e653a4 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -235,7 +235,7 @@ struct prctl_mm_map { #define PR_GET_TAGGED_ADDR_CTRL 56 # define PR_TAGGED_ADDR_ENABLE (1UL << 0) /* MTE tag check fault modes */ -# define PR_MTE_TCF_NONE 0 +# define PR_MTE_TCF_NONE 0UL # define PR_MTE_TCF_SYNC (1UL << 1) # define PR_MTE_TCF_ASYNC (1UL << 2) # define PR_MTE_TCF_MASK (PR_MTE_TCF_SYNC | PR_MTE_TCF_ASYNC) @@ -269,4 +269,16 @@ struct prctl_mm_map { # define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ # define PR_SCHED_CORE_MAX 4 +/* arm64 Scalable Matrix Extension controls */ +/* Flag values must be in sync with SVE versions */ +#define PR_SME_SET_VL 63 /* set task vector length */ +# define PR_SME_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */ +#define PR_SME_GET_VL 64 /* get task vector length */ +/* Bits common to PR_SME_SET_VL and PR_SME_GET_VL */ +# define PR_SME_VL_LEN_MASK 0xffff +# define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */ + +#define PR_SET_VMA 0x53564d41 +# define PR_SET_VMA_ANON_NAME 0 + #endif /* _LINUX_PRCTL_H */ diff --git a/include/uapi/linux/psci.h b/include/uapi/linux/psci.h index 2fcad1dd0b0e..1529f3154d5d 100644 --- a/include/uapi/linux/psci.h +++ b/include/uapi/linux/psci.h @@ -51,6 +51,7 @@ #define PSCI_1_0_FN_SYSTEM_SUSPEND PSCI_0_2_FN(14) #define PSCI_1_0_FN_SET_SUSPEND_MODE PSCI_0_2_FN(15) #define PSCI_1_1_FN_SYSTEM_RESET2 PSCI_0_2_FN(18) +#define PSCI_1_1_FN_MEM_PROTECT PSCI_0_2_FN(19) #define PSCI_1_0_FN64_SYSTEM_SUSPEND PSCI_0_2_FN64(14) #define PSCI_1_1_FN64_SYSTEM_RESET2 PSCI_0_2_FN64(18) @@ -82,6 +83,10 @@ #define PSCI_0_2_TOS_UP_NO_MIGRATE 1 #define PSCI_0_2_TOS_MP 2 +/* PSCI v1.1 reset type encoding for SYSTEM_RESET2 */ +#define PSCI_1_1_RESET_TYPE_SYSTEM_WARM_RESET 0 +#define PSCI_1_1_RESET_TYPE_VENDOR_START 0x80000000U + /* PSCI version decoding (independent of PSCI version) */ #define PSCI_VERSION_MAJOR_SHIFT 16 #define PSCI_VERSION_MINOR_MASK \ diff --git a/include/uapi/linux/rpmsg.h b/include/uapi/linux/rpmsg.h index f5ca8740f3fb..1637e68177d9 100644 --- a/include/uapi/linux/rpmsg.h +++ b/include/uapi/linux/rpmsg.h @@ -33,4 +33,14 @@ struct rpmsg_endpoint_info { */ #define RPMSG_DESTROY_EPT_IOCTL _IO(0xb5, 0x2) +/** + * Instantiate a new local rpmsg service device. + */ +#define RPMSG_CREATE_DEV_IOCTL _IOW(0xb5, 0x3, struct rpmsg_endpoint_info) + +/** + * Release a local rpmsg device. + */ +#define RPMSG_RELEASE_DEV_IOCTL _IOW(0xb5, 0x4, struct rpmsg_endpoint_info) + #endif diff --git a/include/uapi/linux/tipc_config.h b/include/uapi/linux/tipc_config.h index 4dfc05651c98..c00adf2fe868 100644 --- a/include/uapi/linux/tipc_config.h +++ b/include/uapi/linux/tipc_config.h @@ -43,10 +43,6 @@ #include #include -#ifndef __KERNEL__ -#include /* for ntohs etc. */ -#endif - /* * Configuration * @@ -269,33 +265,33 @@ static inline int TLV_OK(const void *tlv, __u16 space) */ return (space >= TLV_SPACE(0)) && - (ntohs(((struct tlv_desc *)tlv)->tlv_len) <= space); + (__be16_to_cpu(((struct tlv_desc *)tlv)->tlv_len) <= space); } static inline int TLV_CHECK(const void *tlv, __u16 space, __u16 exp_type) { return TLV_OK(tlv, space) && - (ntohs(((struct tlv_desc *)tlv)->tlv_type) == exp_type); + (__be16_to_cpu(((struct tlv_desc *)tlv)->tlv_type) == exp_type); } static inline int TLV_GET_LEN(struct tlv_desc *tlv) { - return ntohs(tlv->tlv_len); + return __be16_to_cpu(tlv->tlv_len); } static inline void TLV_SET_LEN(struct tlv_desc *tlv, __u16 len) { - tlv->tlv_len = htons(len); + tlv->tlv_len = __cpu_to_be16(len); } static inline int TLV_CHECK_TYPE(struct tlv_desc *tlv, __u16 type) { - return (ntohs(tlv->tlv_type) == type); + return (__be16_to_cpu(tlv->tlv_type) == type); } static inline void TLV_SET_TYPE(struct tlv_desc *tlv, __u16 type) { - tlv->tlv_type = htons(type); + tlv->tlv_type = __cpu_to_be16(type); } static inline int TLV_SET(void *tlv, __u16 type, void *data, __u16 len) @@ -305,8 +301,8 @@ static inline int TLV_SET(void *tlv, __u16 type, void *data, __u16 len) tlv_len = TLV_LENGTH(len); tlv_ptr = (struct tlv_desc *)tlv; - tlv_ptr->tlv_type = htons(type); - tlv_ptr->tlv_len = htons(tlv_len); + tlv_ptr->tlv_type = __cpu_to_be16(type); + tlv_ptr->tlv_len = __cpu_to_be16(tlv_len); if (len && data) { memcpy(TLV_DATA(tlv_ptr), data, len); memset((char *)TLV_DATA(tlv_ptr) + len, 0, TLV_SPACE(len) - tlv_len); @@ -348,7 +344,7 @@ static inline void *TLV_LIST_DATA(struct tlv_list_desc *list) static inline void TLV_LIST_STEP(struct tlv_list_desc *list) { - __u16 tlv_space = TLV_ALIGN(ntohs(list->tlv_ptr->tlv_len)); + __u16 tlv_space = TLV_ALIGN(__be16_to_cpu(list->tlv_ptr->tlv_len)); list->tlv_ptr = (struct tlv_desc *)((char *)list->tlv_ptr + tlv_space); list->tlv_space -= tlv_space; @@ -404,9 +400,9 @@ static inline int TCM_SET(void *msg, __u16 cmd, __u16 flags, msg_len = TCM_LENGTH(data_len); tcm_hdr = (struct tipc_cfg_msg_hdr *)msg; - tcm_hdr->tcm_len = htonl(msg_len); - tcm_hdr->tcm_type = htons(cmd); - tcm_hdr->tcm_flags = htons(flags); + tcm_hdr->tcm_len = __cpu_to_be32(msg_len); + tcm_hdr->tcm_type = __cpu_to_be16(cmd); + tcm_hdr->tcm_flags = __cpu_to_be16(flags); if (data_len && data) { memcpy(TCM_DATA(msg), data, data_len); memset((char *)TCM_DATA(msg) + data_len, 0, TCM_SPACE(data_len) - msg_len); diff --git a/include/uapi/linux/usb/f_accessory.h b/include/uapi/linux/usb/f_accessory.h new file mode 100644 index 000000000000..0baeb7d0d74c --- /dev/null +++ b/include/uapi/linux/usb/f_accessory.h @@ -0,0 +1,146 @@ +/* + * Gadget Function Driver for Android USB accessories + * + * Copyright (C) 2011 Google, Inc. + * Author: Mike Lockwood + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef _UAPI_LINUX_USB_F_ACCESSORY_H +#define _UAPI_LINUX_USB_F_ACCESSORY_H + +/* Use Google Vendor ID when in accessory mode */ +#define USB_ACCESSORY_VENDOR_ID 0x18D1 + + +/* Product ID to use when in accessory mode */ +#define USB_ACCESSORY_PRODUCT_ID 0x2D00 + +/* Product ID to use when in accessory mode and adb is enabled */ +#define USB_ACCESSORY_ADB_PRODUCT_ID 0x2D01 + +/* Indexes for strings sent by the host via ACCESSORY_SEND_STRING */ +#define ACCESSORY_STRING_MANUFACTURER 0 +#define ACCESSORY_STRING_MODEL 1 +#define ACCESSORY_STRING_DESCRIPTION 2 +#define ACCESSORY_STRING_VERSION 3 +#define ACCESSORY_STRING_URI 4 +#define ACCESSORY_STRING_SERIAL 5 + +/* Control request for retrieving device's protocol version + * + * requestType: USB_DIR_IN | USB_TYPE_VENDOR + * request: ACCESSORY_GET_PROTOCOL + * value: 0 + * index: 0 + * data version number (16 bits little endian) + * 1 for original accessory support + * 2 adds HID and device to host audio support + */ +#define ACCESSORY_GET_PROTOCOL 51 + +/* Control request for host to send a string to the device + * + * requestType: USB_DIR_OUT | USB_TYPE_VENDOR + * request: ACCESSORY_SEND_STRING + * value: 0 + * index: string ID + * data zero terminated UTF8 string + * + * The device can later retrieve these strings via the + * ACCESSORY_GET_STRING_* ioctls + */ +#define ACCESSORY_SEND_STRING 52 + +/* Control request for starting device in accessory mode. + * The host sends this after setting all its strings to the device. + * + * requestType: USB_DIR_OUT | USB_TYPE_VENDOR + * request: ACCESSORY_START + * value: 0 + * index: 0 + * data none + */ +#define ACCESSORY_START 53 + +/* Control request for registering a HID device. + * Upon registering, a unique ID is sent by the accessory in the + * value parameter. This ID will be used for future commands for + * the device + * + * requestType: USB_DIR_OUT | USB_TYPE_VENDOR + * request: ACCESSORY_REGISTER_HID_DEVICE + * value: Accessory assigned ID for the HID device + * index: total length of the HID report descriptor + * data none + */ +#define ACCESSORY_REGISTER_HID 54 + +/* Control request for unregistering a HID device. + * + * requestType: USB_DIR_OUT | USB_TYPE_VENDOR + * request: ACCESSORY_REGISTER_HID + * value: Accessory assigned ID for the HID device + * index: 0 + * data none + */ +#define ACCESSORY_UNREGISTER_HID 55 + +/* Control request for sending the HID report descriptor. + * If the HID descriptor is longer than the endpoint zero max packet size, + * the descriptor will be sent in multiple ACCESSORY_SET_HID_REPORT_DESC + * commands. The data for the descriptor must be sent sequentially + * if multiple packets are needed. + * + * requestType: USB_DIR_OUT | USB_TYPE_VENDOR + * request: ACCESSORY_SET_HID_REPORT_DESC + * value: Accessory assigned ID for the HID device + * index: offset of data in descriptor + * (needed when HID descriptor is too big for one packet) + * data the HID report descriptor + */ +#define ACCESSORY_SET_HID_REPORT_DESC 56 + +/* Control request for sending HID events. + * + * requestType: USB_DIR_OUT | USB_TYPE_VENDOR + * request: ACCESSORY_SEND_HID_EVENT + * value: Accessory assigned ID for the HID device + * index: 0 + * data the HID report for the event + */ +#define ACCESSORY_SEND_HID_EVENT 57 + +/* Control request for setting the audio mode. + * + * requestType: USB_DIR_OUT | USB_TYPE_VENDOR + * request: ACCESSORY_SET_AUDIO_MODE + * value: 0 - no audio + * 1 - device to host, 44100 16-bit stereo PCM + * index: 0 + * data none + */ +#define ACCESSORY_SET_AUDIO_MODE 58 + +/* ioctls for retrieving strings set by the host */ +#define ACCESSORY_GET_STRING_MANUFACTURER _IOW('M', 1, char[256]) +#define ACCESSORY_GET_STRING_MODEL _IOW('M', 2, char[256]) +#define ACCESSORY_GET_STRING_DESCRIPTION _IOW('M', 3, char[256]) +#define ACCESSORY_GET_STRING_VERSION _IOW('M', 4, char[256]) +#define ACCESSORY_GET_STRING_URI _IOW('M', 5, char[256]) +#define ACCESSORY_GET_STRING_SERIAL _IOW('M', 6, char[256]) +/* returns 1 if there is a start request pending */ +#define ACCESSORY_IS_START_REQUESTED _IO('M', 7) +/* returns audio mode (set via the ACCESSORY_SET_AUDIO_MODE control request) */ +#define ACCESSORY_GET_AUDIO_MODE _IO('M', 8) + +#endif /* _UAPI_LINUX_USB_F_ACCESSORY_H */ diff --git a/include/uapi/linux/usb/g_uvc.h b/include/uapi/linux/usb/g_uvc.h index 652f169a019e..8d7824dde1b2 100644 --- a/include/uapi/linux/usb/g_uvc.h +++ b/include/uapi/linux/usb/g_uvc.h @@ -21,6 +21,9 @@ #define UVC_EVENT_DATA (V4L2_EVENT_PRIVATE_START + 5) #define UVC_EVENT_LAST (V4L2_EVENT_PRIVATE_START + 5) +#define UVC_STRING_CONTROL_IDX 0 +#define UVC_STRING_STREAMING_IDX 1 + struct uvc_request_data { __s32 length; __u8 data[60]; diff --git a/include/uapi/linux/usb/video.h b/include/uapi/linux/usb/video.h index bfdae12cdacf..6e8e572c2980 100644 --- a/include/uapi/linux/usb/video.h +++ b/include/uapi/linux/usb/video.h @@ -466,7 +466,7 @@ struct uvc_format_uncompressed { __u8 bDefaultFrameIndex; __u8 bAspectRatioX; __u8 bAspectRatioY; - __u8 bmInterfaceFlags; + __u8 bmInterlaceFlags; __u8 bCopyProtect; } __attribute__((__packed__)); @@ -522,7 +522,7 @@ struct uvc_format_mjpeg { __u8 bDefaultFrameIndex; __u8 bAspectRatioX; __u8 bAspectRatioY; - __u8 bmInterfaceFlags; + __u8 bmInterlaceFlags; __u8 bCopyProtect; } __attribute__((__packed__)); diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index df3a8448375f..cff2d8aa031d 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -128,6 +128,7 @@ enum v4l2_colorfx { V4L2_COLORFX_SOLARIZATION = 13, V4L2_COLORFX_ANTIQUE = 14, V4L2_COLORFX_SET_CBCR = 15, + V4L2_COLORFX_SET_RGB = 16, }; #define V4L2_CID_AUTOBRIGHTNESS (V4L2_CID_BASE+32) #define V4L2_CID_BAND_STOP_FILTER (V4L2_CID_BASE+33) @@ -145,9 +146,10 @@ enum v4l2_colorfx { #define V4L2_CID_ALPHA_COMPONENT (V4L2_CID_BASE+41) #define V4L2_CID_COLORFX_CBCR (V4L2_CID_BASE+42) +#define V4L2_CID_COLORFX_RGB (V4L2_CID_BASE+43) /* last CID + 1 */ -#define V4L2_CID_LASTP1 (V4L2_CID_BASE+43) +#define V4L2_CID_LASTP1 (V4L2_CID_BASE+44) /* USER-class private control IDs */ diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index bc9337b00c0f..786b42963601 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -70,7 +70,7 @@ * Common stuff for both V4L1 and V4L2 * Moved from videodev.h */ -#define VIDEO_MAX_FRAME 32 +#define VIDEO_MAX_FRAME 64 #define VIDEO_MAX_PLANES 8 /* diff --git a/include/uapi/scsi/scsi_bsg_ufs.h b/include/uapi/scsi/scsi_bsg_ufs.h index d55f2176dfd4..5e4891fd99c2 100644 --- a/include/uapi/scsi/scsi_bsg_ufs.h +++ b/include/uapi/scsi/scsi_bsg_ufs.h @@ -14,10 +14,27 @@ */ #define UFS_CDB_SIZE 16 -#define UPIU_TRANSACTION_UIC_CMD 0x1F /* uic commands are 4DW long, per UFSHCI V2.1 paragraph 5.6.1 */ #define UIC_CMD_SIZE (sizeof(__u32) * 4) +enum ufs_bsg_msg_code { + UPIU_TRANSACTION_UIC_CMD = 0x1F, + UPIU_TRANSACTION_ARPMB_CMD, +}; + +/* UFS RPMB Request Message Types */ +enum ufs_rpmb_op_type { + UFS_RPMB_WRITE_KEY = 0x01, + UFS_RPMB_READ_CNT = 0x02, + UFS_RPMB_WRITE = 0x03, + UFS_RPMB_READ = 0x04, + UFS_RPMB_READ_RESP = 0x05, + UFS_RPMB_SEC_CONF_WRITE = 0x06, + UFS_RPMB_SEC_CONF_READ = 0x07, + UFS_RPMB_PURGE_ENABLE = 0x08, + UFS_RPMB_PURGE_STATUS_READ = 0x09, +}; + /** * struct utp_upiu_header - UPIU header structure * @dword_0: UPIU header DW-0 @@ -79,6 +96,23 @@ struct utp_upiu_req { }; }; +struct ufs_arpmb_meta { + __u16 req_resp_type; + __u8 nonce[16]; + __u32 write_counter; + __u16 addr_lun; + __u16 block_count; + __u16 result; +} __attribute__((__packed__)); + +struct ufs_ehs { + __u8 length; + __u8 ehs_type; + __u16 ehssub_type; + struct ufs_arpmb_meta meta; + __u8 mac_key[32]; +} __attribute__((__packed__)); + /* request (CDB) structure of the sg_io_v4 */ struct ufs_bsg_request { __u32 msgcode; @@ -102,4 +136,14 @@ struct ufs_bsg_reply { struct utp_upiu_req upiu_rsp; }; + +struct ufs_rpmb_request { + struct ufs_bsg_request bsg_request; + struct ufs_ehs ehs_req; +}; + +struct ufs_rpmb_reply { + struct ufs_bsg_reply bsg_reply; + struct ufs_ehs ehs_rsp; +}; #endif /* UFS_BSG_H */ diff --git a/drivers/scsi/ufs/ufs.h b/include/ufs/ufs.h similarity index 90% rename from drivers/scsi/ufs/ufs.h rename to include/ufs/ufs.h index 8c6b38b1b142..651d493c4d60 100644 --- a/drivers/scsi/ufs/ufs.h +++ b/include/ufs/ufs.h @@ -13,6 +13,8 @@ #include #include +#include +#include #include #define GENERAL_UPIU_REQUEST_SIZE (sizeof(struct utp_upiu_req)) @@ -38,11 +40,21 @@ #define UFS_UPIU_MAX_UNIT_NUM_ID 0x7F #define UFS_MAX_LUNS (SCSI_W_LUN_BASE + UFS_UPIU_MAX_UNIT_NUM_ID) #define UFS_UPIU_WLUN_ID (1 << 7) -#define UFS_RPMB_UNIT 0xC4 /* WriteBooster buffer is available only for the logical unit from 0 to 7 */ #define UFS_UPIU_MAX_WB_LUN_ID 8 +/* + * WriteBooster buffer lifetime has a limit setted by vendor. + * If it is over the limit, WriteBooster feature will be disabled. + */ +#define UFS_WB_EXCEED_LIFETIME 0x0B + +/* + * In UFS Spec, the Extra Header Segment (EHS) starts from byte 32 in UPIU request/response packet + */ +#define EHS_OFFSET_IN_RESPONSE 32 + /* Well known logical unit id in LUN field of UPIU */ enum { UFS_UPIU_REPORT_LUNS_WLUN = 0x81, @@ -152,10 +164,14 @@ enum attr_idn { QUERY_ATTR_IDN_PSA_STATE = 0x15, QUERY_ATTR_IDN_PSA_DATA_SIZE = 0x16, QUERY_ATTR_IDN_REF_CLK_GATING_WAIT_TIME = 0x17, + QUERY_ATTR_IDN_CASE_ROUGH_TEMP = 0x18, + QUERY_ATTR_IDN_HIGH_TEMP_BOUND = 0x19, + QUERY_ATTR_IDN_LOW_TEMP_BOUND = 0x1A, QUERY_ATTR_IDN_WB_FLUSH_STATUS = 0x1C, QUERY_ATTR_IDN_AVAIL_WB_BUFF_SIZE = 0x1D, QUERY_ATTR_IDN_WB_BUFF_LIFE_TIME_EST = 0x1E, QUERY_ATTR_IDN_CURR_WB_BUFF_SIZE = 0x1F, + QUERY_ATTR_IDN_EXT_IID_EN = 0x2A, }; /* Descriptor idn for Query requests */ @@ -203,6 +219,28 @@ enum unit_desc_param { UNIT_DESC_PARAM_WB_BUF_ALLOC_UNITS = 0x29, }; +/* RPMB Unit descriptor parameters offsets in bytes*/ +enum rpmb_unit_desc_param { + RPMB_UNIT_DESC_PARAM_LEN = 0x0, + RPMB_UNIT_DESC_PARAM_TYPE = 0x1, + RPMB_UNIT_DESC_PARAM_UNIT_INDEX = 0x2, + RPMB_UNIT_DESC_PARAM_LU_ENABLE = 0x3, + RPMB_UNIT_DESC_PARAM_BOOT_LUN_ID = 0x4, + RPMB_UNIT_DESC_PARAM_LU_WR_PROTECT = 0x5, + RPMB_UNIT_DESC_PARAM_LU_Q_DEPTH = 0x6, + RPMB_UNIT_DESC_PARAM_PSA_SENSITIVE = 0x7, + RPMB_UNIT_DESC_PARAM_MEM_TYPE = 0x8, + RPMB_UNIT_DESC_PARAM_REGION_EN = 0x9, + RPMB_UNIT_DESC_PARAM_LOGICAL_BLK_SIZE = 0xA, + RPMB_UNIT_DESC_PARAM_LOGICAL_BLK_COUNT = 0xB, + RPMB_UNIT_DESC_PARAM_REGION0_SIZE = 0x13, + RPMB_UNIT_DESC_PARAM_REGION1_SIZE = 0x14, + RPMB_UNIT_DESC_PARAM_REGION2_SIZE = 0x15, + RPMB_UNIT_DESC_PARAM_REGION3_SIZE = 0x16, + RPMB_UNIT_DESC_PARAM_PROVISIONING_TYPE = 0x17, + RPMB_UNIT_DESC_PARAM_PHY_MEM_RSRC_CNT = 0x18, +}; + /* Device descriptor parameters offsets in bytes*/ enum device_desc_param { DEVICE_DESC_PARAM_LEN = 0x0, @@ -338,8 +376,12 @@ enum { /* Possible values for dExtendedUFSFeaturesSupport */ enum { + UFS_DEV_LOW_TEMP_NOTIF = BIT(4), + UFS_DEV_HIGH_TEMP_NOTIF = BIT(5), + UFS_DEV_EXT_TEMP_NOTIF = BIT(6), UFS_DEV_HPB_SUPPORT = BIT(7), UFS_DEV_WRITE_BOOSTER_SUP = BIT(8), + UFS_DEV_EXT_IID_SUP = BIT(16), }; #define UFS_DEV_HPB_SUPPORT_VERSION 0x310 @@ -370,6 +412,7 @@ enum { MASK_EE_WRITEBOOSTER_EVENT = BIT(5), MASK_EE_PERFORMANCE_THROTTLING = BIT(6), }; +#define MASK_EE_URGENT_TEMP (MASK_EE_TOO_HIGH_TEMP | MASK_EE_TOO_LOW_TEMP) /* Background operation status */ enum bkops_status { @@ -402,11 +445,6 @@ enum ufs_ref_clk_freq { REF_CLK_FREQ_INVAL = -1, }; -struct ufs_ref_clk { - unsigned long freq_hz; - enum ufs_ref_clk_freq val; -}; - /* Query response result code */ enum { QUERY_RESULT_SUCCESS = 0x00, @@ -549,15 +587,6 @@ struct ufs_query_res { struct utp_upiu_query upiu_res; }; -#define UFS_VREG_VCC_MIN_UV 2700000 /* uV */ -#define UFS_VREG_VCC_MAX_UV 3600000 /* uV */ -#define UFS_VREG_VCC_1P8_MIN_UV 1700000 /* uV */ -#define UFS_VREG_VCC_1P8_MAX_UV 1950000 /* uV */ -#define UFS_VREG_VCCQ_MIN_UV 1140000 /* uV */ -#define UFS_VREG_VCCQ_MAX_UV 1260000 /* uV */ -#define UFS_VREG_VCCQ2_MIN_UV 1700000 /* uV */ -#define UFS_VREG_VCCQ2_MAX_UV 1950000 /* uV */ - /* * VCCQ & VCCQ2 current requirement when UFS device is in sleep state * and link is in Hibern8 state. @@ -569,8 +598,6 @@ struct ufs_vreg { const char *name; bool always_on; bool enabled; - int min_uV; - int max_uV; int max_uA; }; @@ -592,6 +619,8 @@ struct ufs_dev_info { u8 *model; u16 wspecversion; u32 clk_gating_wait_us; + /* Stores the depth of queue in UFS device */ + u8 bqueuedepth; /* UFS HPB related flag */ bool hpb_enabled; @@ -604,6 +633,15 @@ struct ufs_dev_info { bool b_rpm_dev_flush_capable; u8 b_presrv_uspc_en; + + bool b_advanced_rpmb_en; + + /* UFS EXT_IID Enable */ + bool b_ext_iid_en; + + ANDROID_KABI_RESERVE(1); + + ANDROID_OEM_DATA(1); }; /* @@ -623,23 +661,4 @@ enum ufs_trace_tsf_t { UFS_TSF_CDB, UFS_TSF_OSF, UFS_TSF_TM_INPUT, UFS_TSF_TM_OUTPUT }; -/** - * ufs_is_valid_unit_desc_lun - checks if the given LUN has a unit descriptor - * @dev_info: pointer of instance of struct ufs_dev_info - * @lun: LU number to check - * @return: true if the lun has a matching unit descriptor, false otherwise - */ -static inline bool ufs_is_valid_unit_desc_lun(struct ufs_dev_info *dev_info, - u8 lun, u8 param_offset) -{ - if (!dev_info || !dev_info->max_lu_supported) { - pr_err("Max General LU supported by UFS isn't initialized\n"); - return false; - } - /* WB is available only for the logical unit from 0 to 7 */ - if (param_offset == UNIT_DESC_PARAM_WB_BUF_ALLOC_UNITS) - return lun < UFS_UPIU_MAX_WB_LUN_ID; - return lun == UFS_UPIU_RPMB_WLUN || (lun < dev_info->max_lu_supported); -} - #endif /* End of Header */ diff --git a/drivers/scsi/ufs/ufs_quirks.h b/include/ufs/ufs_quirks.h similarity index 94% rename from drivers/scsi/ufs/ufs_quirks.h rename to include/ufs/ufs_quirks.h index 35ec9ea79869..bcb4f004bed5 100644 --- a/drivers/scsi/ufs/ufs_quirks.h +++ b/include/ufs/ufs_quirks.h @@ -19,25 +19,16 @@ #define UFS_VENDOR_WDC 0x145 /** - * ufs_dev_fix - ufs device quirk info + * ufs_dev_quirk - ufs device quirk info * @card: ufs card details * @quirk: device quirk */ -struct ufs_dev_fix { +struct ufs_dev_quirk { u16 wmanufacturerid; - u8 *model; + const u8 *model; unsigned int quirk; }; -#define END_FIX { } - -/* add specific device quirk */ -#define UFS_FIX(_vendor, _model, _quirk) { \ - .wmanufacturerid = (_vendor),\ - .model = (_model), \ - .quirk = (_quirk), \ -} - /* * Some vendor's UFS device sends back to back NACs for the DL data frames * causing the host controller to raise the DFES error status. Sometimes diff --git a/drivers/scsi/ufs/ufshcd.h b/include/ufs/ufshcd.h similarity index 70% rename from drivers/scsi/ufs/ufshcd.h rename to include/ufs/ufshcd.h index c8513cc6c2bd..b7b318f77fb6 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -12,50 +12,28 @@ #ifndef _UFSHCD_H #define _UFSHCD_H -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include +#include +#include #include -#include -#include "unipro.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ufs.h" -#include "ufs_quirks.h" -#include "ufshci.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include #define UFSHCD "ufshcd" -#define UFSHCD_DRIVER_VERSION "0.2" struct ufs_hba; enum dev_cmd_type { DEV_CMD_TYPE_NOP = 0x0, DEV_CMD_TYPE_QUERY = 0x1, + DEV_CMD_TYPE_RPMB = 0x2, }; enum ufs_event_type { @@ -181,15 +159,15 @@ struct ufs_pm_lvl_states { * @ucd_rsp_dma_addr: UPIU response dma address for debug * @ucd_req_dma_addr: UPIU request dma address for debug * @cmd: pointer to SCSI command - * @sense_buffer: pointer to sense buffer address of the SCSI command - * @sense_bufflen: Length of the sense buffer * @scsi_status: SCSI status of the command * @command_type: SCSI, UFS, Query. * @task_tag: Task tag of the command * @lun: LUN of the command * @intr_cmd: Interrupt command (doesn't participate in interrupt aggregation) - * @issue_time_stamp: time stamp for debug purposes - * @compl_time_stamp: time stamp for statistics + * @issue_time_stamp: time stamp for debug purposes (CLOCK_MONOTONIC) + * @issue_time_stamp_local_clock: time stamp for debug purposes (local_clock) + * @compl_time_stamp: time stamp for statistics (CLOCK_MONOTONIC) + * @compl_time_stamp_local_clock: time stamp for debug purposes (local_clock) * @crypto_key_slot: the key slot to use for inline crypto (-1 if none) * @data_unit_num: the data unit number for the first block for inline crypto * @req_abort_skip: skip request abort task flag @@ -206,8 +184,6 @@ struct ufshcd_lrb { dma_addr_t ucd_prdt_dma_addr; struct scsi_cmnd *cmd; - u8 *sense_buffer; - unsigned int sense_bufflen; int scsi_status; int command_type; @@ -215,13 +191,17 @@ struct ufshcd_lrb { u8 lun; /* UPIU LUN id field is only 8-bit wide */ bool intr_cmd; ktime_t issue_time_stamp; + u64 issue_time_stamp_local_clock; ktime_t compl_time_stamp; + u64 compl_time_stamp_local_clock; #ifdef CONFIG_SCSI_UFS_CRYPTO int crypto_key_slot; u64 data_unit_num; #endif bool req_abort_skip; + + ANDROID_KABI_RESERVE(1); }; /** @@ -241,12 +221,14 @@ struct ufs_query { * @type: device management command type - Query, NOP OUT * @lock: lock to allow one command at a time * @complete: internal commands completion + * @query: Device management query information */ struct ufs_dev_cmd { enum dev_cmd_type type; struct mutex lock; struct completion *complete; struct ufs_query query; + struct cq_entry *cqe; }; /** @@ -258,7 +240,7 @@ struct ufs_dev_cmd { * @min_freq: min frequency that can be used for clock scaling * @curr_freq: indicates the current frequency that it is set to * @keep_link_active: indicates that the clk should not be disabled if - link is active + * link is active * @enabled: variable to check against multiple enable/disable */ struct ufs_clk_info { @@ -313,13 +295,21 @@ struct ufs_pwr_mode_info { * to set some things * @hibern8_notify: called around hibern8 enter/exit * @apply_dev_quirks: called to apply device specific quirks + * @fixup_dev_quirks: called to modify device specific quirks * @suspend: called during host controller PM callback * @resume: called during host controller PM callback * @dbg_register_dump: used to dump controller debug information * @phy_initialization: used to initialize phys * @device_reset: called to issue a reset pulse on the UFS device + * @config_scaling_param: called to configure clock scaling parameters * @program_key: program or evict an inline encryption key * @event_notify: called to notify important events + * @reinit_notify: called to notify reinit of UFSHCD during max gear switch + * @mcq_config_resource: called to configure MCQ platform resources + * @get_hba_mac: called to get vendor specific mac value, mandatory for mcq mode + * @op_runtime_config: called to config Operation and runtime regs Pointers + * @get_outstanding_cqs: called to get outstanding completion queues + * @config_esi: called to config Event Specific Interrupt */ struct ufs_hba_variant_ops { const char *name; @@ -338,24 +328,38 @@ struct ufs_hba_variant_ops { enum ufs_notify_change_status status, struct ufs_pa_layer_attr *, struct ufs_pa_layer_attr *); - void (*setup_xfer_req)(struct ufs_hba *, int, bool); + void (*setup_xfer_req)(struct ufs_hba *hba, int tag, + bool is_scsi_cmd); void (*setup_task_mgmt)(struct ufs_hba *, int, u8); void (*hibern8_notify)(struct ufs_hba *, enum uic_cmd_dme, enum ufs_notify_change_status); int (*apply_dev_quirks)(struct ufs_hba *hba); void (*fixup_dev_quirks)(struct ufs_hba *hba); - int (*suspend)(struct ufs_hba *, enum ufs_pm_op); + int (*suspend)(struct ufs_hba *, enum ufs_pm_op, + enum ufs_notify_change_status); int (*resume)(struct ufs_hba *, enum ufs_pm_op); void (*dbg_register_dump)(struct ufs_hba *hba); int (*phy_initialization)(struct ufs_hba *); int (*device_reset)(struct ufs_hba *hba); void (*config_scaling_param)(struct ufs_hba *hba, - struct devfreq_dev_profile *profile, - void *data); + struct devfreq_dev_profile *profile, + struct devfreq_simple_ondemand_data *data); int (*program_key)(struct ufs_hba *hba, const union ufs_crypto_cfg_entry *cfg, int slot); void (*event_notify)(struct ufs_hba *hba, enum ufs_event_type evt, void *data); + void (*reinit_notify)(struct ufs_hba *); + int (*mcq_config_resource)(struct ufs_hba *hba); + int (*get_hba_mac)(struct ufs_hba *hba); + int (*op_runtime_config)(struct ufs_hba *hba); + int (*get_outstanding_cqs)(struct ufs_hba *hba, + unsigned long *ocqs); + int (*config_esi)(struct ufs_hba *hba); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; /* clock gating state */ @@ -382,6 +386,7 @@ enum clk_gating_state { * @is_initialized: Indicates whether clock gating is initialized or not * @active_reqs: number of requests that are pending and should be waited for * completion before gating clocks. + * @clk_gating_workq: workqueue for clock gating work. */ struct ufs_clk_gating { struct delayed_work gate_work; @@ -395,6 +400,8 @@ struct ufs_clk_gating { bool is_initialized; int active_reqs; struct workqueue_struct *clk_gating_workq; + + ANDROID_KABI_RESERVE(1); }; struct ufs_saved_pwr_info { @@ -418,9 +425,9 @@ struct ufs_saved_pwr_info { * @resume_work: worker to resume devfreq * @min_gear: lowest HS gear to scale down to * @is_enabled: tracks if scaling is currently enabled or not, controlled by - clkscale_enable sysfs node + * clkscale_enable sysfs node * @is_allowed: tracks if scaling is currently allowed or not, used to block - clock scaling which is not invoked from devfreq governor + * clock scaling which is not invoked from devfreq governor * @is_initialized: Indicates whether clock scaling is initialized or not * @is_busy_started: tracks if busy period has started or not * @is_suspended: tracks if devfreq is suspended or not @@ -441,20 +448,22 @@ struct ufs_clk_scaling { bool is_initialized; bool is_busy_started; bool is_suspended; + + ANDROID_KABI_RESERVE(1); }; #define UFS_EVENT_HIST_LENGTH 8 /** * struct ufs_event_hist - keeps history of errors * @pos: index to indicate cyclic buffer position - * @reg: cyclic buffer for registers value + * @val: cyclic buffer for registers value * @tstamp: cyclic buffer for time stamp * @cnt: error counter */ struct ufs_event_hist { int pos; u32 val[UFS_EVENT_HIST_LENGTH]; - ktime_t tstamp[UFS_EVENT_HIST_LENGTH]; + u64 tstamp[UFS_EVENT_HIST_LENGTH]; unsigned long long cnt; }; @@ -466,13 +475,14 @@ struct ufs_event_hist { * reset this after link-startup. * @last_hibern8_exit_tstamp: Set time after the hibern8 exit. * Clear after the first successful command completion. + * @event: array with event history. */ struct ufs_stats { u32 last_intr_status; - ktime_t last_intr_ts; + u64 last_intr_ts; u32 hibern8_exit_cnt; - ktime_t last_hibern8_exit_tstamp; + u64 last_hibern8_exit_tstamp; struct ufs_event_hist event[UFS_EVT_CNT]; }; @@ -585,9 +595,62 @@ enum ufshcd_quirks { UFSHCD_QUIRK_SKIP_DEF_UNIPRO_TIMEOUT_SETTING = 1 << 13, /* - * This quirk allows only sg entries aligned with page size. + * Align DMA SG entries on a 4 KiB boundary. */ - UFSHCD_QUIRK_ALIGN_SG_WITH_PAGE_SIZE = 1 << 14, + UFSHCD_QUIRK_4KB_DMA_ALIGNMENT = 1 << 14, + + /* + * This quirk needs to be enabled if the host controller does not + * support UIC command + */ + UFSHCD_QUIRK_BROKEN_UIC_CMD = 1 << 15, + + /* + * This quirk needs to be enabled if the host controller cannot + * support physical host configuration. + */ + UFSHCD_QUIRK_SKIP_PH_CONFIGURATION = 1 << 16, + + /* + * This quirk needs to be enabled if the host controller has + * 64-bit addressing supported capability but it doesn't work. + */ + UFSHCD_QUIRK_BROKEN_64BIT_ADDRESS = 1 << 17, + + /* + * This quirk needs to be enabled if the host controller has + * auto-hibernate capability but it's FASTAUTO only. + */ + UFSHCD_QUIRK_HIBERN_FASTAUTO = 1 << 18, + + /* + * This quirk needs to be enabled if the host controller needs + * to reinit the device after switching to maximum gear. + */ + UFSHCD_QUIRK_REINIT_AFTER_MAX_GEAR_SWITCH = 1 << 19, + + /* + * This quirk needs to be enabled if the host controller supports inline + * encryption, but it needs to initialize the crypto capabilities in a + * nonstandard way and/or it needs to override blk_crypto_ll_ops. If + * enabled, the standard code won't initialize the blk_crypto_profile; + * ufs_hba_variant_ops::init() must do it instead. + */ + UFSHCD_QUIRK_CUSTOM_CRYPTO_PROFILE = 1 << 20, + + /* + * This quirk needs to be enabled if the host controller supports inline + * encryption, but the CRYPTO_GENERAL_ENABLE bit is not implemented and + * breaks the HCE sequence if used. + */ + UFSHCD_QUIRK_BROKEN_CRYPTO_ENABLE = 1 << 21, + + /* + * This quirk needs to be enabled if the host controller requires that + * the PRDT be cleared after each encrypted request because encryption + * keys were stored in it. + */ + UFSHCD_QUIRK_KEYS_IN_PRDT = 1 << 22, }; enum ufshcd_caps { @@ -653,6 +716,18 @@ enum ufshcd_caps { * in order to exit DeepSleep state. */ UFSHCD_CAP_DEEPSLEEP = 1 << 10, + + /* + * This capability allows the host controller driver to use temperature + * notification if it is supported by the UFS device. + */ + UFSHCD_CAP_TEMP_NOTIF = 1 << 11, + + /* + * Enable WriteBooster when scaling up the clock and disable + * WriteBooster when scaling the clock down. + */ + UFSHCD_CAP_WB_WITH_CLK_SCALING = 1 << 12, }; struct ufs_hba_variant_params { @@ -706,6 +781,51 @@ struct ufs_hba_monitor { bool enabled; }; +/** + * struct ufshcd_res_info_t - MCQ related resource regions + * + * @name: resource name + * @resource: pointer to resource region + * @base: register base address + */ +struct ufshcd_res_info { + const char *name; + struct resource *resource; + void __iomem *base; +}; + +enum ufshcd_res { + RES_UFS, + RES_MCQ, + RES_MCQ_SQD, + RES_MCQ_SQIS, + RES_MCQ_CQD, + RES_MCQ_CQIS, + RES_MCQ_VS, + RES_MAX, +}; + +/** + * struct ufshcd_mcq_opr_info_t - Operation and Runtime registers + * + * @offset: Doorbell Address Offset + * @stride: Steps proportional to queue [0...31] + * @base: base address + */ +struct ufshcd_mcq_opr_info_t { + unsigned long offset; + unsigned long stride; + void __iomem *base; +}; + +enum ufshcd_mcq_opr { + OPR_SQD, + OPR_SQIS, + OPR_CQD, + OPR_CQIS, + OPR_MAX, +}; + /** * struct ufs_hba - per adapter private structure * @mmio_base: UFSHCI base register address @@ -717,28 +837,46 @@ struct ufs_hba_monitor { * @utmrdl_dma_addr: UTMRDL DMA address * @host: Scsi_Host instance of the driver * @dev: device handle + * @ufs_device_wlun: WLUN that controls the entire UFS device. + * @hwmon_device: device instance registered with the hwmon core. + * @curr_dev_pwr_mode: active UFS device power mode. + * @uic_link_state: active state of the link to the UFS device. + * @rpm_lvl: desired UFS power management level during runtime PM. + * @spm_lvl: desired UFS power management level during system PM. + * @pm_op_in_progress: whether or not a PM operation is in progress. + * @ahit: value of Auto-Hibernate Idle Timer register. * @lrb: local reference block - * @cmd_queue: Used to allocate command tags from hba->host->tag_set. * @outstanding_tasks: Bits representing outstanding task requests * @outstanding_lock: Protects @outstanding_reqs. * @outstanding_reqs: Bits representing outstanding transfer requests * @capabilities: UFS Controller Capabilities + * @mcq_capabilities: UFS Multi Circular Queue capabilities * @nutrs: Transfer Request Queue depth supported by controller * @nutmrs: Task Management Queue depth supported by controller * @reserved_slot: Used to submit device commands. Protected by @dev_cmd.lock. * @ufs_version: UFS Version to which controller complies * @vops: pointer to variant specific operations + * @vps: pointer to variant specific parameters * @priv: pointer to variant specific private data + * @sg_entry_size: size of struct ufshcd_sg_entry (may include variant fields) * @irq: Irq number of the controller - * @active_uic_cmd: handle of active UIC command - * @uic_cmd_mutex: mutex for UIC command + * @is_irq_enabled: whether or not the UFS controller interrupt is enabled. + * @dev_ref_clk_freq: reference clock frequency + * @quirks: bitmask with information about deviations from the UFSHCI standard. + * @dev_quirks: bitmask with information about deviations from the UFS standard. * @tmf_tag_set: TMF tag set. * @tmf_queue: Used to allocate TMF tags. - * @pwr_done: completion for power mode change + * @tmf_rqs: array with pointers to TMF requests while these are in progress. + * @active_uic_cmd: handle of active UIC command + * @uic_cmd_mutex: mutex for UIC command + * @uic_async_done: completion used during UIC processing * @ufshcd_state: UFSHCD state * @eh_flags: Error handling flags * @intr_mask: Interrupt Mask Bits * @ee_ctrl_mask: Exception event control mask + * @ee_drv_mask: Exception event mask for driver + * @ee_usr_mask: Exception event mask for user (set via debugfs) + * @ee_ctrl_mutex: Used to serialize exception event information. * @is_powered: flag to check if HBA is powered * @shutting_down: flag to check if shutdown has been invoked * @host_sem: semaphore used to serialize concurrent contexts @@ -749,26 +887,63 @@ struct ufs_hba_monitor { * @uic_error: UFS interconnect layer error status * @saved_err: sticky error mask * @saved_uic_err: sticky UIC error mask + * @ufs_stats: various error counters * @force_reset: flag to force eh_work perform a full reset * @force_pmc: flag to force a power mode change * @silence_err_logs: flag to silence error logs * @dev_cmd: ufs device management command information * @last_dme_cmd_tstamp: time stamp of the last completed DME command + * @nop_out_timeout: NOP OUT timeout value + * @dev_info: information about the UFS device * @auto_bkops_enabled: to track whether bkops is enabled in device * @vreg_info: UFS device voltage regulator information * @clk_list_head: UFS host controller clocks list node head + * @req_abort_count: number of times ufshcd_abort() has been called + * @lanes_per_direction: number of lanes per data direction between the UFS + * controller and the UFS device. * @pwr_info: holds current power mode * @max_pwr_info: keeps the device max valid pwm - * @desc_size: descriptor sizes reported by device + * @clk_gating: information related to clock gating + * @caps: bitmask with information about UFS controller capabilities + * @devfreq: frequency scaling information owned by the devfreq core + * @clk_scaling: frequency scaling information owned by the UFS driver + * @system_suspending: system suspend has been started and system resume has + * not yet finished. + * @is_sys_suspended: UFS device has been suspended because of system suspend * @urgent_bkops_lvl: keeps track of urgent bkops level for device * @is_urgent_bkops_lvl_checked: keeps track if the urgent bkops level for * device is known or not. * @wb_mutex: used to serialize devfreq and sysfs write booster toggling + * @clk_scaling_lock: used to serialize device commands and clock scaling * @scsi_block_reqs_cnt: reference counting for scsi block requests + * @bsg_dev: struct device associated with the BSG queue + * @bsg_queue: BSG queue associated with the UFS controller + * @rpm_dev_flush_recheck_work: used to suspend from RPM (runtime power + * management) after the UFS device has finished a WriteBooster buffer + * flush or auto BKOP. + * @ufshpb_dev: information related to HPB (Host Performance Booster). + * @monitor: statistics about UFS commands * @crypto_capabilities: Content of crypto capabilities register (0x100) * @crypto_cap_array: Array of crypto capabilities * @crypto_cfg_register: Start of the crypto cfg array - * @ksm: the keyslot manager tied to this hba + * @crypto_profile: the crypto profile of this hba (if applicable) + * @debugfs_root: UFS controller debugfs root directory + * @debugfs_ee_work: used to restore ee_ctrl_mask after a delay + * @debugfs_ee_rate_limit_ms: user configurable delay after which to restore + * ee_ctrl_mask + * @luns_avail: number of regular and well known LUNs supported by the UFS + * device + * @nr_hw_queues: number of hardware queues configured + * @nr_queues: number of Queues of different queue types + * @complete_put: whether or not to call ufshcd_rpm_put() from inside + * ufshcd_resume_complete() + * @ext_iid_sup: is EXT_IID is supported by UFSHC + * @mcq_sup: is mcq supported by UFSHC + * @mcq_enabled: is mcq ready to accept requests + * @res: array of resource info of MCQ registers + * @mcq_base: Multi circular queue registers base address + * @uhq: array of supported hardware queues + * @dev_cmd_queue: Queue for issuing device management commands */ struct ufs_hba { void __iomem *mmio_base; @@ -785,13 +960,11 @@ struct ufs_hba { struct Scsi_Host *host; struct device *dev; - struct request_queue *cmd_queue; - /* - * This field is to keep a reference to "scsi_device" corresponding to - * "UFS device" W-LU. - */ - struct scsi_device *sdev_ufs_device; - struct scsi_device *sdev_rpmb; + struct scsi_device *ufs_device_wlun; + +#ifdef CONFIG_SCSI_UFS_HWMON + struct device *hwmon_device; +#endif enum ufs_dev_pwr_mode curr_dev_pwr_mode; enum uic_link_state uic_link_state; @@ -799,8 +972,6 @@ struct ufs_hba { enum ufs_pm_level rpm_lvl; /* Desired UFS power management level during system PM */ enum ufs_pm_level spm_lvl; - struct device_attribute rpm_lvl_attr; - struct device_attribute spm_lvl_attr; int pm_op_in_progress; /* Auto-Hibernate Idle Timer register value */ @@ -814,12 +985,16 @@ struct ufs_hba { u32 capabilities; int nutrs; + u32 mcq_capabilities; int nutmrs; u32 reserved_slot; u32 ufs_version; const struct ufs_hba_variant_ops *vops; struct ufs_hba_variant_params *vps; void *priv; +#ifdef CONFIG_SCSI_UFS_VARIABLE_SG_ENTRY_SIZE + size_t sg_entry_size; +#endif unsigned int irq; bool is_irq_enabled; enum ufs_ref_clk_freq dev_ref_clk_freq; @@ -840,9 +1015,9 @@ struct ufs_hba { enum ufshcd_state ufshcd_state; u32 eh_flags; u32 intr_mask; - u16 ee_ctrl_mask; /* Exception event mask */ - u16 ee_drv_mask; /* Exception event mask for driver */ - u16 ee_usr_mask; /* Exception event mask for user (via debugfs) */ + u16 ee_ctrl_mask; + u16 ee_drv_mask; + u16 ee_usr_mask; struct mutex ee_ctrl_mutex; bool is_powered; bool shutting_down; @@ -888,6 +1063,7 @@ struct ufs_hba { struct devfreq *devfreq; struct ufs_clk_scaling clk_scaling; + bool system_suspending; bool is_sys_suspended; enum bkops_status urgent_bkops_lvl; @@ -895,7 +1071,6 @@ struct ufs_hba { struct mutex wb_mutex; struct rw_semaphore clk_scaling_lock; - unsigned char desc_size[QUERY_DESC_IDN_MAX]; atomic_t scsi_block_reqs_cnt; struct device bsg_dev; @@ -912,7 +1087,7 @@ struct ufs_hba { union ufs_crypto_capabilities crypto_capabilities; union ufs_crypto_cap_entry *crypto_cap_array; u32 crypto_cfg_register; - struct blk_keyslot_manager ksm; + struct blk_crypto_profile crypto_profile; #endif #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_root; @@ -920,9 +1095,96 @@ struct ufs_hba { u32 debugfs_ee_rate_limit_ms; #endif u32 luns_avail; + unsigned int nr_hw_queues; + unsigned int nr_queues[HCTX_MAX_TYPES]; bool complete_put; + bool ext_iid_sup; + bool scsi_host_added; + bool mcq_sup; + bool mcq_enabled; + struct ufshcd_res_info res[RES_MAX]; + void __iomem *mcq_base; + struct ufs_hw_queue *uhq; + struct ufs_hw_queue *dev_cmd_queue; + struct ufshcd_mcq_opr_info_t mcq_opr[OPR_MAX]; + + ANDROID_VENDOR_DATA(1); + ANDROID_OEM_DATA_ARRAY(1, 2); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; +/** + * struct ufs_hw_queue - per hardware queue structure + * @mcq_sq_head: base address of submission queue head pointer + * @mcq_sq_tail: base address of submission queue tail pointer + * @mcq_cq_head: base address of completion queue head pointer + * @mcq_cq_tail: base address of completion queue tail pointer + * @sqe_base_addr: submission queue entry base address + * @sqe_dma_addr: submission queue dma address + * @cqe_base_addr: completion queue base address + * @cqe_dma_addr: completion queue dma address + * @max_entries: max number of slots in this hardware queue + * @id: hardware queue ID + * @sq_tp_slot: current slot to which SQ tail pointer is pointing + * @sq_lock: serialize submission queue access + * @cq_tail_slot: current slot to which CQ tail pointer is pointing + * @cq_head_slot: current slot to which CQ head pointer is pointing + * @cq_lock: Synchronize between multiple polling instances + */ +struct ufs_hw_queue { + void __iomem *mcq_sq_head; + void __iomem *mcq_sq_tail; + void __iomem *mcq_cq_head; + void __iomem *mcq_cq_tail; + + void *sqe_base_addr; + dma_addr_t sqe_dma_addr; + struct cq_entry *cqe_base_addr; + dma_addr_t cqe_dma_addr; + u32 max_entries; + u32 id; + u32 sq_tail_slot; + spinlock_t sq_lock; + u32 cq_tail_slot; + u32 cq_head_slot; + spinlock_t cq_lock; +}; + +static inline bool is_mcq_enabled(struct ufs_hba *hba) +{ + return hba->mcq_enabled; +} + +#ifdef CONFIG_SCSI_UFS_VARIABLE_SG_ENTRY_SIZE +static inline size_t ufshcd_sg_entry_size(const struct ufs_hba *hba) +{ + return hba->sg_entry_size; +} + +static inline void ufshcd_set_sg_entry_size(struct ufs_hba *hba, size_t sg_entry_size) +{ + WARN_ON_ONCE(sg_entry_size < sizeof(struct ufshcd_sg_entry)); + hba->sg_entry_size = sg_entry_size; +} +#else +static inline size_t ufshcd_sg_entry_size(const struct ufs_hba *hba) +{ + return sizeof(struct ufshcd_sg_entry); +} + +#define ufshcd_set_sg_entry_size(hba, sg_entry_size) \ + ({ (void)(hba); BUILD_BUG_ON(sg_entry_size != sizeof(struct ufshcd_sg_entry)); }) +#endif + +static inline size_t sizeof_utp_transfer_cmd_desc(const struct ufs_hba *hba) +{ + return sizeof(struct utp_transfer_cmd_desc) + SG_ALL * ufshcd_sg_entry_size(hba); +} + /* Returns true if clocks can be gated. Otherwise false */ static inline bool ufshcd_is_clkgating_allowed(struct ufs_hba *hba) { @@ -965,7 +1227,7 @@ static inline bool ufshcd_is_auto_hibern8_supported(struct ufs_hba *hba) static inline bool ufshcd_is_auto_hibern8_enabled(struct ufs_hba *hba) { - return FIELD_GET(UFSHCI_AHIBERN8_TIMER_MASK, hba->ahit) ? true : false; + return FIELD_GET(UFSHCI_AHIBERN8_TIMER_MASK, hba->ahit); } static inline bool ufshcd_is_wb_allowed(struct ufs_hba *hba) @@ -973,22 +1235,32 @@ static inline bool ufshcd_is_wb_allowed(struct ufs_hba *hba) return hba->caps & UFSHCD_CAP_WB_EN; } -static inline bool ufshcd_is_user_access_allowed(struct ufs_hba *hba) +static inline bool ufshcd_enable_wb_if_scaling_up(struct ufs_hba *hba) { - return !hba->shutting_down; + return hba->caps & UFSHCD_CAP_WB_WITH_CLK_SCALING; } +#define ufsmcq_writel(hba, val, reg) \ + writel((val), (hba)->mcq_base + (reg)) +#define ufsmcq_readl(hba, reg) \ + readl((hba)->mcq_base + (reg)) + +#define ufsmcq_writelx(hba, val, reg) \ + writel_relaxed((val), (hba)->mcq_base + (reg)) +#define ufsmcq_readlx(hba, reg) \ + readl_relaxed((hba)->mcq_base + (reg)) + #define ufshcd_writel(hba, val, reg) \ writel((val), (hba)->mmio_base + (reg)) #define ufshcd_readl(hba, reg) \ readl((hba)->mmio_base + (reg)) /** - * ufshcd_rmwl - read modify write into a register - * @hba - per adapter instance - * @mask - mask to apply on read value - * @val - actual value to write - * @reg - register address + * ufshcd_rmwl - perform read/modify/write for a controller register + * @hba: per adapter instance + * @mask: mask to apply on read value + * @val: actual value to write + * @reg: register address */ static inline void ufshcd_rmwl(struct ufs_hba *hba, u32 mask, u32 val, u32 reg) { @@ -1007,25 +1279,23 @@ int ufshcd_init(struct ufs_hba *, void __iomem *, unsigned int); int ufshcd_link_recovery(struct ufs_hba *hba); int ufshcd_make_hba_operational(struct ufs_hba *hba); void ufshcd_remove(struct ufs_hba *); +int ufshcd_uic_hibern8_enter(struct ufs_hba *hba); int ufshcd_uic_hibern8_exit(struct ufs_hba *hba); void ufshcd_delay_us(unsigned long us, unsigned long tolerance); -int ufshcd_wait_for_register(struct ufs_hba *hba, u32 reg, u32 mask, - u32 val, unsigned long interval_us, - unsigned long timeout_ms); void ufshcd_parse_dev_ref_clk_freq(struct ufs_hba *hba, struct clk *refclk); void ufshcd_update_evt_hist(struct ufs_hba *hba, u32 id, u32 val); void ufshcd_hba_stop(struct ufs_hba *hba); - -static inline void check_upiu_size(void) -{ - BUILD_BUG_ON(ALIGNED_UPIU_SIZE < - GENERAL_UPIU_REQUEST_SIZE + QUERY_DESC_MAX_SIZE); -} +void ufshcd_schedule_eh_work(struct ufs_hba *hba); +void ufshcd_mcq_write_cqis(struct ufs_hba *hba, u32 val, int i); +unsigned long ufshcd_mcq_poll_cqe_nolock(struct ufs_hba *hba, + struct ufs_hw_queue *hwq); +void ufshcd_mcq_enable_esi(struct ufs_hba *hba); +void ufshcd_mcq_config_esi(struct ufs_hba *hba, struct msi_msg *msg); /** * ufshcd_set_variant - set variant specific data to the hba - * @hba - per adapter instance - * @variant - pointer to variant specific data + * @hba: per adapter instance + * @variant: pointer to variant specific data */ static inline void ufshcd_set_variant(struct ufs_hba *hba, void *variant) { @@ -1035,25 +1305,13 @@ static inline void ufshcd_set_variant(struct ufs_hba *hba, void *variant) /** * ufshcd_get_variant - get variant specific data from the hba - * @hba - per adapter instance + * @hba: per adapter instance */ static inline void *ufshcd_get_variant(struct ufs_hba *hba) { BUG_ON(!hba); return hba->priv; } -static inline bool ufshcd_keep_autobkops_enabled_except_suspend( - struct ufs_hba *hba) -{ - return hba->caps & UFSHCD_CAP_KEEP_AUTO_BKOPS_ENABLED_EXCEPT_SUSPEND; -} - -static inline u8 ufshcd_wb_get_query_index(struct ufs_hba *hba) -{ - if (hba->dev_info.wb_buffer_type == WB_BUF_MODE_LU_DEDICATED) - return hba->dev_info.wb_dedicated_lu; - return 0; -} #ifdef CONFIG_PM extern int ufshcd_runtime_suspend(struct device *dev); @@ -1062,8 +1320,12 @@ extern int ufshcd_runtime_resume(struct device *dev); #ifdef CONFIG_PM_SLEEP extern int ufshcd_system_suspend(struct device *dev); extern int ufshcd_system_resume(struct device *dev); +extern int ufshcd_system_freeze(struct device *dev); +extern int ufshcd_system_thaw(struct device *dev); +extern int ufshcd_system_restore(struct device *dev); #endif extern int ufshcd_shutdown(struct ufs_hba *hba); + extern int ufshcd_dme_configure_adapt(struct ufs_hba *hba, int agreed_gear, int adapt_val); @@ -1073,6 +1335,7 @@ extern int ufshcd_dme_get_attr(struct ufs_hba *hba, u32 attr_sel, u32 *mib_val, u8 peer); extern int ufshcd_config_pwr_mode(struct ufs_hba *hba, struct ufs_pa_layer_attr *desired_pwr_mode); +extern int ufshcd_uic_change_pwr_mode(struct ufs_hba *hba, u8 mode); /* UIC command interfaces for DME primitives */ #define DME_LOCAL 0 @@ -1150,12 +1413,19 @@ int ufshcd_query_attr_retry(struct ufs_hba *hba, enum query_opcode opcode, u32 *attr_val); int ufshcd_query_attr(struct ufs_hba *hba, enum query_opcode opcode, enum attr_idn idn, u8 index, u8 selector, u32 *attr_val); +int ufshcd_query_attr_retry(struct ufs_hba *hba, + enum query_opcode opcode, enum attr_idn idn, u8 index, u8 selector, + u32 *attr_val); int ufshcd_query_flag(struct ufs_hba *hba, enum query_opcode opcode, enum flag_idn idn, u8 index, bool *flag_res); +int ufshcd_query_flag_retry(struct ufs_hba *hba, + enum query_opcode opcode, enum flag_idn idn, u8 index, bool *flag_res); +int ufshcd_bkops_ctrl(struct ufs_hba *hba, enum bkops_status status); void ufshcd_auto_hibern8_enable(struct ufs_hba *hba); void ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit); -void ufshcd_fixup_dev_quirks(struct ufs_hba *hba, struct ufs_dev_fix *fixups); +void ufshcd_fixup_dev_quirks(struct ufs_hba *hba, + const struct ufs_dev_quirk *fixups); #define SD_ASCII_STD true #define SD_RAW false int ufshcd_read_string_desc(struct ufs_hba *hba, u8 desc_index, @@ -1164,11 +1434,15 @@ int ufshcd_read_string_desc(struct ufs_hba *hba, u8 desc_index, int ufshcd_hold(struct ufs_hba *hba, bool async); void ufshcd_release(struct ufs_hba *hba); -void ufshcd_map_desc_id_to_length(struct ufs_hba *hba, enum desc_idn desc_id, - int *desc_length); +void ufshcd_clkgate_delay_set(struct device *dev, unsigned long value); + +int ufshcd_freeze_scsi_devs(struct ufs_hba *hba, u64 timeout_us); +void ufshcd_unfreeze_scsi_devs(struct ufs_hba *hba); u32 ufshcd_get_local_unipro_ver(struct ufs_hba *hba); +int ufshcd_get_vreg(struct device *dev, struct ufs_vreg *vreg); + int ufshcd_send_uic_cmd(struct ufs_hba *hba, struct uic_command *uic_cmd); int ufshcd_exec_raw_upiu_cmd(struct ufs_hba *hba, @@ -1177,19 +1451,17 @@ int ufshcd_exec_raw_upiu_cmd(struct ufs_hba *hba, int msgcode, u8 *desc_buff, int *buff_len, enum query_opcode desc_op); - +int ufshcd_advanced_rpmb_req_handler(struct ufs_hba *hba, struct utp_upiu_req *req_upiu, + struct utp_upiu_req *rsp_upiu, struct ufs_ehs *ehs_req, + struct ufs_ehs *ehs_rsp, int sg_cnt, + struct scatterlist *sg_list, enum dma_data_direction dir); int ufshcd_wb_toggle(struct ufs_hba *hba, bool enable); +int ufshcd_wb_toggle_buf_flush(struct ufs_hba *hba, bool enable); int ufshcd_suspend_prepare(struct device *dev); +int __ufshcd_suspend_prepare(struct device *dev, bool rpm_ok_for_spm); void ufshcd_resume_complete(struct device *dev); /* Wrapper functions for safely calling variant operations */ -static inline const char *ufshcd_get_var_name(struct ufs_hba *hba) -{ - if (hba->vops) - return hba->vops->name; - return ""; -} - static inline int ufshcd_vops_init(struct ufs_hba *hba) { if (hba->vops && hba->vops->init) @@ -1198,61 +1470,6 @@ static inline int ufshcd_vops_init(struct ufs_hba *hba) return 0; } -static inline void ufshcd_vops_exit(struct ufs_hba *hba) -{ - if (hba->vops && hba->vops->exit) - return hba->vops->exit(hba); -} - -static inline u32 ufshcd_vops_get_ufs_hci_version(struct ufs_hba *hba) -{ - if (hba->vops && hba->vops->get_ufs_hci_version) - return hba->vops->get_ufs_hci_version(hba); - - return ufshcd_readl(hba, REG_UFS_VERSION); -} - -static inline int ufshcd_vops_clk_scale_notify(struct ufs_hba *hba, - bool up, enum ufs_notify_change_status status) -{ - if (hba->vops && hba->vops->clk_scale_notify) - return hba->vops->clk_scale_notify(hba, up, status); - return 0; -} - -static inline void ufshcd_vops_event_notify(struct ufs_hba *hba, - enum ufs_event_type evt, - void *data) -{ - if (hba->vops && hba->vops->event_notify) - hba->vops->event_notify(hba, evt, data); -} - -static inline int ufshcd_vops_setup_clocks(struct ufs_hba *hba, bool on, - enum ufs_notify_change_status status) -{ - if (hba->vops && hba->vops->setup_clocks) - return hba->vops->setup_clocks(hba, on, status); - return 0; -} - -static inline int ufshcd_vops_hce_enable_notify(struct ufs_hba *hba, - bool status) -{ - if (hba->vops && hba->vops->hce_enable_notify) - return hba->vops->hce_enable_notify(hba, status); - - return 0; -} -static inline int ufshcd_vops_link_startup_notify(struct ufs_hba *hba, - bool status) -{ - if (hba->vops && hba->vops->link_startup_notify) - return hba->vops->link_startup_notify(hba, status); - - return 0; -} - static inline int ufshcd_vops_phy_initialization(struct ufs_hba *hba) { if (hba->vops && hba->vops->phy_initialization) @@ -1261,136 +1478,14 @@ static inline int ufshcd_vops_phy_initialization(struct ufs_hba *hba) return 0; } -static inline int ufshcd_vops_pwr_change_notify(struct ufs_hba *hba, - enum ufs_notify_change_status status, - struct ufs_pa_layer_attr *dev_max_params, - struct ufs_pa_layer_attr *dev_req_params) -{ - if (hba->vops && hba->vops->pwr_change_notify) - return hba->vops->pwr_change_notify(hba, status, - dev_max_params, dev_req_params); - - return -ENOTSUPP; -} - -static inline void ufshcd_vops_setup_task_mgmt(struct ufs_hba *hba, - int tag, u8 tm_function) -{ - if (hba->vops && hba->vops->setup_task_mgmt) - return hba->vops->setup_task_mgmt(hba, tag, tm_function); -} - -static inline void ufshcd_vops_hibern8_notify(struct ufs_hba *hba, - enum uic_cmd_dme cmd, - enum ufs_notify_change_status status) -{ - if (hba->vops && hba->vops->hibern8_notify) - return hba->vops->hibern8_notify(hba, cmd, status); -} - -static inline int ufshcd_vops_apply_dev_quirks(struct ufs_hba *hba) -{ - if (hba->vops && hba->vops->apply_dev_quirks) - return hba->vops->apply_dev_quirks(hba); - return 0; -} - -static inline void ufshcd_vops_fixup_dev_quirks(struct ufs_hba *hba) -{ - if (hba->vops && hba->vops->fixup_dev_quirks) - hba->vops->fixup_dev_quirks(hba); -} - -static inline int ufshcd_vops_suspend(struct ufs_hba *hba, enum ufs_pm_op op) -{ - if (hba->vops && hba->vops->suspend) - return hba->vops->suspend(hba, op); - - return 0; -} - -static inline int ufshcd_vops_resume(struct ufs_hba *hba, enum ufs_pm_op op) -{ - if (hba->vops && hba->vops->resume) - return hba->vops->resume(hba, op); - - return 0; -} - -static inline void ufshcd_vops_dbg_register_dump(struct ufs_hba *hba) -{ - if (hba->vops && hba->vops->dbg_register_dump) - hba->vops->dbg_register_dump(hba); -} - -static inline int ufshcd_vops_device_reset(struct ufs_hba *hba) -{ - if (hba->vops && hba->vops->device_reset) - return hba->vops->device_reset(hba); - - return -EOPNOTSUPP; -} - -static inline void ufshcd_vops_config_scaling_param(struct ufs_hba *hba, - struct devfreq_dev_profile - *profile, void *data) -{ - if (hba->vops && hba->vops->config_scaling_param) - hba->vops->config_scaling_param(hba, profile, data); -} - -extern struct ufs_pm_lvl_states ufs_pm_lvl_states[]; - -/* - * ufshcd_scsi_to_upiu_lun - maps scsi LUN to UPIU LUN - * @scsi_lun: scsi LUN id - * - * Returns UPIU LUN id - */ -static inline u8 ufshcd_scsi_to_upiu_lun(unsigned int scsi_lun) -{ - if (scsi_is_wlun(scsi_lun)) - return (scsi_lun & UFS_UPIU_MAX_UNIT_NUM_ID) - | UFS_UPIU_WLUN_ID; - else - return scsi_lun & UFS_UPIU_MAX_UNIT_NUM_ID; -} +extern const struct ufs_pm_lvl_states ufs_pm_lvl_states[]; int ufshcd_dump_regs(struct ufs_hba *hba, size_t offset, size_t len, const char *prefix); int __ufshcd_write_ee_control(struct ufs_hba *hba, u32 ee_ctrl_mask); int ufshcd_write_ee_control(struct ufs_hba *hba); -int ufshcd_update_ee_control(struct ufs_hba *hba, u16 *mask, u16 *other_mask, - u16 set, u16 clr); - -static inline int ufshcd_update_ee_drv_mask(struct ufs_hba *hba, - u16 set, u16 clr) -{ - return ufshcd_update_ee_control(hba, &hba->ee_drv_mask, - &hba->ee_usr_mask, set, clr); -} - -static inline int ufshcd_update_ee_usr_mask(struct ufs_hba *hba, - u16 set, u16 clr) -{ - return ufshcd_update_ee_control(hba, &hba->ee_usr_mask, - &hba->ee_drv_mask, set, clr); -} - -static inline int ufshcd_rpm_get_sync(struct ufs_hba *hba) -{ - return pm_runtime_get_sync(&hba->sdev_ufs_device->sdev_gendev); -} - -static inline int ufshcd_rpm_put_sync(struct ufs_hba *hba) -{ - return pm_runtime_put_sync(&hba->sdev_ufs_device->sdev_gendev); -} - -static inline int ufshcd_rpm_put(struct ufs_hba *hba) -{ - return pm_runtime_put(&hba->sdev_ufs_device->sdev_gendev); -} +int ufshcd_update_ee_control(struct ufs_hba *hba, u16 *mask, + const u16 *other_mask, u16 set, u16 clr); #endif /* End of Header */ diff --git a/drivers/scsi/ufs/ufshci.h b/include/ufs/ufshci.h similarity index 89% rename from drivers/scsi/ufs/ufshci.h rename to include/ufs/ufshci.h index 8dbe9866ea6c..11424bb03814 100644 --- a/drivers/scsi/ufs/ufshci.h +++ b/include/ufs/ufshci.h @@ -11,6 +11,8 @@ #ifndef _UFSHCI_H #define _UFSHCI_H +#include + enum { TASK_REQ_UPIU_SIZE_DWORDS = 8, TASK_RSP_UPIU_SIZE_DWORDS = 8, @@ -20,6 +22,7 @@ enum { /* UFSHCI Registers */ enum { REG_CONTROLLER_CAPABILITIES = 0x00, + REG_MCQCAP = 0x04, REG_UFS_VERSION = 0x08, REG_CONTROLLER_DEV_ID = 0x10, REG_CONTROLLER_PROD_ID = 0x14, @@ -54,6 +57,10 @@ enum { REG_UFS_CCAP = 0x100, REG_UFS_CRYPTOCAP = 0x104, + REG_UFS_MEM_CFG = 0x300, + REG_UFS_MCQ_CFG = 0x380, + REG_UFS_ESILBA = 0x384, + REG_UFS_ESIUBA = 0x388, UFSHCI_CRYPTO_REG_SPACE_SIZE = 0x400, }; @@ -61,11 +68,47 @@ enum { enum { MASK_TRANSFER_REQUESTS_SLOTS = 0x0000001F, MASK_TASK_MANAGEMENT_REQUEST_SLOTS = 0x00070000, + MASK_EHSLUTRD_SUPPORTED = 0x00400000, MASK_AUTO_HIBERN8_SUPPORT = 0x00800000, MASK_64_ADDRESSING_SUPPORT = 0x01000000, MASK_OUT_OF_ORDER_DATA_DELIVERY_SUPPORT = 0x02000000, MASK_UIC_DME_TEST_MODE_SUPPORT = 0x04000000, MASK_CRYPTO_SUPPORT = 0x10000000, + MASK_MCQ_SUPPORT = 0x40000000, +}; + +/* MCQ capability mask */ +enum { + MASK_EXT_IID_SUPPORT = 0x00000400, +}; + +enum { + REG_SQATTR = 0x0, + REG_SQLBA = 0x4, + REG_SQUBA = 0x8, + REG_SQDAO = 0xC, + REG_SQISAO = 0x10, + + REG_CQATTR = 0x20, + REG_CQLBA = 0x24, + REG_CQUBA = 0x28, + REG_CQDAO = 0x2C, + REG_CQISAO = 0x30, +}; + +enum { + REG_SQHP = 0x0, + REG_SQTP = 0x4, +}; + +enum { + REG_CQHP = 0x0, + REG_CQTP = 0x4, +}; + +enum { + REG_CQIS = 0x0, + REG_CQIE = 0x4, }; #define UFS_MASK(mask, offset) ((mask) << (offset)) @@ -124,6 +167,7 @@ static inline u32 ufshci_version(u32 major, u32 minor) #define CONTROLLER_FATAL_ERROR 0x10000 #define SYSTEM_BUS_FATAL_ERROR 0x20000 #define CRYPTO_ENGINE_FATAL_ERROR 0x40000 +#define MCQ_CQ_EVENT_STATUS 0x100000 #define UFSHCD_UIC_HIBERN8_MASK (UIC_HIBERNATE_ENTER |\ UIC_HIBERNATE_EXIT) @@ -221,6 +265,9 @@ enum { /* UTMRLRSR - UTP Task Management Request Run-Stop Register 80h */ #define UTP_TASK_REQ_LIST_RUN_STOP_BIT 0x1 +/* CQISy - CQ y Interrupt Status Register */ +#define UFSHCD_MCQ_CQIS_TAIL_ENT_PUSH_STS 0x1 + /* UICCMD - UIC Command */ #define COMMAND_OPCODE_MASK 0xFF #define GEN_SELECTOR_INDEX_MASK 0xFFFF @@ -386,7 +433,7 @@ enum { }; /* Overall command status values */ -enum { +enum utp_ocs { OCS_SUCCESS = 0x0, OCS_INVALID_CMD_TABLE_ATTR = 0x1, OCS_INVALID_PRDT_ATTR = 0x2, @@ -399,6 +446,9 @@ enum { OCS_INVALID_CRYPTO_CONFIG = 0x9, OCS_GENERAL_CRYPTO_ERROR = 0xA, OCS_INVALID_COMMAND_STATUS = 0x0F, +}; + +enum { MASK_OCS = 0x0F, }; @@ -409,28 +459,31 @@ enum { /** * struct ufshcd_sg_entry - UFSHCI PRD Entry - * @base_addr: Lower 32bit physical address DW-0 - * @upper_addr: Upper 32bit physical address DW-1 + * @addr: Physical address; DW-0 and DW-1. * @reserved: Reserved for future use DW-2 * @size: size of physical segment DW-3 */ struct ufshcd_sg_entry { - __le32 base_addr; - __le32 upper_addr; + __le64 addr; __le32 reserved; __le32 size; + /* + * followed by variant-specific fields if + * CONFIG_SCSI_UFS_VARIABLE_SG_ENTRY_SIZE has been defined. + */ }; /** - * struct utp_transfer_cmd_desc - UFS Command Descriptor structure + * struct utp_transfer_cmd_desc - UTP Command Descriptor (UCD) * @command_upiu: Command UPIU Frame address * @response_upiu: Response UPIU Frame address - * @prd_table: Physical Region Descriptor + * @prd_table: Physical Region Descriptor: an array of SG_ALL struct + * ufshcd_sg_entry's. Variant-specific fields may be present after each. */ struct utp_transfer_cmd_desc { u8 command_upiu[ALIGNED_UPIU_SIZE]; u8 response_upiu[ALIGNED_UPIU_SIZE]; - struct ufshcd_sg_entry prd_table[SG_ALL]; + u8 prd_table[]; }; /** @@ -448,7 +501,7 @@ struct request_desc_header { }; /** - * struct utp_transfer_req_desc - UTRD structure + * struct utp_transfer_req_desc - UTP Transfer Request Descriptor (UTRD) * @header: UTRD header DW-0 to DW-3 * @command_desc_base_addr_lo: UCD base address low DW-4 * @command_desc_base_addr_hi: UCD base address high DW-5 @@ -475,6 +528,28 @@ struct utp_transfer_req_desc { __le16 prd_table_offset; }; +/* MCQ Completion Queue Entry */ +struct cq_entry { + /* DW 0-1 */ + __le64 command_desc_base_addr; + + /* DW 2 */ + __le16 response_upiu_length; + __le16 response_upiu_offset; + + /* DW 3 */ + __le16 prd_table_length; + __le16 prd_table_offset; + + /* DW 4 */ + __le32 status; + + /* DW 5-7 */ + __le32 reserved[3]; +}; + +static_assert(sizeof(struct cq_entry) == 32); + /* * UTMRD structure. */ diff --git a/drivers/scsi/ufs/unipro.h b/include/ufs/unipro.h similarity index 97% rename from drivers/scsi/ufs/unipro.h rename to include/ufs/unipro.h index 8e9e486a4f7b..dc9dd1d23f0f 100644 --- a/drivers/scsi/ufs/unipro.h +++ b/include/ufs/unipro.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * drivers/scsi/ufs/unipro.h - * * Copyright (C) 2013 Samsung Electronics Co., Ltd. */ @@ -40,6 +38,18 @@ /* * M-RX Configuration Attributes */ +#define RX_HS_G1_SYNC_LENGTH_CAP 0x008B +#define RX_HS_G1_PREP_LENGTH_CAP 0x008C +#define RX_MIN_ACTIVATETIME_CAPABILITY 0x008F +#define RX_HIBERN8TIME_CAPABILITY 0x0092 +#define RX_HS_G2_SYNC_LENGTH_CAP 0x0094 +#define RX_HS_G3_SYNC_LENGTH_CAP 0x0095 +#define RX_HS_G2_PREP_LENGTH_CAP 0x0096 +#define RX_HS_G3_PREP_LENGTH_CAP 0x0097 +#define RX_ADV_GRANULARITY_CAP 0x0098 +#define RX_HIBERN8TIME_CAP 0x0092 +#define RX_ADV_HIBERN8TIME_CAP 0x0099 +#define RX_ADV_MIN_ACTIVATETIME_CAP 0x009A #define RX_MODE 0x00A1 #define RX_HSRATE_SERIES 0x00A2 #define RX_HSGEAR 0x00A3 @@ -49,32 +59,19 @@ #define RX_ENTER_HIBERN8 0x00A7 #define RX_BYPASS_8B10B_ENABLE 0x00A8 #define RX_TERMINATION_FORCE_ENABLE 0x00A9 -#define RX_MIN_ACTIVATETIME_CAPABILITY 0x008F -#define RX_HIBERN8TIME_CAPABILITY 0x0092 +#define RXCALCTRL 0x00B4 +#define RXSQCTRL 0x00B5 +#define CFGRXCDR8 0x00BA +#define CFGRXOVR8 0x00BD +#define CFGRXOVR6 0x00BF +#define RXDIRECTCTRL2 0x00C7 +#define CFGRXOVR4 0x00E9 #define RX_REFCLKFREQ 0x00EB #define RX_CFGCLKFREQVAL 0x00EC #define CFGWIDEINLN 0x00F0 -#define CFGRXCDR8 0x00BA #define ENARXDIRECTCFG4 0x00F2 -#define CFGRXOVR8 0x00BD -#define RXDIRECTCTRL2 0x00C7 #define ENARXDIRECTCFG3 0x00F3 -#define RXCALCTRL 0x00B4 #define ENARXDIRECTCFG2 0x00F4 -#define CFGRXOVR4 0x00E9 -#define RXSQCTRL 0x00B5 -#define CFGRXOVR6 0x00BF -#define RX_HS_G1_SYNC_LENGTH_CAP 0x008B -#define RX_HS_G1_PREP_LENGTH_CAP 0x008C -#define RX_HS_G2_SYNC_LENGTH_CAP 0x0094 -#define RX_HS_G3_SYNC_LENGTH_CAP 0x0095 -#define RX_HS_G2_PREP_LENGTH_CAP 0x0096 -#define RX_HS_G3_PREP_LENGTH_CAP 0x0097 -#define RX_ADV_GRANULARITY_CAP 0x0098 -#define RX_MIN_ACTIVATETIME_CAP 0x008F -#define RX_HIBERN8TIME_CAP 0x0092 -#define RX_ADV_HIBERN8TIME_CAP 0x0099 -#define RX_ADV_MIN_ACTIVATETIME_CAP 0x009A #define is_mphy_tx_attr(attr) (attr < RX_MODE) @@ -103,49 +100,51 @@ #define UNIPRO_CB_OFFSET(x) (0x8000 | x) /* - * PHY Adpater attributes + * PHY Adapter attributes */ -#define PA_ACTIVETXDATALANES 0x1560 -#define PA_ACTIVERXDATALANES 0x1580 -#define PA_TXTRAILINGCLOCKS 0x1564 #define PA_PHY_TYPE 0x1500 #define PA_AVAILTXDATALANES 0x1520 -#define PA_AVAILRXDATALANES 0x1540 -#define PA_MINRXTRAILINGCLOCKS 0x1543 -#define PA_TXPWRSTATUS 0x1567 -#define PA_RXPWRSTATUS 0x1582 -#define PA_TXFORCECLOCK 0x1562 -#define PA_TXPWRMODE 0x1563 -#define PA_LEGACYDPHYESCDL 0x1570 #define PA_MAXTXSPEEDFAST 0x1521 #define PA_MAXTXSPEEDSLOW 0x1522 #define PA_MAXRXSPEEDFAST 0x1541 #define PA_MAXRXSPEEDSLOW 0x1542 #define PA_TXLINKSTARTUPHS 0x1544 +#define PA_AVAILRXDATALANES 0x1540 +#define PA_MINRXTRAILINGCLOCKS 0x1543 #define PA_LOCAL_TX_LCC_ENABLE 0x155E +#define PA_ACTIVETXDATALANES 0x1560 +#define PA_CONNECTEDTXDATALANES 0x1561 +#define PA_TXFORCECLOCK 0x1562 +#define PA_TXPWRMODE 0x1563 +#define PA_TXTRAILINGCLOCKS 0x1564 #define PA_TXSPEEDFAST 0x1565 #define PA_TXSPEEDSLOW 0x1566 -#define PA_REMOTEVERINFO 0x15A0 +#define PA_TXPWRSTATUS 0x1567 #define PA_TXGEAR 0x1568 #define PA_TXTERMINATION 0x1569 #define PA_HSSERIES 0x156A +#define PA_LEGACYDPHYESCDL 0x1570 #define PA_PWRMODE 0x1571 +#define PA_ACTIVERXDATALANES 0x1580 +#define PA_CONNECTEDRXDATALANES 0x1581 +#define PA_RXPWRSTATUS 0x1582 #define PA_RXGEAR 0x1583 #define PA_RXTERMINATION 0x1584 #define PA_MAXRXPWMGEAR 0x1586 #define PA_MAXRXHSGEAR 0x1587 -#define PA_RXHSUNTERMCAP 0x15A5 -#define PA_RXLSTERMCAP 0x15A6 -#define PA_GRANULARITY 0x15AA #define PA_PACPREQTIMEOUT 0x1590 #define PA_PACPREQEOBTIMEOUT 0x1591 +#define PA_REMOTEVERINFO 0x15A0 +#define PA_LOGICALLANEMAP 0x15A1 +#define PA_SLEEPNOCONFIGTIME 0x15A2 +#define PA_STALLNOCONFIGTIME 0x15A3 +#define PA_SAVECONFIGTIME 0x15A4 +#define PA_RXHSUNTERMCAP 0x15A5 +#define PA_RXLSTERMCAP 0x15A6 #define PA_HIBERN8TIME 0x15A7 #define PA_LOCALVERINFO 0x15A9 #define PA_GRANULARITY 0x15AA #define PA_TACTIVATE 0x15A8 -#define PA_PACPFRAMECOUNT 0x15C0 -#define PA_PACPERRORCOUNT 0x15C1 -#define PA_PHYTESTCONTROL 0x15C2 #define PA_PWRMODEUSERDATA0 0x15B0 #define PA_PWRMODEUSERDATA1 0x15B1 #define PA_PWRMODEUSERDATA2 0x15B2 @@ -158,12 +157,9 @@ #define PA_PWRMODEUSERDATA9 0x15B9 #define PA_PWRMODEUSERDATA10 0x15BA #define PA_PWRMODEUSERDATA11 0x15BB -#define PA_CONNECTEDTXDATALANES 0x1561 -#define PA_CONNECTEDRXDATALANES 0x1581 -#define PA_LOGICALLANEMAP 0x15A1 -#define PA_SLEEPNOCONFIGTIME 0x15A2 -#define PA_STALLNOCONFIGTIME 0x15A3 -#define PA_SAVECONFIGTIME 0x15A4 +#define PA_PACPFRAMECOUNT 0x15C0 +#define PA_PACPERRORCOUNT 0x15C1 +#define PA_PHYTESTCONTROL 0x15C2 #define PA_TXHSADAPTTYPE 0x15D4 /* Adpat type for PA_TXHSADAPTTYPE attribute */ @@ -175,9 +171,9 @@ #define PA_HIBERN8_TIME_UNIT_US 100 /*Other attributes*/ +#define VS_POWERSTATE 0xD083 #define VS_MPHYCFGUPDT 0xD085 #define VS_DEBUGOMC 0xD09E -#define VS_POWERSTATE 0xD083 #define PA_GRANULARITY_MIN_VAL 1 #define PA_GRANULARITY_MAX_VAL 6 @@ -231,6 +227,7 @@ enum ufs_hs_gear_tag { UFS_HS_G2, /* HS Gear 2 */ UFS_HS_G3, /* HS Gear 3 */ UFS_HS_G4, /* HS Gear 4 */ + UFS_HS_G5 /* HS Gear 5 */ }; enum ufs_unipro_ver { @@ -248,27 +245,27 @@ enum ufs_unipro_ver { /* * Data Link Layer Attributes */ +#define DL_TXPREEMPTIONCAP 0x2000 +#define DL_TC0TXMAXSDUSIZE 0x2001 +#define DL_TC0RXINITCREDITVAL 0x2002 +#define DL_TC1TXMAXSDUSIZE 0x2003 +#define DL_TC1RXINITCREDITVAL 0x2004 +#define DL_TC0TXBUFFERSIZE 0x2005 +#define DL_TC1TXBUFFERSIZE 0x2006 #define DL_TC0TXFCTHRESHOLD 0x2040 #define DL_FC0PROTTIMEOUTVAL 0x2041 #define DL_TC0REPLAYTIMEOUTVAL 0x2042 #define DL_AFC0REQTIMEOUTVAL 0x2043 #define DL_AFC0CREDITTHRESHOLD 0x2044 #define DL_TC0OUTACKTHRESHOLD 0x2045 +#define DL_PEERTC0PRESENT 0x2046 +#define DL_PEERTC0RXINITCREVAL 0x2047 #define DL_TC1TXFCTHRESHOLD 0x2060 #define DL_FC1PROTTIMEOUTVAL 0x2061 #define DL_TC1REPLAYTIMEOUTVAL 0x2062 #define DL_AFC1REQTIMEOUTVAL 0x2063 #define DL_AFC1CREDITTHRESHOLD 0x2064 #define DL_TC1OUTACKTHRESHOLD 0x2065 -#define DL_TXPREEMPTIONCAP 0x2000 -#define DL_TC0TXMAXSDUSIZE 0x2001 -#define DL_TC0RXINITCREDITVAL 0x2002 -#define DL_TC0TXBUFFERSIZE 0x2005 -#define DL_PEERTC0PRESENT 0x2046 -#define DL_PEERTC0RXINITCREVAL 0x2047 -#define DL_TC1TXMAXSDUSIZE 0x2003 -#define DL_TC1RXINITCREDITVAL 0x2004 -#define DL_TC1TXBUFFERSIZE 0x2006 #define DL_PEERTC1PRESENT 0x2066 #define DL_PEERTC1RXINITCREVAL 0x2067 @@ -300,20 +297,6 @@ enum ufs_unipro_ver { #define T_TC0TXMAXSDUSIZE 0x4060 #define T_TC1TXMAXSDUSIZE 0x4061 -#ifdef FALSE -#undef FALSE -#endif - -#ifdef TRUE -#undef TRUE -#endif - -/* Boolean attribute values */ -enum { - FALSE = 0, - TRUE, -}; - /* CPort setting */ #define E2EFC_ON (1 << 0) #define E2EFC_OFF (0 << 0) diff --git a/init/Kconfig b/init/Kconfig index a4144393717b..9a002d0128c5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -62,13 +62,13 @@ config LLD_VERSION config CC_CAN_LINK bool - default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(m64-flag)) if 64BIT - default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(m32-flag)) + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m64-flag)) if 64BIT + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m32-flag)) config CC_CAN_LINK_STATIC bool - default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(m64-flag) -static) if 64BIT - default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(m32-flag) -static) + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m64-flag) -static) if 64BIT + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m32-flag) -static) config CC_HAS_ASM_GOTO def_bool $(success,$(srctree)/scripts/gcc-goto.sh $(CC)) @@ -144,7 +144,7 @@ config COMPILE_TEST config WERROR bool "Compile the kernel with warnings as errors" - default COMPILE_TEST + default y help A kernel build should not cause any compiler warnings, and this enables the '-Werror' flag to enforce that rule by default. @@ -1274,6 +1274,17 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. +config RT_SOFTINT_OPTIMIZATION + bool "Improve RT scheduling during long softint execution" + depends on ARM64 + depends on SMP + default n + help + Enable an optimization which tries to avoid placing RT tasks on CPUs + occupied by nonpreemptible tasks, such as a long softint, or CPUs + which may soon block preemptions, such as a CPU running a ksoftirq + thread which handles slow softints. + config SYSFS_DEPRECATED bool "Enable deprecated sysfs features to support old userspace tools" depends on SYSFS @@ -1804,6 +1815,10 @@ config HAVE_PERF_EVENTS help See tools/perf/design.txt for details. +config GUEST_PERF_EVENTS + bool + depends on HAVE_PERF_EVENTS + config PERF_USE_VMALLOC bool help @@ -2147,6 +2162,20 @@ config MODULE_SRCVERSION_ALL the version). With this option, such a "srcversion" field will be created for all modules. If unsure, say N. +config MODULE_SCMVERSION + bool "SCM version for modules" + depends on LOCALVERSION_AUTO + help + This enables the module attribute "scmversion" which can be used + by developers to identify the SCM version of a given module, e.g. + git sha1 or hg sha1. The SCM version can be queried by modinfo or + via the sysfs node: /sys/modules/MODULENAME/scmversion. This is + useful when the kernel or kernel modules are updated separately + since that causes the vermagic of the kernel and the module to + differ. + + If unsure, say N. + config MODULE_SIG bool "Module signature verification" select MODULE_SIG_FORMAT @@ -2176,6 +2205,18 @@ config MODULE_SIG_FORCE Reject unsigned modules or signed modules for which we don't have a key. Without this, such modules will simply taint the kernel. +config MODULE_SIG_PROTECT + bool "Android GKI module protection" + depends on MODULE_SIG && !MODULE_SIG_FORCE + help + Enables Android GKI symbol protection support. + + This modifies the behavior of the MODULE_SIG_FORCE as follows: + - Allows Android GKI Modules signed using MODULE_SIG_ALL during build. + - Allows other modules to load if they don't violate the access to + Android GKI protected symbols. Loading will fail and return + -EACCES (Permission denied) if symbol access contidions are not met. + config MODULE_SIG_ALL bool "Automatically sign all modules" default y @@ -2381,3 +2422,5 @@ config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE # . config ARCH_HAS_SYSCALL_WRAPPER def_bool n + +source "init/Kconfig.gki" diff --git a/init/Kconfig.gki b/init/Kconfig.gki new file mode 100644 index 000000000000..ce622014516e --- /dev/null +++ b/init/Kconfig.gki @@ -0,0 +1,267 @@ +config GKI_HIDDEN_DRM_CONFIGS + bool "Hidden DRM configs needed for GKI" + select DRM_KMS_HELPER if (HAS_IOMEM && DRM) + select DRM_GEM_SHMEM_HELPER if (DRM) + select DRM_GEM_CMA_HELPER + select DRM_KMS_CMA_HELPER + select DRM_MIPI_DSI + select DRM_TTM if (HAS_IOMEM && DRM) + select VIDEOMODE_HELPERS + select WANT_DEV_COREDUMP + select INTERVAL_TREE + help + Dummy config option used to enable hidden DRM configs. + These are normally selected implicitly when including a + DRM module, but for GKI, the modules are built out-of-tree. + +config GKI_HIDDEN_REGMAP_CONFIGS + bool "Hidden Regmap configs needed for GKI" + select REGMAP_IRQ + select REGMAP_MMIO + select REGMAP_SPMI + select SPMI + help + Dummy config option used to enable hidden regmap configs. + These are normally selected implicitly when a module + that relies on it is configured. + +config GKI_HIDDEN_CRYPTO_CONFIGS + bool "Hidden CRYPTO configs needed for GKI" + select CRYPTO_ENGINE + help + Dummy config option used to enable hidden CRYPTO configs. + These are normally selected implicitly when a module + that relies on it is configured. + +config GKI_HIDDEN_SND_CONFIGS + bool "Hidden SND configs needed for GKI" + select SND_VMASTER + select SND_PCM_ELD + select SND_JACK + select SND_JACK_INPUT_DEV + select SND_INTEL_NHLT if (ACPI) + help + Dummy config option used to enable hidden SND configs. + These are normally selected implicitly when a module + that relies on it is configured. + +config GKI_HIDDEN_SND_SOC_CONFIGS + bool "Hidden SND_SOC configs needed for GKI" + select SND_SOC_GENERIC_DMAENGINE_PCM if (SND_SOC && SND) + select SND_PCM_IEC958 + select SND_SOC_COMPRESS if (SND_SOC && SND) + select SND_SOC_TOPOLOGY if (SND_SOC && SND) + select DMADEVICES + select DMA_VIRTUAL_CHANNELS + help + Dummy config option used to enable hidden SND_SOC configs. + These are normally selected implicitly when a module + that relies on it is configured. + +config GKI_HIDDEN_MMC_CONFIGS + bool "Hidden MMC configs needed for GKI" + select MMC_SDHCI_IO_ACCESSORS if (MMC_SDHCI) + help + Dummy config option used to enable hidden MMC configs. + These are normally selected implicitly when a module + that relies on it is configured. + +config GKI_HIDDEN_GPIO_CONFIGS + bool "Hidden GPIO configs needed for GKI" + select PINCTRL_SINGLE if (PINCTRL && OF && HAS_IOMEM) + select GPIO_PL061 if (HAS_IOMEM && ARM_AMBA && GPIOLIB) + help + Dummy config option used to enable hidden GPIO configs. + These are normally selected implicitly when a module + that relies on it is configured. + +config GKI_HIDDEN_QCOM_CONFIGS + bool "Hidden QCOM configs needed for GKI" + select QCOM_SMEM_STATE + select QCOM_GDSC if (ARCH_QCOM) + select IOMMU_IO_PGTABLE_LPAE if (ARCH_QCOM) + + help + Dummy config option used to enable hidden QCOM configs. + These are normally selected implicitly when a module + that relies on it is configured. + +config GKI_HIDDEN_MEDIA_CONFIGS + bool "Hidden Media configs needed for GKI" + select VIDEOBUF2_CORE + select V4L2_MEM2MEM_DEV + select MEDIA_CONTROLLER + select MEDIA_CONTROLLER_REQUEST_API + select MEDIA_SUPPORT + select FRAME_VECTOR + select CEC_CORE + select CEC_NOTIFIER + select CEC_PIN + select VIDEOBUF2_DMA_CONTIG + select VIDEOBUF2_DMA_SG + help + Dummy config option used to enable hidden media configs. + These are normally selected implicitly when a module + that relies on it is configured. + +config GKI_HIDDEN_VIRTUAL_CONFIGS + bool "Hidden Virtual configs needed for GKI" + select HVC_DRIVER + help + Dummy config option used to enable hidden virtual device configs. + These are normally selected implicitly when a module + that relies on it is configured. + +# LEGACY_WEXT_ALLCONFIG Discussed upstream, soundly rejected as a unique +# problem for GKI to solve. It should be noted that these extensions are +# in-effect deprecated and generally unsupported and we should pressure +# the SOC vendors to drop any modules that require these extensions. +config GKI_LEGACY_WEXT_ALLCONFIG + bool "Hidden wireless extension configs needed for GKI" + select WIRELESS_EXT + select WEXT_CORE + select WEXT_PROC + select WEXT_SPY + select WEXT_PRIV + help + Dummy config option used to enable all the hidden legacy wireless + extensions to the core wireless network functionality used by + add-in modules. + + If you are not building a kernel to be used for a variety of + out-of-kernel built wireless modules, say N here. + +config GKI_HIDDEN_USB_CONFIGS + bool "Hiddel USB configurations needed for GKI" + select USB_PHY + help + Dummy config option used to enable all USB related hidden configs. + These configurations are usually only selected by another config + option or a combination of them. + + If you are not building a kernel to be used for a variety of + out-of-kernel build USB drivers, say N here. + +config GKI_HIDDEN_SOC_BUS_CONFIGS + bool "Hidden SoC bus configuration needed for GKI" + select SOC_BUS + help + Dummy config option used to enable SOC_BUS hidden Kconfig. + The configuration is required for SoCs to register themselves to the bus. + + If you are not building a kernel to be used for a variety of SoCs and + out-of-tree drivers, say N here. + +config GKI_HIDDEN_RPMSG_CONFIGS + bool "Hidden RPMSG configuration needed for GKI" + select RPMSG + help + Dummy config option used to enable the hidden RPMSG config. + This configuration is usually only selected by another config + option or a combination of them. + + If you are not building a kernel to be used for a variety of + out-of-kernel build RPMSG drivers, say N here. + +config GKI_HIDDEN_GPU_CONFIGS + bool "Hidden GPU configuration needed for GKI" + select TRACE_GPU_MEM + help + Dummy config option used to enable the hidden GPU config. + These are normally selected implicitly when a module + that relies on it is configured. + +config GKI_HIDDEN_IRQ_CONFIGS + bool "Hidden IRQ configuration needed for GKI" + select GENERIC_IRQ_CHIP + select IRQ_DOMAIN_HIERARCHY + select IRQ_FASTEOI_HIERARCHY_HANDLERS + help + Dummy config option used to enable GENERIC_IRQ_CHIP hidden + config, required by various SoC platforms. This is usually + selected by ARCH_*. + +config GKI_HIDDEN_HYPERVISOR_CONFIGS + bool "Hidden hypervisor configuration needed for GKI" + select SYS_HYPERVISOR + help + Dummy config option used to enable the SYS_HYPERVISOR hidden + config, required by various SoC platforms. This is usually + selected by XEN or S390. + +config GKI_HIDDEN_NET_CONFIGS + bool "Hidden networking configuration needed for GKI" + select PAGE_POOL + select NET_PTP_CLASSIFY + help + Dummy config option used to enable the networking hidden + config, required by various SoC platforms. + +config GKI_HIDDEN_PHY_CONFIGS + bool "Hidden PHY configuration needed for GKI" + select GENERIC_PHY_MIPI_DPHY + help + Dummy config option used to enable the hidden PHY configs, + required by various SoC platforms. + +config GKI_HIDDEN_MM_CONFIGS + bool "Hidden MM configuration needed for GKI" + select PAGE_REPORTING + select BALLOON_COMPACTION + select MEMORY_BALLOON + help + Dummy config option used to enable hidden MM configs, + currently required for VIRTIO_BALLOON + +config GKI_HIDDEN_ETHERNET_CONFIGS + bool "Hidden Ethernet configuration needed for GKI" + select PHYLINK + help + Dummy config option used to enable the hidden Ethernet PHYLINK + configs, required by various ethernet devices. + +config GKI_HIDDEN_DMA_CONFIGS + bool "Hidden DMA configuration needed for GKI" + select ASYNC_TX_ENABLE_CHANNEL_SWITCH + help + Dummy config option used to enable the hidden DMA configs, + required by various SoC platforms. + +# Atrocities needed for +# a) building GKI modules in separate tree, or +# b) building drivers that are not modularizable +# +# All of these should be reworked into an upstream solution +# if possible. +# +config GKI_HACKS_TO_FIX + bool "GKI Dummy config options" + select GKI_HIDDEN_CRYPTO_CONFIGS + select GKI_HIDDEN_DRM_CONFIGS + select GKI_HIDDEN_REGMAP_CONFIGS + select GKI_HIDDEN_SND_CONFIGS + select GKI_HIDDEN_SND_SOC_CONFIGS + select GKI_HIDDEN_MMC_CONFIGS + select GKI_HIDDEN_GPIO_CONFIGS + select GKI_HIDDEN_QCOM_CONFIGS + select GKI_LEGACY_WEXT_ALLCONFIG + select GKI_HIDDEN_MEDIA_CONFIGS + select GKI_HIDDEN_VIRTUAL_CONFIGS + select GKI_HIDDEN_USB_CONFIGS + select GKI_HIDDEN_SOC_BUS_CONFIGS + select GKI_HIDDEN_RPMSG_CONFIGS + select GKI_HIDDEN_GPU_CONFIGS + select GKI_HIDDEN_IRQ_CONFIGS + select GKI_HIDDEN_HYPERVISOR_CONFIGS + select GKI_HIDDEN_NET_CONFIGS + select GKI_HIDDEN_PHY_CONFIGS + select GKI_HIDDEN_MM_CONFIGS + select GKI_HIDDEN_ETHERNET_CONFIGS + select GKI_HIDDEN_DMA_CONFIGS + + help + Dummy config option used to enable core functionality used by + modules that may not be selectable in this config. + + Unless you are building a GKI kernel to be used with modules + built from a different config, say N here. diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index 533d81ed74d4..cd4a81114f75 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -83,7 +83,7 @@ static void __init handle_initrd(void) * In case that a resume from disk is carried out by linuxrc or one of * its children, we need to tell the freezer not to wait for us. */ - current->flags |= PF_FREEZER_SKIP; + freezer_do_not_count(); info = call_usermodehelper_setup("/linuxrc", argv, envp_init, GFP_KERNEL, init_linuxrc, NULL, NULL); diff --git a/init/init_task.c b/init/init_task.c index 2d024066e27b..559eff84a620 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -214,6 +214,10 @@ struct task_struct init_task #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif +#ifdef CONFIG_ANDROID_VENDOR_OEM_DATA + .android_vendor_data1 = {0, }, + .android_oem_data1 = {0, }, +#endif }; EXPORT_SYMBOL(init_task); diff --git a/kernel/.gitignore b/kernel/.gitignore index c6b299a6b786..43a96fcbfafc 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,3 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only /config_data /kheaders.md5 +/gki_module_exported.h +/gki_module_protected.h diff --git a/kernel/Makefile b/kernel/Makefile index 599cb926449a..9dc7e32828ca 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -41,9 +41,6 @@ KCSAN_SANITIZE_kcov.o := n UBSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector -# Don't instrument error handlers -CFLAGS_REMOVE_cfi.o := $(CC_FLAGS_CFI) - obj-y += sched/ obj-y += locking/ obj-y += power/ @@ -69,6 +66,7 @@ obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o +obj-$(CONFIG_MODULE_SIG_PROTECT) += gki_module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_CRASH_CORE) += crash_core.o @@ -160,3 +158,28 @@ $(obj)/kheaders_data.tar.xz: FORCE $(call cmd,genikh) clean-files := kheaders_data.tar.xz kheaders.md5 + +# +# ANDROID: GKI: Generate headerfiles required for gki_module.o +# +# Dependencies on generated files need to be listed explicitly +$(obj)/gki_module.o: $(obj)/gki_module_protected_exports.h \ + $(obj)/gki_module_unprotected.h + +ALL_KMI_SYMBOLS := all_kmi_symbols + +$(obj)/gki_module_unprotected.h: $(ALL_KMI_SYMBOLS) \ + $(srctree)/scripts/gen_gki_modules_headers.sh + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/gen_gki_modules_headers.sh $@ \ + "$(srctree)" \ + $(ALL_KMI_SYMBOLS) + +# Generate symbol list with union of all symbol list for arm64; empty for others +$(ALL_KMI_SYMBOLS): $(if $(filter arm64,$(ARCH)),$(wildcard $(srctree)/android/abi_gki_aarch64_*),) + $(if $(strip $^),cat $^ > $(ALL_KMI_SYMBOLS), echo "" > $(ALL_KMI_SYMBOLS)) + +$(obj)/gki_module_protected_exports.h: $(srctree)/android/abi_gki_protected_exports \ + $(srctree)/scripts/gen_gki_modules_headers.sh + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/gen_gki_modules_headers.sh $@ \ + "$(srctree)" \ + $< diff --git a/kernel/bounds.c b/kernel/bounds.c index 9795d75b09b2..b529182e8b04 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -22,6 +22,13 @@ int main(void) DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); +#ifdef CONFIG_LRU_GEN + DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1)); + DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2); +#else + DEFINE(LRU_GEN_WIDTH, 0); + DEFINE(__LRU_REFS_WIDTH, 0); +#endif /* End of constants */ return 0; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 7f33098ca63f..0115967ac7f0 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -36,3 +36,6 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif obj-$(CONFIG_BPF_PRELOAD) += preload/ +ifeq ($(CONFIG_FUSE_BPF),y) +obj-$(CONFIG_BPF_SYSCALL) += bpf_fuse.o +endif diff --git a/kernel/bpf/bpf_fuse.c b/kernel/bpf/bpf_fuse.c new file mode 100644 index 000000000000..c6aa670bc54c --- /dev/null +++ b/kernel/bpf/bpf_fuse.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2021 Google LLC + +#include +#include + +static const struct bpf_func_proto * +fuse_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); + + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + + default: + pr_debug("Invalid fuse bpf func %d\n", func_id); + return NULL; + } +} + +static bool fuse_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + int i; + + if (off < 0 || off > offsetofend(struct fuse_bpf_args, out_args)) + return false; + + /* TODO This is garbage. Do it properly */ + for (i = 0; i < 5; i++) { + if (off == offsetof(struct fuse_bpf_args, in_args[i].value)) { + info->reg_type = PTR_TO_BUF; + info->ctx_field_size = 256; + if (type != BPF_READ) + return false; + return true; + } + } + for (i = 0; i < 3; i++) { + if (off == offsetof(struct fuse_bpf_args, out_args[i].value)) { + info->reg_type = PTR_TO_BUF; + info->ctx_field_size = 256; + return true; + } + } + if (type != BPF_READ) + return false; + + return true; +} + +const struct bpf_verifier_ops fuse_verifier_ops = { + .get_func_proto = fuse_prog_func_proto, + .is_valid_access = fuse_prog_is_valid_access, +}; + +const struct bpf_prog_ops fuse_prog_ops = { +}; + +struct bpf_prog *fuse_get_bpf_prog(struct file *file) +{ + struct bpf_prog *bpf_prog = ERR_PTR(-EINVAL); + + if (!file || IS_ERR(file)) + return bpf_prog; + /** + * Two ways of getting a bpf prog from another task's fd, since + * bpf_prog_get_type_dev only works with an fd + * + * 1) Duplicate a little of the needed code. Requires access to + * bpf_prog_fops for validation, which is not exported for modules + * 2) Insert the bpf_file object into a fd from the current task + * Stupidly complex, but I think OK, as security checks are not run + * during the existence of the handle + * + * Best would be to upstream 1) into kernel/bpf/syscall.c and export it + * for use here. Failing that, we have to use 2, since fuse must be + * compilable as a module. + */ +#if 1 + if (file->f_op != &bpf_prog_fops) + goto out; + + bpf_prog = file->private_data; + if (bpf_prog->type == BPF_PROG_TYPE_FUSE) + bpf_prog_inc(bpf_prog); + else + bpf_prog = ERR_PTR(-EINVAL); + +#else + { + int task_fd = get_unused_fd_flags(file->f_flags); + + if (task_fd < 0) + goto out; + + fd_install(task_fd, file); + + bpf_prog = bpf_prog_get_type_dev(task_fd, BPF_PROG_TYPE_FUSE, + false); + + /* Close the fd, which also closes the file */ + __close_fd(current->files, task_fd); + file = NULL; + } +#endif + +out: + if (file) + fput(file); + return bpf_prog; +} +EXPORT_SYMBOL(fuse_get_bpf_prog); + + diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0c2fa93bd8d2..9e8a28e4da13 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -1299,12 +1300,18 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, if (!bpf_verifier_log_needed(log)) return; - /* btf verifier prints all types it is processing via - * btf_verifier_log_type(..., fmt = NULL). - * Skip those prints for in-kernel BTF verification. - */ - if (log->level == BPF_LOG_KERNEL && !fmt) - return; + if (log->level == BPF_LOG_KERNEL) { + /* btf verifier prints all types it is processing via + * btf_verifier_log_type(..., fmt = NULL). + * Skip those prints for in-kernel BTF verification. + */ + if (!fmt) + return; + + /* Skip logging when loading module BTF with mismatches permitted */ + if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) + return; + } __btf_verifier_log(log, "[%u] %s %s%s", env->log_type_id, @@ -1343,8 +1350,15 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, if (!bpf_verifier_log_needed(log)) return; - if (log->level == BPF_LOG_KERNEL && !fmt) - return; + if (log->level == BPF_LOG_KERNEL) { + if (!fmt) + return; + + /* Skip logging when loading module BTF with mismatches permitted */ + if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) + return; + } + /* The CHECK_META phase already did a btf dump. * * If member is logged again, it must hit an error in @@ -6063,10 +6077,14 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, } btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size); if (IS_ERR(btf)) { - pr_warn("failed to validate module [%s] BTF: %ld\n", - mod->name, PTR_ERR(btf)); kfree(btf_mod); - err = PTR_ERR(btf); + if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) { + pr_warn("failed to validate module [%s] BTF: %ld\n", + mod->name, PTR_ERR(btf)); + err = PTR_ERR(btf); + } else { + pr_warn_once("Kernel module BTF mismatch detected, BTF debug info may be unavailable for some modules\n"); + } goto out; } err = btf_alloc_id(btf); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ad41b8230780..b5f17f9c72e5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -32,6 +32,8 @@ #include #include +#include + #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) @@ -4587,6 +4589,8 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) if (copy_from_bpfptr(&attr, uattr, size) != 0) return -EFAULT; + trace_android_vh_check_bpf_syscall(cmd, &attr, size); + err = security_bpf(cmd, &attr, size); if (err < 0) return err; diff --git a/kernel/cfi.c b/kernel/cfi.c index 08102d19ec15..2789b061d70b 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -43,6 +43,8 @@ typedef u16 shadow_t; struct cfi_shadow { /* Page index for the beginning of the shadow */ unsigned long base; + /* rcu to free old cfi_shadow asynchronously */ + struct rcu_head rcu; /* An array of __cfi_check locations (as indices to the shadow) */ shadow_t shadow[1]; } __packed; @@ -182,6 +184,13 @@ static void remove_module_from_shadow(struct cfi_shadow *s, struct module *mod, } } +static void free_shadow(struct rcu_head *rcu) +{ + struct cfi_shadow *old = container_of(rcu, struct cfi_shadow, rcu); + + vfree(old); +} + typedef void (*update_shadow_fn)(struct cfi_shadow *, struct module *, unsigned long min_addr, unsigned long max_addr); @@ -211,11 +220,10 @@ static void update_shadow(struct module *mod, unsigned long base_addr, rcu_assign_pointer(cfi_shadow, next); mutex_unlock(&shadow_update_lock); - synchronize_rcu(); if (prev) { set_memory_rw((unsigned long)prev, SHADOW_PAGES); - vfree(prev); + call_rcu(&prev->rcu, free_shadow); } } @@ -311,7 +319,7 @@ static inline cfi_check_fn find_check_fn(unsigned long ptr) return fn; } -void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag) +static inline void __nocfi ___cfi_slowpath_diag(uint64_t id, void *ptr, void *diag) { cfi_check_fn fn = find_check_fn((unsigned long)ptr); @@ -320,6 +328,11 @@ void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag) else /* Don't allow unchecked modules */ handle_cfi_failure(ptr); } + +void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag) +{ + ___cfi_slowpath_diag(id, ptr, diag); +} EXPORT_SYMBOL(__cfi_slowpath_diag); #else /* !CONFIG_MODULES */ diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index d8fcc139ac05..c45ba890869b 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -165,7 +165,6 @@ struct cgroup_mgctx { #define DEFINE_CGROUP_MGCTX(name) \ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) -extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; @@ -250,7 +249,8 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *locked) + bool *locked, + struct cgroup *dst_cgrp); __acquires(&cgroup_threadgroup_rwsem); void cgroup_procs_write_finish(struct task_struct *task, bool locked) __releases(&cgroup_threadgroup_rwsem); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index ee8b3d80f19e..ce4b9872e7d0 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -17,6 +17,7 @@ #include #include +#include /* * pidlists linger the following amount before being destroyed. The goal @@ -505,7 +506,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, if (!cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &locked, cgrp); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -519,13 +520,15 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, tcred = get_task_cred(task); if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) + !uid_eq(cred->euid, tcred->suid) && + !ns_capable(tcred->user_ns, CAP_SYS_NICE)) ret = -EACCES; put_cred(tcred); if (ret) goto out_finish; ret = cgroup_attach_task(cgrp, task, threadgroup); + trace_android_vh_cgroup_set_task(ret, task); out_finish: cgroup_procs_write_finish(task, locked); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b58ece486896..340b45ade0f6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -61,6 +61,9 @@ #define CREATE_TRACE_POINTS #include +#undef CREATE_TRACE_POINTS + +#include #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) @@ -2440,6 +2443,7 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, return cgroup_taskset_next(tset, dst_cssp); } +EXPORT_SYMBOL_GPL(cgroup_taskset_first); /** * cgroup_taskset_next - iterate to the next task in taskset @@ -2486,6 +2490,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, return NULL; } +EXPORT_SYMBOL_GPL(cgroup_taskset_next); /** * cgroup_migrate_execute - migrate a taskset @@ -2556,6 +2561,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->attach) { tset->ssid = ssid; + trace_android_vh_cgroup_attach(ss, tset); ss->attach(tset); } } while_each_subsys_mask(); @@ -2862,10 +2868,13 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *threadgroup_locked) + bool *threadgroup_locked, + struct cgroup *dst_cgrp) + __acquires(&cgroup_threadgroup_rwsem) { struct task_struct *tsk; pid_t pid; + bool force_migration = false; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); @@ -2896,13 +2905,16 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, if (threadgroup) tsk = tsk->group_leader; + if (tsk->flags & PF_KTHREAD) + trace_android_rvh_cgroup_force_kthread_migration(tsk, dst_cgrp, &force_migration); + /* * kthreads may acquire PF_NO_SETAFFINITY during initialization. * If userland migrates such a kthread to a non-root cgroup, it can * become trapped in a cpuset, or RT kthread may be born in a * cgroup with no rt_runtime allocated. Just say no. */ - if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { + if (!force_migration && (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY))) { tsk = ERR_PTR(-EINVAL); goto out_unlock_threadgroup; } @@ -4315,6 +4327,7 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) cft->flags |= __CFTYPE_NOT_ON_DFL; return cgroup_add_cftypes(ss, cfts); } +EXPORT_SYMBOL_GPL(cgroup_add_legacy_cftypes); /** * cgroup_file_notify - generate a file modified event for a cgroup_file @@ -4404,6 +4417,7 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, return next; return NULL; } +EXPORT_SYMBOL_GPL(css_next_child); /** * css_next_descendant_pre - find the next descendant for pre-order walk @@ -4979,7 +4993,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); + task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked, dst_cgrp); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 428820bf141d..e82cf5f28f63 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -67,6 +67,8 @@ #include #include +#include + DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); @@ -106,6 +108,7 @@ struct cpuset { /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; + cpumask_var_t cpus_requested; nodemask_t mems_allowed; /* effective CPUs and Memory Nodes allow to tasks */ @@ -350,17 +353,6 @@ static struct cpuset top_cpuset = { */ DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); - -void cpuset_read_lock(void) -{ - percpu_down_read(&cpuset_rwsem); -} - -void cpuset_read_unlock(void) -{ - percpu_up_read(&cpuset_rwsem); -} - static DEFINE_SPINLOCK(callback_lock); static struct workqueue_struct *cpuset_migrate_mm_wq; @@ -476,7 +468,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) { - return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && + return cpumask_subset(p->cpus_requested, q->cpus_requested) && nodes_subset(p->mems_allowed, q->mems_allowed) && is_cpu_exclusive(p) <= is_cpu_exclusive(q) && is_mem_exclusive(p) <= is_mem_exclusive(q); @@ -513,8 +505,13 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) goto free_two; + if (cs && !zalloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL)) + goto free_three; + return 0; +free_three: + free_cpumask_var(*pmask3); free_two: free_cpumask_var(*pmask2); free_one: @@ -531,6 +528,7 @@ static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { if (cs) { free_cpumask_var(cs->cpus_allowed); + free_cpumask_var(cs->cpus_requested); free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->subparts_cpus); } @@ -559,6 +557,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) } cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + cpumask_copy(trial->cpus_requested, cs->cpus_requested); cpumask_copy(trial->effective_cpus, cs->effective_cpus); return trial; } @@ -627,7 +626,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) cpuset_for_each_child(c, css, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && - cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) + cpumask_intersects(trial->cpus_requested, c->cpus_requested)) goto out; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && @@ -1076,6 +1075,18 @@ void rebuild_sched_domains(void) cpus_read_unlock(); } +static int update_cpus_allowed(struct cpuset *cs, struct task_struct *p, + const struct cpumask *new_mask) +{ + int ret = -EINVAL; + + trace_android_rvh_update_cpus_allowed(p, cs->cpus_requested, new_mask, &ret); + if (!ret) + return ret; + + return set_cpus_allowed_ptr(p, new_mask); +} + /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed @@ -1098,7 +1109,7 @@ static void update_tasks_cpumask(struct cpuset *cs) if (top_cs && (task->flags & PF_KTHREAD) && kthread_is_per_cpu(task)) continue; - set_cpus_allowed_ptr(task, cs->effective_cpus); + update_cpus_allowed(cs, task, cs->effective_cpus); } css_task_iter_end(&it); } @@ -1120,10 +1131,10 @@ static void compute_effective_cpumask(struct cpumask *new_cpus, if (parent->nr_subparts_cpus) { cpumask_or(new_cpus, parent->effective_cpus, parent->subparts_cpus); - cpumask_and(new_cpus, new_cpus, cs->cpus_allowed); + cpumask_and(new_cpus, new_cpus, cs->cpus_requested); cpumask_and(new_cpus, new_cpus, cpu_active_mask); } else { - cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); + cpumask_and(new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus); } } @@ -1565,25 +1576,26 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, return -EACCES; /* - * An empty cpus_allowed is ok only if the cpuset has no tasks. + * An empty cpus_requested is ok only if the cpuset has no tasks. * Since cpulist_parse() fails on an empty mask, we special case * that parsing. The validate_change() call ensures that cpusets * with tasks have cpus. */ if (!*buf) { - cpumask_clear(trialcs->cpus_allowed); + cpumask_clear(trialcs->cpus_requested); } else { - retval = cpulist_parse(buf, trialcs->cpus_allowed); + retval = cpulist_parse(buf, trialcs->cpus_requested); if (retval < 0) return retval; - - if (!cpumask_subset(trialcs->cpus_allowed, - top_cpuset.cpus_allowed)) - return -EINVAL; } + if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask)) + return -EINVAL; + + cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask); + /* Nothing to do if the cpus didn't change */ - if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) + if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested)) return 0; retval = validate_change(cs, trialcs); @@ -1611,6 +1623,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); + cpumask_copy(cs->cpus_requested, trialcs->cpus_requested); /* * Make sure that subparts_cpus is a subset of cpus_allowed. @@ -2267,7 +2280,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here */ - WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); + WARN_ON_ONCE(update_cpus_allowed(cs, task, cpus_attach)); cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, task); @@ -2491,7 +2504,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) switch (type) { case FILE_CPULIST: - seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_requested)); break; case FILE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); @@ -2865,6 +2878,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->mems_allowed = parent->mems_allowed; cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + cpumask_copy(cs->cpus_requested, parent->cpus_requested); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: @@ -2981,8 +2995,10 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); + cpumask_setall(top_cpuset.cpus_requested); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); nodes_setall(top_cpuset.effective_mems); diff --git a/kernel/cpu.c b/kernel/cpu.c index 393114c10c28..fdbd61613371 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -1344,6 +1345,25 @@ void cpuhp_online_idle(enum cpuhp_state state) complete_ap_thread(st, true); } +static int switch_to_rt_policy(void) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + unsigned int policy = current->policy; + + if (policy == SCHED_NORMAL) + /* Switch to SCHED_FIFO from SCHED_NORMAL. */ + return sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m); + else + return 1; +} + +static int switch_to_fair_policy(void) +{ + struct sched_param param = { .sched_priority = 0 }; + + return sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); +} + /* Requires cpu_add_remove_lock to be held */ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) { @@ -1408,6 +1428,7 @@ out: static int cpu_up(unsigned int cpu, enum cpuhp_state target) { int err = 0; + int switch_err; if (!cpu_possible(cpu)) { pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n", @@ -1418,9 +1439,21 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target) return -EINVAL; } + /* + * CPU hotplug operations consists of many steps and each step + * calls a callback of core kernel subsystem. CPU hotplug-in + * operation may get preempted by other CFS tasks and whole + * operation of cpu hotplug in CPU gets delayed. Switch the + * current task to SCHED_FIFO from SCHED_NORMAL, so that + * hotplug in operation may complete quickly in heavy loaded + * conditions and new CPU will start handle the workload. + */ + + switch_err = switch_to_rt_policy(); + err = try_online_node(cpu_to_node(cpu)); if (err) - return err; + goto switch_out; cpu_maps_update_begin(); @@ -1436,6 +1469,14 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target) err = _cpu_up(cpu, 0, target); out: cpu_maps_update_done(); +switch_out: + if (!switch_err) { + switch_err = switch_to_fair_policy(); + if (switch_err) + pr_err("Hotplug policy switch err=%d Task %s pid=%d\n", + switch_err, current->comm, current->pid); + } + return err; } diff --git a/kernel/cred.c b/kernel/cred.c index 933155c96922..bd863bc50f0e 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -17,6 +17,8 @@ #include #include +#include + #if 0 #define kdebug(FMT, ...) \ printk("[%-5.5s%5u] " FMT "\n", \ @@ -181,6 +183,7 @@ void exit_creds(struct task_struct *tsk) key_put(tsk->cached_requested_key); tsk->cached_requested_key = NULL; #endif + trace_android_rvh_exit_creds(tsk, cred); } /** @@ -499,6 +502,7 @@ int commit_creds(struct cred *new) inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); + trace_android_rvh_commit_creds(task, new); if (new->user != old->user || new->user_ns != old->user_ns) dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); alter_cred_subscribers(old, -2); @@ -576,6 +580,7 @@ const struct cred *override_creds(const struct cred *new) get_new_cred((struct cred *)new); alter_cred_subscribers(new, 1); rcu_assign_pointer(current->cred, new); + trace_android_rvh_override_creds(current, new); alter_cred_subscribers(old, -1); kdebug("override_creds() = %p{%d,%d}", old, @@ -604,6 +609,7 @@ void revert_creds(const struct cred *old) validate_creds(override); alter_cred_subscribers(old, 1); rcu_assign_pointer(current->cred, old); + trace_android_rvh_revert_creds(current, old); alter_cred_subscribers(override, -1); put_cred(override); } diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 25fc85a7aebe..ca05989d9901 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -75,7 +75,7 @@ out_unmap_membase: return ERR_PTR(-ENOMEM); } -static void dma_release_coherent_memory(struct dma_coherent_mem *mem) +static void _dma_release_coherent_memory(struct dma_coherent_mem *mem) { if (!mem) return; @@ -127,10 +127,16 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, ret = dma_assign_coherent_memory(dev, mem); if (ret) - dma_release_coherent_memory(mem); + _dma_release_coherent_memory(mem); return ret; } +void dma_release_coherent_memory(struct device *dev) +{ + if (dev) + _dma_release_coherent_memory(dev->dma_mem); +} + static void *__dma_alloc_from_coherent(struct device *dev, struct dma_coherent_mem *mem, ssize_t size, dma_addr_t *dma_handle) diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 3d63d91cba5c..da70a24d1b7a 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -58,6 +58,7 @@ #endif struct cma *dma_contiguous_default_area; +EXPORT_SYMBOL_GPL(dma_contiguous_default_area); /* * Default global CMA area size can be defined in kernel's .config. @@ -260,7 +261,8 @@ struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - return cma_alloc(dev_get_cma_area(dev), count, align, no_warn); + return cma_alloc(dev_get_cma_area(dev), count, align, GFP_KERNEL | + (no_warn ? __GFP_NOWARN : 0)); } /** @@ -283,7 +285,8 @@ static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp) { unsigned int align = min(get_order(size), CONFIG_CMA_ALIGNMENT); - return cma_alloc(cma, size >> PAGE_SHIFT, align, gfp & __GFP_NOWARN); + return cma_alloc(cma, size >> PAGE_SHIFT, align, + GFP_KERNEL | (gfp & __GFP_NOWARN)); } /** diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index ed5dd9e02324..f33092303f47 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -60,7 +60,8 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, *phys_limit = dma_to_phys(dev, dma_limit); if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits)) return GFP_DMA; - if (*phys_limit <= DMA_BIT_MASK(32)) + if (*phys_limit <= DMA_BIT_MASK(32) && + !zone_dma32_are_empty()) return GFP_DMA32; return 0; } @@ -141,7 +142,8 @@ again: if (IS_ENABLED(CONFIG_ZONE_DMA32) && phys_limit < DMA_BIT_MASK(64) && - !(gfp & (GFP_DMA32 | GFP_DMA))) { + !(gfp & (GFP_DMA32 | GFP_DMA)) && + !zone_dma32_are_empty()) { gfp |= GFP_DMA32; goto again; } diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 9478eccd1c8e..de839f6aa62c 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -415,6 +415,9 @@ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) if (attrs & DMA_ATTR_WRITE_COMBINE) return pgprot_writecombine(prot); #endif + if (attrs & DMA_ATTR_SYS_CACHE_ONLY || + attrs & DMA_ATTR_SYS_CACHE_ONLY_NWA) + return pgprot_syscached(prot); return pgprot_dmacoherent(prot); } #endif /* CONFIG_MMU */ diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 4d40dcce7604..8e79903fbda8 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -71,7 +71,7 @@ static bool cma_in_zone(gfp_t gfp) end = cma_get_base(cma) + size - 1; if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) return end <= DMA_BIT_MASK(zone_dma_bits); - if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) + if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty()) return end <= DMA_BIT_MASK(32); return true; } @@ -153,7 +153,7 @@ static void atomic_pool_work_fn(struct work_struct *work) if (IS_ENABLED(CONFIG_ZONE_DMA)) atomic_pool_resize(atomic_pool_dma, GFP_KERNEL | GFP_DMA); - if (IS_ENABLED(CONFIG_ZONE_DMA32)) + if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty()) atomic_pool_resize(atomic_pool_dma32, GFP_KERNEL | GFP_DMA32); atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL); @@ -209,7 +209,7 @@ static int __init dma_atomic_pool_init(void) if (!atomic_pool_dma) ret = -ENOMEM; } - if (IS_ENABLED(CONFIG_ZONE_DMA32)) { + if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty()) { atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size, GFP_KERNEL | GFP_DMA32); if (!atomic_pool_dma32) @@ -224,7 +224,7 @@ postcore_initcall(dma_atomic_pool_init); static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp) { if (prev == NULL) { - if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) + if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty()) return atomic_pool_dma32; if (atomic_pool_dma && (gfp & GFP_DMA)) return atomic_pool_dma; diff --git a/kernel/events/core.c b/kernel/events/core.c index d8795036202a..4c72f68ff8d4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4607,6 +4607,7 @@ out: return ret; } +EXPORT_SYMBOL_GPL(perf_event_read_local); static int perf_event_read(struct perf_event *event, bool group) { @@ -6673,6 +6674,7 @@ static void perf_pending_task(struct callback_head *head) put_event(event); } +#ifdef CONFIG_GUEST_PERF_EVENTS /* * We assume there is only KVM supporting the callbacks. * Later on, we might change it to a list if there is @@ -6680,26 +6682,25 @@ static void perf_pending_task(struct callback_head *head) */ struct perf_guest_info_callbacks __rcu *perf_guest_cbs; -int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) - return -EBUSY; + return; rcu_assign_pointer(perf_guest_cbs, cbs); - return 0; } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); -int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) - return -EINVAL; + return; rcu_assign_pointer(perf_guest_cbs, NULL); synchronize_rcu(); - return 0; } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); +#endif static void perf_output_sample_regs(struct perf_output_handle *handle, diff --git a/kernel/exit.c b/kernel/exit.c index 80efdfda6662..e278891e6983 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -69,6 +69,7 @@ #include #include #include +#include /* * The default value should be high enough to not crash a system that randomly @@ -469,6 +470,7 @@ assign_new_owner: goto retry; } WRITE_ONCE(mm->owner, c); + lru_gen_migrate_mm(mm); task_unlock(c); put_task_struct(c); } @@ -545,6 +547,7 @@ static void exit_mm(void) task_unlock(current); mmap_read_unlock(mm); mm_update_next_owner(mm); + trace_android_vh_exit_mm(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); diff --git a/kernel/fork.c b/kernel/fork.c index 3fb7e9e6a7b9..659dc5f5d00f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -97,6 +98,7 @@ #include #include #include +#include #include #include @@ -109,6 +111,8 @@ #define CREATE_TRACE_POINTS #include +#undef CREATE_TRACE_POINTS +#include /* * Minimum number of threads to boot the kernel */ @@ -119,6 +123,8 @@ */ #define MAX_THREADS FUTEX_TID_MASK +EXPORT_TRACEPOINT_SYMBOL_GPL(task_newtask); + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -139,6 +145,7 @@ static const char * const resident_page_types[] = { DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +EXPORT_SYMBOL_GPL(tasklist_lock); #ifdef CONFIG_PROVE_RCU int lockdep_tasklist_lock_is_held(void) @@ -226,15 +233,17 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; - /* Mark stack accessible for KASAN. */ + /* Reset stack metadata. */ kasan_unpoison_range(s->addr, THREAD_SIZE); + stack = kasan_reset_tag(s->addr); + /* Clear stale pointers from reused stack. */ - memset(s->addr, 0, THREAD_SIZE); + memset(stack, 0, THREAD_SIZE); tsk->stack_vm_area = s; - tsk->stack = s->addr; - return s->addr; + tsk->stack = stack; + return stack; } /* @@ -254,6 +263,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) * so cache the vm_struct. */ if (stack) { + stack = kasan_reset_tag(stack); tsk->stack_vm_area = find_vm_area(stack); tsk->stack = stack; } @@ -366,14 +376,47 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; + dup_anon_vma_name(orig, new); } return new; } -void vm_area_free(struct vm_area_struct *vma) +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +static void __free_vm_area_struct(struct rcu_head *head) +{ + struct vm_area_struct *vma = container_of(head, struct vm_area_struct, + vm_rcu); + kmem_cache_free(vm_area_cachep, vma); +} + +static inline void free_vm_area_struct(struct vm_area_struct *vma) +{ + call_rcu(&vma->vm_rcu, __free_vm_area_struct); +} +#else +static inline void free_vm_area_struct(struct vm_area_struct *vma) { kmem_cache_free(vm_area_cachep, vma); } +#endif + +void vm_area_free_no_check(struct vm_area_struct *vma) +{ + free_anon_vma_name(vma); + if (vma->vm_file) + fput(vma->vm_file); + free_vm_area_struct(vma); +} + +void vm_area_free(struct vm_area_struct *vma) +{ +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + /* Free only after refcount dropped to negative */ + if (atomic_dec_return(&vma->file_ref_count) >= 0) + return; +#endif + vm_area_free_no_check(vma); +} static void account_kernel_stack(struct task_struct *tsk, int account) { @@ -449,9 +492,11 @@ void free_task(struct task_struct *tsk) #ifdef CONFIG_SECCOMP WARN_ON_ONCE(tsk->seccomp.filter); #endif + cpufreq_task_times_exit(tsk); release_user_cpus_ptr(tsk); scs_release(tsk); + trace_android_vh_free_task(tsk); #ifndef CONFIG_THREAD_INFO_IN_TASK /* * The task is finally done with both the stack and thread_info, @@ -631,6 +676,7 @@ fail_uprobe_end: fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: + tmp->vm_file = NULL; /* prevents fput within vm_area_free() */ vm_area_free(tmp); fail_nomem: retval = -ENOMEM; @@ -973,6 +1019,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; #endif +#ifdef CONFIG_ANDROID_VENDOR_OEM_DATA + memset(&tsk->android_vendor_data1, 0, sizeof(tsk->android_vendor_data1)); + memset(&tsk->android_oem_data1, 0, sizeof(tsk->android_oem_data1)); +#endif + trace_android_vh_dup_task_struct(tsk, orig); return tsk; free_stack: @@ -1060,7 +1111,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_owner(mm, p); mm_init_pasid(mm); RCU_INIT_POINTER(mm->exe_file, NULL); - mmu_notifier_subscriptions_init(mm); + if (!mmu_notifier_subscriptions_init(mm)) + goto fail_nopgd; init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; @@ -1083,6 +1135,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, goto fail_nocontext; mm->user_ns = get_user_ns(user_ns); + lru_gen_init_mm(mm); return mm; fail_nocontext: @@ -1125,6 +1178,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + lru_gen_del_mm(mm); mmdrop(mm); } @@ -2039,6 +2093,8 @@ static __latent_entropy struct task_struct *copy_process( siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); } + cpufreq_task_times_init(p); + /* * This _must_ happen before we call free_task(), i.e. before we jump * to any of the bad_fork_* labels. This is to avoid freeing @@ -2604,6 +2660,8 @@ pid_t kernel_clone(struct kernel_clone_args *args) if (IS_ERR(p)) return PTR_ERR(p); + cpufreq_task_times_alloc(p); + /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -2622,6 +2680,13 @@ pid_t kernel_clone(struct kernel_clone_args *args) get_task_struct(p); } + if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { + /* lock the task to synchronize with memcg migration */ + task_lock(p); + lru_gen_add_mm(p->mm); + task_unlock(p); + } + wake_up_new_task(p); /* forking complete and child started to run, tell ptracer */ diff --git a/kernel/freezer.c b/kernel/freezer.c index 45ab36ffd0e7..9c34194db56f 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -11,6 +11,7 @@ #include #include #include +#include /* total number of freezing conditions in effect */ atomic_t system_freezing_cnt = ATOMIC_INIT(0); @@ -146,9 +147,16 @@ bool freeze_task(struct task_struct *p) void __thaw_task(struct task_struct *p) { unsigned long flags; + const struct cpumask *mask = task_cpu_possible_mask(p); spin_lock_irqsave(&freezer_lock, flags); - if (frozen(p)) + /* + * Wake up frozen tasks. On asymmetric systems where tasks cannot + * run on all CPUs, ttwu() may have deferred a wakeup generated + * before thaw_secondary_cpus() had completed so we generate + * additional wakeups here for tasks in the PF_FREEZER_SKIP state. + */ + if (frozen(p) || (frozen_or_skipped(p) && mask != cpu_possible_mask)) wake_up_process(p); spin_unlock_irqrestore(&freezer_lock, flags); } diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 764e73622b38..bcf0940a2e11 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -43,6 +43,7 @@ #include #include "../locking/rtmutex_common.h" +#include /* * READ this before attempting to hack on futexes! @@ -2475,6 +2476,7 @@ queue_unlock(struct futex_hash_bucket *hb) static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; + bool already_on_hb = false; /* * The priority used to register this element is @@ -2487,7 +2489,9 @@ static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) prio = min(current->normal_prio, MAX_RT_PRIO); plist_node_init(&q->list, prio); - plist_add(&q->list, &hb->chain); + trace_android_vh_alter_futex_plist_add(&q->list, &hb->chain, &already_on_hb); + if (!already_on_hb) + plist_add(&q->list, &hb->chain); q->task = current; } diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 1966a749e0d9..0c78e64f747d 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -74,7 +74,7 @@ fi # of tree builds having stale headers in srctree. Just silence CPIO for now. for f in $dir_list; do find "$f" -name "*.h"; -done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 +done | cpio --quiet -pdu $cpio_dir >/dev/null 2>&1 # Remove comments except SDPX lines find $cpio_dir -type f -print0 | diff --git a/kernel/gki_module.c b/kernel/gki_module.c new file mode 100644 index 000000000000..4f124f9a14ec --- /dev/null +++ b/kernel/gki_module.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022 Google LLC + * Author: ramjiyani@google.com (Ramji Jiyani) + */ + +#include +#include +#include +#include +#include + +/* + * Build time generated header files + * + * gki_module_protected_exports.h -- Symbols protected from _export_ by unsigned modules + * gki_module_unprotected.h -- Symbols allowed to _access_ by unsigned modules + */ +#include "gki_module_protected_exports.h" +#include "gki_module_unprotected.h" + +#define MAX_STRCMP_LEN (max(MAX_UNPROTECTED_NAME_LEN, MAX_PROTECTED_EXPORTS_NAME_LEN)) + +/* bsearch() comparision callback */ +static int cmp_name(const void *sym, const void *protected_sym) +{ + return strncmp(sym, protected_sym, MAX_STRCMP_LEN); +} + +/** + * gki_is_module_protected_export - Is a symbol exported from a protected GKI module? + * + * @name: Symbol being checked against exported symbols from protected GKI modules + */ +bool gki_is_module_protected_export(const char *name) +{ + if (NR_UNPROTECTED_SYMBOLS) { + return bsearch(name, gki_protected_exports_symbols, NR_PROTECTED_EXPORTS_SYMBOLS, + MAX_PROTECTED_EXPORTS_NAME_LEN, cmp_name) != NULL; + } else { + /* + * If there are no symbols in unprotected list; We don't need to + * protect exports as there is no KMI enforcement. + * Treat everything exportable in this case. + */ + return false; + } +} + +/** + * gki_is_module_unprotected_symbol - Is a symbol unprotected for unsigned module? + * + * @name: Symbol being checked in list of unprotected symbols + */ +bool gki_is_module_unprotected_symbol(const char *name) +{ + if (NR_UNPROTECTED_SYMBOLS) { + return bsearch(name, gki_unprotected_symbols, NR_UNPROTECTED_SYMBOLS, + MAX_UNPROTECTED_NAME_LEN, cmp_name) != NULL; + } else { + /* + * If there are no symbols in unprotected list; + * there isn't a KMI enforcement for the kernel. + * Treat everything accessible in this case. + */ + return true; + } +} diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 9888e2bc8c76..f71b00aca9df 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -24,6 +24,8 @@ #include #include +#undef CREATE_TRACE_POINTS +#include /* * The number of tasks checked: @@ -93,8 +95,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * Ensure the task is not frozen. * Also, skip vfork and any other user process that freezer should skip. */ - if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) - return; + if (unlikely(frozen_or_skipped(t))) + return; /* * When a freshly created task is scheduled once, changes its state to @@ -178,6 +180,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) int max_count = sysctl_hung_task_check_count; unsigned long last_break = jiffies; struct task_struct *g, *t; + bool need_check = true; /* * If the system crashed already then all bets are off, @@ -196,10 +199,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; last_break = jiffies; } - /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ - if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE) - check_hung_task(t, timeout); + trace_android_vh_check_uninterruptible_tasks(t, timeout, &need_check); + if (need_check) + /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ + if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE) + check_hung_task(t, timeout); } + trace_android_vh_check_uninterruptible_tasks_dn(NULL); unlock: rcu_read_unlock(); if (hung_task_show_lock) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 00d58588ea95..5bf36cb608ca 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -110,6 +110,9 @@ config GENERIC_IRQ_MATRIX_ALLOCATOR config GENERIC_IRQ_RESERVATION_MODE bool +config ARCH_WANTS_IRQ_RAW + bool + # Support forced irq threading config IRQ_FORCED_THREADING bool diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f3920374f71c..aa087b08499a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -510,8 +511,22 @@ static bool irq_may_run(struct irq_desc *desc) * If the interrupt is not in progress and is not an armed * wakeup interrupt, proceed. */ - if (!irqd_has_set(&desc->irq_data, mask)) + if (!irqd_has_set(&desc->irq_data, mask)) { +#ifdef CONFIG_PM_SLEEP + if (unlikely(desc->no_suspend_depth && + irqd_is_wakeup_set(&desc->irq_data))) { + unsigned int irq = irq_desc_get_irq(desc); + const char *name = "(unnamed)"; + + if (desc->action && desc->action->name) + name = desc->action->name; + + log_abnormal_wakeup_reason("misconfigured IRQ %u %s", + irq, name); + } +#endif return true; + } /* * If the interrupt is an armed wakeup source, mark it pending diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index e4cff358b437..f53475d88072 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -140,6 +140,7 @@ static const struct irq_bit_descr irqdesc_states[] = { BIT_MASK_DESCR(_IRQ_IS_POLLED), BIT_MASK_DESCR(_IRQ_DISABLE_UNLAZY), BIT_MASK_DESCR(_IRQ_HIDDEN), + BIT_MASK_DESCR(_IRQ_RAW), }; static const struct irq_bit_descr irqdesc_istates[] = { diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 7a45fd593245..ef5445146aa7 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -355,9 +355,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) { return radix_tree_lookup(&irq_desc_tree, irq); } -#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE EXPORT_SYMBOL_GPL(irq_to_desc); -#endif static void delete_irq_desc(unsigned int irq) { @@ -698,16 +696,22 @@ int handle_domain_irq(struct irq_domain *domain, struct irq_desc *desc; int ret = 0; - irq_enter(); /* The irqdomain code provides boundary checks */ desc = irq_resolve_mapping(domain, hwirq); - if (likely(desc)) - handle_irq_desc(desc); + if (likely(desc)) { + if (IS_ENABLED(CONFIG_ARCH_WANTS_IRQ_RAW) && + unlikely(irq_settings_is_raw(desc))) { + handle_irq_desc(desc); + } else { + irq_enter(); + handle_irq_desc(desc); + irq_exit(); + } + } else ret = -EINVAL; - irq_exit(); set_irq_regs(old_regs); return ret; } @@ -937,6 +941,7 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) return desc && desc->kstat_irqs ? *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; } +EXPORT_SYMBOL_GPL(kstat_irqs_cpu); static bool irq_is_nmi(struct irq_desc *desc) { @@ -994,3 +999,4 @@ void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class } EXPORT_SYMBOL_GPL(__irq_set_lockdep_class); #endif +EXPORT_SYMBOL_GPL(kstat_irqs_usr); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9862372e0f01..b208e802a695 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -294,6 +294,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } +EXPORT_SYMBOL_GPL(irq_do_set_affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ static inline int irq_set_affinity_pending(struct irq_data *data, diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 7b7efb1a114b..94110c8025a1 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -19,6 +19,7 @@ enum { _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY, _IRQ_HIDDEN = IRQ_HIDDEN, _IRQ_NO_DEBUG = IRQ_NO_DEBUG, + _IRQ_RAW = IRQ_RAW, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -35,6 +36,7 @@ enum { #define IRQ_DISABLE_UNLAZY GOT_YOU_MORON #define IRQ_HIDDEN GOT_YOU_MORON #define IRQ_NO_DEBUG GOT_YOU_MORON +#define IRQ_RAW GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -186,3 +188,16 @@ static inline bool irq_settings_no_debug(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_NO_DEBUG; } + +static inline bool irq_settings_is_raw(struct irq_desc *desc) +{ + if (IS_ENABLED(CONFIG_ARCH_WANTS_IRQ_RAW)) + return desc->status_use_accessors & _IRQ_RAW; + + /* + * Using IRQ_RAW on architectures that don't expect it is + * likely to be wrong. + */ + WARN_ON_ONCE(1); + return false; +} diff --git a/kernel/irq_work.c b/kernel/irq_work.c index db8c248ebc8c..65ee62d0722d 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -113,7 +113,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) return true; #endif /* CONFIG_SMP */ } - +EXPORT_SYMBOL_GPL(irq_work_queue_on); bool irq_work_needs_cpu(void) { diff --git a/kernel/kthread.c b/kernel/kthread.c index 5b37a8567168..31278089674c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -482,6 +482,7 @@ void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) { __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); } +EXPORT_SYMBOL_GPL(kthread_bind_mask); /** * kthread_bind - bind a just-created kthread to a cpu. diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index d456579d0952..3b9b4f206719 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -39,6 +39,8 @@ # define MUTEX_WARN_ON(cond) #endif +#include + void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { @@ -49,6 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) osq_lock_init(&lock->osq); #endif + trace_android_vh_mutex_init(lock); debug_mutex_init(lock, name, key); } EXPORT_SYMBOL(__mutex_init); @@ -199,9 +202,12 @@ static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct list_head *list) { + bool already_on_list = false; debug_mutex_add_waiter(lock, waiter, current); - list_add_tail(&waiter->list, list); + trace_android_vh_alter_mutex_list_add(lock, waiter, list, &already_on_list); + if (!already_on_list) + list_add_tail(&waiter->list, list); if (__mutex_waiter_is_first(lock, waiter)) __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); } @@ -636,6 +642,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err_early_kill; } + trace_android_vh_mutex_wait_start(lock); set_current_state(state); for (;;) { bool first; @@ -685,6 +692,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas raw_spin_lock(&lock->wait_lock); acquired: __set_current_state(TASK_RUNNING); + trace_android_vh_mutex_wait_finish(lock); if (ww_ctx) { /* @@ -713,6 +721,7 @@ skip_wait: err: __set_current_state(TASK_RUNNING); + trace_android_vh_mutex_wait_finish(lock); __mutex_remove_waiter(lock, &waiter); err_early_kill: raw_spin_unlock(&lock->wait_lock); @@ -890,6 +899,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne if (owner & MUTEX_FLAG_HANDOFF) __mutex_handoff(lock, next); + trace_android_vh_mutex_unlock_slowpath(lock); raw_spin_unlock(&lock->wait_lock); wake_up_q(&wake_q); diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 70a32a576f3f..26121ef6265c 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, @@ -162,7 +164,7 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) __set_current_state(TASK_RUNNING); } -bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) +bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) { if (__percpu_down_read_trylock(sem)) return true; @@ -211,7 +213,7 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem) return true; } -void percpu_down_write(struct percpu_rw_semaphore *sem) +void __sched percpu_down_write(struct percpu_rw_semaphore *sem) { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); @@ -268,3 +270,34 @@ void percpu_up_write(struct percpu_rw_semaphore *sem) rcu_sync_exit(&sem->rss); } EXPORT_SYMBOL_GPL(percpu_up_write); + +static LIST_HEAD(destroy_list); +static DEFINE_SPINLOCK(destroy_list_lock); + +static void destroy_list_workfn(struct work_struct *work) +{ + struct percpu_rw_semaphore *sem, *sem2; + LIST_HEAD(to_destroy); + + spin_lock(&destroy_list_lock); + list_splice_init(&destroy_list, &to_destroy); + spin_unlock(&destroy_list_lock); + + if (list_empty(&to_destroy)) + return; + + list_for_each_entry_safe(sem, sem2, &to_destroy, destroy_list_entry) { + percpu_free_rwsem(sem); + kfree(sem); + } +} + +static DECLARE_WORK(destroy_list_work, destroy_list_workfn); + +void percpu_rwsem_async_destroy(struct percpu_rw_semaphore *sem) +{ + spin_lock(&destroy_list_lock); + list_add_tail(&sem->destroy_list_entry, &destroy_list); + spin_unlock(&destroy_list_lock); + schedule_work(&destroy_list_work); +} diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index ea5a701ab240..e23a101ed69e 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "rtmutex_common.h" @@ -1479,6 +1480,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, struct task_struct *owner; int ret = 0; + trace_android_vh_rtmutex_wait_start(lock); for (;;) { /* Try to acquire the lock: */ if (try_to_take_rt_mutex(lock, current, waiter)) @@ -1512,6 +1514,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, set_current_state(state); } + trace_android_vh_rtmutex_wait_finish(lock); __set_current_state(TASK_RUNNING); return ret; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 4cc73e6f8974..2d3bce4ce755 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -30,6 +30,8 @@ #ifndef CONFIG_PREEMPT_RT #include "lock_events.h" +#include +#include /* * The least significant 2 bits of the owner value has the following @@ -322,21 +324,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, #ifdef CONFIG_RWSEM_SPIN_ON_OWNER osq_lock_init(&sem->osq); #endif + trace_android_vh_rwsem_init(sem); } EXPORT_SYMBOL(__init_rwsem); -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; - unsigned long timeout; - bool handoff_set; -}; #define rwsem_first_waiter(sem) \ list_first_entry(&sem->wait_list, struct rwsem_waiter, list) @@ -951,6 +942,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); bool wake = false; + bool already_on_list = false; /* * To prevent a constant stream of readers from starving a sleeping @@ -1008,7 +1000,12 @@ queue: } adjustment += RWSEM_FLAG_WAITERS; } - rwsem_add_waiter(sem, &waiter); + + trace_android_vh_alter_rwsem_list_add( + &waiter, + sem, &already_on_list); + if (!already_on_list) + rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); @@ -1027,10 +1024,12 @@ queue: (adjustment & RWSEM_FLAG_WAITERS))) rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + trace_android_vh_rwsem_wake(sem); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); /* wait to be given the lock */ + trace_android_vh_rwsem_read_wait_start(sem); for (;;) { set_current_state(state); if (!smp_load_acquire(&waiter.task)) { @@ -1050,6 +1049,7 @@ queue: } __set_current_state(TASK_RUNNING); + trace_android_vh_rwsem_read_wait_finish(sem); lockevent_inc(rwsem_rlock); return sem; @@ -1057,6 +1057,7 @@ out_nolock: rwsem_del_waiter(sem, &waiter); raw_spin_unlock_irq(&sem->wait_lock); __set_current_state(TASK_RUNNING); + trace_android_vh_rwsem_read_wait_finish(sem); lockevent_inc(rwsem_rlock_fail); return ERR_PTR(-EINTR); } @@ -1064,12 +1065,13 @@ out_nolock: /* * Wait until we successfully acquire the write lock */ -static struct rw_semaphore * +static struct rw_semaphore __sched * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { long count; struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); + bool already_on_list = false; /* do optimistic spinning and steal lock if possible */ if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) { @@ -1087,7 +1089,12 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); - rwsem_add_waiter(sem, &waiter); + + trace_android_vh_alter_rwsem_list_add( + &waiter, + sem, &already_on_list); + if (!already_on_list) + rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock */ if (rwsem_first_waiter(sem) != &waiter) { @@ -1123,7 +1130,9 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) } wait: + trace_android_vh_rwsem_wake(sem); /* wait until we successfully acquire the lock */ + trace_android_vh_rwsem_write_wait_start(sem); set_current_state(state); for (;;) { if (rwsem_try_write_lock(sem, &waiter)) { @@ -1162,12 +1171,14 @@ trylock_again: raw_spin_lock_irq(&sem->wait_lock); } __set_current_state(TASK_RUNNING); + trace_android_vh_rwsem_write_wait_finish(sem); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); return sem; out_nolock: __set_current_state(TASK_RUNNING); + trace_android_vh_rwsem_write_wait_finish(sem); raw_spin_lock_irq(&sem->wait_lock); rwsem_del_waiter(sem, &waiter); if (!list_empty(&sem->wait_list)) @@ -1583,6 +1594,7 @@ EXPORT_SYMBOL(up_read); void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, _RET_IP_); + trace_android_vh_rwsem_write_finished(sem); __up_write(sem); } EXPORT_SYMBOL(up_write); @@ -1593,6 +1605,7 @@ EXPORT_SYMBOL(up_write); void downgrade_write(struct rw_semaphore *sem) { lock_downgrade(&sem->dep_map, _RET_IP_); + trace_android_vh_rwsem_write_finished(sem); __downgrade_write(sem); } EXPORT_SYMBOL(downgrade_write); diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 33783abc377b..321070747f88 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -29,3 +29,17 @@ struct load_info { }; extern int mod_verify_sig(const void *mod, struct load_info *info); + +#ifdef CONFIG_MODULE_SIG_PROTECT +extern bool gki_is_module_unprotected_symbol(const char *name); +extern bool gki_is_module_protected_export(const char *name); +#else +static inline bool gki_is_module_unprotected_symbol(const char *name) +{ + return 1; +} +static inline bool gki_is_module_protected_export(const char *name) +{ + return 0; +} +#endif /* CONFIG_MODULE_SIG_PROTECT */ diff --git a/kernel/module.c b/kernel/module.c index 8a1766c69c6e..8770b263e2e8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -63,6 +63,10 @@ #define CREATE_TRACE_POINTS #include +#undef CREATE_TRACE_POINTS +#include +#include + #ifndef ARCH_SHF_SMALL #define ARCH_SHF_SMALL 0 #endif @@ -267,7 +271,7 @@ static void module_assert_mutex_or_preempt(void) #endif } -#ifdef CONFIG_MODULE_SIG +#if defined(CONFIG_MODULE_SIG) && !defined(CONFIG_MODULE_SIG_PROTECT) static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); module_param(sig_enforce, bool_enable_only, 0644); @@ -743,6 +747,7 @@ static struct module_attribute modinfo_##field = { \ MODINFO_ATTR(version); MODINFO_ATTR(srcversion); +MODINFO_ATTR(scmversion); static char last_unloaded_module[MODULE_NAME_LEN+1]; @@ -1206,6 +1211,7 @@ static struct module_attribute *modinfo_attrs[] = { &module_uevent, &modinfo_version, &modinfo_srcversion, + &modinfo_scmversion, &modinfo_initstate, &modinfo_coresize, &modinfo_initsize, @@ -1434,6 +1440,21 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, goto getname; } + /* + * ANDROID: GKI: + * In case of an unsigned module symbol resolves only if: + * 1. Symbol is in the list of unprotected symbol list OR + * 2. If symbol owner is not NULL i.e. owner is another module; + * it has to be an unsigned module and not signed GKI module + * to protect symbols exported by signed GKI modules. + */ + if (!mod->sig_ok && + !gki_is_module_unprotected_symbol(name) && + fsa.owner && fsa.owner->sig_ok) { + fsa.sym = ERR_PTR(-EACCES); + goto getname; + } + err = ref_module(mod, fsa.owner); if (err) { fsa.sym = ERR_PTR(err); @@ -2200,6 +2221,10 @@ static void free_module(struct module *mod) /* This may be empty, but that's OK */ module_arch_freeing_init(mod); + trace_android_vh_set_memory_rw((unsigned long)mod->init_layout.base, + (mod->init_layout.size)>>PAGE_SHIFT); + trace_android_vh_set_memory_nx((unsigned long)mod->init_layout.base, + (mod->init_layout.size)>>PAGE_SHIFT); module_memfree(mod->init_layout.base); kfree(mod->args); percpu_modfree(mod); @@ -2208,6 +2233,10 @@ static void free_module(struct module *mod) lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); /* Finally, free the core (containing the module structure) */ + trace_android_vh_set_memory_rw((unsigned long)mod->core_layout.base, + (mod->core_layout.size)>>PAGE_SHIFT); + trace_android_vh_set_memory_nx((unsigned long)mod->core_layout.base, + (mod->core_layout.size)>>PAGE_SHIFT); module_memfree(mod->core_layout.base); } @@ -2253,6 +2282,14 @@ static int verify_exported_symbols(struct module *mod) .name = kernel_symbol_name(s), .gplok = true, }; + + if (!mod->sig_ok && gki_is_module_protected_export( + kernel_symbol_name(s))) { + pr_err("%s: exports protected symbol %s\n", + mod->name, kernel_symbol_name(s)); + return -EACCES; + } + if (find_symbol(&fsa)) { pr_err("%s: exports duplicate symbol %s" " (owned by %s)\n", @@ -2333,9 +2370,15 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) ignore_undef_symbol(info->hdr->e_machine, name))) break; - ret = PTR_ERR(ksym) ?: -ENOENT; - pr_warn("%s: Unknown symbol %s (err %d)\n", - mod->name, name, ret); + if (PTR_ERR(ksym) == -EACCES) { + ret = -EACCES; + pr_warn("%s: Protected symbol: %s (err %d)\n", + mod->name, name, ret); + } else { + ret = PTR_ERR(ksym) ?: -ENOENT; + pr_warn("%s: Unknown symbol %s (err %d)\n", + mod->name, name, ret); + } break; default: @@ -2931,7 +2974,15 @@ static int module_sig_check(struct load_info *info, int flags) return -EKEYREJECTED; } +/* + * ANDROID: GKI: Do not prevent loading of unsigned modules; + * as all modules except GKI modules are not signed. + */ +#ifndef CONFIG_MODULE_SIG_PROTECT return security_locked_down(LOCKDOWN_MODULE_SIGNATURE); +#else + return 0; +#endif } #else /* !CONFIG_MODULE_SIG */ static int module_sig_check(struct load_info *info, int flags) @@ -3624,7 +3675,15 @@ static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); module_arch_freeing_init(mod); + trace_android_vh_set_memory_rw((unsigned long)mod->init_layout.base, + (mod->init_layout.size)>>PAGE_SHIFT); + trace_android_vh_set_memory_nx((unsigned long)mod->init_layout.base, + (mod->init_layout.size)>>PAGE_SHIFT); module_memfree(mod->init_layout.base); + trace_android_vh_set_memory_rw((unsigned long)mod->core_layout.base, + (mod->core_layout.size)>>PAGE_SHIFT); + trace_android_vh_set_memory_nx((unsigned long)mod->core_layout.base, + (mod->core_layout.size)>>PAGE_SHIFT); module_memfree(mod->core_layout.base); } @@ -3768,8 +3827,13 @@ static noinline int do_init_module(struct module *mod) rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif module_enable_ro(mod, true); + trace_android_vh_set_module_permit_after_init(mod); mod_tree_remove_init(mod); module_arch_freeing_init(mod); + trace_android_vh_set_memory_rw((unsigned long)mod->init_layout.base, + (mod->init_layout.size)>>PAGE_SHIFT); + trace_android_vh_set_memory_nx((unsigned long)mod->init_layout.base, + (mod->init_layout.size)>>PAGE_SHIFT); mod->init_layout.base = NULL; mod->init_layout.size = 0; mod->init_layout.ro_size = 0; @@ -3895,6 +3959,7 @@ static int complete_formation(struct module *mod, struct load_info *info) module_enable_ro(mod, false); module_enable_nx(mod); module_enable_x(mod); + trace_android_vh_set_module_permit_before_init(mod); /* * Mark state as coming so strong_try_module_get() ignores us, @@ -4033,6 +4098,8 @@ static int load_module(struct load_info *info, const char __user *uargs, "kernel\n", mod->name); add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK); } +#else + mod->sig_ok = 0; #endif /* To avoid stressing percpu allocator, do this once we're unique. */ @@ -4795,6 +4862,22 @@ void print_modules(void) pr_cont("\n"); } +#ifdef CONFIG_ANDROID_DEBUG_SYMBOLS +void android_debug_for_each_module(int (*fn)(const char *mod_name, void *mod_addr, void *data), + void *data) +{ + struct module *module; + preempt_disable(); + list_for_each_entry_rcu(module, &modules, list) { + if (fn(module->name, module->core_layout.base, data)) + goto out; + } +out: + preempt_enable(); +} +EXPORT_SYMBOL_NS_GPL(android_debug_for_each_module, MINIDUMP); +#endif + #ifdef CONFIG_MODVERSIONS /* * Generate the signature for all relevant module structures here. diff --git a/kernel/pid.c b/kernel/pid.c index efe87db44683..de13b4690aaf 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -421,6 +421,7 @@ struct task_struct *find_task_by_vpid(pid_t vnr) { return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } +EXPORT_SYMBOL_GPL(find_task_by_vpid); struct task_struct *find_get_task_by_vpid(pid_t nr) { diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 5899260a8bef..97705757f9c6 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -17,4 +17,5 @@ obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o +obj-$(CONFIG_SUSPEND) += wakeup_reason.o obj-$(CONFIG_ENERGY_MODEL) += energy_model.o diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 97e62469a6b3..b2efe6c5fb2a 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -15,6 +15,7 @@ #include #include #include +#include /* * Mutex serializing the registrations of performance domains and letting @@ -279,6 +280,7 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, { unsigned long cap, prev_cap = 0; int cpu, ret; + bool cond = false; if (!dev || !nr_states || !cb) return -EINVAL; @@ -307,6 +309,10 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, ret = -EEXIST; goto unlock; } + + trace_android_vh_em_dev_register_pd(&cond); + if (cond) + continue; /* * All CPUs of a domain must have the same * micro-architecture since they all share the same diff --git a/kernel/power/main.c b/kernel/power/main.c index 7e646079fbeb..b6219c8f6d76 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -23,7 +23,7 @@ void lock_system_sleep(void) { - current->flags |= PF_FREEZER_SKIP; + freezer_do_not_count(); mutex_lock(&system_transition_mutex); } EXPORT_SYMBOL_GPL(lock_system_sleep); diff --git a/kernel/power/process.c b/kernel/power/process.c index ee78a39463e6..5db1955c7734 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -23,6 +23,8 @@ #include #include +#include + /* * Timeout for stopping processes */ @@ -85,26 +87,33 @@ static int try_to_freeze_tasks(bool user_only) elapsed = ktime_sub(end, start); elapsed_msecs = ktime_to_ms(elapsed); - if (todo) { + if (wakeup) { pr_cont("\n"); - pr_err("Freezing of tasks %s after %d.%03d seconds " - "(%d tasks refusing to freeze, wq_busy=%d):\n", - wakeup ? "aborted" : "failed", + pr_err("Freezing of tasks aborted after %d.%03d seconds", + elapsed_msecs / 1000, elapsed_msecs % 1000); + } else if (todo) { + pr_cont("\n"); + pr_err("Freezing of tasks failed after %d.%03d seconds" + " (%d tasks refusing to freeze, wq_busy=%d):\n", elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); if (wq_busy) show_workqueue_state(); - if (!wakeup || pm_debug_messages_on) { + if (pm_debug_messages_on) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p != current && !freezer_should_skip(p) - && freezing(p) && !frozen(p)) + && freezing(p) && !frozen(p)) { sched_show_task(p); + trace_android_vh_try_to_freeze_todo_unfrozen(p); + } } read_unlock(&tasklist_lock); } + + trace_android_vh_try_to_freeze_todo(todo, elapsed_msecs, wq_busy); } else { pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, elapsed_msecs % 1000); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 13d905dd3267..f167af45526a 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "power.h" @@ -138,6 +139,7 @@ static void s2idle_loop(void) break; } + clear_wakeup_reasons(); s2idle_enter(); } @@ -359,6 +361,7 @@ static int suspend_prepare(suspend_state_t state) if (!error) return 0; + log_suspend_abort_reason("One or more tasks refusing to freeze"); suspend_stats.failed_freeze++; dpm_save_failed_step(SUSPEND_FREEZE); pm_notifier_call_chain(PM_POST_SUSPEND); @@ -388,7 +391,7 @@ void __weak arch_suspend_enable_irqs(void) */ static int suspend_enter(suspend_state_t state, bool *wakeup) { - int error; + int error, last_dev; error = platform_suspend_prepare(state); if (error) @@ -396,7 +399,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_late(PMSG_SUSPEND); if (error) { + last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; + last_dev %= REC_FAILED_NUM; pr_err("late suspend of devices failed\n"); + log_suspend_abort_reason("late suspend of %s device failed", + suspend_stats.failed_devs[last_dev]); goto Platform_finish; } error = platform_suspend_prepare_late(state); @@ -405,7 +412,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { + last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; + last_dev %= REC_FAILED_NUM; pr_err("noirq suspend of devices failed\n"); + log_suspend_abort_reason("noirq suspend of %s device failed", + suspend_stats.failed_devs[last_dev]); goto Platform_early_resume; } error = platform_suspend_prepare_noirq(state); @@ -421,8 +432,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) } error = suspend_disable_secondary_cpus(); - if (error || suspend_test(TEST_CPUS)) + if (error || suspend_test(TEST_CPUS)) { + log_suspend_abort_reason("Disabling non-boot cpus failed"); goto Enable_cpus; + } arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); @@ -493,6 +506,8 @@ int suspend_devices_and_enter(suspend_state_t state) error = dpm_suspend_start(PMSG_SUSPEND); if (error) { pr_err("Some devices failed to suspend, or early wake event detected\n"); + log_suspend_abort_reason( + "Some devices failed to suspend, or early wake event detected"); goto Recover_platform; } suspend_test_finish("suspend devices"); diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c new file mode 100644 index 000000000000..8fefaa3fdba2 --- /dev/null +++ b/kernel/power/wakeup_reason.c @@ -0,0 +1,438 @@ +/* + * kernel/power/wakeup_reason.c + * + * Logs the reasons which caused the kernel to resume from + * the suspend mode. + * + * Copyright (C) 2020 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * struct wakeup_irq_node - stores data and relationships for IRQs logged as + * either base or nested wakeup reasons during suspend/resume flow. + * @siblings - for membership on leaf or parent IRQ lists + * @irq - the IRQ number + * @irq_name - the name associated with the IRQ, or a default if none + */ +struct wakeup_irq_node { + struct list_head siblings; + int irq; + const char *irq_name; +}; + +enum wakeup_reason_flag { + RESUME_NONE = 0, + RESUME_IRQ, + RESUME_ABORT, + RESUME_ABNORMAL, +}; + +static DEFINE_SPINLOCK(wakeup_reason_lock); + +static LIST_HEAD(leaf_irqs); /* kept in ascending IRQ sorted order */ +static LIST_HEAD(parent_irqs); /* unordered */ + +static struct kmem_cache *wakeup_irq_nodes_cache; + +static const char *default_irq_name = "(unnamed)"; + +static struct kobject *kobj; + +static bool capture_reasons; +static int wakeup_reason; +static char non_irq_wake_reason[MAX_SUSPEND_ABORT_LEN]; + +static ktime_t last_monotime; /* monotonic time before last suspend */ +static ktime_t curr_monotime; /* monotonic time after last suspend */ +static ktime_t last_stime; /* monotonic boottime offset before last suspend */ +static ktime_t curr_stime; /* monotonic boottime offset after last suspend */ + +static void init_node(struct wakeup_irq_node *p, int irq) +{ + struct irq_desc *desc; + + INIT_LIST_HEAD(&p->siblings); + + p->irq = irq; + desc = irq_to_desc(irq); + if (desc && desc->action && desc->action->name) + p->irq_name = desc->action->name; + else + p->irq_name = default_irq_name; +} + +static struct wakeup_irq_node *create_node(int irq) +{ + struct wakeup_irq_node *result; + + result = kmem_cache_alloc(wakeup_irq_nodes_cache, GFP_ATOMIC); + if (unlikely(!result)) + pr_warn("Failed to log wakeup IRQ %d\n", irq); + else + init_node(result, irq); + + return result; +} + +static void delete_list(struct list_head *head) +{ + struct wakeup_irq_node *n; + + while (!list_empty(head)) { + n = list_first_entry(head, struct wakeup_irq_node, siblings); + list_del(&n->siblings); + kmem_cache_free(wakeup_irq_nodes_cache, n); + } +} + +static bool add_sibling_node_sorted(struct list_head *head, int irq) +{ + struct wakeup_irq_node *n = NULL; + struct list_head *predecessor = head; + + if (unlikely(WARN_ON(!head))) + return NULL; + + if (!list_empty(head)) + list_for_each_entry(n, head, siblings) { + if (n->irq < irq) + predecessor = &n->siblings; + else if (n->irq == irq) + return true; + else + break; + } + + n = create_node(irq); + if (n) { + list_add(&n->siblings, predecessor); + return true; + } + + return false; +} + +static struct wakeup_irq_node *find_node_in_list(struct list_head *head, + int irq) +{ + struct wakeup_irq_node *n; + + if (unlikely(WARN_ON(!head))) + return NULL; + + list_for_each_entry(n, head, siblings) + if (n->irq == irq) + return n; + + return NULL; +} + +void log_irq_wakeup_reason(int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&wakeup_reason_lock, flags); + if (wakeup_reason == RESUME_ABNORMAL || wakeup_reason == RESUME_ABORT) { + spin_unlock_irqrestore(&wakeup_reason_lock, flags); + return; + } + + if (!capture_reasons) { + spin_unlock_irqrestore(&wakeup_reason_lock, flags); + return; + } + + if (find_node_in_list(&parent_irqs, irq) == NULL) + add_sibling_node_sorted(&leaf_irqs, irq); + + wakeup_reason = RESUME_IRQ; + spin_unlock_irqrestore(&wakeup_reason_lock, flags); +} + +void log_threaded_irq_wakeup_reason(int irq, int parent_irq) +{ + struct wakeup_irq_node *parent; + unsigned long flags; + + /* + * Intentionally unsynchronized. Calls that come in after we have + * resumed should have a fast exit path since there's no work to be + * done, any any coherence issue that could cause a wrong value here is + * both highly improbable - given the set/clear timing - and very low + * impact (parent IRQ gets logged instead of the specific child). + */ + if (!capture_reasons) + return; + + spin_lock_irqsave(&wakeup_reason_lock, flags); + + if (wakeup_reason == RESUME_ABNORMAL || wakeup_reason == RESUME_ABORT) { + spin_unlock_irqrestore(&wakeup_reason_lock, flags); + return; + } + + if (!capture_reasons || (find_node_in_list(&leaf_irqs, irq) != NULL)) { + spin_unlock_irqrestore(&wakeup_reason_lock, flags); + return; + } + + parent = find_node_in_list(&parent_irqs, parent_irq); + if (parent != NULL) + add_sibling_node_sorted(&leaf_irqs, irq); + else { + parent = find_node_in_list(&leaf_irqs, parent_irq); + if (parent != NULL) { + list_del_init(&parent->siblings); + list_add_tail(&parent->siblings, &parent_irqs); + add_sibling_node_sorted(&leaf_irqs, irq); + } + } + + spin_unlock_irqrestore(&wakeup_reason_lock, flags); +} +EXPORT_SYMBOL_GPL(log_threaded_irq_wakeup_reason); + +static void __log_abort_or_abnormal_wake(bool abort, const char *fmt, + va_list args) +{ + unsigned long flags; + + spin_lock_irqsave(&wakeup_reason_lock, flags); + + /* Suspend abort or abnormal wake reason has already been logged. */ + if (wakeup_reason != RESUME_NONE) { + spin_unlock_irqrestore(&wakeup_reason_lock, flags); + return; + } + + if (abort) + wakeup_reason = RESUME_ABORT; + else + wakeup_reason = RESUME_ABNORMAL; + + vsnprintf(non_irq_wake_reason, MAX_SUSPEND_ABORT_LEN, fmt, args); + + spin_unlock_irqrestore(&wakeup_reason_lock, flags); +} + +void log_suspend_abort_reason(const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + __log_abort_or_abnormal_wake(true, fmt, args); + va_end(args); +} +EXPORT_SYMBOL_GPL(log_suspend_abort_reason); + +void log_abnormal_wakeup_reason(const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + __log_abort_or_abnormal_wake(false, fmt, args); + va_end(args); +} +EXPORT_SYMBOL_GPL(log_abnormal_wakeup_reason); + +void clear_wakeup_reasons(void) +{ + unsigned long flags; + + spin_lock_irqsave(&wakeup_reason_lock, flags); + + delete_list(&leaf_irqs); + delete_list(&parent_irqs); + wakeup_reason = RESUME_NONE; + capture_reasons = true; + + spin_unlock_irqrestore(&wakeup_reason_lock, flags); +} + +static void print_wakeup_sources(void) +{ + struct wakeup_irq_node *n; + unsigned long flags; + + spin_lock_irqsave(&wakeup_reason_lock, flags); + + capture_reasons = false; + + if (wakeup_reason == RESUME_ABORT) { + pr_info("Abort: %s\n", non_irq_wake_reason); + spin_unlock_irqrestore(&wakeup_reason_lock, flags); + return; + } + + if (wakeup_reason == RESUME_IRQ && !list_empty(&leaf_irqs)) + list_for_each_entry(n, &leaf_irqs, siblings) + pr_info("Resume caused by IRQ %d, %s\n", n->irq, + n->irq_name); + else if (wakeup_reason == RESUME_ABNORMAL) + pr_info("Resume caused by %s\n", non_irq_wake_reason); + else + pr_info("Resume cause unknown\n"); + + spin_unlock_irqrestore(&wakeup_reason_lock, flags); +} + +static ssize_t last_resume_reason_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + ssize_t buf_offset = 0; + struct wakeup_irq_node *n; + unsigned long flags; + + spin_lock_irqsave(&wakeup_reason_lock, flags); + + if (wakeup_reason == RESUME_ABORT) { + buf_offset = scnprintf(buf, PAGE_SIZE, "Abort: %s", + non_irq_wake_reason); + spin_unlock_irqrestore(&wakeup_reason_lock, flags); + return buf_offset; + } + + if (wakeup_reason == RESUME_IRQ && !list_empty(&leaf_irqs)) + list_for_each_entry(n, &leaf_irqs, siblings) + buf_offset += scnprintf(buf + buf_offset, + PAGE_SIZE - buf_offset, + "%d %s\n", n->irq, n->irq_name); + else if (wakeup_reason == RESUME_ABNORMAL) + buf_offset = scnprintf(buf, PAGE_SIZE, "-1 %s", + non_irq_wake_reason); + + spin_unlock_irqrestore(&wakeup_reason_lock, flags); + + return buf_offset; +} + +static ssize_t last_suspend_time_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct timespec64 sleep_time; + struct timespec64 total_time; + struct timespec64 suspend_resume_time; + + /* + * total_time is calculated from monotonic bootoffsets because + * unlike CLOCK_MONOTONIC it include the time spent in suspend state. + */ + total_time = ktime_to_timespec64(ktime_sub(curr_stime, last_stime)); + + /* + * suspend_resume_time is calculated as monotonic (CLOCK_MONOTONIC) + * time interval before entering suspend and post suspend. + */ + suspend_resume_time = + ktime_to_timespec64(ktime_sub(curr_monotime, last_monotime)); + + /* sleep_time = total_time - suspend_resume_time */ + sleep_time = timespec64_sub(total_time, suspend_resume_time); + + /* Export suspend_resume_time and sleep_time in pair here. */ + return sprintf(buf, "%llu.%09lu %llu.%09lu\n", + (unsigned long long)suspend_resume_time.tv_sec, + suspend_resume_time.tv_nsec, + (unsigned long long)sleep_time.tv_sec, + sleep_time.tv_nsec); +} + +static struct kobj_attribute resume_reason = __ATTR_RO(last_resume_reason); +static struct kobj_attribute suspend_time = __ATTR_RO(last_suspend_time); + +static struct attribute *attrs[] = { + &resume_reason.attr, + &suspend_time.attr, + NULL, +}; +static struct attribute_group attr_group = { + .attrs = attrs, +}; + +/* Detects a suspend and clears all the previous wake up reasons*/ +static int wakeup_reason_pm_event(struct notifier_block *notifier, + unsigned long pm_event, void *unused) +{ + switch (pm_event) { + case PM_SUSPEND_PREPARE: + /* monotonic time since boot */ + last_monotime = ktime_get(); + /* monotonic time since boot including the time spent in suspend */ + last_stime = ktime_get_boottime(); + clear_wakeup_reasons(); + break; + case PM_POST_SUSPEND: + /* monotonic time since boot */ + curr_monotime = ktime_get(); + /* monotonic time since boot including the time spent in suspend */ + curr_stime = ktime_get_boottime(); + print_wakeup_sources(); + break; + default: + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block wakeup_reason_pm_notifier_block = { + .notifier_call = wakeup_reason_pm_event, +}; + +static int __init wakeup_reason_init(void) +{ + if (register_pm_notifier(&wakeup_reason_pm_notifier_block)) { + pr_warn("[%s] failed to register PM notifier\n", __func__); + goto fail; + } + + kobj = kobject_create_and_add("wakeup_reasons", kernel_kobj); + if (!kobj) { + pr_warn("[%s] failed to create a sysfs kobject\n", __func__); + goto fail_unregister_pm_notifier; + } + + if (sysfs_create_group(kobj, &attr_group)) { + pr_warn("[%s] failed to create a sysfs group\n", __func__); + goto fail_kobject_put; + } + + wakeup_irq_nodes_cache = + kmem_cache_create("wakeup_irq_node_cache", + sizeof(struct wakeup_irq_node), 0, 0, NULL); + if (!wakeup_irq_nodes_cache) + goto fail_remove_group; + + return 0; + +fail_remove_group: + sysfs_remove_group(kobj, &attr_group); +fail_kobject_put: + kobject_put(kobj); +fail_unregister_pm_notifier: + unregister_pm_notifier(&wakeup_reason_pm_notifier_block); +fail: + return 1; +} + +late_initcall(wakeup_reason_init); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8d856b7c2e5a..cb63d863ad80 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -54,6 +54,9 @@ #include #define CREATE_TRACE_POINTS #include +#undef CREATE_TRACE_POINTS +#include +#include #include "printk_ringbuffer.h" #include "console_cmdline.h" @@ -556,10 +559,14 @@ static ssize_t info_print_ext_header(char *buf, size_t size, u64 ts_usec = info->ts_nsec; char caller[20]; #ifdef CONFIG_PRINTK_CALLER + int vh_ret = 0; u32 id = info->caller_id; - snprintf(caller, sizeof(caller), ",caller=%c%u", - id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); + trace_android_vh_printk_ext_header(caller, sizeof(caller), id, &vh_ret); + + if (!vh_ret) + snprintf(caller, sizeof(caller), ",caller=%c%u", + id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); #else caller[0] = '\0'; #endif @@ -1279,9 +1286,12 @@ static size_t print_time(u64 ts, char *buf) static size_t print_caller(u32 id, char *buf) { char caller[12]; + int vh_ret = 0; - snprintf(caller, sizeof(caller), "%c%u", - id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); + trace_android_vh_printk_caller(caller, sizeof(caller), id, &vh_ret); + if (!vh_ret) + snprintf(caller, sizeof(caller), "%c%u", + id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); return sprintf(buf, "[%6s]", caller); } #else @@ -2041,6 +2051,12 @@ static inline void printk_delay(void) static inline u32 printk_caller_id(void) { + u32 caller_id = 0; + + trace_android_vh_printk_caller_id(&caller_id); + if (caller_id) + return caller_id; + return in_task() ? task_pid_nr(current) : 0x80000000 + raw_smp_processor_id(); } @@ -2185,6 +2201,7 @@ int vprintk_store(int facility, int level, prb_commit(&e); } + trace_android_vh_logbuf_pr_cont(&r, text_len); ret = text_len; goto out; } @@ -2224,6 +2241,7 @@ int vprintk_store(int facility, int level, else prb_final_commit(&e); + trace_android_vh_logbuf(prb, &r); ret = text_len + trunc_msg_len; out: printk_exit_irqrestore(recursion_ptr, irqflags); @@ -2524,6 +2542,12 @@ void resume_console(void) */ static int console_cpu_notify(unsigned int cpu) { + int flag = 0; + + trace_android_vh_printk_hotplug(&flag); + if (flag) + return 0; + if (!cpuhp_tasks_frozen) { /* If trylock fails, someone else is doing the printing */ if (console_trylock()) @@ -3317,6 +3341,7 @@ int _printk_deferred(const char *fmt, ...) return r; } +EXPORT_SYMBOL_GPL(_printk_deferred); /* * printk rate limiting, lifted from the networking subsystem. diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 8a7b7362c0dd..2b7b6ddab4f7 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -474,8 +474,10 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, * state has been re-checked. A memcpy() for all of @desc * cannot be used because of the atomic_t @state_var field. */ - memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, - sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ + if (desc_out) { + memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, + sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ + } if (seq_out) *seq_out = info->seq; /* also part of desc_read:C */ if (caller_id_out) @@ -528,7 +530,8 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */ d_state = get_desc_state(id, state_val); out: - atomic_long_set(&desc_out->state_var, state_val); + if (desc_out) + atomic_long_set(&desc_out->state_var, state_val); return d_state; } @@ -1449,6 +1452,9 @@ static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id) atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val, DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */ + + /* Best effort to remember the last finalized @id. */ + atomic_long_set(&desc_ring->last_finalized_id, id); } /** @@ -1657,7 +1663,12 @@ void prb_commit(struct prb_reserved_entry *e) */ void prb_final_commit(struct prb_reserved_entry *e) { + struct prb_desc_ring *desc_ring = &e->rb->desc_ring; + _prb_commit(e, desc_finalized); + + /* Best effort to remember the last finalized @id. */ + atomic_long_set(&desc_ring->last_finalized_id, e->id); } /* @@ -2005,9 +2016,39 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb) */ u64 prb_next_seq(struct printk_ringbuffer *rb) { - u64 seq = 0; + struct prb_desc_ring *desc_ring = &rb->desc_ring; + enum desc_state d_state; + unsigned long id; + u64 seq; - /* Search forward from the oldest descriptor. */ + /* Check if the cached @id still points to a valid @seq. */ + id = atomic_long_read(&desc_ring->last_finalized_id); + d_state = desc_read(desc_ring, id, NULL, &seq, NULL); + + if (d_state == desc_finalized || d_state == desc_reusable) { + /* + * Begin searching after the last finalized record. + * + * On 0, the search must begin at 0 because of hack#2 + * of the bootstrapping phase it is not known if a + * record at index 0 exists. + */ + if (seq != 0) + seq++; + } else { + /* + * The information about the last finalized sequence number + * has gone. It should happen only when there is a flood of + * new messages and the ringbuffer is rapidly recycled. + * Give up and start from the beginning. + */ + seq = 0; + } + + /* + * The information about the last finalized @seq might be inaccurate. + * Search forward to find the current one. + */ while (_prb_read_valid(rb, &seq, NULL, NULL)) seq++; @@ -2044,6 +2085,7 @@ void prb_init(struct printk_ringbuffer *rb, rb->desc_ring.infos = infos; atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); + atomic_long_set(&rb->desc_ring.last_finalized_id, DESC0_ID(descbits)); rb->text_data_ring.size_bits = textbits; rb->text_data_ring.data = text_buf; diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 73cc80e01cef..18cd25e489b8 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -75,6 +75,7 @@ struct prb_desc_ring { struct printk_info *infos; atomic_long_t head_id; atomic_long_t tail_id; + atomic_long_t last_finalized_id; }; /* @@ -258,6 +259,7 @@ static struct printk_ringbuffer name = { \ .infos = &_##name##_infos[0], \ .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ + .last_finalized_id = ATOMIC_INIT(DESC0_ID(descbits)), \ }, \ .text_data_ring = { \ .size_bits = (avgtextbits) + (descbits), \ diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index f73cf17fcee9..066be3bb9d77 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -214,6 +214,20 @@ config RCU_BOOST_DELAY Accept the default if unsure. +config RCU_EXP_KTHREAD + bool "Perform RCU expedited work in a real-time kthread" + depends on RCU_BOOST && RCU_EXPERT + default !PREEMPT_RT && NR_CPUS <= 32 + help + Use this option to further reduce the latencies of expedited + grace periods at the expense of being more disruptive. + + This option is disabled by default on PREEMPT_RT=y kernels which + disable expedited grace periods after boot by unconditionally + setting rcupdate.rcu_normal_after_boot=1. + + Accept the default if unsure. + config RCU_NOCB_CPU bool "Offload RCU callback processing from boot-selected CPUs" depends on TREE_RCU diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 24b5f2c2de87..5510d2231c32 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -534,7 +534,12 @@ int rcu_get_gp_kthreads_prio(void); void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; +#ifdef CONFIG_RCU_EXP_KTHREAD +extern struct kthread_worker *rcu_exp_gp_kworker; +extern struct kthread_worker *rcu_exp_par_gp_kworker; +#else /* !CONFIG_RCU_EXP_KTHREAD */ extern struct workqueue_struct *rcu_par_gp_wq; +#endif /* CONFIG_RCU_EXP_KTHREAD */ #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cf101da389b0..cbb02e8c2b57 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2990,7 +2990,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) head->func = func; head->next = NULL; local_irq_save(flags); - kasan_record_aux_stack(head); + kasan_record_aux_stack_noalloc(head); rdp = this_cpu_ptr(&rcu_data); /* Add the callback to our list. */ @@ -3556,7 +3556,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) return; } - kasan_record_aux_stack(ptr); + kasan_record_aux_stack_noalloc(ptr); success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); if (!success) { run_page_cache_worker(krcp); @@ -4404,6 +4404,51 @@ static int rcu_pm_notify(struct notifier_block *self, return NOTIFY_OK; } +#ifdef CONFIG_RCU_EXP_KTHREAD +struct kthread_worker *rcu_exp_gp_kworker; +struct kthread_worker *rcu_exp_par_gp_kworker; + +static void __init rcu_start_exp_gp_kworkers(void) +{ + const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker"; + const char *gp_kworker_name = "rcu_exp_gp_kthread_worker"; + struct sched_param param = { .sched_priority = kthread_prio }; + + rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { + pr_err("Failed to create %s!\n", gp_kworker_name); + return; + } + + rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { + pr_err("Failed to create %s!\n", par_gp_kworker_name); + kthread_destroy_worker(rcu_exp_gp_kworker); + return; + } + + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, + ¶m); +} + +static inline void rcu_alloc_par_gp_wq(void) +{ +} +#else /* !CONFIG_RCU_EXP_KTHREAD */ +struct workqueue_struct *rcu_par_gp_wq; + +static void __init rcu_start_exp_gp_kworkers(void) +{ +} + +static inline void rcu_alloc_par_gp_wq(void) +{ + rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_par_gp_wq); +} +#endif /* CONFIG_RCU_EXP_KTHREAD */ + /* * Spawn the kthreads that handle RCU's grace periods. */ @@ -4449,6 +4494,8 @@ static int __init rcu_spawn_gp_kthread(void) rcu_spawn_nocb_kthreads(); rcu_spawn_boost_kthreads(); rcu_spawn_core_kthreads(); + /* Create kthread worker for expedited GPs */ + rcu_start_exp_gp_kworkers(); return 0; } early_initcall(rcu_spawn_gp_kthread); @@ -4671,7 +4718,6 @@ static void __init rcu_dump_rcu_node_tree(void) } struct workqueue_struct *rcu_gp_wq; -struct workqueue_struct *rcu_par_gp_wq; static void __init kfree_rcu_batch_init(void) { @@ -4736,8 +4782,7 @@ void __init rcu_init(void) /* Create workqueue for Tree SRCU and for expedited GPs. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); - rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); - WARN_ON(!rcu_par_gp_wq); + rcu_alloc_par_gp_wq(); /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 305cf6aeb408..f0a16b4a832b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -10,6 +10,7 @@ */ #include +#include #include #include #include @@ -23,7 +24,11 @@ /* Communicate arguments to a workqueue handler. */ struct rcu_exp_work { unsigned long rew_s; +#ifdef CONFIG_RCU_EXP_KTHREAD + struct kthread_work rew_work; +#else struct work_struct rew_work; +#endif /* CONFIG_RCU_EXP_KTHREAD */ }; /* RCU's kthread states for tracing. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 16f94118ca34..5aa462b435da 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -334,15 +334,13 @@ fastpath: * Select the CPUs within the specified rcu_node that the upcoming * expedited grace period needs to wait for. */ -static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) +static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) { int cpu; unsigned long flags; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; - struct rcu_exp_work *rewp = - container_of(wp, struct rcu_exp_work, rew_work); struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -417,13 +415,119 @@ retry_ipi: rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); } +static void rcu_exp_sel_wait_wake(unsigned long s); + +#ifdef CONFIG_RCU_EXP_KTHREAD +static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) +{ + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + + __sync_rcu_exp_select_node_cpus(rewp); +} + +static inline bool rcu_gp_par_worker_started(void) +{ + return !!READ_ONCE(rcu_exp_par_gp_kworker); +} + +static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) +{ + kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + /* + * Use rcu_exp_par_gp_kworker, because flushing a work item from + * another work item on the same kthread worker can result in + * deadlock. + */ + kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work); +} + +static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) +{ + kthread_flush_work(&rnp->rew.rew_work); +} + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct kthread_work *wp) +{ + struct rcu_exp_work *rewp; + + rewp = container_of(wp, struct rcu_exp_work, rew_work); + rcu_exp_sel_wait_wake(rewp->rew_s); +} + +static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) +{ + kthread_init_work(&rew->rew_work, wait_rcu_exp_gp); + kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work); +} + +static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) +{ +} +#else /* !CONFIG_RCU_EXP_KTHREAD */ +static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) +{ + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + + __sync_rcu_exp_select_node_cpus(rewp); +} + +static inline bool rcu_gp_par_worker_started(void) +{ + return !!READ_ONCE(rcu_par_gp_wq); +} + +static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) +{ + int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); + + INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + /* If all offline, queue the work on an unbound CPU. */ + if (unlikely(cpu > rnp->grphi - rnp->grplo)) + cpu = WORK_CPU_UNBOUND; + else + cpu += rnp->grplo; + queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); +} + +static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) +{ + flush_work(&rnp->rew.rew_work); +} + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct work_struct *wp) +{ + struct rcu_exp_work *rewp; + + rewp = container_of(wp, struct rcu_exp_work, rew_work); + rcu_exp_sel_wait_wake(rewp->rew_s); +} + +static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) +{ + INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp); + queue_work(rcu_gp_wq, &rew->rew_work); +} + +static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) +{ + destroy_work_on_stack(&rew->rew_work); +} +#endif /* CONFIG_RCU_EXP_KTHREAD */ + /* * Select the nodes that the upcoming expedited grace period needs * to wait for. */ static void sync_rcu_exp_select_cpus(void) { - int cpu; struct rcu_node *rnp; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset")); @@ -435,28 +539,21 @@ static void sync_rcu_exp_select_cpus(void) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - if (!READ_ONCE(rcu_par_gp_wq) || + if (!rcu_gp_par_worker_started() || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { - /* No workqueues yet or last leaf, do direct call. */ + /* No worker started yet or last leaf, do direct call. */ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); continue; } - INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); - /* If all offline, queue the work on an unbound CPU. */ - if (unlikely(cpu > rnp->grphi - rnp->grplo)) - cpu = WORK_CPU_UNBOUND; - else - cpu += rnp->grplo; - queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); + sync_rcu_exp_select_cpus_queue_work(rnp); rnp->exp_need_flush = true; } - /* Wait for workqueue jobs (if any) to complete. */ + /* Wait for jobs (if any) to complete. */ rcu_for_each_leaf_node(rnp) if (rnp->exp_need_flush) - flush_work(&rnp->rew.rew_work); + sync_rcu_exp_select_cpus_flush_work(rnp); } /* @@ -619,17 +716,6 @@ static void rcu_exp_sel_wait_wake(unsigned long s) rcu_exp_wait_wake(s); } -/* - * Work-queue handler to drive an expedited grace period forward. - */ -static void wait_rcu_exp_gp(struct work_struct *wp) -{ - struct rcu_exp_work *rewp; - - rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_s); -} - #ifdef CONFIG_PREEMPT_RCU /* @@ -845,20 +931,19 @@ void synchronize_rcu_expedited(void) } else { /* Marshall arguments & schedule the expedited grace period. */ rew.rew_s = s; - INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - queue_work(rcu_gp_wq, &rew.rew_work); + synchronize_rcu_expedited_queue_work(&rew); } /* Wait for expedited grace period to complete. */ rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(s)); - smp_mb(); /* Workqueue actions happen before return. */ + smp_mb(); /* Work actions happen before return. */ /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); if (likely(!boottime)) - destroy_work_on_stack(&rew.rew_work); + synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); diff --git a/kernel/reboot.c b/kernel/reboot.c index f7440c0c7e43..957ef52b8af4 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -33,7 +33,9 @@ EXPORT_SYMBOL(cad_pid); #define DEFAULT_REBOOT_MODE #endif enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; +EXPORT_SYMBOL_GPL(reboot_mode); enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED; +EXPORT_SYMBOL_GPL(panic_reboot_mode); /* * This variable is used privately to keep track of whether or not diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 978fcfca5871..660d6d6d0837 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -37,3 +37,4 @@ obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o obj-$(CONFIG_SCHED_CORE) += core_sched.o +obj-$(CONFIG_ANDROID_VENDOR_HOOKS) += vendor_hooks.o diff --git a/kernel/sched/OWNERS b/kernel/sched/OWNERS new file mode 100644 index 000000000000..09a9f8e85576 --- /dev/null +++ b/kernel/sched/OWNERS @@ -0,0 +1,4 @@ +connoro@google.com +elavila@google.com +qperret@google.com +tkjos@google.com diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c1458fa8beb3..43bd448d62af 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -27,6 +27,10 @@ #include "pelt.h" #include "smp.h" +#include +#include +#include + /* * Export tracepoints that act as a bare tracehook (ie: have no trace event * associated with them) to allow external modules to probe them. @@ -42,8 +46,10 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_switch); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +EXPORT_SYMBOL_GPL(runqueues); #ifdef CONFIG_SCHED_DEBUG /* @@ -58,6 +64,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); const_debug unsigned int sysctl_sched_features = #include "features.h" 0; +EXPORT_SYMBOL_GPL(sysctl_sched_features); #undef SCHED_FEAT /* @@ -489,6 +496,7 @@ void raw_spin_rq_lock_nested(struct rq *rq, int subclass) raw_spin_unlock(lock); } } +EXPORT_SYMBOL_GPL(raw_spin_rq_lock_nested); bool raw_spin_rq_trylock(struct rq *rq) { @@ -518,6 +526,7 @@ void raw_spin_rq_unlock(struct rq *rq) { raw_spin_unlock(rq_lockp(rq)); } +EXPORT_SYMBOL_GPL(raw_spin_rq_unlock); #ifdef CONFIG_SMP /* @@ -536,6 +545,7 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2) double_rq_clock_clear_update(rq1, rq2); } +EXPORT_SYMBOL_GPL(double_rq_lock); #endif /* @@ -561,6 +571,7 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) cpu_relax(); } } +EXPORT_SYMBOL_GPL(__task_rq_lock); /* * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. @@ -603,6 +614,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) cpu_relax(); } } +EXPORT_SYMBOL_GPL(task_rq_lock); /* * RQ-clock updating methods: @@ -683,6 +695,7 @@ void update_rq_clock(struct rq *rq) rq->clock += delta; update_rq_clock_task(rq, delta); } +EXPORT_SYMBOL_GPL(update_rq_clock); #ifdef CONFIG_SCHED_HRTICK /* @@ -887,6 +900,7 @@ static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) */ *head->lastp = node; head->lastp = &node->next; + head->count++; return true; } @@ -942,12 +956,14 @@ void wake_up_q(struct wake_q_head *head) /* Task can safely be re-inserted now: */ node = node->next; task->wake_q.next = NULL; + task->wake_q_count = head->count; /* * wake_up_process() executes a full barrier, which pairs with * the queueing in wake_q_add() so as not to miss wakeups. */ wake_up_process(task); + task->wake_q_count = 0; put_task_struct(task); } } @@ -982,6 +998,7 @@ void resched_curr(struct rq *rq) else trace_sched_wake_idle_without_ipi(cpu); } +EXPORT_SYMBOL_GPL(resched_curr); void resched_cpu(int cpu) { @@ -1009,6 +1026,11 @@ int get_nohz_timer_target(void) int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; const struct cpumask *hk_mask; + bool done = false; + + trace_android_rvh_get_nohz_timer_target(&cpu, &done); + if (done) + return cpu; if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { if (!idle_cpu(cpu)) @@ -1284,6 +1306,7 @@ static struct uclamp_se uclamp_default[UCLAMP_CNT]; * * An admin modifying the cgroup cpu.uclamp.{min, max} */ DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); +EXPORT_SYMBOL_GPL(sched_uclamp_used); /* Integer rounded range for each bucket */ #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) @@ -1457,6 +1480,12 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id); struct uclamp_se uc_max = uclamp_default[clamp_id]; + struct uclamp_se uc_eff; + int ret = 0; + + trace_android_rvh_uclamp_eff_get(p, clamp_id, &uc_max, &uc_eff, &ret); + if (ret) + return uc_eff; /* System default restrictions always apply */ if (unlikely(uc_req.value > uc_max.value)) @@ -1477,6 +1506,7 @@ unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) return (unsigned long)uc_eff.value; } +EXPORT_SYMBOL_GPL(uclamp_eff_value); /* * When a task is enqueued on a rq, the clamp bucket currently defined by the @@ -1779,7 +1809,7 @@ done: } static int uclamp_validate(struct task_struct *p, - const struct sched_attr *attr) + const struct sched_attr *attr, bool user) { int util_min = p->uclamp_req[UCLAMP_MIN].value; int util_max = p->uclamp_req[UCLAMP_MAX].value; @@ -1804,11 +1834,19 @@ static int uclamp_validate(struct task_struct *p, /* * We have valid uclamp attributes; make sure uclamp is enabled. * - * We need to do that here, because enabling static branches is a - * blocking operation which obviously cannot be done while holding + * We need to do that here, because enabling static branches is + * a blocking operation which obviously cannot be done while holding * scheduler locks. + * + * We only enable the static key if this was initiated by user space + * request. There should be no in-kernel users of uclamp except to + * implement things like inheritance like in binder. These in-kernel + * callers can rightfully be called be sometimes in_atomic() context + * which is invalid context to enable the key in. The enabling path + * unconditionally holds the cpus_read_lock() which might_sleep(). */ - static_branch_enable(&sched_uclamp_used); + if (user) + static_branch_enable(&sched_uclamp_used); return 0; } @@ -1870,12 +1908,14 @@ static void __setscheduler_uclamp(struct task_struct *p, attr->sched_util_min != -1) { uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], attr->sched_util_min, true); + trace_android_vh_setscheduler_uclamp(p, UCLAMP_MIN, attr->sched_util_min); } if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && attr->sched_util_max != -1) { uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], attr->sched_util_max, true); + trace_android_vh_setscheduler_uclamp(p, UCLAMP_MAX, attr->sched_util_max); } } @@ -1947,7 +1987,7 @@ static void __init init_uclamp(void) static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } static inline int uclamp_validate(struct task_struct *p, - const struct sched_attr *attr) + const struct sched_attr *attr, bool user) { return -EOPNOTSUPP; } @@ -1974,7 +2014,9 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) } uclamp_rq_inc(rq, p); + trace_android_rvh_enqueue_task(rq, p, flags); p->sched_class->enqueue_task(rq, p, flags); + trace_android_rvh_after_enqueue_task(rq, p, flags); if (sched_core_enabled(rq)) sched_core_enqueue(rq, p); @@ -1994,7 +2036,9 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) } uclamp_rq_dec(rq, p); + trace_android_rvh_dequeue_task(rq, p, flags); p->sched_class->dequeue_task(rq, p, flags); + trace_android_rvh_after_dequeue_task(rq, p, flags); } void activate_task(struct rq *rq, struct task_struct *p, int flags) @@ -2003,6 +2047,7 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) p->on_rq = TASK_ON_RQ_QUEUED; } +EXPORT_SYMBOL_GPL(activate_task); void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { @@ -2010,6 +2055,7 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } +EXPORT_SYMBOL_GPL(deactivate_task); static inline int __normal_prio(int policy, int rt_prio, int nice) { @@ -2102,6 +2148,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) rq_clock_skip_update(rq); } +EXPORT_SYMBOL_GPL(check_preempt_curr); #ifdef CONFIG_SMP @@ -2181,6 +2228,8 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) */ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) { + bool allowed = true; + /* When not in the task's cpumask, no point in looking further. */ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) return false; @@ -2189,14 +2238,20 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) if (is_migration_disabled(p)) return cpu_online(cpu); + /* check for all cases */ + trace_android_rvh_is_cpu_allowed(p, cpu, &allowed); + /* Non kernel threads are not allowed during either online or offline. */ if (!(p->flags & PF_KTHREAD)) - return cpu_active(cpu) && task_cpu_possible(cpu, p); + return cpu_active(cpu) && task_cpu_possible(cpu, p) && allowed; /* KTHREAD_IS_PER_CPU is always allowed. */ if (kthread_is_per_cpu(p)) return cpu_online(cpu); + if (!allowed) + return false; + /* Regular kernel threads don't get to stay during offline. */ if (cpu_dying(cpu)) return false; @@ -2227,12 +2282,24 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int new_cpu) { + int detached = 0; + lockdep_assert_rq_held(rq); + /* + * The vendor hook may drop the lock temporarily, so + * pass the rq flags to unpin lock. We expect the + * rq lock to be held after return. + */ + trace_android_rvh_migrate_queued_task(rq, rf, p, new_cpu, &detached); + if (detached) + goto attach; + deactivate_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); - rq_unlock(rq, rf); +attach: + rq_unlock(rq, rf); rq = cpu_rq(new_cpu); rq_lock(rq, rf); @@ -2270,8 +2337,8 @@ struct set_affinity_pending { * So we race with normal scheduler movements, but that's OK, as long * as the task is no longer on this CPU. */ -static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, - struct task_struct *p, int dest_cpu) +struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, int dest_cpu) { /* Affinity changed (again). */ if (!is_cpu_allowed(p, dest_cpu)) @@ -2282,6 +2349,7 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, return rq; } +EXPORT_SYMBOL_GPL(__migrate_task); /* * migration_cpu_stop - this will be executed by a highprio stopper thread @@ -2837,6 +2905,8 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, * immediately required to distribute the tasks within their new mask. */ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); + trace_android_rvh_set_cpus_allowed_ptr_locked(cpu_valid_mask, new_mask, &dest_cpu); + if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; goto out; @@ -2958,6 +3028,7 @@ void force_compatible_cpus_allowed_ptr(struct task_struct *p) * offlining of the chosen destination CPU, so take the hotplug * lock to ensure that the migration succeeds. */ + trace_android_vh_force_compatible_pre(NULL); cpus_read_lock(); if (!cpumask_available(new_mask)) goto out_set_mask; @@ -2982,6 +3053,7 @@ out_set_mask: WARN_ON(set_cpus_allowed_ptr(p, override_mask)); out_free_mask: cpus_read_unlock(); + trace_android_vh_force_compatible_post(NULL); free_cpumask_var(new_mask); } @@ -3066,12 +3138,13 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->se.nr_migrations++; rseq_migrate(p); perf_event_task_migrate(p); + trace_android_rvh_set_task_cpu(p, new_cpu); } __set_task_cpu(p, new_cpu); } +EXPORT_SYMBOL_GPL(set_task_cpu); -#ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) { if (task_on_rq_queued(p)) { @@ -3186,7 +3259,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, out: return ret; } -#endif /* CONFIG_NUMA_BALANCING */ +EXPORT_SYMBOL_GPL(migrate_swap); /* * wait_task_inactive - wait for a thread to unschedule. @@ -3343,12 +3416,16 @@ EXPORT_SYMBOL_GPL(kick_process); * select_task_rq() below may allow selection of !active CPUs in order * to satisfy the above rules. */ -static int select_fallback_rq(int cpu, struct task_struct *p) +int select_fallback_rq(int cpu, struct task_struct *p) { int nid = cpu_to_node(cpu); const struct cpumask *nodemask = NULL; enum { cpuset, possible, fail } state = cpuset; - int dest_cpu; + int dest_cpu = -1; + + trace_android_rvh_select_fallback_rq(cpu, p, &dest_cpu); + if (dest_cpu >= 0) + return dest_cpu; /* * If the node that the CPU is on has been offlined, cpu_to_node() @@ -3413,6 +3490,7 @@ out: return dest_cpu; } +EXPORT_SYMBOL_GPL(select_fallback_rq); /* * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. @@ -3589,6 +3667,9 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, { int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; + if (wake_flags & WF_SYNC) + en_flags |= ENQUEUE_WAKEUP_SYNC; + lockdep_assert_rq_held(rq); if (p->sched_contributes_to_load) @@ -3734,6 +3815,7 @@ void wake_up_if_idle(int cpu) out: rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(wake_up_if_idle); bool cpus_share_cache(int this_cpu, int that_cpu) { @@ -3785,7 +3867,11 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { - if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) { + bool cond = false; + + trace_android_rvh_ttwu_cond(cpu, &cond); + + if ((sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) || cond) { sched_clock_cpu(cpu); /* Sync clocks across CPUs */ __ttwu_queue_wakelist(p, cpu, wake_flags); return true; @@ -4025,6 +4111,19 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (!ttwu_state_match(p, state, &success)) goto unlock; +#ifdef CONFIG_FREEZER + /* + * If we're going to wake up a thread which may be frozen, then + * we can only do so if we have an active CPU which is capable of + * running it. This may not be the case when resuming from suspend, + * as the secondary CPUs may not yet be back online. See __thaw_task() + * for the actual wakeup. + */ + if (unlikely(frozen_or_skipped(p)) && + !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p))) + goto unlock; +#endif + trace_sched_waking(p); /* @@ -4053,6 +4152,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) goto unlock; + if (READ_ONCE(p->__state) & TASK_UNINTERRUPTIBLE) + trace_sched_blocked_reason(p); + #ifdef CONFIG_SMP /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be @@ -4121,6 +4223,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); + trace_android_rvh_try_to_wake_up(p); + cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); if (task_cpu(p) != cpu) { if (p->in_iowait) { @@ -4140,8 +4244,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) unlock: raw_spin_unlock_irqrestore(&p->pi_lock, flags); out: - if (success) + if (success) { + trace_android_rvh_try_to_wake_up_success(p); ttwu_stat(p, task_cpu(p), wake_flags); + } preempt_enable(); return success; @@ -4236,6 +4342,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.cfs_rq = NULL; #endif + trace_android_rvh_sched_fork_init(p); + #ifdef CONFIG_SCHEDSTATS /* Even if schedstat is disabled, there should not be garbage */ memset(&p->se.statistics, 0, sizeof(p->se.statistics)); @@ -4370,6 +4478,8 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, */ int sched_fork(unsigned long clone_flags, struct task_struct *p) { + trace_android_rvh_sched_fork(p); + __sched_fork(clone_flags, p); /* * We mark the process as NEW here. This guarantees that @@ -4382,6 +4492,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Make sure we do not leak PI boosting priority to the child. */ p->prio = current->normal_prio; + trace_android_rvh_prepare_prio_fork(p); uclamp_fork(p); @@ -4414,6 +4525,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; init_entity_runnable_average(&p->se); + trace_android_rvh_finish_prio_fork(p); + #ifdef CONFIG_SCHED_INFO @@ -4493,6 +4606,8 @@ void wake_up_new_task(struct task_struct *p) struct rq_flags rf; struct rq *rq; + trace_android_rvh_wake_up_new_task(p); + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); #ifdef CONFIG_SMP @@ -4511,6 +4626,7 @@ void wake_up_new_task(struct task_struct *p) rq = __task_rq_lock(p, &rf); update_rq_clock(rq); post_init_entity_util_avg(p); + trace_android_rvh_new_task_stats(p); activate_task(rq, p, ENQUEUE_NOCLOCK); trace_sched_wakeup_new(p); @@ -4684,6 +4800,7 @@ struct callback_head balance_push_callback = { .next = NULL, .func = (void (*)(struct callback_head *))balance_push, }; +EXPORT_SYMBOL_GPL(balance_push_callback); static inline struct callback_head * __splice_balance_callbacks(struct rq *rq, bool split) @@ -4930,6 +5047,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) * task and put them back on the free list. */ kprobe_flush_task(prev); + trace_android_rvh_flush_task(prev); /* Task is done with its stack. */ put_task_stack(prev); @@ -5007,6 +5125,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * finish_task_switch()'s mmdrop(). */ switch_mm_irqs_off(prev->active_mm, next->mm, next); + lru_gen_use_mm(next->mm); if (!prev->mm) { // from kernel /* will mmdrop() in finish_task_switch(). */ @@ -5135,6 +5254,11 @@ void sched_exec(void) struct task_struct *p = current; unsigned long flags; int dest_cpu; + bool cond = false; + + trace_android_rvh_sched_exec(&cond); + if (cond) + return; raw_spin_lock_irqsave(&p->pi_lock, flags); dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC); @@ -5220,6 +5344,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } +EXPORT_SYMBOL_GPL(task_sched_runtime); #ifdef CONFIG_SCHED_DEBUG static u64 cpu_resched_latency(struct rq *rq) @@ -5289,6 +5414,8 @@ void scheduler_tick(void) rq_lock(rq, &rf); update_rq_clock(rq); + trace_android_rvh_tick_entry(rq); + thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); curr->sched_class->task_tick(rq, curr, 0); @@ -5307,6 +5434,8 @@ void scheduler_tick(void) rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq); #endif + + trace_android_vh_scheduler_tick(rq); } #ifdef CONFIG_NO_HZ_FULL @@ -5562,6 +5691,8 @@ static noinline void __schedule_bug(struct task_struct *prev) } check_panic_on_warn("scheduling while atomic"); + trace_android_rvh_schedule_bug(prev); + dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } @@ -6337,6 +6468,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) rq->last_seen_need_resched_ns = 0; #endif + trace_android_rvh_schedule(prev, next, rq); if (likely(prev != next)) { rq->nr_switches++; /* @@ -6781,7 +6913,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { - WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC | WF_ANDROID_VENDOR)); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -6834,6 +6966,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) struct rq_flags rf; struct rq *rq; + trace_android_rvh_rtmutex_prepare_setprio(p, pi_task); /* XXX used to be waiter->prio, not waiter->task->prio */ prio = __rt_effective_prio(pi_task, p->normal_prio); @@ -6952,12 +7085,13 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) void set_user_nice(struct task_struct *p, long nice) { - bool queued, running; + bool queued, running, allowed = false; int old_prio; struct rq_flags rf; struct rq *rq; - if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) + trace_android_rvh_set_user_nice(p, &nice, &allowed); + if ((task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) && !allowed) return; /* * We have to be careful, if called from sys_setpriority(), @@ -7110,6 +7244,7 @@ int available_idle_cpu(int cpu) return 1; } +EXPORT_SYMBOL_GPL(available_idle_cpu); /** * idle_task - return the idle task for a given CPU. @@ -7149,6 +7284,11 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, { unsigned long dl_util, util, irq; struct rq *rq = cpu_rq(cpu); + unsigned long new_util = ULONG_MAX; + + trace_android_rvh_effective_cpu_util(cpu, util_cfs, max, type, p, &new_util); + if (new_util != ULONG_MAX) + return new_util; if (!uclamp_is_used() && type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { @@ -7384,6 +7524,10 @@ recheck: /* Normal users shall not reset the sched_reset_on_fork flag: */ if (p->sched_reset_on_fork && !reset_on_fork) return -EPERM; + + /* Can't change util-clamps */ + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) + return -EPERM; } if (user) { @@ -7397,14 +7541,11 @@ recheck: /* Update task specific "requested" clamps */ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { - retval = uclamp_validate(p, attr); + retval = uclamp_validate(p, attr, user); if (retval) return retval; } - if (pi) - cpuset_read_lock(); - /* * Make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: @@ -7479,8 +7620,6 @@ change: if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; task_rq_unlock(rq, p, &rf); - if (pi) - cpuset_read_unlock(); goto recheck; } @@ -7546,10 +7685,8 @@ change: head = splice_balance_callbacks(rq); task_rq_unlock(rq, p, &rf); - if (pi) { - cpuset_read_unlock(); + if (pi) rt_mutex_adjust_pi(p); - } /* Run balance callbacks after we've adjusted the PI chain: */ balance_callbacks(rq, head); @@ -7559,8 +7696,6 @@ change: unlock: task_rq_unlock(rq, p, &rf); - if (pi) - cpuset_read_unlock(); return retval; } @@ -7599,11 +7734,13 @@ int sched_setscheduler(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, true); } +EXPORT_SYMBOL_GPL(sched_setscheduler); int sched_setattr(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, true, true); } +EXPORT_SYMBOL_GPL(sched_setattr); int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { @@ -7629,6 +7766,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, false); } +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); /* * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally @@ -7690,14 +7828,9 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); - if (likely(p)) - get_task_struct(p); - rcu_read_unlock(); - - if (likely(p)) { + if (p != NULL) retval = sched_setscheduler(p, policy, &lparam); - put_task_struct(p); - } + rcu_read_unlock(); return retval; } @@ -8095,6 +8228,8 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_put_task; retval = __sched_setaffinity(p, in_mask); + trace_android_rvh_sched_setaffinity(p, in_mask, &retval); + out_put_task: put_task_struct(p); return retval; @@ -8154,6 +8289,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, &p->cpus_mask, cpu_active_mask); + trace_android_rvh_sched_getaffinity(p, mask); raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: @@ -8209,6 +8345,8 @@ static void do_sched_yield(void) schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); + trace_android_rvh_do_sched_yield(rq); + preempt_disable(); rq_unlock_irq(rq, &rf); sched_preempt_enable_no_resched(); @@ -8622,6 +8760,7 @@ void sched_show_task(struct task_struct *p) print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); + trace_android_vh_sched_show_task(p); show_stack(p, NULL, KERN_INFO); put_task_stack(p); } @@ -8880,6 +9019,24 @@ void idle_task_exit(void) /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } +struct task_struct *pick_migrate_task(struct rq *rq) +{ + const struct sched_class *class; + struct task_struct *next; + + for_each_class(class) { + next = class->pick_next_task(rq); + if (next) { + next->sched_class->put_prev_task(rq, next); + return next; + } + } + + /* The idle class should always have a runnable task */ + BUG(); +} +EXPORT_SYMBOL_GPL(pick_migrate_task); + static int __balance_push_cpu_stop(void *arg) { struct task_struct *p = arg; @@ -9222,6 +9379,7 @@ int sched_cpu_starting(unsigned int cpu) sched_core_cpu_starting(cpu); sched_rq_cpu_starting(cpu); sched_tick_start(cpu); + trace_android_rvh_sched_cpu_starting(cpu); return 0; } @@ -9295,6 +9453,8 @@ int sched_cpu_dying(unsigned int cpu) } rq_unlock_irqrestore(rq, &rf); + trace_android_rvh_sched_cpu_dying(cpu); + calc_load_migrate(rq); update_max_interval(); hrtick_clear(rq); @@ -9355,7 +9515,9 @@ int in_sched_functions(unsigned long addr) * Every task in system belongs to this group at bootup. */ struct task_group root_task_group; +EXPORT_SYMBOL_GPL(root_task_group); LIST_HEAD(task_groups); +EXPORT_SYMBOL_GPL(task_groups); /* Cacheline aligned slab cache for task_group */ static struct kmem_cache *task_group_cache __read_mostly; @@ -9619,6 +9781,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) pr_err("Preemption disabled at:"); print_ip_sym(KERN_ERR, preempt_disable_ip); } + + trace_android_rvh_schedule_bug(NULL); + dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } @@ -9998,6 +10163,7 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) mutex_unlock(&uclamp_mutex); #endif + trace_android_rvh_cpu_cgroup_online(css); return 0; } @@ -10063,6 +10229,9 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (ret) break; } + + trace_android_rvh_cpu_cgroup_can_attach(tset, &ret); + return ret; } @@ -10073,6 +10242,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(task, css, tset) sched_move_task(task); + + trace_android_rvh_cpu_cgroup_attach(tset); } #ifdef CONFIG_UCLAMP_TASK_GROUP @@ -10250,6 +10421,27 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v) cpu_uclamp_print(sf, UCLAMP_MAX); return 0; } + +static int cpu_uclamp_ls_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 ls) +{ + struct task_group *tg; + + if (ls > 1) + return -EINVAL; + tg = css_tg(css); + tg->latency_sensitive = (unsigned int) ls; + + return 0; +} + +static u64 cpu_uclamp_ls_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + return (u64) tg->latency_sensitive; +} #endif /* CONFIG_UCLAMP_TASK_GROUP */ #ifdef CONFIG_FAIR_GROUP_SCHED @@ -10686,6 +10878,12 @@ static struct cftype cpu_legacy_files[] = { .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, }, + { + .name = "uclamp.latency_sensitive", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = cpu_uclamp_ls_read_u64, + .write_u64 = cpu_uclamp_ls_write_u64, + }, #endif { } /* Terminate */ }; @@ -10880,6 +11078,12 @@ static struct cftype cpu_files[] = { .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, }, + { + .name = "uclamp.latency_sensitive", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = cpu_uclamp_ls_read_u64, + .write_u64 = cpu_uclamp_ls_write_u64, + }, #endif { } /* terminate */ }; diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 7c2fe50fd76d..3d5f5a80b401 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -75,3 +75,4 @@ bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy) (policy->dvfs_possible_from_any_cpu && rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data))); } +EXPORT_SYMBOL_GPL(cpufreq_this_cpu_can_update); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7f6bb37d3a2f..cdc6b3f31f76 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -12,6 +12,7 @@ #include #include +#include #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) @@ -103,11 +104,17 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { + bool should_update = true; + if (sg_policy->need_freq_update) sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); else if (sg_policy->next_freq == next_freq) return false; + trace_android_rvh_set_sugov_update(sg_policy, next_freq, &should_update); + if (!should_update) + return false; + sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; @@ -150,9 +157,14 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, struct cpufreq_policy *policy = sg_policy->policy; unsigned int freq = arch_scale_freq_invariant() ? policy->cpuinfo.max_freq : policy->cur; + unsigned long next_freq = 0; util = map_util_perf(util); - freq = map_util_freq(util, freq, max); + trace_android_vh_map_util_freq(util, freq, max, &next_freq); + if (next_freq) + freq = next_freq; + else + freq = map_util_freq(util, freq, max); if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) return sg_policy->next_freq; @@ -597,6 +609,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) if (policy->fast_switch_enabled) return 0; + trace_android_vh_set_sugov_sched_attr(&attr); kthread_init_work(&sg_policy->work, sugov_work); kthread_init_worker(&sg_policy->worker); thread = kthread_create(kthread_worker_fn, &sg_policy->worker, @@ -836,29 +849,3 @@ struct cpufreq_governor *cpufreq_default_governor(void) #endif cpufreq_governor_init(schedutil_gov); - -#ifdef CONFIG_ENERGY_MODEL -static void rebuild_sd_workfn(struct work_struct *work) -{ - rebuild_sched_domains_energy(); -} -static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); - -/* - * EAS shouldn't be attempted without sugov, so rebuild the sched_domains - * on governor changes to make sure the scheduler knows about it. - */ -void sched_cpufreq_governor_change(struct cpufreq_policy *policy, - struct cpufreq_governor *old_gov) -{ - if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) { - /* - * When called from the cpufreq_register_driver() path, the - * cpu_hotplug_lock is already held, so use a work item to - * avoid nested locking in rebuild_sched_domains(). - */ - schedule_work(&rebuild_sd_work); - } - -} -#endif diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index d583f2aa744e..a4a0c2181493 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -65,8 +65,29 @@ static int convert_prio(int prio) return cpupri; } +#ifdef CONFIG_RT_SOFTINT_OPTIMIZATION +/** + * drop_nopreempt_cpus - remove likely nonpreemptible cpus from the mask + * @lowest_mask: mask with selected CPUs (non-NULL) + */ +static void +drop_nopreempt_cpus(struct cpumask *lowest_mask) +{ + unsigned int cpu = cpumask_first(lowest_mask); + while (cpu < nr_cpu_ids) { + /* unlocked access */ + struct task_struct *task = READ_ONCE(cpu_rq(cpu)->curr); + if (task_may_not_preempt(task, cpu)) { + cpumask_clear_cpu(cpu, lowest_mask); + } + cpu = cpumask_next(cpu, lowest_mask); + } +} +#endif + static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, - struct cpumask *lowest_mask, int idx) + struct cpumask *lowest_mask, int idx, + bool drop_nopreempts) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; int skip = 0; @@ -103,6 +124,11 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, if (lowest_mask) { cpumask_and(lowest_mask, &p->cpus_mask, vec->mask); +#ifdef CONFIG_RT_SOFTINT_OPTIMIZATION + if (drop_nopreempts) + drop_nopreempt_cpus(lowest_mask); +#endif + /* * We have to ensure that we have at least one bit * still set in the array, since the map could have @@ -147,12 +173,16 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, { int task_pri = convert_prio(p->prio); int idx, cpu; + bool drop_nopreempts = task_pri <= MAX_RT_PRIO; BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); +#ifdef CONFIG_RT_SOFTINT_OPTIMIZATION +retry: +#endif for (idx = 0; idx < task_pri; idx++) { - if (!__cpupri_find(cp, p, lowest_mask, idx)) + if (!__cpupri_find(cp, p, lowest_mask, idx, drop_nopreempts)) continue; if (!lowest_mask || !fitness_fn) @@ -174,6 +204,17 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, return 1; } + /* + * If we can't find any non-preemptible cpu's, retry so we can + * find the lowest priority target and avoid priority inversion. + */ +#ifdef CONFIG_RT_SOFTINT_OPTIMIZATION + if (drop_nopreempts) { + drop_nopreempts = false; + goto retry; + } +#endif + /* * If we failed to find a fitting lowest_mask, kick off a new search * but without taking into account any fitness criteria this time. @@ -196,6 +237,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, return 0; } +EXPORT_SYMBOL_GPL(cpupri_find_fitness); /** * cpupri_set - update the CPU priority setting @@ -314,3 +356,17 @@ void cpupri_cleanup(struct cpupri *cp) for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) free_cpumask_var(cp->pri_to_cpu[i].mask); } + +#ifdef CONFIG_RT_SOFTINT_OPTIMIZATION +/* + * cpupri_check_rt - check if CPU has a RT task + * should be called from rcu-sched read section. + */ +bool cpupri_check_rt(void) +{ + int cpu = raw_smp_processor_id(); + + return (cpu_rq(cpu)->rd->cpupri.cpu_to_pri[cpu] > CPUPRI_NORMAL) && + (cpu_rq(cpu)->rt.rt_throttled == 0); +} +#endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 042a6dbce8f3..be02fcf9285e 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -2,7 +2,9 @@ /* * Simple CPU accounting cgroup controller */ +#include #include "sched.h" +#include #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -18,6 +20,7 @@ * compromise in place of having locks on each irq in account_system_time. */ DEFINE_PER_CPU(struct irqtime, cpu_irqtime); +EXPORT_PER_CPU_SYMBOL_GPL(cpu_irqtime); static int sched_clock_irqtime; @@ -53,6 +56,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset) unsigned int pc; s64 delta; int cpu; + bool irq_start = true; if (!sched_clock_irqtime) return; @@ -68,10 +72,20 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset) * in that case, so as not to confuse scheduler with a special task * that do not consume any time, but still wants to run. */ - if (pc & HARDIRQ_MASK) + if (pc & HARDIRQ_MASK) { irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); - else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) + irq_start = false; + } else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) { irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); + irq_start = false; + } + + trace_android_rvh_account_irq(curr, cpu, delta); + + if (irq_start) + trace_android_rvh_account_irq_start(curr, cpu, delta); + else + trace_android_rvh_account_irq_end(curr, cpu, delta); } static u64 irqtime_tick_accounted(u64 maxtime) @@ -130,6 +144,9 @@ void account_user_time(struct task_struct *p, u64 cputime) /* Account for user time used */ acct_account_cputime(p); + + /* Account power usage for user time */ + cpufreq_acct_update_power(p, cputime); } /* @@ -174,6 +191,9 @@ void account_system_index_time(struct task_struct *p, /* Account for system time used */ acct_account_cputime(p); + + /* Account power usage for system time */ + cpufreq_acct_update_power(p, cputime); } /* @@ -389,6 +409,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, } else { account_system_index_time(p, cputime, CPUTIME_SYSTEM); } + trace_android_vh_irqtime_account_process_tick(p, this_rq(), user_tick, ticks); } static void irqtime_account_idle_ticks(int ticks) @@ -458,6 +479,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) *ut = cputime.utime; *st = cputime.stime; } +EXPORT_SYMBOL_GPL(thread_group_cputime_adjusted); #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */ @@ -472,6 +494,7 @@ void account_process_tick(struct task_struct *p, int user_tick) if (vtime_accounting_enabled_this_cpu()) return; + trace_android_vh_account_task_time(p, this_rq(), user_tick); if (sched_clock_irqtime) { irqtime_account_process_tick(p, user_tick, 1); @@ -627,6 +650,8 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) thread_group_cputime(p, &cputime); cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } +EXPORT_SYMBOL_GPL(thread_group_cputime_adjusted); + #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2a2f32eaffcc..16a8c7898266 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,6 +17,7 @@ */ #include "sched.h" #include "pelt.h" +#include struct dl_bandwidth def_dl_bandwidth; @@ -1676,6 +1677,12 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) struct task_struct *curr; bool select_rq; struct rq *rq; + int target_cpu = -1; + + trace_android_rvh_select_task_rq_dl(p, cpu, flags & 0xF, + flags, &target_cpu); + if (target_cpu >= 0) + return target_cpu; if (!(flags & WF_TTWU)) goto out; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 34c5ff3a0669..3d1b5810cd99 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -48,9 +48,10 @@ static unsigned long nsec_low(unsigned long long nsec) #define SCHED_FEAT(name, enabled) \ #name , -static const char * const sched_feat_names[] = { +const char * const sched_feat_names[] = { #include "features.h" }; +EXPORT_SYMBOL_GPL(sched_feat_names); #undef SCHED_FEAT @@ -79,6 +80,7 @@ static int sched_feat_show(struct seq_file *m, void *v) struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { #include "features.h" }; +EXPORT_SYMBOL_GPL(sched_feat_keys); #undef SCHED_FEAT diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6648683cd964..437b95273e80 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -22,6 +22,8 @@ */ #include "sched.h" +#include + /* * Targeted preemption latency for CPU-bound tasks: * @@ -36,6 +38,7 @@ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_latency = 6000000ULL; +EXPORT_SYMBOL_GPL(sysctl_sched_latency); static unsigned int normalized_sysctl_sched_latency = 6000000ULL; /* @@ -577,11 +580,13 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { + trace_android_rvh_enqueue_entity(cfs_rq, se); rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { + trace_android_rvh_dequeue_entity(cfs_rq, se); rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); } @@ -804,6 +809,8 @@ void post_init_entity_util_avg(struct task_struct *p) return; } + /* Hook before this se's util is attached to cfs_rq's util */ + trace_android_rvh_post_init_entity_util_avg(se); attach_entity_cfs_rq(se); } @@ -3117,6 +3124,7 @@ void reweight_task(struct task_struct *p, int prio) reweight_entity(cfs_rq, se, weight); load->inv_weight = sched_prio_to_wmult[prio]; } +EXPORT_SYMBOL_GPL(reweight_task); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP @@ -3800,6 +3808,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s else se->avg.load_sum = 1; + trace_android_rvh_attach_entity_load_avg(cfs_rq, se); + enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; @@ -3829,6 +3839,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ u32 divider = get_pelt_divider(&cfs_rq->avg); + trace_android_rvh_detach_entity_load_avg(cfs_rq, se); + dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; @@ -3865,6 +3877,8 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s decayed = update_cfs_rq_load_avg(now, cfs_rq); decayed |= propagate_entity_load_avg(se); + trace_android_rvh_update_load_avg(now, cfs_rq, se); + if (!se->avg.last_update_time && (flags & DO_ATTACH)) { /* @@ -3936,6 +3950,8 @@ static void remove_entity_load_avg(struct sched_entity *se) sync_entity_load_avg(se); + trace_android_rvh_remove_entity_load_avg(cfs_rq, se); + raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags); ++cfs_rq->removed.nr; cfs_rq->removed.util_avg += se->avg.util_avg; @@ -4040,6 +4056,11 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, { long last_ewma_diff, last_enqueued_diff; struct util_est ue; + int ret = 0; + + trace_android_rvh_util_est_update(cfs_rq, p, task_sleep, &ret); + if (ret) + return; if (!sched_feat(UTIL_EST)) return; @@ -4253,7 +4274,10 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu) static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { - if (!sched_asym_cpucap_active()) + bool need_update = true; + + trace_android_rvh_update_misfit_status(p, rq, &need_update); + if (!sched_asym_cpucap_active() || !need_update) return; if (!p || p->nr_cpus_allowed == 1) { @@ -4357,6 +4381,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) /* ensure we never gain time by being placed backwards. */ se->vruntime = max_vruntime(se->vruntime, vruntime); + trace_android_rvh_place_entity(cfs_rq, se, initial, &vruntime); } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); @@ -4580,9 +4605,14 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) unsigned long ideal_runtime, delta_exec; struct sched_entity *se; s64 delta; + bool skip_preempt = false; ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + trace_android_rvh_check_preempt_tick(current, &ideal_runtime, &skip_preempt, + delta_exec, cfs_rq, curr, sysctl_sched_min_granularity); + if (skip_preempt) + return; if (delta_exec > ideal_runtime) { resched_curr(rq_of(cfs_rq)); /* @@ -4611,8 +4641,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) resched_curr(rq_of(cfs_rq)); } -static void -set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { clear_buddies(cfs_rq, se); @@ -4645,6 +4674,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } +EXPORT_SYMBOL_GPL(set_next_entity); static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); @@ -4660,7 +4690,11 @@ static struct sched_entity * pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { struct sched_entity *left = __pick_first_entity(cfs_rq); - struct sched_entity *se; + struct sched_entity *se = NULL; + + trace_android_rvh_pick_next_entity(cfs_rq, curr, &se); + if (se) + goto done; /* * If curr is set we have to see if its left of the leftmost entity @@ -4702,6 +4736,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) se = cfs_rq->last; } +done: return se; } @@ -4764,6 +4799,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) if (cfs_rq->nr_running > 1) check_preempt_tick(cfs_rq, curr); + trace_android_rvh_entity_tick(cfs_rq, curr); } @@ -5692,6 +5728,12 @@ static inline unsigned long cpu_util(int cpu); static inline bool cpu_overutilized(int cpu) { + int overutilized = -1; + + trace_android_rvh_cpu_overutilized(cpu, &overutilized); + if (overutilized != -1) + return overutilized; + return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); } @@ -5732,6 +5774,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); int task_new = !(flags & ENQUEUE_WAKEUP); + int should_iowait_boost; /* * The code below (indirectly) updates schedutil which looks at @@ -5746,7 +5789,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * utilization updates, so do it here explicitly with the IOWAIT flag * passed. */ - if (p->in_iowait) + should_iowait_boost = p->in_iowait; + trace_android_rvh_set_iowait(p, rq, &should_iowait_boost); + if (should_iowait_boost) cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); for_each_sched_entity(se) { @@ -5768,6 +5813,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) flags = ENQUEUE_WAKEUP; } + trace_android_rvh_enqueue_task_fair(rq, p, flags); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -5880,6 +5926,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) flags |= DEQUEUE_SLEEP; } + trace_android_rvh_dequeue_task_fair(rq, p, flags); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -6837,6 +6884,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); unsigned long max_util = 0, sum_util = 0; unsigned long _cpu_cap = cpu_cap; + unsigned long energy = 0; int cpu; _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask)); @@ -6893,7 +6941,11 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) max_util = max(max_util, min(cpu_util, _cpu_cap)); } - return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap); + trace_android_vh_em_cpu_energy(pd->em_pd, max_util, sum_util, &energy); + if (!energy) + energy = em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap); + + return energy; } /* @@ -6935,20 +6987,39 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * other use-cases too. So, until someone finds a better way to solve this, * let's keep things simple by re-using the existing slow path. */ -static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) +static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync) { unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + int max_spare_cap_cpu_ls = prev_cpu, best_idle_cpu = -1; int cpu, best_energy_cpu = prev_cpu, target = -1; + unsigned long max_spare_cap_ls = 0, target_cap; unsigned long cpu_cap, util, base_energy = 0; + bool boosted, latency_sensitive = false; + unsigned int min_exit_lat = UINT_MAX; + struct cpuidle_state *idle; struct sched_domain *sd; struct perf_domain *pd; + int new_cpu = INT_MAX; + + sync_entity_load_avg(&p->se); + trace_android_rvh_find_energy_efficient_cpu(p, prev_cpu, sync, &new_cpu); + if (new_cpu != INT_MAX) + return new_cpu; rcu_read_lock(); pd = rcu_dereference(rd->pd); if (!pd || READ_ONCE(rd->overutilized)) goto unlock; + cpu = smp_processor_id(); + if (sync && cpu_rq(cpu)->nr_running == 1 && + cpumask_test_cpu(cpu, p->cpus_ptr) && + task_fits_cpu(p, cpu)) { + rcu_read_unlock(); + return cpu; + } + /* * Energy-aware wake-up happens on the lowest sched_domain starting * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu. @@ -6961,10 +7032,13 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) target = prev_cpu; - sync_entity_load_avg(&p->se); if (!task_util_est(p)) goto unlock; + latency_sensitive = uclamp_latency_sensitive(p); + boosted = uclamp_boosted(p); + target_cap = boosted ? 0 : ULONG_MAX; + for (; pd; pd = pd->next) { unsigned long cur_delta, spare_cap, max_spare_cap = 0; bool compute_prev_delta = false; @@ -6991,7 +7065,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!fits_capacity(util, cpu_cap)) continue; - if (cpu == prev_cpu) { + if (!latency_sensitive && cpu == prev_cpu) { /* Always use prev_cpu as a candidate. */ compute_prev_delta = true; } else if (spare_cap > max_spare_cap) { @@ -7002,9 +7076,32 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) max_spare_cap = spare_cap; max_spare_cap_cpu = cpu; } + + if (!latency_sensitive) + continue; + + if (idle_cpu(cpu)) { + cpu_cap = capacity_orig_of(cpu); + if (boosted && cpu_cap < target_cap) + continue; + if (!boosted && cpu_cap > target_cap) + continue; + idle = idle_get_state(cpu_rq(cpu)); + if (idle && idle->exit_latency > min_exit_lat && + cpu_cap == target_cap) + continue; + + if (idle) + min_exit_lat = idle->exit_latency; + target_cap = cpu_cap; + best_idle_cpu = cpu; + } else if (spare_cap > max_spare_cap_ls) { + max_spare_cap_ls = spare_cap; + max_spare_cap_cpu_ls = cpu; + } } - if (max_spare_cap_cpu < 0 && !compute_prev_delta) + if (!latency_sensitive && max_spare_cap_cpu < 0 && !compute_prev_delta) continue; /* Compute the 'base' energy of the pd, without @p */ @@ -7034,6 +7131,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } rcu_read_unlock(); + if (latency_sensitive) + return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls; + /* * Pick the best CPU if prev_cpu cannot be used, or if it saves at * least 6% of the energy used by prev_cpu. @@ -7068,9 +7168,18 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; + int target_cpu = -1; /* SD_flags and WF_flags share the first nibble */ int sd_flag = wake_flags & 0xF; + if (trace_android_rvh_select_task_rq_fair_enabled() && + !(sd_flag & SD_BALANCE_FORK)) + sync_entity_load_avg(&p->se); + trace_android_rvh_select_task_rq_fair(p, prev_cpu, sd_flag, + wake_flags, &target_cpu); + if (target_cpu >= 0) + return target_cpu; + /* * required for stable ->cpus_allowed */ @@ -7079,7 +7188,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) record_wakee(p); if (sched_energy_enabled()) { - new_cpu = find_energy_efficient_cpu(p, prev_cpu); + new_cpu = find_energy_efficient_cpu(p, prev_cpu, sync); if (new_cpu >= 0) return new_cpu; new_cpu = prev_cpu; @@ -7288,9 +7397,14 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; int cse_is_idle, pse_is_idle; + bool ignore = false; + bool preempt = false; if (unlikely(se == pse)) return; + trace_android_rvh_check_preempt_wakeup_ignore(curr, &ignore); + if (ignore) + return; /* * This is possible from callers such as attach_tasks(), in which we @@ -7347,6 +7461,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; update_curr(cfs_rq_of(se)); + trace_android_rvh_check_preempt_wakeup(rq, p, &preempt, &ignore, + wake_flags, se, pse, next_buddy_marked, sysctl_sched_wakeup_granularity); + if (preempt) + goto preempt; + if (ignore) + return; + if (wakeup_preempt_entity(se, pse) == 1) { /* * Bias pick_next to pick the sched entity that is @@ -7414,9 +7535,10 @@ struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct cfs_rq *cfs_rq = &rq->cfs; - struct sched_entity *se; - struct task_struct *p; + struct sched_entity *se = NULL; + struct task_struct *p = NULL; int new_tasks; + bool repick = false; again: if (!sched_fair_runnable(rq)) @@ -7470,7 +7592,7 @@ again: } while (cfs_rq); p = task_of(se); - + trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, false, prev); /* * Since we haven't yet done put_prev_entity and if the selected task * is a different task than we started out with, try and touch the @@ -7503,6 +7625,10 @@ simple: if (prev) put_prev_task(rq, prev); + trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, true, prev); + if (repick) + goto done; + do { se = pick_next_entity(cfs_rq, NULL); set_next_entity(cfs_rq, se); @@ -7744,7 +7870,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) * rewrite all of this once again.] */ -static unsigned long __read_mostly max_load_balance_interval = HZ/10; +unsigned long __read_mostly max_load_balance_interval = HZ/10; +EXPORT_SYMBOL_GPL(max_load_balance_interval); enum fbq_type { regular, remote, all }; @@ -7824,6 +7951,7 @@ struct lb_env { enum fbq_type fbq_type; enum migration_type migration_type; struct list_head tasks; + struct rq_flags *src_rq_rf; }; /* @@ -7938,9 +8066,14 @@ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot; + int can_migrate = 1; lockdep_assert_rq_held(env->src_rq); + trace_android_rvh_can_migrate_task(p, env->dst_cpu, &can_migrate); + if (!can_migrate) + return 0; + /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or @@ -8028,8 +8161,20 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) */ static void detach_task(struct task_struct *p, struct lb_env *env) { + int detached = 0; + lockdep_assert_rq_held(env->src_rq); + /* + * The vendor hook may drop the lock temporarily, so + * pass the rq flags to unpin lock. We expect the + * rq lock to be held after return. + */ + trace_android_rvh_migrate_queued_task(env->src_rq, env->src_rq_rf, p, + env->dst_cpu, &detached); + if (detached) + return; + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -8336,6 +8481,8 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) bool decayed = false; int cpu = cpu_of(rq); + trace_android_rvh_update_blocked_fair(rq); + /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. @@ -8555,6 +8702,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) if (!capacity) capacity = 1; + trace_android_rvh_update_cpu_capacity(cpu, &capacity); cpu_rq(cpu)->cpu_capacity = capacity; trace_sched_cpu_capacity_tp(cpu_rq(cpu)); @@ -9667,8 +9815,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (sched_energy_enabled()) { struct root_domain *rd = env->dst_rq->rd; + int out_balance = 1; - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) + trace_android_rvh_find_busiest_group(sds.busiest, env->dst_rq, + &out_balance); + if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized) + && out_balance) goto out_balanced; } @@ -9787,7 +9939,12 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct rq *busiest = NULL, *rq; unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; unsigned int busiest_nr = 0; - int i; + int i, done = 0; + + trace_android_rvh_find_busiest_queue(env->dst_cpu, group, env->cpus, + &busiest, &done); + if (done) + return busiest; for_each_cpu_and(i, sched_group_span(group), env->cpus) { unsigned long capacity, load, util; @@ -10078,6 +10235,7 @@ redo: more_balance: rq_lock_irqsave(busiest, &rf); + env.src_rq_rf = &rf; update_rq_clock(busiest); /* @@ -10371,6 +10529,7 @@ static int active_load_balance_cpu_stop(void *data) .src_rq = busiest_rq, .idle = CPU_IDLE, .flags = LBF_ACTIVE_LB, + .src_rq_rf = &rf, }; schedstat_inc(sd->alb_count); @@ -10428,6 +10587,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) int need_serialize, need_decay = 0; u64 max_cost = 0; + trace_android_rvh_sched_rebalance_domains(rq, &continue_balancing); + if (!continue_balancing) + return; + rcu_read_lock(); for_each_domain(cpu, sd) { /* @@ -10519,9 +10682,13 @@ static inline int on_null_domain(struct rq *rq) static inline int find_new_ilb(void) { - int ilb; + int ilb = -1; const struct cpumask *hk_mask; + trace_android_rvh_find_new_ilb(nohz.idle_cpus_mask, &ilb); + if (ilb >= 0) + return ilb; + hk_mask = housekeeping_cpumask(HK_FLAG_MISC); for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) { @@ -10583,6 +10750,7 @@ static void nohz_balancer_kick(struct rq *rq) struct sched_domain *sd; int nr_busy, i, cpu = rq->cpu; unsigned int flags = 0; + int done = 0; if (unlikely(rq->idle_balance)) return; @@ -10607,6 +10775,10 @@ static void nohz_balancer_kick(struct rq *rq) if (time_before(now, nohz.next_balance)) goto out; + trace_android_rvh_sched_nohz_balancer_kick(rq, &flags, &done); + if (done) + goto out; + if (rq->nr_running >= 2) { flags = NOHZ_KICK_MASK; goto out; @@ -10999,6 +11171,11 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) struct sched_domain *sd; int pulled_task = 0; u64 curr_cost = 0; + int done = 0; + + trace_android_rvh_sched_newidle_balance(this_rq, rf, &pulled_task, &done); + if (done) + return pulled_task; update_misfit_status(NULL, this_rq); diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index a554e3bbab2b..4f8871ae95f6 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -180,9 +180,8 @@ accumulate_sum(u64 delta, struct sched_avg *sa, * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ -static __always_inline int -___update_load_sum(u64 now, struct sched_avg *sa, - unsigned long load, unsigned long runnable, int running) +int ___update_load_sum(u64 now, struct sched_avg *sa, + unsigned long load, unsigned long runnable, int running) { u64 delta; @@ -232,6 +231,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa, return 1; } +EXPORT_SYMBOL_GPL(___update_load_sum); /* * When syncing *_avg with *_sum, we must take into account the current @@ -257,8 +257,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * the period_contrib of cfs_rq when updating the sched_avg of a sched_entity * if it's more convenient. */ -static __always_inline void -___update_load_avg(struct sched_avg *sa, unsigned long load) +void ___update_load_avg(struct sched_avg *sa, unsigned long load) { u32 divider = get_pelt_divider(sa); @@ -269,6 +268,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) sa->runnable_avg = div_u64(sa->runnable_sum, divider); WRITE_ONCE(sa->util_avg, sa->util_sum / divider); } +EXPORT_SYMBOL_GPL(___update_load_avg); /* * sched_entity: @@ -306,6 +306,7 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) return 0; } +EXPORT_SYMBOL_GPL(__update_load_avg_blocked_se); int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -471,3 +472,43 @@ int update_irq_load_avg(struct rq *rq, u64 running) return ret; } #endif + +unsigned int sysctl_sched_pelt_multiplier = 1; +__read_mostly unsigned int sched_pelt_lshift; + +int sched_pelt_multiplier(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + static DEFINE_MUTEX(mutex); + unsigned int old; + int ret; + + mutex_lock(&mutex); + + old = sysctl_sched_pelt_multiplier; + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret) + goto undo; + if (!write) + goto done; + + switch (sysctl_sched_pelt_multiplier) { + case 1: + fallthrough; + case 2: + fallthrough; + case 4: + WRITE_ONCE(sched_pelt_lshift, + sysctl_sched_pelt_multiplier >> 1); + goto done; + default: + ret = -EINVAL; + } + +undo: + sysctl_sched_pelt_multiplier = old; +done: + mutex_unlock(&mutex); + + return ret; +} diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 4ff2ed4f8fa1..74ef53753df7 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -61,6 +61,8 @@ static inline void cfs_se_util_change(struct sched_avg *avg) WRITE_ONCE(avg->util_est.enqueued, enqueued); } +extern unsigned int sched_pelt_lshift; + /* * The clock_pelt scales the time to reflect the effective amount of * computation done during the running delta time but then sync back to @@ -75,9 +77,13 @@ static inline void cfs_se_util_change(struct sched_avg *avg) */ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta) { + delta <<= READ_ONCE(sched_pelt_lshift); + + rq->clock_task_mult += delta; + if (unlikely(is_idle_task(rq->curr))) { /* The rq is idle, we can sync to clock_task */ - rq->clock_pelt = rq_clock_task(rq); + rq->clock_pelt = rq_clock_task_mult(rq); return; } @@ -129,7 +135,8 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq) * rq's clock_task. */ if (util_sum >= divider) - rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt; + rq->lost_idle_time += rq_clock_task_mult(rq) - + rq->clock_pelt; } static inline u64 rq_clock_pelt(struct rq *rq) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index cad2a1b34ed0..a01c03496e23 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -202,6 +202,7 @@ static void group_init(struct psi_group *group) INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); /* Init trigger-related members */ + atomic_set(&group->poll_scheduled, 0); mutex_init(&group->trigger_lock); INIT_LIST_HEAD(&group->triggers); memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); @@ -566,18 +567,17 @@ static u64 update_triggers(struct psi_group *group, u64 now) return now + group->poll_min_period; } -/* Schedule polling if it's not already scheduled. */ -static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) +/* Schedule polling if it's not already scheduled or forced. */ +static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay, + bool force) { struct task_struct *task; /* - * Do not reschedule if already scheduled. - * Possible race with a timer scheduled after this check but before - * mod_timer below can be tolerated because group->polling_next_update - * will keep updates on schedule. + * atomic_xchg should be called even when !force to provide a + * full memory barrier (see the comment inside psi_poll_work). */ - if (timer_pending(&group->poll_timer)) + if (atomic_xchg(&group->poll_scheduled, 1) && !force) return; rcu_read_lock(); @@ -589,12 +589,15 @@ static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) */ if (likely(task)) mod_timer(&group->poll_timer, jiffies + delay); + else + atomic_set(&group->poll_scheduled, 0); rcu_read_unlock(); } static void psi_poll_work(struct psi_group *group) { + bool force_reschedule = false; u32 changed_states; u64 now; @@ -602,6 +605,43 @@ static void psi_poll_work(struct psi_group *group) now = sched_clock(); + if (now > group->polling_until) { + /* + * We are either about to start or might stop polling if no + * state change was recorded. Resetting poll_scheduled leaves + * a small window for psi_group_change to sneak in and schedule + * an immegiate poll_work before we get to rescheduling. One + * potential extra wakeup at the end of the polling window + * should be negligible and polling_next_update still keeps + * updates correctly on schedule. + */ + atomic_set(&group->poll_scheduled, 0); + /* + * A task change can race with the poll worker that is supposed to + * report on it. To avoid missing events, ensure ordering between + * poll_scheduled and the task state accesses, such that if the poll + * worker misses the state update, the task change is guaranteed to + * reschedule the poll worker: + * + * poll worker: + * atomic_set(poll_scheduled, 0) + * smp_mb() + * LOAD states + * + * task change: + * STORE states + * if atomic_xchg(poll_scheduled, 1) == 0: + * schedule poll worker + * + * The atomic_xchg() implies a full barrier. + */ + smp_mb(); + } else { + /* Polling window is not over, keep rescheduling */ + force_reschedule = true; + } + + collect_percpu_times(group, PSI_POLL, &changed_states); if (changed_states & group->poll_states) { @@ -627,7 +667,8 @@ static void psi_poll_work(struct psi_group *group) group->polling_next_update = update_triggers(group, now); psi_schedule_poll_work(group, - nsecs_to_jiffies(group->polling_next_update - now) + 1); + nsecs_to_jiffies(group->polling_next_update - now) + 1, + force_reschedule); out: mutex_unlock(&group->trigger_lock); @@ -752,7 +793,7 @@ static void psi_group_change(struct psi_group *group, int cpu, write_seqcount_end(&groupc->seq); if (state_mask & group->poll_states) - psi_schedule_poll_work(group, 1); + psi_schedule_poll_work(group, 1, false); if (wake_clock && !delayed_work_pending(&group->avgs_work)) schedule_delayed_work(&group->avgs_work, PSI_FREQ); @@ -1218,6 +1259,7 @@ void psi_trigger_destroy(struct psi_trigger *t) * can no longer be found through group->poll_task. */ kthread_stop(task_to_destroy); + atomic_set(&group->poll_scheduled, 0); } kfree(t); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index add67f811e00..0ededdea98b2 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -7,6 +7,8 @@ #include "pelt.h" +#include + int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; /* More than 4 hours if BW_SHIFT equals 20. */ @@ -984,6 +986,13 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) if (likely(rt_b->rt_runtime)) { rt_rq->rt_throttled = 1; printk_deferred_once("sched: RT throttling activated\n"); + + trace_android_vh_dump_throttled_rt_tasks( + raw_smp_processor_id(), + rq_clock(rq_of_rt_rq(rt_rq)), + sched_rt_period(rt_rq), + runtime, + hrtimer_get_expires_ns(&rt_b->rt_period_timer)); } else { /* * In case we did anyway, make it go away, @@ -1384,6 +1393,27 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) enqueue_top_rt_rq(&rq->rt); } +#ifdef CONFIG_SMP +static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p, + bool sync) +{ + /* + * If the waker is CFS, then an RT sync wakeup would preempt the waker + * and force it to run for a likely small time after the RT wakee is + * done. So, only honor RT sync wakeups from RT wakers. + */ + return sync && task_has_rt_policy(rq->curr) && + p->prio <= rq->rt.highest_prio.next && + rq->rt.rt_nr_running <= 2; +} +#else +static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p, + bool sync) +{ + return 0; +} +#endif + /* * Adding/removing a task to/from a priority array: */ @@ -1391,13 +1421,15 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct sched_rt_entity *rt_se = &p->rt; + bool sync = !!(flags & ENQUEUE_WAKEUP_SYNC); if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; enqueue_rt_entity(rt_se, flags); - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + if (!task_current(rq, p) && p->nr_cpus_allowed > 1 && + !should_honor_rt_sync(rq, p, sync)) enqueue_pushable_task(rq, p); } @@ -1448,12 +1480,43 @@ static void yield_task_rt(struct rq *rq) #ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); +#ifdef CONFIG_RT_SOFTINT_OPTIMIZATION +/* + * Return whether the task on the given cpu is currently non-preemptible + * while handling a potentially long softint, or if the task is likely + * to block preemptions soon because it is a ksoftirq thread that is + * handling slow softints. + */ +bool +task_may_not_preempt(struct task_struct *task, int cpu) +{ + __u32 softirqs = per_cpu(active_softirqs, cpu) | + local_softirq_pending(); + + struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu); + return ((softirqs & LONG_SOFTIRQ_MASK) && + (task == cpu_ksoftirqd || + task_thread_info(task)->preempt_count & SOFTIRQ_MASK)); +} +EXPORT_SYMBOL_GPL(task_may_not_preempt); +#endif /* CONFIG_RT_SOFTINT_OPTIMIZATION */ + static int select_task_rq_rt(struct task_struct *p, int cpu, int flags) { struct task_struct *curr; struct rq *rq; + struct rq *this_cpu_rq; bool test; + int target_cpu = -1; + bool may_not_preempt; + bool sync = !!(flags & WF_SYNC); + int this_cpu; + + trace_android_rvh_select_task_rq_rt(p, cpu, flags & 0xF, + flags, &target_cpu); + if (target_cpu >= 0) + return target_cpu; /* For anything but wake ups, just return the task_cpu */ if (!(flags & (WF_TTWU | WF_FORK))) @@ -1463,9 +1526,16 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags) rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ + this_cpu = smp_processor_id(); + this_cpu_rq = cpu_rq(this_cpu); /* - * If the current task on @p's runqueue is an RT task, then + * If the current task on @p's runqueue is a softirq task, + * it may run without preemption for a time that is + * ill-suited for a waiting RT task. Therefore, try to + * wake this RT task on another runqueue. + * + * Also, if the current task on @p's runqueue is an RT task, then * try to see if we can wake this RT task up on another * runqueue. Otherwise simply start this RT task * on its current runqueue. @@ -1490,9 +1560,19 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags) * requirement of the task - which is only important on heterogeneous * systems like big.LITTLE. */ - test = curr && - unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); + may_not_preempt = task_may_not_preempt(curr, cpu); + test = (curr && (may_not_preempt || + (unlikely(rt_task(curr)) && + (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio)))); + + /* + * Respect the sync flag as long as the task can run on this CPU. + */ + if (should_honor_rt_sync(this_cpu_rq, p, sync) && + cpumask_test_cpu(this_cpu, p->cpus_ptr)) { + cpu = this_cpu; + goto out_unlock; + } if (test || !rt_task_fits_capacity(p, cpu)) { int target = find_lowest_rq(p); @@ -1505,11 +1585,14 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags) goto out_unlock; /* - * Don't bother moving it if the destination CPU is + * If cpu is non-preemptible, prefer remote cpu + * even if it's running a higher-prio task. + * Otherwise: Don't bother moving it if the destination CPU is * not running a lower priority task. */ if (target != -1 && - p->prio < cpu_rq(target)->rt.highest_prio.curr) + (may_not_preempt || + p->prio < cpu_rq(target)->rt.highest_prio.curr)) cpu = target; } @@ -1550,6 +1633,8 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) { if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { + int done = 0; + /* * This is OK, because current is on_cpu, which avoids it being * picked for load-balance and preemption/IRQs are still @@ -1557,7 +1642,9 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) * not yet started the picking loop. */ rq_unpin_lock(rq, rf); - pull_rt_task(rq); + trace_android_rvh_sched_balance_rt(rq, p, &done); + if (!done) + pull_rt_task(rq); rq_repin_lock(rq, rf); } @@ -1610,6 +1697,7 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f */ if (rq->curr->sched_class != &rt_sched_class) update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + trace_android_rvh_update_rt_rq_load_avg(rq_clock_pelt(rq), rq, p, 0); rt_queue_push_tasks(rq); } @@ -1672,6 +1760,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) update_curr_rt(rq); update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); + trace_android_rvh_update_rt_rq_load_avg(rq_clock_pelt(rq), rq, p, 1); /* * The previous task needs to be made eligible for pushing @@ -1699,7 +1788,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) * Return the highest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise */ -static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) +struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) { struct plist_head *head = &rq->rt.pushable_tasks; struct task_struct *p; @@ -1714,6 +1803,7 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) return NULL; } +EXPORT_SYMBOL_GPL(pick_highest_pushable_task); static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); @@ -1722,7 +1812,7 @@ static int find_lowest_rq(struct task_struct *task) struct sched_domain *sd; struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); int this_cpu = smp_processor_id(); - int cpu = task_cpu(task); + int cpu = -1; int ret; /* Make sure the mask is initialized first */ @@ -1747,9 +1837,15 @@ static int find_lowest_rq(struct task_struct *task) task, lowest_mask); } + trace_android_rvh_find_lowest_rq(task, lowest_mask, ret, &cpu); + if (cpu >= 0) + return cpu; + if (!ret) return -1; /* No targets found */ + cpu = task_cpu(task); + /* * At this point we have built a mask of CPUs representing the * lowest priority tasks in the system. Now we want to elect @@ -2081,6 +2177,9 @@ static int rto_next_cpu(struct root_domain *rd) /* When rto_cpu is -1 this acts like cpumask_first() */ cpu = cpumask_next(rd->rto_cpu, rd->rto_mask); + /* this will be any CPU in the rd->rto_mask, and can be a halted cpu update it */ + trace_android_rvh_rto_next_cpu(rd->rto_cpu, rd->rto_mask, &cpu); + rd->rto_cpu = cpu; if (cpu < nr_cpu_ids) @@ -2478,6 +2577,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) update_curr_rt(rq); update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); + trace_android_rvh_update_rt_rq_load_avg(rq_clock_pelt(rq), rq, p, 1); watchdog(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e1f46ed412bc..683429f6a0f5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -67,6 +67,8 @@ #include #include #include +#include +#include #include @@ -435,8 +437,16 @@ struct task_group { struct uclamp_se uclamp_req[UCLAMP_CNT]; /* Effective clamp values used for a task group */ struct uclamp_se uclamp[UCLAMP_CNT]; + /* Latency-sensitive flag used for a task group */ + unsigned int latency_sensitive; + + ANDROID_VENDOR_DATA_ARRAY(1, 4); #endif + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -855,6 +865,13 @@ struct root_domain { * CPUs of the rd. Protected by RCU. */ struct perf_domain __rcu *pd; + + ANDROID_VENDOR_DATA_ARRAY(1, 4); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; extern void init_defrootdomain(void); @@ -866,6 +883,7 @@ extern void sched_put_rd(struct root_domain *rd); #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); #endif +extern struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu); #endif /* CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK @@ -983,6 +1001,7 @@ struct rq { u64 clock; /* Ensure that all clocks are in the same cache line */ u64 clock_task ____cacheline_aligned; + u64 clock_task_mult; u64 clock_pelt; unsigned long lost_idle_time; @@ -1110,6 +1129,14 @@ struct rq { unsigned char core_forceidle; unsigned int core_forceidle_seq; #endif + + ANDROID_VENDOR_DATA_ARRAY(1, 96); + ANDROID_OEM_DATA_ARRAY(1, 16); + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1479,6 +1506,14 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } +static inline u64 rq_clock_task_mult(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + assert_clock_updated(rq); + + return rq->clock_task_mult; +} + /** * By default the decay is the default pelt decay period. * The decay shift can change the decay period in @@ -1526,6 +1561,11 @@ struct rq_flags { #endif }; +#ifdef CONFIG_SMP +extern struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, int dest_cpu); +#endif + extern struct callback_head balance_push_callback; /* @@ -1698,8 +1738,6 @@ enum numa_faults_stats { }; extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); -extern int migrate_swap(struct task_struct *p, struct task_struct *t, - int cpu, int scpu); extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); #else static inline void @@ -1710,6 +1748,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SMP +extern int migrate_swap(struct task_struct *p, struct task_struct *t, + int cpu, int scpu); + static inline void queue_balance_callback(struct rq *rq, struct callback_head *head, @@ -1984,6 +2025,8 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ #undef SCHED_FEAT extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; +extern const char * const sched_feat_names[__SCHED_FEAT_NR]; + #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) #else /* !CONFIG_JUMP_LABEL */ @@ -2058,6 +2101,8 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ +#define WF_ANDROID_VENDOR 0x1000 /* Vendor specific for Android */ + #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); static_assert(WF_FORK == SD_BALANCE_FORK); @@ -2116,6 +2161,8 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_MIGRATED 0x00 #endif +#define ENQUEUE_WAKEUP_SYNC 0x80 + #define RETRY_TASK ((void *)-1UL) struct sched_class { @@ -2284,6 +2331,7 @@ static inline struct task_struct *get_push_task(struct rq *rq) extern int push_cpu_stop(void *arg); +extern unsigned long __read_mostly max_load_balance_interval; #endif #ifdef CONFIG_CPU_IDLE @@ -2908,6 +2956,11 @@ out: return clamp(util, min_util, max_util); } +static inline bool uclamp_boosted(struct task_struct *p) +{ + return uclamp_eff_value(p, UCLAMP_MIN) > 0; +} + /* * When uclamp is compiled in, the aggregation at rq level is 'turned off' * by default in the fast path and only gets turned on once userspace performs @@ -2937,12 +2990,36 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, return util; } +static inline bool uclamp_boosted(struct task_struct *p) +{ + return false; +} + static inline bool uclamp_is_used(void) { return false; } #endif /* CONFIG_UCLAMP_TASK */ +#ifdef CONFIG_UCLAMP_TASK_GROUP +static inline bool uclamp_latency_sensitive(struct task_struct *p) +{ + struct cgroup_subsys_state *css = task_css(p, cpu_cgrp_id); + struct task_group *tg; + + if (!css) + return false; + tg = container_of(css, struct task_group, css); + + return tg->latency_sensitive; +} +#else +static inline bool uclamp_latency_sensitive(struct task_struct *p) +{ + return false; +} +#endif /* CONFIG_UCLAMP_TASK_GROUP */ + #ifdef arch_scale_freq_capacity # ifndef arch_scale_freq_invariant # define arch_scale_freq_invariant() true @@ -3102,3 +3179,14 @@ extern int sched_dynamic_mode(const char *str); extern void sched_dynamic_update(int mode); #endif +/* + * task_may_not_preempt - check whether a task may not be preemptible soon + */ +#ifdef CONFIG_RT_SOFTINT_OPTIMIZATION +extern bool task_may_not_preempt(struct task_struct *task, int cpu); +#else +static inline bool task_may_not_preempt(struct task_struct *task, int cpu) +{ + return false; +} +#endif /* CONFIG_RT_SOFTINT_OPTIMIZATION */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 4e8698e62f07..bceebda44f1f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -4,6 +4,8 @@ */ #include "sched.h" +#include + DEFINE_MUTEX(sched_domains_mutex); /* Protected by sched_domains_mutex: */ @@ -327,8 +329,7 @@ static void sched_energy_set(bool has_eas) * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. * 3. no SMT is detected. * 4. the EM complexity is low enough to keep scheduling overheads low; - * 5. schedutil is driving the frequency of all CPUs of the rd; - * 6. frequency invariance support is present; + * 5. frequency invariance support is present; * * The complexity of the Energy Model is defined as: * @@ -348,21 +349,23 @@ static void sched_energy_set(bool has_eas) */ #define EM_MAX_COMPLEXITY 2048 -extern struct cpufreq_governor schedutil_gov; static bool build_perf_domains(const struct cpumask *cpu_map) { int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map); struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd; - struct cpufreq_policy *policy; - struct cpufreq_governor *gov; + bool eas_check = false; if (!sysctl_sched_energy_aware) goto free; - /* EAS is enabled for asymmetric CPU capacity topologies. */ - if (!per_cpu(sd_asym_cpucapacity, cpu)) { + /* + * EAS is enabled for asymmetric CPU capacity topologies. + * Allow vendor to override if desired. + */ + trace_android_rvh_build_perf_domains(&eas_check); + if (!per_cpu(sd_asym_cpucapacity, cpu) && !eas_check) { if (sched_debug()) { pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n", cpumask_pr_args(cpu_map)); @@ -390,19 +393,6 @@ static bool build_perf_domains(const struct cpumask *cpu_map) if (find_pd(pd, i)) continue; - /* Do not attempt EAS if schedutil is not being used. */ - policy = cpufreq_cpu_get(i); - if (!policy) - goto free; - gov = policy->governor; - cpufreq_cpu_put(policy); - if (gov != &schedutil_gov) { - if (rd->pd) - pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n", - cpumask_pr_args(cpu_map)); - goto free; - } - /* Create the new pd and add it to the local list. */ tmp = pd_init(i); if (!tmp) @@ -2259,6 +2249,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); } + trace_android_vh_build_sched_domains(has_asym); ret = 0; error: diff --git a/kernel/sched/vendor_hooks.c b/kernel/sched/vendor_hooks.c new file mode 100644 index 000000000000..4b79d6eff596 --- /dev/null +++ b/kernel/sched/vendor_hooks.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* vendor_hook.c + * + * Copyright 2022 Google LLC + */ +#include "sched.h" +#include "pelt.h" +#include "smp.h" + +#define CREATE_TRACE_POINTS +#include +#include +#include +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_select_task_rq_fair); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_select_task_rq_rt); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_select_task_rq_dl); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_select_fallback_rq); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_scheduler_tick); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_enqueue_task); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_dequeue_task); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_can_migrate_task); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_find_lowest_rq); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_prepare_prio_fork); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_finish_prio_fork); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_rtmutex_prepare_setprio); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_rto_next_cpu); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_is_cpu_allowed); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_get_nohz_timer_target); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_user_nice); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_setscheduler); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_find_busiest_group); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_dump_throttled_rt_tasks); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_jiffies_update); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sched_newidle_balance); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sched_nohz_balancer_kick); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sched_rebalance_domains); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_find_busiest_queue); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_migrate_queued_task); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_find_energy_efficient_cpu); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_sugov_sched_attr); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_iowait); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_sugov_update); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpu_overutilized); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sched_setaffinity); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sched_getaffinity); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_cpus_allowed); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_task_cpu); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_try_to_wake_up); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_try_to_wake_up_success); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sched_fork); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_wake_up_new_task); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_new_task_stats); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_flush_task); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_tick_entry); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_schedule); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sched_cpu_starting); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sched_cpu_dying); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_account_irq); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_account_irq_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_account_irq_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_place_entity); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_build_perf_domains); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_cpu_capacity); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_misfit_status); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sched_fork_init); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ttwu_cond); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_schedule_bug); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sched_exec); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_map_util_freq); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_em_cpu_energy); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_build_sched_domains); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_check_preempt_tick); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_check_preempt_wakeup_ignore); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_replace_next_task_fair); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sched_balance_rt); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_pick_next_entity); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_check_preempt_wakeup); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_cpus_allowed_ptr_locked); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_sched_yield); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_free_task); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_irqtime_account_process_tick); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_uclamp_eff_get); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_after_enqueue_task); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_after_dequeue_task); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_enqueue_entity); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_dequeue_entity); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_entity_tick); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_enqueue_task_fair); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_dequeue_task_fair); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_em_dev_register_pd); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_post_init_entity_util_avg); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_find_new_ilb); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_util_est_update); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_setscheduler_uclamp); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_wake_up_sync); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_wake_flags); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_effective_cpu_util); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_force_compatible_pre); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_force_compatible_post); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_dup_task_struct); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_account_task_time); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_attach_entity_load_avg); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_detach_entity_load_avg); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_load_avg); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_remove_entity_load_avg); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_blocked_fair); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_rt_rq_load_avg); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index eca38107b32f..7f4c5e0aad3e 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -5,6 +5,7 @@ * (C) 2004 Nadia Yvette Chambers, Oracle */ #include "sched.h" +#include void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { @@ -199,10 +200,13 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) { + int wake_flags = WF_SYNC; + if (unlikely(!wq_head)) return; - __wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key); + trace_android_vh_set_wake_flags(&wake_flags, &mode); + __wake_up_common_lock(wq_head, mode, 1, wake_flags, key); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); @@ -411,7 +415,8 @@ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_en } EXPORT_SYMBOL(finish_wait); -int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) +__sched int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, + int sync, void *key) { int ret = default_wake_function(wq_entry, mode, sync, key); @@ -447,7 +452,7 @@ static inline bool is_kthread_should_stop(void) * } smp_mb(); // C * remove_wait_queue(&wq_head, &wait); wq_entry->flags |= WQ_FLAG_WOKEN; */ -long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) +__sched long wait_woken(struct wait_queue_entry *wq_entry, unsigned int mode, long timeout) { /* * The below executes an smp_mb(), which matches with the full barrier @@ -472,7 +477,8 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) } EXPORT_SYMBOL(wait_woken); -int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) +__sched int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, + int sync, void *key) { /* Pairs with the smp_store_mb() in wait_woken(). */ smp_mb(); /* C */ diff --git a/kernel/scs.c b/kernel/scs.c index 579841be8864..b7e1b096d906 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -32,15 +32,19 @@ static void *__scs_alloc(int node) for (i = 0; i < NR_CACHED_SCS; i++) { s = this_cpu_xchg(scs_cache[i], NULL); if (s) { - kasan_unpoison_vmalloc(s, SCS_SIZE); + s = kasan_unpoison_vmalloc(s, SCS_SIZE, + KASAN_VMALLOC_PROT_NORMAL); memset(s, 0, SCS_SIZE); - return s; + goto out; } } - return __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END, + s = __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END, GFP_SCS, PAGE_KERNEL, 0, node, __builtin_return_address(0)); + +out: + return kasan_reset_tag(s); } void *scs_alloc(int node) @@ -78,7 +82,7 @@ void scs_free(void *s) if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL) return; - kasan_unpoison_vmalloc(s, SCS_SIZE); + kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_PROT_NORMAL); vfree_atomic(s); } diff --git a/kernel/signal.c b/kernel/signal.c index c7dbb19219b9..37fad322283a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -56,6 +56,8 @@ #include #include /* for syscall_get_* */ +#undef CREATE_TRACE_POINTS +#include /* * SLAB caches for signal bits. */ @@ -1289,7 +1291,7 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p { unsigned long flags; int ret = -ESRCH; - + trace_android_vh_do_send_sig_info(sig, current, p); if (lock_task_sighand(p, &flags)) { ret = send_signal(sig, info, p, type); unlock_task_sighand(p, &flags); diff --git a/kernel/softirq.c b/kernel/softirq.c index 322b65d45676..1de7823db47f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -33,6 +33,13 @@ #define CREATE_TRACE_POINTS #include +EXPORT_TRACEPOINT_SYMBOL_GPL(irq_handler_entry); +EXPORT_TRACEPOINT_SYMBOL_GPL(irq_handler_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(softirq_entry); +EXPORT_TRACEPOINT_SYMBOL_GPL(softirq_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(tasklet_entry); +EXPORT_TRACEPOINT_SYMBOL_GPL(tasklet_exit); + /* - No shared variables, all the data are CPU local. - If a softirq needs serialization, let it serialize itself @@ -59,6 +66,14 @@ EXPORT_PER_CPU_SYMBOL(irq_stat); static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +EXPORT_PER_CPU_SYMBOL_GPL(ksoftirqd); + +/* + * active_softirqs -- per cpu, a mask of softirqs that are being handled, + * with the expectation that approximate answers are acceptable and therefore + * no synchronization. + */ +DEFINE_PER_CPU(__u32, active_softirqs); const char * const softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL", @@ -80,21 +95,6 @@ static void wakeup_softirqd(void) wake_up_process(tsk); } -/* - * If ksoftirqd is scheduled, we do not want to process pending softirqs - * right now. Let ksoftirqd handle this at its own rate, to get fairness, - * unless we're doing some of the synchronous softirqs. - */ -#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ)) -static bool ksoftirqd_running(unsigned long pending) -{ - struct task_struct *tsk = __this_cpu_read(ksoftirqd); - - if (pending & SOFTIRQ_NOW_MASK) - return false; - return tsk && task_is_running(tsk) && !__kthread_should_park(tsk); -} - #ifdef CONFIG_TRACE_IRQFLAGS DEFINE_PER_CPU(int, hardirqs_enabled); DEFINE_PER_CPU(int, hardirq_context); @@ -236,7 +236,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) goto out; pending = local_softirq_pending(); - if (!pending || ksoftirqd_running(pending)) + if (!pending) goto out; /* @@ -419,9 +419,6 @@ static inline bool should_wake_ksoftirqd(void) static inline void invoke_softirq(void) { - if (ksoftirqd_running(local_softirq_pending())) - return; - if (!force_irqthreads() || !__this_cpu_read(ksoftirqd)) { #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* @@ -455,7 +452,7 @@ asmlinkage __visible void do_softirq(void) pending = local_softirq_pending(); - if (pending && !ksoftirqd_running(pending)) + if (pending) do_softirq_own_stack(); local_irq_restore(flags); @@ -512,6 +509,17 @@ static inline bool lockdep_softirq_start(void) { return false; } static inline void lockdep_softirq_end(bool in_hardirq) { } #endif +static inline __u32 softirq_deferred_for_rt(__u32 *pending) +{ + __u32 deferred = 0; + + if (cpupri_check_rt()) { + deferred = *pending & LONG_SOFTIRQ_MASK; + *pending &= ~LONG_SOFTIRQ_MASK; + } + return deferred; +} + asmlinkage __visible void __softirq_entry __do_softirq(void) { unsigned long end = jiffies + MAX_SOFTIRQ_TIME; @@ -519,6 +527,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) int max_restart = MAX_SOFTIRQ_RESTART; struct softirq_action *h; bool in_hardirq; + __u32 deferred; __u32 pending; int softirq_bit; @@ -531,13 +540,15 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) pending = local_softirq_pending(); + deferred = softirq_deferred_for_rt(&pending); softirq_handle_begin(); in_hardirq = lockdep_softirq_start(); account_softirq_enter(current); restart: /* Reset the pending bitmask before enabling irqs */ - set_softirq_pending(0); + set_softirq_pending(deferred); + __this_cpu_write(active_softirqs, pending); local_irq_enable(); @@ -567,6 +578,7 @@ restart: pending >>= softirq_bit; } + __this_cpu_write(active_softirqs, 0); if (!IS_ENABLED(CONFIG_PREEMPT_RT) && __this_cpu_read(ksoftirqd) == current) rcu_softirq_qs(); @@ -574,14 +586,17 @@ restart: local_irq_disable(); pending = local_softirq_pending(); + deferred = softirq_deferred_for_rt(&pending); + if (pending) { if (time_before(jiffies, end) && !need_resched() && --max_restart) goto restart; - - wakeup_softirqd(); } + if (pending | deferred) + wakeup_softirqd(); + account_softirq_exit(current); lockdep_softirq_end(in_hardirq); softirq_handle_end(); @@ -779,10 +794,15 @@ static void tasklet_action_common(struct softirq_action *a, if (tasklet_trylock(t)) { if (!atomic_read(&t->count)) { if (tasklet_clear_sched(t)) { - if (t->use_callback) + if (t->use_callback) { + trace_tasklet_entry(t->callback); t->callback(t); - else + trace_tasklet_exit(t->callback); + } else { + trace_tasklet_entry(t->func); t->func(t->data); + trace_tasklet_exit(t->func); + } } tasklet_unlock(t); continue; diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 9c625257023d..6a2fe3490931 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -151,6 +151,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store, put_task_stack(tsk); return c.len; } +EXPORT_SYMBOL_GPL(stack_trace_save_tsk); /** * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array @@ -174,6 +175,7 @@ unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, arch_stack_walk(consume_entry, &c, current, regs); return c.len; } +EXPORT_SYMBOL_GPL(stack_trace_save_regs); #ifdef CONFIG_HAVE_RELIABLE_STACKTRACE /** diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index cbc30271ea4d..ac3fc7f9dd91 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -152,6 +152,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) wait_for_completion(&done.completion); return done.ret; } +EXPORT_SYMBOL_GPL(stop_one_cpu); /* This controls the threads on each CPU. */ enum multi_stop_state { @@ -387,6 +388,7 @@ bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, }; return cpu_stop_queue_work(cpu, work_buf); } +EXPORT_SYMBOL_GPL(stop_one_cpu_nowait); static bool queue_stop_cpus_work(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg, diff --git a/kernel/sys.c b/kernel/sys.c index 2d2bc6396515..1463a77d2dee 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -74,6 +75,8 @@ #include "uid16.h" +#include + #ifndef SET_UNALIGN_CTL # define SET_UNALIGN_CTL(a, b) (-EINVAL) #endif @@ -116,6 +119,12 @@ #ifndef SVE_GET_VL # define SVE_GET_VL() (-EINVAL) #endif +#ifndef SME_SET_VL +# define SME_SET_VL(a) (-EINVAL) +#endif +#ifndef SME_GET_VL +# define SME_GET_VL() (-EINVAL) +#endif #ifndef PAC_RESET_KEYS # define PAC_RESET_KEYS(a, b) (-EINVAL) #endif @@ -2271,6 +2280,70 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) +#ifdef CONFIG_ANON_VMA_NAME + +#define ANON_VMA_NAME_MAX_LEN 256 +#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" + +static inline bool is_valid_name_char(char ch) +{ + /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ + return ch > 0x1f && ch < 0x7f && + !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); +} + +static int prctl_set_vma(unsigned long opt, unsigned long addr, + unsigned long size, unsigned long arg) +{ + struct mm_struct *mm = current->mm; + const char __user *uname; + struct anon_vma_name *anon_name = NULL; + int error; + + switch (opt) { + case PR_SET_VMA_ANON_NAME: + uname = (const char __user *)arg; + if (uname) { + char *name, *pch; + + name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); + if (IS_ERR(name)) + return PTR_ERR(name); + + for (pch = name; *pch != '\0'; pch++) { + if (!is_valid_name_char(*pch)) { + kfree(name); + return -EINVAL; + } + } + /* anon_vma has its own copy */ + anon_name = anon_vma_name_alloc(name); + kfree(name); + if (!anon_name) + return -ENOMEM; + + } + + mmap_write_lock(mm); + error = madvise_set_anon_name(mm, addr, size, anon_name); + mmap_write_unlock(mm); + anon_vma_name_put(anon_name); + break; + default: + error = -EINVAL; + } + + return error; +} + +#else /* CONFIG_ANON_VMA_NAME */ +static int prctl_set_vma(unsigned long opt, unsigned long start, + unsigned long size, unsigned long arg) +{ + return -EINVAL; +} +#endif /* CONFIG_ANON_VMA_NAME */ + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2473,6 +2546,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SVE_GET_VL: error = SVE_GET_VL(); break; + case PR_SME_SET_VL: + error = SME_SET_VL(arg2); + break; + case PR_SME_GET_VL: + error = SME_GET_VL(); + break; case PR_GET_SPECULATION_CTRL: if (arg3 || arg4 || arg5) return -EINVAL; @@ -2540,10 +2619,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = sched_core_share_pid(arg2, arg3, arg4, arg5); break; #endif + case PR_SET_VMA: + error = prctl_set_vma(arg2, arg3, arg4, arg5); + break; default: error = -EINVAL; break; } + trace_android_vh_syscall_prctl_finished(option, me); return error; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 928798f89ca1..21acf51976d8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1850,6 +1850,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rr_handler, }, +#ifdef CONFIG_SMP + { + .procname = "sched_pelt_multiplier", + .data = &sysctl_sched_pelt_multiplier, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_pelt_multiplier, + }, +#endif #ifdef CONFIG_UCLAMP_TASK { .procname = "sched_util_clamp_min", diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index b1b9b12899f5..6740dd02a402 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "timekeeping.h" @@ -149,8 +150,7 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) return HRTIMER_RESTART; } -void __init -sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) +void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; @@ -224,6 +224,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pS as sched_clock source\n", read); } +EXPORT_SYMBOL_GPL(sched_clock_register); void __init generic_sched_clock_init(void) { @@ -270,6 +271,7 @@ int sched_clock_suspend(void) update_sched_clock(); hrtimer_cancel(&sched_clock_timer); rd->read_sched_clock = suspended_sched_clock_read; + trace_android_vh_show_suspend_epoch_val(rd->epoch_ns, rd->epoch_cyc); return 0; } @@ -281,6 +283,7 @@ void sched_clock_resume(void) rd->epoch_cyc = cd.actual_read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); rd->read_sched_clock = cd.actual_read_sched_clock; + trace_android_vh_show_resume_epoch_val(rd->epoch_cyc); } static struct syscore_ops sched_clock_ops = { diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 46789356f856..5c47cc50f9e9 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -95,6 +96,7 @@ static void tick_periodic(int cpu) write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); update_wall_time(); + trace_android_vh_jiffies_update(NULL); } update_process_times(user_mode(get_irq_regs())); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 9c6f661fb436..6041ee08896c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -193,8 +194,10 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) #endif /* Check, if the jiffies need an update */ - if (tick_do_timer_cpu == cpu) + if (tick_do_timer_cpu == cpu) { tick_do_update_jiffies64(now); + trace_android_vh_jiffies_update(NULL); + } if (ts->inidle) ts->got_idle_tick = 1; @@ -1201,6 +1204,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) return ktime_sub(next_event, now); } +EXPORT_SYMBOL_GPL(tick_nohz_get_sleep_length); /** * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value @@ -1214,6 +1218,7 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) return ts->idle_calls; } +EXPORT_SYMBOL_GPL(tick_nohz_get_idle_calls_cpu); /** * tick_nohz_get_idle_calls - return the current idle calls counter value diff --git a/kernel/time/time.c b/kernel/time/time.c index 29923b20e0e4..f2e92bddb4f5 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -449,7 +449,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); -struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) +struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec) { struct timespec64 ts = ns_to_timespec64(nsec); struct __kernel_old_timeval tv; @@ -503,7 +503,7 @@ EXPORT_SYMBOL(set_normalized_timespec64); * * Returns the timespec64 representation of the nsec parameter. */ -struct timespec64 ns_to_timespec64(const s64 nsec) +struct timespec64 ns_to_timespec64(s64 nsec) { struct timespec64 ts = { 0, 0 }; s32 rem; @@ -686,6 +686,7 @@ u64 nsec_to_clock_t(u64 x) return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); #endif } +EXPORT_SYMBOL_GPL(nsec_to_clock_t); u64 jiffies64_to_nsecs(u64 j) { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d6a0ff68df41..476898e15b8c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "tick-internal.h" #include "ntp_internal.h" @@ -1038,9 +1039,11 @@ noinstr time64_t __ktime_get_real_seconds(void) void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) { struct timekeeper *tk = &tk_core.timekeeper; + u32 mono_mult, mono_shift; unsigned int seq; ktime_t base_raw; ktime_t base_real; + ktime_t base_boot; u64 nsec_raw; u64 nsec_real; u64 now; @@ -1055,14 +1058,21 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); + base_boot = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_boot); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); + mono_mult = tk->tkr_mono.mult; + mono_shift = tk->tkr_mono.shift; } while (read_seqcount_retry(&tk_core.seq, seq)); systime_snapshot->cycles = now; systime_snapshot->real = ktime_add_ns(base_real, nsec_real); + systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real); systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); + systime_snapshot->mono_shift = mono_shift; + systime_snapshot->mono_mult = mono_mult; } EXPORT_SYMBOL_GPL(ktime_get_snapshot); @@ -1324,6 +1334,8 @@ out: write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + trace_android_rvh_tk_based_time_sync(tk); + /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 9dd2a39cb3b0..8c62e3814d9f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -55,6 +55,11 @@ #define CREATE_TRACE_POINTS #include +#undef CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL_GPL(hrtimer_expire_entry); +EXPORT_TRACEPOINT_SYMBOL_GPL(hrtimer_expire_exit); __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; @@ -502,6 +507,7 @@ static inline unsigned calc_index(unsigned long expires, unsigned lvl, * * Round up with level granularity to prevent this. */ + trace_android_vh_timer_calc_index(lvl, &expires); expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); *bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 4265d125d50f..ab9267980031 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -95,6 +95,13 @@ config RING_BUFFER_ALLOW_SWAP Allow the use of ring_buffer_swap_cpu. Adds a very slight overhead to tracing when enabled. +config TRACE_MMIO_ACCESS + bool "Register read/write tracing" + depends on TRACING && ARCH_HAVE_TRACE_MMIO_ACCESS + help + Create tracepoints for MMIO read/write operations. These trace events + can be used for logging all MMIO read/write operations. + config PREEMPTIRQ_TRACEPOINTS bool depends on TRACE_PREEMPT_TOGGLE || TRACE_IRQFLAGS diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index bedc5caceec7..a3d16e1a5abd 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -99,5 +99,6 @@ obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o +obj-$(CONFIG_TRACE_MMIO_ACCESS) += trace_readwrite.o libftrace-y := ftrace.o diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 21bb161c2316..3ca551c11ae1 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -18,4 +18,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency); EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); - +EXPORT_TRACEPOINT_SYMBOL_GPL(device_pm_callback_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(device_pm_callback_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(clock_set_rate); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ffc8696e6746..4d5ce7f6c578 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5,6 +5,7 @@ * Copyright (C) 2008 Steven Rostedt */ #include +#include #include #include #include @@ -124,23 +125,6 @@ int ring_buffer_print_entry_header(struct trace_seq *s) /* Used for individual buffers (after the counter) */ #define RB_BUFFER_OFF (1 << 20) -#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) - -#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) -#define RB_ALIGNMENT 4U -#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) -#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ - -#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS -# define RB_FORCE_8BYTE_ALIGNMENT 0 -# define RB_ARCH_ALIGNMENT RB_ALIGNMENT -#else -# define RB_FORCE_8BYTE_ALIGNMENT 1 -# define RB_ARCH_ALIGNMENT 8U -#endif - -#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) - /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -283,10 +267,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_online_buffer_cpu(buffer, cpu) \ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) -#define TS_SHIFT 27 -#define TS_MASK ((1ULL << TS_SHIFT) - 1) -#define TS_DELTA_TEST (~TS_MASK) - static u64 rb_event_time_stamp(struct ring_buffer_event *event) { u64 ts; @@ -303,12 +283,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event) /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) -struct buffer_data_page { - u64 time_stamp; /* page time stamp */ - local_t commit; /* write committed index */ - unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ -}; - /* * Note, the buffer_page list must be first. The buffer pages * are allocated in cache lines, which means that each buffer @@ -356,18 +330,6 @@ static void free_buffer_page(struct buffer_page *bpage) kfree(bpage); } -/* - * We need to fit the time_stamp delta into 27 bits. - */ -static inline int test_time_stamp(u64 delta) -{ - if (delta & TS_DELTA_TEST) - return 1; - return 0; -} - -#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) - /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) @@ -546,6 +508,8 @@ struct trace_buffer { struct rb_irq_work irq_work; bool time_stamp_abs; + + struct ring_buffer_ext_cb *ext_cb; }; struct ring_buffer_iter { @@ -744,6 +708,16 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) } #endif +static inline bool has_ext_writer(struct trace_buffer *buffer) +{ + return !!buffer->ext_cb; +} + +static inline bool rb_has_ext_writer(struct ring_buffer_per_cpu *cpu_buffer) +{ + return has_ext_writer(cpu_buffer->buffer); +} + /* * Enable this to make sure that the event passed to * ring_buffer_event_time_stamp() is not committed and also @@ -1859,6 +1833,26 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, } EXPORT_SYMBOL_GPL(__ring_buffer_alloc); +struct trace_buffer *ring_buffer_alloc_ext(unsigned long size, + struct ring_buffer_ext_cb *cb) +{ + struct trace_buffer *buffer; + + if (!cb || !cb->update_footers || !cb->swap_reader) + return NULL; + + buffer = ring_buffer_alloc(size, RB_FL_OVERWRITE); + if (!buffer) + return NULL; + + WARN_ON(cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, + &buffer->node)); + buffer->ext_cb = cb; + atomic_set(&buffer->record_disabled, 1); + + return buffer; +} + /** * ring_buffer_free - free a ring buffer. * @buffer: the buffer to free. @@ -1868,7 +1862,9 @@ ring_buffer_free(struct trace_buffer *buffer) { int cpu; - cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); + if (!has_ext_writer(buffer)) + cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, + &buffer->node); for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); @@ -2137,6 +2133,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, unsigned long nr_pages; int cpu, err; + if (unlikely(has_ext_writer(buffer))) + return -EINVAL; /* * Always succeed at resizing a non-existent buffer: */ @@ -3450,7 +3448,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, return; /* - * If this interrupted another event, + * If this interrupted another event, */ if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) goto out; @@ -3860,6 +3858,9 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer; int cpu; + if (unlikely(has_ext_writer(buffer))) + return; + /* The event is discarded regardless */ rb_event_discard(event); @@ -4015,6 +4016,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable); */ void ring_buffer_record_enable(struct trace_buffer *buffer) { + if (unlikely(has_ext_writer(buffer))) + return; + atomic_dec(&buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_record_enable); @@ -4058,6 +4062,9 @@ void ring_buffer_record_on(struct trace_buffer *buffer) unsigned int rd; unsigned int new_rd; + if (unlikely(has_ext_writer(buffer))) + return; + do { rd = atomic_read(&buffer->record_disabled); new_rd = rd & ~RB_BUFFER_OFF; @@ -4500,49 +4507,119 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, return; } -static struct buffer_page * -rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +static void __set_head_page_flag(struct buffer_page *head, int flag) { - struct buffer_page *reader = NULL; - unsigned long overwrite; - unsigned long flags; - int nr_loops = 0; - int ret; + struct list_head *prev = head->list.prev; - local_irq_save(flags); - arch_spin_lock(&cpu_buffer->lock); + prev->next = (struct list_head *)(((unsigned long)prev->next & ~RB_FLAG_MASK) | flag); +} + +static int __read_footer_reader_status(struct buffer_page *bpage) +{ + struct rb_ext_page_footer *footer = rb_ext_page_get_footer(bpage->page); + + return atomic_read(&footer->reader_status); +} + +static int __read_footer_writer_status(struct buffer_page *bpage) +{ + struct rb_ext_page_footer *footer = rb_ext_page_get_footer(bpage->page); + + return atomic_read(&footer->writer_status); +} + +static struct buffer_page * +ring_buffer_search_footer(struct buffer_page *start, unsigned long flag) +{ + bool search_writer = flag == RB_PAGE_FT_COMMIT; + struct buffer_page *bpage = start; + unsigned long status; + int cnt = 0; +again: + do { + status = search_writer ? __read_footer_writer_status(bpage) : + __read_footer_reader_status(bpage); + if (flag & status) + return bpage; + + rb_inc_page(&bpage); + } while (bpage != start); - again: /* - * This should normally only loop twice. But because the - * start of the reader inserts an empty page, it causes - * a case where we will loop three times. There should be no - * reason to loop four times (that I know of). + * There's a chance the writer is in the middle of moving the flag and + * we might not find anything after a first round. Let's try again. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) { - reader = NULL; - goto out; + if (cnt++ < 3) + goto again; + + return NULL; +} + +static struct buffer_page * +noinline rb_swap_reader_page_ext(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *new_reader, *new_rb_page, *new_head; + struct rb_ext_page_footer *footer; + unsigned long overrun; + + if (cpu_buffer->buffer->ext_cb->swap_reader(cpu_buffer->cpu)) { + WARN_ON(1); + return NULL; } - reader = cpu_buffer->reader_page; + new_rb_page = cpu_buffer->reader_page; - /* If there's more to read, return this page */ - if (cpu_buffer->reader_page->read < rb_page_size(reader)) - goto out; + /* + * Find what page is the new reader... starting with the latest known + * head. + */ + new_reader = ring_buffer_search_footer(cpu_buffer->head_page, + RB_PAGE_FT_READER); + if (!new_reader) { + WARN_ON(1); + return NULL; + } - /* Never should we have an index greater than the size */ - if (RB_WARN_ON(cpu_buffer, - cpu_buffer->reader_page->read > rb_page_size(reader))) - goto out; + /* ... and install it into the ring buffer in place of the old head */ + rb_list_head_clear(&new_reader->list); + new_rb_page->list.next = new_reader->list.next; + new_rb_page->list.prev = new_reader->list.prev; + new_rb_page->list.next->prev = &new_rb_page->list; + new_rb_page->list.prev->next = &new_rb_page->list; - /* check if we caught up to the tail */ - reader = NULL; - if (cpu_buffer->commit_page == cpu_buffer->reader_page) - goto out; + cpu_buffer->reader_page = new_reader; + cpu_buffer->reader_page->read = 0; - /* Don't bother swapping if the ring buffer is empty */ - if (rb_num_of_entries(cpu_buffer) == 0) - goto out; + /* Install the new head page */ + new_head = new_rb_page; + rb_inc_page(&new_head); + cpu_buffer->head_page = new_head; + + /* + * cpu_buffer->pages just needs to point to the buffer, it + * has no specific buffer page to point to. Lets move it out + * of our way so we don't accidentally swap it. + */ + cpu_buffer->pages = &new_head->list; + + __set_head_page_flag(new_head, RB_PAGE_HEAD); + + footer = rb_ext_page_get_footer(new_reader->page); + overrun = footer->stats.overrun; + if (overrun != cpu_buffer->last_overrun) { + cpu_buffer->lost_events = overrun - cpu_buffer->last_overrun; + cpu_buffer->last_overrun = overrun; + } + + return new_reader; +} + +static struct buffer_page * +rb_swap_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *reader; + unsigned long overwrite; + int ret; /* * Reset the reader page to size zero. @@ -4558,7 +4635,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) */ reader = rb_set_head_page(cpu_buffer); if (!reader) - goto out; + return NULL; + cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); cpu_buffer->reader_page->list.prev = reader->list.prev; @@ -4622,7 +4700,60 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->last_overrun = overwrite; } - goto again; + return reader; +} + +static struct buffer_page * +rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *reader = NULL; + unsigned long flags; + int nr_loops = 0; + unsigned int page_size; + + local_irq_save(flags); + arch_spin_lock(&cpu_buffer->lock); + + again: + /* + * This should normally only loop twice. But because the + * start of the reader inserts an empty page, it causes + * a case where we will loop three times. There should be no + * reason to loop four times (that I know of). + */ + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) { + reader = NULL; + goto out; + } + + reader = cpu_buffer->reader_page; + + /* If there's more to read, return this page */ + if (cpu_buffer->reader_page->read < rb_page_size(reader)) + goto out; + + page_size = rb_page_size(reader); + /* Never should we have an index greater than the size */ + if (RB_WARN_ON(cpu_buffer, + cpu_buffer->reader_page->read > page_size)) + goto out; + + /* check if we caught up to the tail */ + reader = NULL; + if (cpu_buffer->commit_page == cpu_buffer->reader_page) + goto out; + + /* Don't bother swapping if the ring buffer is empty */ + if (rb_num_of_entries(cpu_buffer) == 0) + goto out; + + if (rb_has_ext_writer(cpu_buffer)) + reader = rb_swap_reader_page_ext(cpu_buffer); + else + reader = rb_swap_reader_page(cpu_buffer); + + if (reader) + goto again; out: /* Update the read_stamp on the first event */ @@ -5040,6 +5171,73 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts, } EXPORT_SYMBOL_GPL(ring_buffer_consume); +static void ring_buffer_update_view(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct rb_ext_page_footer *footer; + struct buffer_page *bpage; + + if (!rb_has_ext_writer(cpu_buffer)) + return; + + raw_spin_lock_irq(&cpu_buffer->reader_lock); + arch_spin_lock(&cpu_buffer->lock); + + cpu_buffer->buffer->ext_cb->update_footers(cpu_buffer->cpu); + + bpage = cpu_buffer->reader_page; + footer = rb_ext_page_get_footer(bpage->page); + + local_set(&cpu_buffer->entries, footer->stats.entries); + local_set(&cpu_buffer->pages_touched, footer->stats.pages_touched); + local_set(&cpu_buffer->overrun, footer->stats.overrun); + + /* Update the commit page */ + bpage = ring_buffer_search_footer(cpu_buffer->commit_page, + RB_PAGE_FT_COMMIT); + if (!bpage) { + WARN_ON(1); + goto unlock; + } + cpu_buffer->commit_page = bpage; + + /* Update the head page */ + bpage = ring_buffer_search_footer(cpu_buffer->head_page, + RB_PAGE_FT_HEAD); + if (!bpage) { + WARN_ON(1); + goto unlock; + } + + /* Reset the previous RB_PAGE_HEAD flag */ + __set_head_page_flag(cpu_buffer->head_page, RB_PAGE_NORMAL); + + /* Set RB_PAGE_HEAD flag pointing to the new head */ + __set_head_page_flag(bpage, RB_PAGE_HEAD); + + cpu_buffer->reader_page->list.next = &cpu_buffer->head_page->list; + + cpu_buffer->head_page = bpage; + +unlock: + arch_spin_unlock(&cpu_buffer->lock); + raw_spin_unlock_irq(&cpu_buffer->reader_lock); +} + +int ring_buffer_poke(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + ring_buffer_update_view(cpu_buffer); + rb_wakeups(buffer, cpu_buffer); + + return 0; +} + /** * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer * @buffer: The ring buffer to read from @@ -5086,6 +5284,8 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) atomic_inc(&cpu_buffer->resize_disabled); + ring_buffer_update_view(cpu_buffer); + return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); @@ -5446,6 +5646,9 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, struct ring_buffer_per_cpu *cpu_buffer_b; int ret = -EINVAL; + if (unlikely(has_ext_writer(buffer_a) || has_ext_writer(buffer_b))) + return -EINVAL; + if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || !cpumask_test_cpu(cpu, buffer_b->cpumask)) goto out; @@ -5746,14 +5949,21 @@ int ring_buffer_read_page(struct trace_buffer *buffer, cpu_buffer->read += rb_page_entries(reader); cpu_buffer->read_bytes += BUF_PAGE_SIZE; - /* swap the pages */ - rb_init_page(bpage); - bpage = reader->page; - reader->page = *data_page; - local_set(&reader->write, 0); - local_set(&reader->entries, 0); - reader->read = 0; - *data_page = bpage; + if (unlikely(has_ext_writer(buffer))) { + u64 commit = local_read(&reader->page->commit); + + memcpy(bpage, reader->page, + BUF_PAGE_HDR_SIZE + commit); + } else { + /* swap the pages */ + rb_init_page(bpage); + bpage = reader->page; + reader->page = *data_page; + local_set(&reader->write, 0); + local_set(&reader->entries, 0); + reader->read = 0; + *data_page = bpage; + } /* * Use the real_end for the data size, @@ -5841,6 +6051,74 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node) return 0; } +#define TRACE_BUFFER_PACK_HDR_SIZE offsetof(struct trace_buffer_pack, __data) +#define RING_BUFFER_PACK_HDR_SIZE offsetof(struct ring_buffer_pack, page_va) + +size_t trace_buffer_pack_size(struct trace_buffer *trace_buffer) +{ + size_t size = 0; + int cpu; + + for_each_buffer_cpu(trace_buffer, cpu) { + struct ring_buffer_per_cpu *rb = trace_buffer->buffers[cpu]; + size += rb->nr_pages * sizeof(unsigned long); + size += RING_BUFFER_PACK_HDR_SIZE; + } + + size += TRACE_BUFFER_PACK_HDR_SIZE; + + return size; +} + +int trace_buffer_pack(struct trace_buffer *trace_buffer, + struct trace_buffer_pack *pack) +{ + struct ring_buffer_pack *cpu_pack; + int cpu = -1, pack_cpu, j; + + if (!has_ext_writer(trace_buffer)) + return -EINVAL; + + pack->nr_cpus = cpumask_weight(trace_buffer->cpumask); + pack->total_pages = 0; + + for_each_ring_buffer_pack(cpu_pack, pack_cpu, pack) { + struct ring_buffer_per_cpu *rb; + unsigned long flags, nr_pages; + struct buffer_page *bpage; + + cpu = cpumask_next(cpu, trace_buffer->cpumask); + if (cpu > nr_cpu_ids) { + WARN_ON(1); + break; + } + + rb = trace_buffer->buffers[cpu]; + + local_irq_save(flags); + arch_spin_lock(&rb->lock); + + bpage = rb->head_page; + nr_pages = rb->nr_pages; + + pack->total_pages += nr_pages + 1; + + cpu_pack->cpu = cpu; + cpu_pack->reader_page_va = (unsigned long)rb->reader_page->page; + cpu_pack->nr_pages = nr_pages; + + for (j = 0; j < nr_pages; j++) { + cpu_pack->page_va[j] = (unsigned long)bpage->page; + rb_inc_page(&bpage); + } + + arch_spin_unlock(&rb->lock); + local_irq_restore(flags); + } + + return 0; +} + #ifdef CONFIG_RING_BUFFER_STARTUP_TEST /* * This is a basic integrity check of the ring buffer. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 06ff4dea34d0..e5182cb62b3a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -49,6 +49,7 @@ #include #include #include +#include #include "trace.h" #include "trace_output.h" @@ -5646,6 +5647,7 @@ static const char readme_msg[] = #ifdef CONFIG_HIST_TRIGGERS " hist trigger\t- If set, event hits are aggregated into a hash table\n" "\t Format: hist:keys=\n" + "\t [:=[,=...]]\n" "\t [:values=]\n" "\t [:sort=]\n" "\t [:size=#entries]\n" @@ -5657,6 +5659,16 @@ static const char readme_msg[] = "\t common_timestamp - to record current timestamp\n" "\t common_cpu - to record the CPU the event happened on\n" "\n" + "\t A hist trigger variable can be:\n" + "\t - a reference to a field e.g. x=current_timestamp,\n" + "\t - a reference to another variable e.g. y=$x,\n" + "\t - a numeric literal: e.g. ms_per_sec=1000,\n" + "\t - an arithmetic expression: e.g. time_secs=current_timestamp/1000\n" + "\n" + "\t hist trigger aritmethic expressions support addition(+), subtraction(-),\n" + "\t multiplication(*) and division(/) operators. An operand can be either a\n" + "\t variable reference, field or numeric literal.\n" + "\n" "\t When a matching event is hit, an entry is added to a hash\n" "\t table using the key(s) and value(s) named, and the value of a\n" "\t sum called 'hitcount' is incremented. Keys and values\n" @@ -9808,8 +9820,17 @@ fs_initcall(tracer_init_tracefs); static int trace_panic_handler(struct notifier_block *this, unsigned long event, void *unused) { + bool ftrace_check = false; + + trace_android_vh_ftrace_oops_enter(&ftrace_check); + + if (ftrace_check) + return NOTIFY_OK; + if (ftrace_dump_on_oops) ftrace_dump(ftrace_dump_on_oops); + + trace_android_vh_ftrace_oops_exit(&ftrace_check); return NOTIFY_OK; } @@ -9823,6 +9844,13 @@ static int trace_die_handler(struct notifier_block *self, unsigned long val, void *data) { + bool ftrace_check = false; + + trace_android_vh_ftrace_oops_enter(&ftrace_check); + + if (ftrace_check) + return NOTIFY_OK; + switch (val) { case DIE_OOPS: if (ftrace_dump_on_oops) @@ -9831,6 +9859,8 @@ static int trace_die_handler(struct notifier_block *self, default: break; } + + trace_android_vh_ftrace_oops_exit(&ftrace_check); return NOTIFY_OK; } @@ -9855,6 +9885,8 @@ static struct notifier_block trace_die_notifier = { void trace_printk_seq(struct trace_seq *s) { + bool dump_printk = true; + /* Probably should print a warning here. */ if (s->seq.len >= TRACE_MAX_PRINT) s->seq.len = TRACE_MAX_PRINT; @@ -9870,7 +9902,9 @@ trace_printk_seq(struct trace_seq *s) /* should be zero ended, but we are paranoid. */ s->buffer[s->seq.len] = 0; - printk(KERN_TRACE "%s", s->buffer); + trace_android_vh_ftrace_dump_buffer(s, &dump_printk); + if (dump_printk) + printk(KERN_TRACE "%s", s->buffer); trace_seq_init(s); } @@ -9909,6 +9943,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) unsigned int old_userobj; unsigned long flags; int cnt = 0, cpu; + bool ftrace_check = false; + unsigned long size; /* Only allow one dump user at a time. */ if (atomic_inc_return(&dump_running) != 1) { @@ -9933,6 +9969,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); + size = ring_buffer_size(iter.array_buffer->buffer, cpu); + trace_android_vh_ftrace_size_check(size, &ftrace_check); } old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -9940,6 +9978,9 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) /* don't look at user memory in panic mode */ tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + if (ftrace_check) + goto out_enable; + switch (oops_dump_mode) { case DUMP_ALL: iter.cpu_file = RING_BUFFER_ALL_CPUS; @@ -9970,6 +10011,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) */ while (!trace_empty(&iter)) { + ftrace_check = true; if (!cnt) printk(KERN_TRACE "---------------------------------\n"); @@ -9977,7 +10019,9 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) cnt++; trace_iterator_reset(&iter); - iter.iter_flags |= TRACE_FILE_LAT_FMT; + trace_android_vh_ftrace_format_check(&ftrace_check); + if (ftrace_check) + iter.iter_flags |= TRACE_FILE_LAT_FMT; if (trace_find_next_entry_inc(&iter) != NULL) { int ret; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index aaf779ee68a6..6b811ffbfbe2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -66,7 +66,10 @@ C(EMPTY_SORT_FIELD, "Empty sort field"), \ C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \ C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \ - C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), + C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \ + C(EXPECT_NUMBER, "Expecting numeric literal"), \ + C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \ + C(DIVISION_BY_ZERO, "Division by zero"), #undef C #define C(a, b) HIST_ERR_##a @@ -89,12 +92,16 @@ typedef u64 (*hist_field_fn_t) (struct hist_field *field, #define HIST_FIELD_OPERANDS_MAX 2 #define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX) #define HIST_ACTIONS_MAX 8 +#define HIST_CONST_DIGITS_MAX 21 +#define HIST_DIV_SHIFT 20 /* For optimizing division by constants */ enum field_op_id { FIELD_OP_NONE, FIELD_OP_PLUS, FIELD_OP_MINUS, FIELD_OP_UNARY_MINUS, + FIELD_OP_DIV, + FIELD_OP_MULT, }; /* @@ -152,6 +159,11 @@ struct hist_field { bool read_once; unsigned int var_str_idx; + + /* Numeric literals are represented as u64 */ + u64 constant; + /* Used to optimize division by constants */ + u64 div_multiplier; }; static u64 hist_field_none(struct hist_field *field, @@ -163,6 +175,15 @@ static u64 hist_field_none(struct hist_field *field, return 0; } +static u64 hist_field_const(struct hist_field *field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + return field->constant; +} + static u64 hist_field_counter(struct hist_field *field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -271,6 +292,106 @@ static u64 hist_field_minus(struct hist_field *hist_field, return val1 - val2; } +static u64 hist_field_div(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + + /* Return -1 for the undefined case */ + if (!val2) + return -1; + + /* Use shift if the divisor is a power of 2 */ + if (!(val2 & (val2 - 1))) + return val1 >> __ffs64(val2); + + return div64_u64(val1, val2); +} + +static u64 div_by_power_of_two(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + + return val1 >> __ffs64(operand2->constant); +} + +static u64 div_by_not_power_of_two(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + + return div64_u64(val1, operand2->constant); +} + +static u64 div_by_mult_and_shift(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + + /* + * If the divisor is a constant, do a multiplication and shift instead. + * + * Choose Z = some power of 2. If Y <= Z, then: + * X / Y = (X * (Z / Y)) / Z + * + * (Z / Y) is a constant (mult) which is calculated at parse time, so: + * X / Y = (X * mult) / Z + * + * The division by Z can be replaced by a shift since Z is a power of 2: + * X / Y = (X * mult) >> HIST_DIV_SHIFT + * + * As long, as X < Z the results will not be off by more than 1. + */ + if (val1 < (1 << HIST_DIV_SHIFT)) { + u64 mult = operand2->div_multiplier; + + return (val1 * mult + ((1 << HIST_DIV_SHIFT) - 1)) >> HIST_DIV_SHIFT; + } + + return div64_u64(val1, operand2->constant); +} + +static u64 hist_field_mult(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + + return val1 * val2; +} + static u64 hist_field_unary_minus(struct hist_field *hist_field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -341,6 +462,7 @@ enum hist_field_flags { HIST_FIELD_FL_CPU = 1 << 15, HIST_FIELD_FL_ALIAS = 1 << 16, HIST_FIELD_FL_BUCKET = 1 << 17, + HIST_FIELD_FL_CONST = 1 << 18, }; struct var_defs { @@ -517,6 +639,25 @@ struct snapshot_context { void *key; }; +/* + * Returns the specific division function to use if the divisor + * is constant. This avoids extra branches when the trigger is hit. + */ +static hist_field_fn_t hist_field_get_div_fn(struct hist_field *divisor) +{ + u64 div = divisor->constant; + + if (!(div & (div - 1))) + return div_by_power_of_two; + + /* If the divisor is too large, do a regular division */ + if (div > (1 << HIST_DIV_SHIFT)) + return div_by_not_power_of_two; + + divisor->div_multiplier = div64_u64((u64)(1 << HIST_DIV_SHIFT), div); + return div_by_mult_and_shift; +} + static void track_data_free(struct track_data *track_data) { struct hist_elt_data *elt_data; @@ -1516,6 +1657,12 @@ static void expr_field_str(struct hist_field *field, char *expr) { if (field->flags & HIST_FIELD_FL_VAR_REF) strcat(expr, "$"); + else if (field->flags & HIST_FIELD_FL_CONST) { + char str[HIST_CONST_DIGITS_MAX]; + + snprintf(str, HIST_CONST_DIGITS_MAX, "%llu", field->constant); + strcat(expr, str); + } strcat(expr, hist_field_name(field, 0)); @@ -1571,6 +1718,12 @@ static char *expr_str(struct hist_field *field, unsigned int level) case FIELD_OP_PLUS: strcat(expr, "+"); break; + case FIELD_OP_DIV: + strcat(expr, "/"); + break; + case FIELD_OP_MULT: + strcat(expr, "*"); + break; default: kfree(expr); return NULL; @@ -1581,34 +1734,92 @@ static char *expr_str(struct hist_field *field, unsigned int level) return expr; } -static int contains_operator(char *str) +/* + * If field_op != FIELD_OP_NONE, *sep points to the root operator + * of the expression tree to be evaluated. + */ +static int contains_operator(char *str, char **sep) { enum field_op_id field_op = FIELD_OP_NONE; - char *op; + char *minus_op, *plus_op, *div_op, *mult_op; - op = strpbrk(str, "+-"); - if (!op) - return FIELD_OP_NONE; - switch (*op) { - case '-': + /* + * Report the last occurrence of the operators first, so that the + * expression is evaluated left to right. This is important since + * subtraction and division are not associative. + * + * e.g + * 64/8/4/2 is 1, i.e 64/8/4/2 = ((64/8)/4)/2 + * 14-7-5-2 is 0, i.e 14-7-5-2 = ((14-7)-5)-2 + */ + + /* + * First, find lower precedence addition and subtraction + * since the expression will be evaluated recursively. + */ + minus_op = strrchr(str, '-'); + if (minus_op) { /* - * Unfortunately, the modifier ".sym-offset" - * can confuse things. + * Unary minus is not supported in sub-expressions. If + * present, it is always the next root operator. */ - if (op - str >= 4 && !strncmp(op - 4, ".sym-offset", 11)) - return FIELD_OP_NONE; - - if (*str == '-') + if (minus_op == str) { field_op = FIELD_OP_UNARY_MINUS; - else - field_op = FIELD_OP_MINUS; - break; - case '+': - field_op = FIELD_OP_PLUS; - break; - default: - break; + goto out; + } + + field_op = FIELD_OP_MINUS; + } + + plus_op = strrchr(str, '+'); + if (plus_op || minus_op) { + /* + * For operators of the same precedence use to rightmost as the + * root, so that the expression is evaluated left to right. + */ + if (plus_op > minus_op) + field_op = FIELD_OP_PLUS; + goto out; + } + + /* + * Multiplication and division have higher precedence than addition and + * subtraction. + */ + div_op = strrchr(str, '/'); + if (div_op) + field_op = FIELD_OP_DIV; + + mult_op = strrchr(str, '*'); + /* + * For operators of the same precedence use to rightmost as the + * root, so that the expression is evaluated left to right. + */ + if (mult_op > div_op) + field_op = FIELD_OP_MULT; + +out: + if (sep) { + switch (field_op) { + case FIELD_OP_UNARY_MINUS: + case FIELD_OP_MINUS: + *sep = minus_op; + break; + case FIELD_OP_PLUS: + *sep = plus_op; + break; + case FIELD_OP_DIV: + *sep = div_op; + break; + case FIELD_OP_MULT: + *sep = mult_op; + break; + case FIELD_OP_NONE: + default: + *sep = NULL; + break; + } } return field_op; @@ -1689,6 +1900,15 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; } + if (flags & HIST_FIELD_FL_CONST) { + hist_field->fn = hist_field_const; + hist_field->size = sizeof(u64); + hist_field->type = kstrdup("u64", GFP_KERNEL); + if (!hist_field->type) + goto free; + goto out; + } + if (flags & HIST_FIELD_FL_STACKTRACE) { hist_field->fn = hist_field_none; goto out; @@ -1933,7 +2153,7 @@ static char *field_name_from_var(struct hist_trigger_data *hist_data, if (strcmp(var_name, name) == 0) { field = hist_data->attrs->var_defs.expr[i]; - if (contains_operator(field) || is_var_ref(field)) + if (contains_operator(field, NULL) || is_var_ref(field)) continue; return field; } @@ -2010,7 +2230,11 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, *flags |= HIST_FIELD_FL_HEX; else if (strcmp(modifier, "sym") == 0) *flags |= HIST_FIELD_FL_SYM; - else if (strcmp(modifier, "sym-offset") == 0) + /* + * 'sym-offset' occurrences in the trigger string are modified + * to 'symXoffset' to simplify arithmetic expression parsing. + */ + else if (strcmp(modifier, "symXoffset") == 0) *flags |= HIST_FIELD_FL_SYM_OFFSET; else if ((strcmp(modifier, "execname") == 0) && (strcmp(field_name, "common_pid") == 0)) @@ -2098,6 +2322,29 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data, return alias; } +static struct hist_field *parse_const(struct hist_trigger_data *hist_data, + char *str, char *var_name, + unsigned long *flags) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *field = NULL; + u64 constant; + + if (kstrtoull(str, 0, &constant)) { + hist_err(tr, HIST_ERR_EXPECT_NUMBER, errpos(str)); + return NULL; + } + + *flags |= HIST_FIELD_FL_CONST; + field = create_hist_field(hist_data, NULL, *flags, var_name); + if (!field) + return NULL; + + field->constant = constant; + + return field; +} + static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long *flags, char *var_name) @@ -2108,6 +2355,15 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, unsigned long buckets = 0; int ret = 0; + if (isdigit(str[0])) { + hist_field = parse_const(hist_data, str, var_name, flags); + if (!hist_field) { + ret = -EINVAL; + goto out; + } + return hist_field; + } + s = strchr(str, '.'); if (s) { s = strchr(++s, '.'); @@ -2164,21 +2420,24 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long flags, - char *var_name, unsigned int level); + char *var_name, unsigned int *n_subexprs); static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long flags, - char *var_name, unsigned int level) + char *var_name, unsigned int *n_subexprs) { struct hist_field *operand1, *expr = NULL; unsigned long operand_flags; int ret = 0; char *s; + /* Unary minus operator, increment n_subexprs */ + ++*n_subexprs; + /* we support only -(xxx) i.e. explicit parens required */ - if (level > 3) { + if (*n_subexprs > 3) { hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); ret = -EINVAL; goto free; @@ -2195,8 +2454,16 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, } s = strrchr(str, ')'); - if (s) + if (s) { + /* unary minus not supported in sub-expressions */ + if (*(s+1) != '\0') { + hist_err(file->tr, HIST_ERR_UNARY_MINUS_SUBEXPR, + errpos(str)); + ret = -EINVAL; + goto free; + } *s = '\0'; + } else { ret = -EINVAL; /* no closing ')' */ goto free; @@ -2210,7 +2477,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, } operand_flags = 0; - operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level); + operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs); if (IS_ERR(operand1)) { ret = PTR_ERR(operand1); goto free; @@ -2243,9 +2510,15 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, return ERR_PTR(ret); } +/* + * If the operands are var refs, return pointers the + * variable(s) referenced in var1 and var2, else NULL. + */ static int check_expr_operands(struct trace_array *tr, struct hist_field *operand1, - struct hist_field *operand2) + struct hist_field *operand2, + struct hist_field **var1, + struct hist_field **var2) { unsigned long operand1_flags = operand1->flags; unsigned long operand2_flags = operand2->flags; @@ -2258,6 +2531,7 @@ static int check_expr_operands(struct trace_array *tr, if (!var) return -EINVAL; operand1_flags = var->flags; + *var1 = var; } if ((operand2_flags & HIST_FIELD_FL_VAR_REF) || @@ -2268,6 +2542,7 @@ static int check_expr_operands(struct trace_array *tr, if (!var) return -EINVAL; operand2_flags = var->flags; + *var2 = var; } if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != @@ -2282,74 +2557,102 @@ static int check_expr_operands(struct trace_array *tr, static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long flags, - char *var_name, unsigned int level) + char *var_name, unsigned int *n_subexprs) { struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL; - unsigned long operand_flags; + struct hist_field *var1 = NULL, *var2 = NULL; + unsigned long operand_flags, operand2_flags; int field_op, ret = -EINVAL; char *sep, *operand1_str; + hist_field_fn_t op_fn; + bool combine_consts; - if (level > 3) { + if (*n_subexprs > 3) { hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); return ERR_PTR(-EINVAL); } - field_op = contains_operator(str); + field_op = contains_operator(str, &sep); if (field_op == FIELD_OP_NONE) return parse_atom(hist_data, file, str, &flags, var_name); if (field_op == FIELD_OP_UNARY_MINUS) - return parse_unary(hist_data, file, str, flags, var_name, ++level); + return parse_unary(hist_data, file, str, flags, var_name, n_subexprs); - switch (field_op) { - case FIELD_OP_MINUS: - sep = "-"; - break; - case FIELD_OP_PLUS: - sep = "+"; - break; - default: - goto free; - } + /* Binary operator found, increment n_subexprs */ + ++*n_subexprs; - operand1_str = strsep(&str, sep); - if (!operand1_str || !str) - goto free; + /* Split the expression string at the root operator */ + if (!sep) + return ERR_PTR(-EINVAL); + + *sep = '\0'; + operand1_str = str; + str = sep+1; + + /* Binary operator requires both operands */ + if (*operand1_str == '\0' || *str == '\0') + return ERR_PTR(-EINVAL); operand_flags = 0; - operand1 = parse_atom(hist_data, file, operand1_str, - &operand_flags, NULL); - if (IS_ERR(operand1)) { - ret = PTR_ERR(operand1); - operand1 = NULL; - goto free; - } + + /* LHS of string is an expression e.g. a+b in a+b+c */ + operand1 = parse_expr(hist_data, file, operand1_str, operand_flags, NULL, n_subexprs); + if (IS_ERR(operand1)) + return ERR_CAST(operand1); + if (operand1->flags & HIST_FIELD_FL_STRING) { hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(operand1_str)); ret = -EINVAL; - goto free; + goto free_op1; } - /* rest of string could be another expression e.g. b+c in a+b+c */ + /* RHS of string is another expression e.g. c in a+b+c */ operand_flags = 0; - operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level); + operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs); if (IS_ERR(operand2)) { ret = PTR_ERR(operand2); - operand2 = NULL; - goto free; + goto free_op1; } if (operand2->flags & HIST_FIELD_FL_STRING) { hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str)); ret = -EINVAL; - goto free; + goto free_operands; } - ret = check_expr_operands(file->tr, operand1, operand2); - if (ret) - goto free; + switch (field_op) { + case FIELD_OP_MINUS: + op_fn = hist_field_minus; + break; + case FIELD_OP_PLUS: + op_fn = hist_field_plus; + break; + case FIELD_OP_DIV: + op_fn = hist_field_div; + break; + case FIELD_OP_MULT: + op_fn = hist_field_mult; + break; + default: + ret = -EINVAL; + goto free_operands; + } - flags |= HIST_FIELD_FL_EXPR; + ret = check_expr_operands(file->tr, operand1, operand2, &var1, &var2); + if (ret) + goto free_operands; + + operand_flags = var1 ? var1->flags : operand1->flags; + operand2_flags = var2 ? var2->flags : operand2->flags; + + /* + * If both operands are constant, the expression can be + * collapsed to a single constant. + */ + combine_consts = operand_flags & operand2_flags & HIST_FIELD_FL_CONST; + + flags |= combine_consts ? HIST_FIELD_FL_CONST : HIST_FIELD_FL_EXPR; flags |= operand1->flags & (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); @@ -2357,45 +2660,80 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr = create_hist_field(hist_data, NULL, flags, var_name); if (!expr) { ret = -ENOMEM; - goto free; + goto free_operands; } operand1->read_once = true; operand2->read_once = true; + /* The operands are now owned and free'd by 'expr' */ expr->operands[0] = operand1; expr->operands[1] = operand2; - /* The operand sizes should be the same, so just pick one */ - expr->size = operand1->size; - expr->is_signed = operand1->is_signed; + if (field_op == FIELD_OP_DIV && + operand2_flags & HIST_FIELD_FL_CONST) { + u64 divisor = var2 ? var2->constant : operand2->constant; - expr->operator = field_op; - expr->name = expr_str(expr, 0); - expr->type = kstrdup_const(operand1->type, GFP_KERNEL); - if (!expr->type) { - ret = -ENOMEM; - goto free; + if (!divisor) { + hist_err(file->tr, HIST_ERR_DIVISION_BY_ZERO, errpos(str)); + ret = -EDOM; + goto free_expr; + } + + /* + * Copy the divisor here so we don't have to look it up + * later if this is a var ref + */ + operand2->constant = divisor; + op_fn = hist_field_get_div_fn(operand2); } - switch (field_op) { - case FIELD_OP_MINUS: - expr->fn = hist_field_minus; - break; - case FIELD_OP_PLUS: - expr->fn = hist_field_plus; - break; - default: - ret = -EINVAL; - goto free; + if (combine_consts) { + if (var1) + expr->operands[0] = var1; + if (var2) + expr->operands[1] = var2; + + expr->constant = op_fn(expr, NULL, NULL, NULL, NULL); + + expr->operands[0] = NULL; + expr->operands[1] = NULL; + + /* + * var refs won't be destroyed immediately + * See: destroy_hist_field() + */ + destroy_hist_field(operand2, 0); + destroy_hist_field(operand1, 0); + + expr->name = expr_str(expr, 0); + } else { + expr->fn = op_fn; + + /* The operand sizes should be the same, so just pick one */ + expr->size = operand1->size; + expr->is_signed = operand1->is_signed; + + expr->operator = field_op; + expr->type = kstrdup_const(operand1->type, GFP_KERNEL); + if (!expr->type) { + ret = -ENOMEM; + goto free_expr; + } + + expr->name = expr_str(expr, 0); } return expr; - free: - destroy_hist_field(operand1, 0); - destroy_hist_field(operand2, 0); - destroy_hist_field(expr, 0); +free_operands: + destroy_hist_field(operand2, 0); +free_op1: + destroy_hist_field(operand1, 0); + return ERR_PTR(ret); + +free_expr: + destroy_hist_field(expr, 0); return ERR_PTR(ret); } @@ -3770,9 +4108,9 @@ static int __create_val_field(struct hist_trigger_data *hist_data, unsigned long flags) { struct hist_field *hist_field; - int ret = 0; + int ret = 0, n_subexprs = 0; - hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0); + hist_field = parse_expr(hist_data, file, field_str, flags, var_name, &n_subexprs); if (IS_ERR(hist_field)) { ret = PTR_ERR(hist_field); goto out; @@ -3913,7 +4251,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, struct hist_field *hist_field = NULL; unsigned long flags = 0; unsigned int key_size; - int ret = 0; + int ret = 0, n_subexprs = 0; if (WARN_ON(key_idx >= HIST_FIELDS_MAX)) return -EINVAL; @@ -3926,7 +4264,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, hist_field = create_hist_field(hist_data, NULL, flags, NULL); } else { hist_field = parse_expr(hist_data, file, field_str, flags, - NULL, 0); + NULL, &n_subexprs); if (IS_ERR(hist_field)) { ret = PTR_ERR(hist_field); goto out; @@ -4977,6 +5315,8 @@ static void hist_field_debug_show_flags(struct seq_file *m, if (flags & HIST_FIELD_FL_ALIAS) seq_puts(m, " HIST_FIELD_FL_ALIAS\n"); + else if (flags & HIST_FIELD_FL_CONST) + seq_puts(m, " HIST_FIELD_FL_CONST\n"); } static int hist_field_debug_show(struct seq_file *m, @@ -4998,6 +5338,9 @@ static int hist_field_debug_show(struct seq_file *m, field->var.idx); } + if (field->flags & HIST_FIELD_FL_CONST) + seq_printf(m, " constant: %llu\n", field->constant); + if (field->flags & HIST_FIELD_FL_ALIAS) seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n", field->var_ref_idx); @@ -5240,6 +5583,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) if (hist_field->flags & HIST_FIELD_FL_CPU) seq_puts(m, "common_cpu"); + else if (hist_field->flags & HIST_FIELD_FL_CONST) + seq_printf(m, "%llu", hist_field->constant); else if (field_name) { if (hist_field->flags & HIST_FIELD_FL_VAR_REF || hist_field->flags & HIST_FIELD_FL_ALIAS) @@ -5822,7 +6167,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, struct synth_event *se; const char *se_name; bool remove = false; - char *trigger, *p; + char *trigger, *p, *start; int ret = 0; lockdep_assert_held(&event_mutex); @@ -5870,6 +6215,16 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, trigger = strstrip(trigger); } + /* + * To simplify arithmetic expression parsing, replace occurrences of + * '.sym-offset' modifier with '.symXoffset' + */ + start = strstr(trigger, ".sym-offset"); + while (start) { + *(start + 4) = 'X'; + start = strstr(start + 11, ".sym-offset"); + } + attrs = parse_hist_trigger_attrs(file->tr, trigger); if (IS_ERR(attrs)) return PTR_ERR(attrs); diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 1e130da1b742..070befb7ad79 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -14,6 +14,8 @@ #define CREATE_TRACE_POINTS #include +#undef CREATE_TRACE_POINTS +#include #ifdef CONFIG_TRACE_IRQFLAGS /* Per-cpu variable to prevent redundant calls when IRQs already off */ @@ -28,8 +30,11 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu); void trace_hardirqs_on_prepare(void) { if (this_cpu_read(tracing_irq_cpu)) { - if (!in_nmi()) + if (!in_nmi()) { trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); + trace_android_rvh_irqs_enable(CALLER_ADDR0, + CALLER_ADDR1); + } tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -40,8 +45,11 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { - if (!in_nmi()) + if (!in_nmi()) { trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + trace_android_rvh_irqs_enable(CALLER_ADDR0, + CALLER_ADDR1); + } tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -63,8 +71,11 @@ void trace_hardirqs_off_finish(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - if (!in_nmi()) + if (!in_nmi()) { trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); + trace_android_rvh_irqs_disable(CALLER_ADDR0, + CALLER_ADDR1); + } } } @@ -78,8 +89,11 @@ void trace_hardirqs_off(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - if (!in_nmi()) + if (!in_nmi()) { trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + trace_android_rvh_irqs_disable(CALLER_ADDR0, + CALLER_ADDR1); + } } } EXPORT_SYMBOL(trace_hardirqs_off); @@ -88,8 +102,11 @@ NOKPROBE_SYMBOL(trace_hardirqs_off); __visible void trace_hardirqs_on_caller(unsigned long caller_addr) { if (this_cpu_read(tracing_irq_cpu)) { - if (!in_nmi()) + if (!in_nmi()) { trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); + trace_android_rvh_irqs_enable(CALLER_ADDR0, + caller_addr); + } tracer_hardirqs_on(CALLER_ADDR0, caller_addr); this_cpu_write(tracing_irq_cpu, 0); } @@ -107,8 +124,11 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, caller_addr); - if (!in_nmi()) + if (!in_nmi()) { trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); + trace_android_rvh_irqs_enable(CALLER_ADDR0, + caller_addr); + } } } EXPORT_SYMBOL(trace_hardirqs_off_caller); @@ -119,15 +139,19 @@ NOKPROBE_SYMBOL(trace_hardirqs_off_caller); void trace_preempt_on(unsigned long a0, unsigned long a1) { - if (!in_nmi()) + if (!in_nmi()) { trace_preempt_enable_rcuidle(a0, a1); + trace_android_rvh_preempt_enable(a0, a1); + } tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - if (!in_nmi()) + if (!in_nmi()) { trace_preempt_disable_rcuidle(a0, a1); + trace_android_rvh_preempt_disable(a0, a1); + } tracer_preempt_off(a0, a1); } #endif diff --git a/kernel/trace/trace_readwrite.c b/kernel/trace/trace_readwrite.c new file mode 100644 index 000000000000..10ebe3c9687a --- /dev/null +++ b/kernel/trace/trace_readwrite.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Register read and write tracepoints + * + * Copyright (c) 2021 Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#ifdef CONFIG_TRACE_MMIO_ACCESS +void log_write_mmio(u64 val, u8 width, volatile void __iomem *addr, + unsigned long caller_addr) +{ + trace_rwmmio_write(caller_addr, val, width, addr); +} +EXPORT_SYMBOL_GPL(log_write_mmio); +EXPORT_TRACEPOINT_SYMBOL_GPL(rwmmio_write); + +void log_post_write_mmio(u64 val, u8 width, volatile void __iomem *addr, + unsigned long caller_addr) +{ + trace_rwmmio_post_write(caller_addr, val, width, addr); +} +EXPORT_SYMBOL_GPL(log_post_write_mmio); +EXPORT_TRACEPOINT_SYMBOL_GPL(rwmmio_post_write); + +void log_read_mmio(u8 width, const volatile void __iomem *addr, + unsigned long caller_addr) +{ + trace_rwmmio_read(caller_addr, width, addr); +} +EXPORT_SYMBOL_GPL(log_read_mmio); +EXPORT_TRACEPOINT_SYMBOL_GPL(rwmmio_read); + +void log_post_read_mmio(u64 val, u8 width, const volatile void __iomem *addr, + unsigned long caller_addr) +{ + trace_rwmmio_post_read(caller_addr, val, width, addr); +} +EXPORT_SYMBOL_GPL(log_post_read_mmio); +EXPORT_TRACEPOINT_SYMBOL_GPL(rwmmio_post_read); +#endif /* CONFIG_TRACE_MMIO_ACCESS */ diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 64ea283f2f86..d05628d1936e 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -786,3 +786,82 @@ void syscall_unregfunc(void) } } #endif + +#ifdef CONFIG_ANDROID_VENDOR_HOOKS + +static void *rvh_zalloc_funcs(int count) +{ + return kzalloc(sizeof(struct tracepoint_func) * count, GFP_KERNEL); +} + +#define ANDROID_RVH_NR_PROBES_MAX 2 +static int rvh_func_add(struct tracepoint *tp, struct tracepoint_func *func) +{ + int i; + + if (!static_key_enabled(&tp->key)) { + /* '+ 1' for the last NULL element */ + tp->funcs = rvh_zalloc_funcs(ANDROID_RVH_NR_PROBES_MAX + 1); + if (!tp->funcs) + return ENOMEM; + } + + for (i = 0; i < ANDROID_RVH_NR_PROBES_MAX; i++) { + if (!tp->funcs[i].func) { + if (!static_key_enabled(&tp->key)) + tp->funcs[i].data = func->data; + WRITE_ONCE(tp->funcs[i].func, func->func); + + return 0; + } + } + + return -EBUSY; +} + +static int android_rvh_add_func(struct tracepoint *tp, struct tracepoint_func *func) +{ + int ret; + + if (tp->regfunc && !static_key_enabled(&tp->key)) { + ret = tp->regfunc(); + if (ret < 0) + return ret; + } + + ret = rvh_func_add(tp, func); + if (ret) + return ret; + tracepoint_update_call(tp, tp->funcs); + static_key_enable(&tp->key); + + return 0; +} + +int android_rvh_probe_register(struct tracepoint *tp, void *probe, void *data) +{ + struct tracepoint_func tp_func; + int ret; + + /* + * Once the static key has been flipped, the array may be read + * concurrently. Although __traceiter_*() always checks .func first, + * it doesn't enforce read->read dependencies, and we can't strongly + * guarantee it will see the correct .data for the second element + * without adding smp_load_acquire() in the fast path. But this is a + * corner case which is unlikely to be needed by anybody in practice, + * so let's just forbid it and keep the fast path clean. + */ + if (WARN_ON(static_key_enabled(&tp->key) && data)) + return -EINVAL; + + mutex_lock(&tracepoints_mutex); + tp_func.func = probe; + tp_func.data = data; + ret = android_rvh_add_func(tp, &tp_func); + mutex_unlock(&tracepoints_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(android_rvh_probe_register); +#endif diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1cfa269bd448..2b1c9d202f92 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -27,6 +27,8 @@ #include #include +#include + static DEFINE_MUTEX(watchdog_mutex); #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) @@ -439,6 +441,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) clear_bit_unlock(0, &soft_lockup_nmi_warn); } + trace_android_vh_watchdog_timer_softlockup(duration, regs, !!softlockup_panic); add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f5fa7be8d17e..3570cf5c347e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -54,6 +54,10 @@ #include "workqueue_internal.h" +#include +/* events/workqueue.h uses default TRACE_INCLUDE_PATH */ +#undef TRACE_INCLUDE_PATH + enum { /* * worker_pool flags @@ -379,6 +383,9 @@ static void show_pwq(struct pool_workqueue *pwq); #define CREATE_TRACE_POINTS #include +EXPORT_TRACEPOINT_SYMBOL_GPL(workqueue_execute_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(workqueue_execute_end); + #define assert_rcu_or_pool_mutex() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&wq_pool_mutex), \ @@ -1359,7 +1366,7 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct worker_pool *pool = pwq->pool; /* record the work call stack in order to print it in KASAN reports */ - kasan_record_aux_stack(work); + kasan_record_aux_stack_noalloc(work); /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); @@ -5877,6 +5884,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) pr_cont_pool_info(pool); pr_cont(" stuck for %us!\n", jiffies_to_msecs(now - pool_ts) / 1000); + trace_android_vh_wq_lockup_pool(pool->cpu, pool_ts); } } diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f71db0cc3bf1..c0a0ec4dc067 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -336,6 +336,16 @@ config DEBUG_INFO_BTF_MODULES help Generate compact split BTF type information for kernel modules. +config MODULE_ALLOW_BTF_MISMATCH + bool "Allow loading modules with non-matching BTF type info" + depends on DEBUG_INFO_BTF_MODULES + help + For modules whose split BTF does not match vmlinux, load without + BTF rather than refusing to load. The default behavior with + module BTF enabled is to reject modules with such mismatches; + this option will still load module BTF where possible but ignore + it when a mismatch is found. + config GDB_SCRIPTS bool "Provide GDB scripts for kernel debugging" help diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index cdc842d090db..f4c0c37c6ea3 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only + # This config refers to the generic KASAN mode. config HAVE_ARCH_KASAN bool @@ -15,9 +16,8 @@ config HAVE_ARCH_KASAN_VMALLOC config ARCH_DISABLE_KASAN_INLINE bool help - An architecture might not support inline instrumentation. - When this option is selected, inline and stack instrumentation are - disabled. + Disables both inline and stack instrumentation. Selected by + architectures that do not support these instrumentation types. config CC_HAS_KASAN_GENERIC def_bool $(cc-option, -fsanitize=kernel-address) @@ -26,13 +26,13 @@ config CC_HAS_KASAN_SW_TAGS def_bool $(cc-option, -fsanitize=kernel-hwaddress) # This option is only required for software KASAN modes. -# Old GCC versions don't have proper support for no_sanitize_address. +# Old GCC versions do not have proper support for no_sanitize_address. # See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89124 for details. config CC_HAS_WORKING_NOSANITIZE_ADDRESS def_bool !CC_IS_GCC || GCC_VERSION >= 80300 menuconfig KASAN - bool "KASAN: runtime memory debugger" + bool "KASAN: dynamic memory safety error detector" depends on (((HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC) || \ (HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS)) && \ CC_HAS_WORKING_NOSANITIZE_ADDRESS) || \ @@ -40,10 +40,13 @@ menuconfig KASAN depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB) select STACKDEPOT help - Enables KASAN (KernelAddressSANitizer) - runtime memory debugger, - designed to find out-of-bounds accesses and use-after-free bugs. + Enables KASAN (Kernel Address Sanitizer) - a dynamic memory safety + error detector designed to find out-of-bounds and use-after-free bugs. + See Documentation/dev-tools/kasan.rst for details. + For better error reports, also enable CONFIG_STACKTRACE. + if KASAN choice @@ -51,75 +54,71 @@ choice default KASAN_GENERIC help KASAN has three modes: - 1. generic KASAN (similar to userspace ASan, - x86_64/arm64/xtensa, enabled with CONFIG_KASAN_GENERIC), - 2. software tag-based KASAN (arm64 only, based on software - memory tagging (similar to userspace HWASan), enabled with - CONFIG_KASAN_SW_TAGS), and - 3. hardware tag-based KASAN (arm64 only, based on hardware - memory tagging, enabled with CONFIG_KASAN_HW_TAGS). - All KASAN modes are strictly debugging features. + 1. Generic KASAN (supported by many architectures, enabled with + CONFIG_KASAN_GENERIC, similar to userspace ASan), + 2. Software Tag-Based KASAN (arm64 only, based on software memory + tagging, enabled with CONFIG_KASAN_SW_TAGS, similar to userspace + HWASan), and + 3. Hardware Tag-Based KASAN (arm64 only, based on hardware memory + tagging, enabled with CONFIG_KASAN_HW_TAGS). - For better error reports enable CONFIG_STACKTRACE. + See Documentation/dev-tools/kasan.rst for details about each mode. config KASAN_GENERIC - bool "Generic mode" + bool "Generic KASAN" depends on HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS select SLUB_DEBUG if SLUB select CONSTRUCTORS help - Enables generic KASAN mode. + Enables Generic KASAN. - This mode is supported in both GCC and Clang. With GCC it requires - version 8.3.0 or later. Any supported Clang version is compatible, - but detection of out-of-bounds accesses for global variables is - supported only since Clang 11. + Requires GCC 8.3.0+ or Clang. - This mode consumes about 1/8th of available memory at kernel start - and introduces an overhead of ~x1.5 for the rest of the allocations. + Consumes about 1/8th of available memory at kernel start and adds an + overhead of ~50% for dynamic allocations. The performance slowdown is ~x3. - Currently CONFIG_KASAN_GENERIC doesn't work with CONFIG_DEBUG_SLAB - (the resulting kernel does not boot). + (Incompatible with CONFIG_DEBUG_SLAB: the kernel does not boot.) config KASAN_SW_TAGS - bool "Software tag-based mode" + bool "Software Tag-Based KASAN" depends on HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS select SLUB_DEBUG if SLUB select CONSTRUCTORS help - Enables software tag-based KASAN mode. + Enables Software Tag-Based KASAN. - This mode require software memory tagging support in the form of - HWASan-like compiler instrumentation. + Requires GCC 11+ or Clang. - Currently this mode is only implemented for arm64 CPUs and relies on - Top Byte Ignore. This mode requires Clang. + Supported only on arm64 CPUs and relies on Top Byte Ignore. - This mode consumes about 1/16th of available memory at kernel start - and introduces an overhead of ~20% for the rest of the allocations. - This mode may potentially introduce problems relating to pointer - casting and comparison, as it embeds tags into the top byte of each - pointer. + Consumes about 1/16th of available memory at kernel start and + add an overhead of ~20% for dynamic allocations. - Currently CONFIG_KASAN_SW_TAGS doesn't work with CONFIG_DEBUG_SLAB - (the resulting kernel does not boot). + May potentially introduce problems related to pointer casting and + comparison, as it embeds a tag into the top byte of each pointer. + + (Incompatible with CONFIG_DEBUG_SLAB: the kernel does not boot.) config KASAN_HW_TAGS - bool "Hardware tag-based mode" + bool "Hardware Tag-Based KASAN" depends on HAVE_ARCH_KASAN_HW_TAGS depends on SLUB help - Enables hardware tag-based KASAN mode. + Enables Hardware Tag-Based KASAN. - This mode requires hardware memory tagging support, and can be used - by any architecture that provides it. + Requires GCC 10+ or Clang 12+. - Currently this mode is only implemented for arm64 CPUs starting from - ARMv8.5 and relies on Memory Tagging Extension and Top Byte Ignore. + Supported only on arm64 CPUs starting from ARMv8.5 and relies on + Memory Tagging Extension and Top Byte Ignore. + + Consumes about 1/32nd of available memory. + + May potentially introduce problems related to pointer casting and + comparison, as it embeds a tag into the top byte of each pointer. endchoice @@ -131,83 +130,72 @@ choice config KASAN_OUTLINE bool "Outline instrumentation" help - Before every memory access compiler insert function call - __asan_load*/__asan_store*. These functions performs check - of shadow memory. This is slower than inline instrumentation, - however it doesn't bloat size of kernel's .text section so - much as inline does. + Makes the compiler insert function calls that check whether the memory + is accessible before each memory access. Slower than KASAN_INLINE, but + does not bloat the size of the kernel's .text section so much. config KASAN_INLINE bool "Inline instrumentation" depends on !ARCH_DISABLE_KASAN_INLINE help - Compiler directly inserts code checking shadow memory before - memory accesses. This is faster than outline (in some workloads - it gives about x2 boost over outline instrumentation), but - make kernel's .text size much bigger. + Makes the compiler directly insert memory accessibility checks before + each memory access. Faster than KASAN_OUTLINE (gives ~x2 boost for + some workloads), but makes the kernel's .text size much bigger. endchoice config KASAN_STACK - bool "Enable stack instrumentation (unsafe)" if CC_IS_CLANG && !COMPILE_TEST + bool "Stack instrumentation (unsafe)" if CC_IS_CLANG && !COMPILE_TEST depends on KASAN_GENERIC || KASAN_SW_TAGS depends on !ARCH_DISABLE_KASAN_INLINE default y if CC_IS_GCC help - The LLVM stack address sanitizer has a know problem that - causes excessive stack usage in a lot of functions, see - https://bugs.llvm.org/show_bug.cgi?id=38809 - Disabling asan-stack makes it safe to run kernels build - with clang-8 with KASAN enabled, though it loses some of - the functionality. - This feature is always disabled when compile-testing with clang - to avoid cluttering the output in stack overflow warnings, - but clang users can still enable it for builds without - CONFIG_COMPILE_TEST. On gcc it is assumed to always be safe - to use and enabled by default. - If the architecture disables inline instrumentation, stack - instrumentation is also disabled as it adds inline-style - instrumentation that is run unconditionally. + Disables stack instrumentation and thus KASAN's ability to detect + out-of-bounds bugs in stack variables. -config KASAN_TAGS_IDENTIFY - bool "Enable memory corruption identification" - depends on KASAN_SW_TAGS || KASAN_HW_TAGS - help - This option enables best-effort identification of bug type - (use-after-free or out-of-bounds) at the cost of increased - memory consumption. + With Clang, stack instrumentation has a problem that causes excessive + stack usage, see https://bugs.llvm.org/show_bug.cgi?id=38809. Thus, + with Clang, this option is deemed unsafe. + + This option is always disabled when compile-testing with Clang to + avoid cluttering the log with stack overflow warnings. + + With GCC, enabling stack instrumentation is assumed to be safe. + + If the architecture disables inline instrumentation via + ARCH_DISABLE_KASAN_INLINE, stack instrumentation gets disabled + as well, as it adds inline-style instrumentation that is run + unconditionally. config KASAN_VMALLOC - bool "Back mappings in vmalloc space with real shadow memory" - depends on KASAN_GENERIC && HAVE_ARCH_KASAN_VMALLOC + bool "Check accesses to vmalloc allocations" + depends on HAVE_ARCH_KASAN_VMALLOC help - By default, the shadow region for vmalloc space is the read-only - zero page. This means that KASAN cannot detect errors involving - vmalloc space. + Makes KASAN check the validity of accesses to vmalloc allocations. - Enabling this option will hook in to vmap/vmalloc and back those - mappings with real shadow memory allocated on demand. This allows - for KASAN to detect more sorts of errors (and to support vmapped - stacks), but at the cost of higher memory usage. + With software KASAN modes, all types vmalloc allocations are + checked. Enabling this option leads to higher memory usage. + + With Hardware Tag-Based KASAN, only non-executable VM_ALLOC mappings + are checked. There is no additional memory usage. config KASAN_KUNIT_TEST tristate "KUnit-compatible tests of KASAN bug detection capabilities" if !KUNIT_ALL_TESTS depends on KASAN && KUNIT default KUNIT_ALL_TESTS help - This is a KUnit test suite doing various nasty things like - out of bounds and use after free accesses. It is useful for testing - kernel debugging features like KASAN. + A KUnit-based KASAN test suite. Triggers different kinds of + out-of-bounds and use-after-free accesses. Useful for testing whether + KASAN can detect certain bug types. For more information on KUnit and unit tests in general, please refer - to the KUnit documentation in Documentation/dev-tools/kunit. + to the KUnit documentation in Documentation/dev-tools/kunit/. config KASAN_MODULE_TEST tristate "KUnit-incompatible tests of KASAN bug detection capabilities" depends on m && KASAN && !KASAN_HW_TAGS help - This is a part of the KASAN test suite that is incompatible with - KUnit. Currently includes tests that do bad copy_from/to_user - accesses. + A part of the KASAN test suite that is not integrated with KUnit. + Incompatible with Hardware Tag-Based KASAN. endif # KASAN diff --git a/lib/Makefile b/lib/Makefile index 0868cb67e5b0..94e4bdaef54a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -63,11 +63,6 @@ CFLAGS_test_bitops.o += -Werror obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o obj-$(CONFIG_TEST_IDA) += test_ida.o -obj-$(CONFIG_KASAN_KUNIT_TEST) += test_kasan.o -CFLAGS_test_kasan.o += -fno-builtin -CFLAGS_test_kasan.o += $(call cc-disable-warning, vla) -obj-$(CONFIG_KASAN_MODULE_TEST) += test_kasan_module.o -CFLAGS_test_kasan_module.o += -fno-builtin obj-$(CONFIG_TEST_UBSAN) += test_ubsan.o CFLAGS_test_ubsan.o += $(call cc-disable-warning, vla) UBSAN_SANITIZE_test_ubsan.o := y diff --git a/lib/bug.c b/lib/bug.c index 45a0584f6541..b42b6c712304 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -49,6 +49,8 @@ #include #include +#include + extern struct bug_entry __start___bug_table[], __stop___bug_table[]; static inline unsigned long bug_addr(const struct bug_entry *bug) @@ -207,6 +209,8 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n", (void *)bugaddr); + trace_android_rvh_report_bug(file, line, bugaddr); + return BUG_TRAP_TYPE_BUG; } diff --git a/lib/crypto/aes.c b/lib/crypto/aes.c index 827fe89922ff..5fc78e50d693 100644 --- a/lib/crypto/aes.c +++ b/lib/crypto/aes.c @@ -7,6 +7,7 @@ #include #include #include +#include /* * Emit the sbox as volatile const to prevent the compiler from doing @@ -189,6 +190,13 @@ int aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, u32 rc, i, j; int err; +#ifndef __DISABLE_EXPORTS + err = -(MAX_ERRNO + 1); + trace_android_vh_aes_expandkey(ctx, in_key, key_len, &err); + if (err != -(MAX_ERRNO + 1)) + return err; +#endif + err = aes_check_keylen(key_len); if (err) return err; @@ -261,6 +269,13 @@ void aes_encrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in) int rounds = 6 + ctx->key_length / 4; u32 st0[4], st1[4]; int round; +#ifndef __DISABLE_EXPORTS + int hook_inuse = 0; + + trace_android_vh_aes_encrypt(ctx, out, in, &hook_inuse); + if (hook_inuse) + return; +#endif st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in); st0[1] = ctx->key_enc[1] ^ get_unaligned_le32(in + 4); @@ -312,6 +327,13 @@ void aes_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in) int rounds = 6 + ctx->key_length / 4; u32 st0[4], st1[4]; int round; +#ifndef __DISABLE_EXPORTS + int hook_inuse = 0; + + trace_android_vh_aes_decrypt(ctx, out, in, &hook_inuse); + if (hook_inuse) + return; +#endif st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in); st0[1] = ctx->key_dec[1] ^ get_unaligned_le32(in + 4); diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index 72a4b0b1df28..3f0475055059 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -17,6 +17,7 @@ #include #include #include +#include static const u32 SHA256_K[] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, @@ -200,6 +201,14 @@ void sha256(const u8 *data, unsigned int len, u8 *out) { struct sha256_state sctx; +#ifndef __DISABLE_EXPORTS + int hook_inuse = 0; + + trace_android_vh_sha256(data, len, out, &hook_inuse); + if (hook_inuse) + return; +#endif + sha256_init(&sctx); sha256_update(&sctx, data, len); sha256_final(&sctx, out); diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c index f7a3dc13316a..c1b89024f698 100644 --- a/lib/decompress_unxz.c +++ b/lib/decompress_unxz.c @@ -20,8 +20,8 @@ * * The worst case for in-place decompression is that the beginning of * the file is compressed extremely well, and the rest of the file is - * uncompressible. Thus, we must look for worst-case expansion when the - * compressor is encoding uncompressible data. + * incompressible. Thus, we must look for worst-case expansion when the + * compressor is encoding incompressible data. * * The structure of the .xz file in case of a compressed kernel is as follows. * Sizes (as bytes) of the fields are in parenthesis. @@ -58,7 +58,7 @@ * uncompressed size of the payload is in practice never less than the * payload size itself. The LZMA2 format would allow uncompressed size * to be less than the payload size, but no sane compressor creates such - * files. LZMA2 supports storing uncompressible data in uncompressed form, + * files. LZMA2 supports storing incompressible data in uncompressed form, * so there's never a need to create payloads whose uncompressed size is * smaller than the compressed size. * diff --git a/lib/show_mem.c b/lib/show_mem.c index 1c26c14ffbb9..3d8d609bbf54 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -7,7 +7,7 @@ #include #include - +#include void show_mem(unsigned int filter, nodemask_t *nodemask) { pg_data_t *pgdat; @@ -41,4 +41,6 @@ void show_mem(unsigned int filter, nodemask_t *nodemask) #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif + trace_android_vh_show_mem(filter, nodemask); } +EXPORT_SYMBOL_GPL(show_mem); diff --git a/lib/stackdepot.c b/lib/stackdepot.c index e90f0f19e77f..2a5836f8e7f2 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -101,8 +101,8 @@ static bool init_stack_slab(void **prealloc) } /* Allocation of a new stack in raw storage */ -static struct stack_record *depot_alloc_stack(unsigned long *entries, int size, - u32 hash, void **prealloc, gfp_t alloc_flags) +static struct stack_record * +depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) { struct stack_record *stack; size_t required_size = struct_size(stack, entries, size); @@ -213,6 +213,24 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, return NULL; } +/** + * stack_depot_print - print stack entries from a depot + * + * @stack: Stack depot handle which was returned from + * stack_depot_save(). + * + */ +void stack_depot_print(depot_stack_handle_t stack) +{ + unsigned long *entries; + unsigned int nr_entries; + + nr_entries = stack_depot_fetch(stack, &entries); + if (nr_entries > 0) + stack_trace_print(entries, nr_entries, 0); +} +EXPORT_SYMBOL_GPL(stack_depot_print); + /** * stack_depot_fetch - Fetch stack entries from a depot * @@ -231,6 +249,9 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, struct stack_record *stack; *entries = NULL; + if (!handle) + return 0; + if (parts.slabindex > depot_index) { WARN(1, "slab index %d out of bounds (%d) for stack id %08x\n", parts.slabindex, depot_index, handle); @@ -247,17 +268,31 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, EXPORT_SYMBOL_GPL(stack_depot_fetch); /** - * stack_depot_save - Save a stack trace from an array + * __stack_depot_save - Save a stack trace from an array * * @entries: Pointer to storage array * @nr_entries: Size of the storage array * @alloc_flags: Allocation gfp flags + * @can_alloc: Allocate stack slabs (increased chance of failure if false) * - * Return: The handle of the stack struct stored in depot + * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is + * %true, is allowed to replenish the stack slab pool in case no space is left + * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids + * any allocations and will fail if no space is left to store the stack trace. + * + * If the stack trace in @entries is from an interrupt, only the portion up to + * interrupt entry is saved. + * + * Context: Any context, but setting @can_alloc to %false is required if + * alloc_pages() cannot be used from the current context. Currently + * this is the case from contexts where neither %GFP_ATOMIC nor + * %GFP_NOWAIT can be used (NMI, raw_spin_lock). + * + * Return: The handle of the stack struct stored in depot, 0 on failure. */ -depot_stack_handle_t stack_depot_save(unsigned long *entries, - unsigned int nr_entries, - gfp_t alloc_flags) +depot_stack_handle_t __stack_depot_save(unsigned long *entries, + unsigned int nr_entries, + gfp_t alloc_flags, bool can_alloc) { struct stack_record *found = NULL, **bucket; depot_stack_handle_t retval = 0; @@ -266,6 +301,16 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned long flags; u32 hash; + /* + * If this stack trace is from an interrupt, including anything before + * interrupt entry usually leads to unbounded stackdepot growth. + * + * Because use of filter_irq_stacks() is a requirement to ensure + * stackdepot can efficiently deduplicate interrupt stacks, always + * filter_irq_stacks() to simplify all callers' use of stackdepot. + */ + nr_entries = filter_irq_stacks(entries, nr_entries); + if (unlikely(nr_entries == 0) || stack_depot_disable) goto fast_exit; @@ -290,7 +335,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, * The smp_load_acquire() here pairs with smp_store_release() to * |next_slab_inited| in depot_alloc_stack() and init_stack_slab(). */ - if (unlikely(!smp_load_acquire(&next_slab_inited))) { + if (unlikely(can_alloc && !smp_load_acquire(&next_slab_inited))) { /* * Zero out zone modifiers, as we don't have specific zone * requirements. Keep the flags related to allocation in atomic @@ -308,9 +353,8 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, found = find_stack(*bucket, entries, nr_entries, hash); if (!found) { - struct stack_record *new = - depot_alloc_stack(entries, nr_entries, - hash, &prealloc, alloc_flags); + struct stack_record *new = depot_alloc_stack(entries, nr_entries, hash, &prealloc); + if (new) { new->next = *bucket; /* @@ -339,4 +383,24 @@ exit: fast_exit: return retval; } +EXPORT_SYMBOL_GPL(__stack_depot_save); + +/** + * stack_depot_save - Save a stack trace from an array + * + * @entries: Pointer to storage array + * @nr_entries: Size of the storage array + * @alloc_flags: Allocation gfp flags + * + * Context: Contexts where allocations via alloc_pages() are allowed. + * See __stack_depot_save() for more details. + * + * Return: The handle of the stack struct stored in depot, 0 on failure. + */ +depot_stack_handle_t stack_depot_save(unsigned long *entries, + unsigned int nr_entries, + gfp_t alloc_flags) +{ + return __stack_depot_save(entries, nr_entries, alloc_flags, true); +} EXPORT_SYMBOL_GPL(stack_depot_save); diff --git a/lib/test_lockup.c b/lib/test_lockup.c index c3fd87d6c2dd..07ff86deecd4 100644 --- a/lib/test_lockup.c +++ b/lib/test_lockup.c @@ -616,5 +616,6 @@ static int __init test_lockup_init(void) module_init(test_lockup_init); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver); MODULE_AUTHOR("Konstantin Khlebnikov "); MODULE_DESCRIPTION("Test module to generate lockups"); diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig index 5cb50245a878..adce22ac18d6 100644 --- a/lib/xz/Kconfig +++ b/lib/xz/Kconfig @@ -39,6 +39,19 @@ config XZ_DEC_SPARC default y select XZ_DEC_BCJ +config XZ_DEC_MICROLZMA + bool "MicroLZMA decoder" + default n + help + MicroLZMA is a header format variant where the first byte + of a raw LZMA stream (without the end of stream marker) has + been replaced with a bitwise-negation of the lc/lp/pb + properties byte. MicroLZMA was created to be used in EROFS + but can be used by other things too where wasting minimal + amount of space for headers is important. + + Unless you know that you need this, say N. + endif config XZ_DEC_BCJ diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c index d548cf0e59fe..27ce34520e78 100644 --- a/lib/xz/xz_dec_lzma2.c +++ b/lib/xz/xz_dec_lzma2.c @@ -248,6 +248,10 @@ struct lzma2_dec { * before the first LZMA chunk. */ bool need_props; + +#ifdef XZ_DEC_MICROLZMA + bool pedantic_microlzma; +#endif }; struct xz_dec_lzma2 { @@ -419,6 +423,12 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b, } } +#ifdef XZ_DEC_MICROLZMA +# define DICT_FLUSH_SUPPORTS_SKIPPING true +#else +# define DICT_FLUSH_SUPPORTS_SKIPPING false +#endif + /* * Flush pending data from dictionary to b->out. It is assumed that there is * enough space in b->out. This is guaranteed because caller uses dict_limit() @@ -437,9 +447,14 @@ static uint32_t dict_flush(struct dictionary *dict, struct xz_buf *b) * decompression because in multi-call mode dict->buf * has been allocated by us in this file; it's not * provided by the caller like in single-call mode. + * + * With MicroLZMA, b->out can be NULL to skip bytes that + * the caller doesn't need. This cannot be done with XZ + * because it would break BCJ filters. */ - memcpy(b->out + b->out_pos, dict->buf + dict->start, - copy_size); + if (!DICT_FLUSH_SUPPORTS_SKIPPING || b->out != NULL) + memcpy(b->out + b->out_pos, dict->buf + dict->start, + copy_size); } dict->start = dict->pos; @@ -505,7 +520,7 @@ static __always_inline void rc_normalize(struct rc_dec *rc) * functions so that the compiler is supposed to be able to more easily avoid * an extra branch. In this particular version of the LZMA decoder, this * doesn't seem to be a good idea (tested with GCC 3.3.6, 3.4.6, and 4.3.3 - * on x86). Using a non-splitted version results in nicer looking code too. + * on x86). Using a non-split version results in nicer looking code too. * * NOTE: This must return an int. Do not make it return a bool or the speed * of the code generated by GCC 3.x decreases 10-15 %. (GCC 4.3 doesn't care, @@ -791,6 +806,7 @@ static void lzma_reset(struct xz_dec_lzma2 *s) s->lzma.rep1 = 0; s->lzma.rep2 = 0; s->lzma.rep3 = 0; + s->lzma.len = 0; /* * All probabilities are initialized to the same value. This hack @@ -1174,8 +1190,6 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props) } } - s->lzma.len = 0; - s->lzma2.sequence = SEQ_CONTROL; s->lzma2.need_dict_reset = true; @@ -1191,3 +1205,140 @@ XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s) kfree(s); } + +#ifdef XZ_DEC_MICROLZMA +/* This is a wrapper struct to have a nice struct name in the public API. */ +struct xz_dec_microlzma { + struct xz_dec_lzma2 s; +}; + +enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s_ptr, + struct xz_buf *b) +{ + struct xz_dec_lzma2 *s = &s_ptr->s; + + /* + * sequence is SEQ_PROPERTIES before the first input byte, + * SEQ_LZMA_PREPARE until a total of five bytes have been read, + * and SEQ_LZMA_RUN for the rest of the input stream. + */ + if (s->lzma2.sequence != SEQ_LZMA_RUN) { + if (s->lzma2.sequence == SEQ_PROPERTIES) { + /* One byte is needed for the props. */ + if (b->in_pos >= b->in_size) + return XZ_OK; + + /* + * Don't increment b->in_pos here. The same byte is + * also passed to rc_read_init() which will ignore it. + */ + if (!lzma_props(s, ~b->in[b->in_pos])) + return XZ_DATA_ERROR; + + s->lzma2.sequence = SEQ_LZMA_PREPARE; + } + + /* + * xz_dec_microlzma_reset() doesn't validate the compressed + * size so we do it here. We have to limit the maximum size + * to avoid integer overflows in lzma2_lzma(). 3 GiB is a nice + * round number and much more than users of this code should + * ever need. + */ + if (s->lzma2.compressed < RC_INIT_BYTES + || s->lzma2.compressed > (3U << 30)) + return XZ_DATA_ERROR; + + if (!rc_read_init(&s->rc, b)) + return XZ_OK; + + s->lzma2.compressed -= RC_INIT_BYTES; + s->lzma2.sequence = SEQ_LZMA_RUN; + + dict_reset(&s->dict, b); + } + + /* This is to allow increasing b->out_size between calls. */ + if (DEC_IS_SINGLE(s->dict.mode)) + s->dict.end = b->out_size - b->out_pos; + + while (true) { + dict_limit(&s->dict, min_t(size_t, b->out_size - b->out_pos, + s->lzma2.uncompressed)); + + if (!lzma2_lzma(s, b)) + return XZ_DATA_ERROR; + + s->lzma2.uncompressed -= dict_flush(&s->dict, b); + + if (s->lzma2.uncompressed == 0) { + if (s->lzma2.pedantic_microlzma) { + if (s->lzma2.compressed > 0 || s->lzma.len > 0 + || !rc_is_finished(&s->rc)) + return XZ_DATA_ERROR; + } + + return XZ_STREAM_END; + } + + if (b->out_pos == b->out_size) + return XZ_OK; + + if (b->in_pos == b->in_size + && s->temp.size < s->lzma2.compressed) + return XZ_OK; + } +} + +struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, + uint32_t dict_size) +{ + struct xz_dec_microlzma *s; + + /* Restrict dict_size to the same range as in the LZMA2 code. */ + if (dict_size < 4096 || dict_size > (3U << 30)) + return NULL; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return NULL; + + s->s.dict.mode = mode; + s->s.dict.size = dict_size; + + if (DEC_IS_MULTI(mode)) { + s->s.dict.end = dict_size; + + s->s.dict.buf = vmalloc(dict_size); + if (s->s.dict.buf == NULL) { + kfree(s); + return NULL; + } + } + + return s; +} + +void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, uint32_t comp_size, + uint32_t uncomp_size, int uncomp_size_is_exact) +{ + /* + * comp_size is validated in xz_dec_microlzma_run(). + * uncomp_size can safely be anything. + */ + s->s.lzma2.compressed = comp_size; + s->s.lzma2.uncompressed = uncomp_size; + s->s.lzma2.pedantic_microlzma = uncomp_size_is_exact; + + s->s.lzma2.sequence = SEQ_PROPERTIES; + s->s.temp.size = 0; +} + +void xz_dec_microlzma_end(struct xz_dec_microlzma *s) +{ + if (DEC_IS_MULTI(s->s.dict.mode)) + vfree(s->s.dict.buf); + + kfree(s); +} +#endif diff --git a/lib/xz/xz_dec_syms.c b/lib/xz/xz_dec_syms.c index 32eb3c03aede..61098c67a413 100644 --- a/lib/xz/xz_dec_syms.c +++ b/lib/xz/xz_dec_syms.c @@ -15,8 +15,15 @@ EXPORT_SYMBOL(xz_dec_reset); EXPORT_SYMBOL(xz_dec_run); EXPORT_SYMBOL(xz_dec_end); +#ifdef CONFIG_XZ_DEC_MICROLZMA +EXPORT_SYMBOL(xz_dec_microlzma_alloc); +EXPORT_SYMBOL(xz_dec_microlzma_reset); +EXPORT_SYMBOL(xz_dec_microlzma_run); +EXPORT_SYMBOL(xz_dec_microlzma_end); +#endif + MODULE_DESCRIPTION("XZ decompressor"); -MODULE_VERSION("1.0"); +MODULE_VERSION("1.1"); MODULE_AUTHOR("Lasse Collin and Igor Pavlov"); /* diff --git a/lib/xz/xz_private.h b/lib/xz/xz_private.h index 09360ebb510e..bf1e94ec7873 100644 --- a/lib/xz/xz_private.h +++ b/lib/xz/xz_private.h @@ -37,6 +37,9 @@ # ifdef CONFIG_XZ_DEC_SPARC # define XZ_DEC_SPARC # endif +# ifdef CONFIG_XZ_DEC_MICROLZMA +# define XZ_DEC_MICROLZMA +# endif # define memeq(a, b, size) (memcmp(a, b, size) == 0) # define memzero(buf, size) memset(buf, 0, size) # endif diff --git a/mm/Kconfig b/mm/Kconfig index c048dea7e342..0d4a75cdc6ac 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -188,6 +188,13 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK config MEMORY_BALLOON bool +# +# support for memory relinquish +config MEMORY_RELINQUISH + def_bool y + depends on ARCH_HAS_MEM_RELINQUISH + depends on MEMORY_BALLOON || PAGE_REPORTING + # # support for memory balloon compaction config BALLOON_COMPACTION @@ -894,9 +901,76 @@ config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY config IO_MAPPING bool +# Some architectures want callbacks for all IO mappings in order to +# track the physical addresses that get used as devices. +config ARCH_HAS_IOREMAP_PHYS_HOOKS + bool + config SECRETMEM def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED +config ANON_VMA_NAME + bool "Anonymous VMA name support" + depends on PROC_FS && ADVISE_SYSCALLS && MMU + + help + Allow naming anonymous virtual memory areas. + + This feature allows assigning names to virtual memory areas. Assigned + names can be later retrieved from /proc/pid/maps and /proc/pid/smaps + and help identifying individual anonymous memory areas. + Assigning a name to anonymous virtual memory area might prevent that + area from being merged with adjacent virtual memory areas due to the + difference in their name. + +# multi-gen LRU { +config LRU_GEN + bool "Multi-Gen LRU" + depends on MMU + # make sure page->flags has enough spare bits + depends on !MAXSMP && (64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP) + help + A high performance LRU implementation to overcommit memory. See + Documentation/admin-guide/mm/multigen_lru.rst for details. + +config LRU_GEN_ENABLED + bool "Enable by default" + depends on LRU_GEN + help + This option enables the multi-gen LRU by default. + +config LRU_GEN_STATS + bool "Full stats for debugging" + depends on LRU_GEN + help + Do not enable this option unless you plan to look at historical stats + from evicted generations for debugging purpose. + + This option has a per-memcg and per-node memory overhead. +# } + source "mm/damon/Kconfig" +config ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT + def_bool n + +config SPECULATIVE_PAGE_FAULT + bool "Speculative page faults" + default y + depends on ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT && MMU && SMP && !NUMA + help + Try to handle user space page faults without holding the mmap lock. + + Instead of blocking writers through the use of mmap lock, + the page fault handler merely verifies, at the end of the page + fault, that no writers have been running concurrently with it. + + In high concurrency situations, the speculative fault handler + gains a throughput advantage by avoiding having to update the + mmap lock reader count. + + If the check fails due to a concurrent writer, or due to hitting + an unsupported case, the fault handler falls back to classical + processing using the mmap read lock. + endmenu diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 1e73717802f8..32a5839af244 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -62,6 +62,22 @@ config PAGE_OWNER If unsure, say N. +config PAGE_PINNER + bool "Track page pinner" + depends on DEBUG_KERNEL && STACKTRACE_SUPPORT + select DEBUG_FS + select STACKTRACE + select STACKDEPOT + select PAGE_EXTENSION + help + This keeps track of what call chain is the pinner of a page, may + help to find page migration failures. Even if you include this + feature in your build, it is disabled by default. You should pass + "page_pinner=on" to boot parameter in order to enable it. Eats + a fair amount of memory if enabled. + + If unsure, say N. + config PAGE_POISONING bool "Poison pages after freeing" help @@ -150,3 +166,10 @@ config PTDUMP_DEBUGFS kernel. If in doubt, say N. + +config SPECULATIVE_PAGE_FAULT_STATS + bool "Additional statistics for speculative page faults" + depends on SPECULATIVE_PAGE_FAULT + help + Additional statistics for speculative page faults. + If in doubt, say N. diff --git a/mm/Makefile b/mm/Makefile index fc60a40ce954..8a9954121e4d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -57,6 +57,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ # Give 'page_alloc' its own module-parameter namespace page-alloc-y := page_alloc.o page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o +CFLAGS_page_alloc.o += -DDYNAMIC_DEBUG_MODULE # Give 'memory_hotplug' its own module-parameter namespace memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o @@ -102,6 +103,7 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o +obj-$(CONFIG_PAGE_PINNER) += page_pinner.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZPOOL) += zpool.o diff --git a/mm/OWNERS b/mm/OWNERS new file mode 100644 index 000000000000..02326c9a00a6 --- /dev/null +++ b/mm/OWNERS @@ -0,0 +1,4 @@ +hridya@google.com +kaleshsingh@google.com +surenb@google.com +minchan@google.com diff --git a/mm/cma.c b/mm/cma.c index a972c3440c40..9a23246a8a1c 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -31,8 +32,13 @@ #include #include #include +#include +#include #include +#undef CREATE_TRACE_POINTS +#include + #include "cma.h" struct cma cma_areas[MAX_CMA_AREAS]; @@ -48,11 +54,13 @@ unsigned long cma_get_size(const struct cma *cma) { return cma->count << PAGE_SHIFT; } +EXPORT_SYMBOL_GPL(cma_get_size); const char *cma_get_name(const struct cma *cma) { return cma->name; } +EXPORT_SYMBOL_GPL(cma_get_name); static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, unsigned int align_order) @@ -419,13 +427,13 @@ static inline void cma_debug_show_areas(struct cma *cma) { } * @cma: Contiguous memory region for which the allocation is performed. * @count: Requested number of pages. * @align: Requested alignment of pages (in PAGE_SIZE order). - * @no_warn: Avoid printing message about failed allocation + * @gfp_mask: GFP mask to use during the cma allocation. * * This function allocates part of contiguous memory on specific * contiguous memory area. */ struct page *cma_alloc(struct cma *cma, unsigned long count, - unsigned int align, bool no_warn) + unsigned int align, gfp_t gfp_mask) { unsigned long mask, offset; unsigned long pfn = -1; @@ -434,12 +442,18 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned long i; struct page *page = NULL; int ret = -ENOMEM; + int num_attempts = 0; + int max_retries = 5; + s64 ts; + struct cma_alloc_info cma_info = {0}; + + trace_android_vh_cma_alloc_start(&ts); if (!cma || !cma->count || !cma->bitmap) goto out; - pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma, - count, align); + pr_debug("%s(cma %p, count %lu, align %d gfp_mask 0x%x)\n", __func__, + (void *)cma, count, align, gfp_mask); if (!count) goto out; @@ -455,13 +469,36 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, goto out; for (;;) { + struct acr_info info = {0}; + spin_lock_irq(&cma->lock); bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, bitmap_maxno, start, bitmap_count, mask, offset); if (bitmap_no >= bitmap_maxno) { - spin_unlock_irq(&cma->lock); - break; + if ((num_attempts < max_retries) && (ret == -EBUSY)) { + spin_unlock_irq(&cma->lock); + + if (fatal_signal_pending(current) || + (gfp_mask & __GFP_NORETRY)) + break; + + /* + * Page may be momentarily pinned by some other + * process which has been scheduled out, e.g. + * in exit path, during unmap call, or process + * fork and so cannot be freed there. Sleep + * for 100ms and retry the allocation. + */ + start = 0; + ret = -ENOMEM; + schedule_timeout_killable(msecs_to_jiffies(100)); + num_attempts++; + continue; + } else { + spin_unlock_irq(&cma->lock); + break; + } } bitmap_set(cma->bitmap, bitmap_no, bitmap_count); /* @@ -473,8 +510,18 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma_mutex); - ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, - GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); + ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, gfp_mask, &info); + cma_info.nr_migrated += info.nr_migrated; + cma_info.nr_reclaimed += info.nr_reclaimed; + cma_info.nr_mapped += info.nr_mapped; + if (info.err) { + if (info.err & ACR_ERR_ISOLATE) + cma_info.nr_isolate_fail++; + if (info.err & ACR_ERR_MIGRATE) + cma_info.nr_migrate_fail++; + if (info.err & ACR_ERR_TEST) + cma_info.nr_test_fail++; + } mutex_unlock(&cma_mutex); if (ret == 0) { page = pfn_to_page(pfn); @@ -490,11 +537,20 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn), count, align); - /* try again with a bit different memory target */ - start = bitmap_no + mask + 1; + + if (info.failed_pfn && gfp_mask & __GFP_NORETRY) { + /* try again from following failed page */ + start = (pfn_max_align_up(info.failed_pfn + 1) - + cma->base_pfn) >> cma->order_per_bit; + + } else { + /* try again with a bit different memory target */ + start = bitmap_no + mask + 1; + } } trace_cma_alloc_finish(cma->name, pfn, page, count, align); + trace_cma_alloc_info(cma->name, page, count, align, &cma_info); /* * CMA can allocate multiple page blocks, which results in different @@ -506,7 +562,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, page_kasan_tag_reset(page + i); } - if (ret && !no_warn) { + if (ret && !(gfp_mask & __GFP_NOWARN)) { pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n", __func__, cma->name, count, ret); cma_debug_show_areas(cma); @@ -514,10 +570,11 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, pr_debug("%s(): returned %p\n", __func__, page); out: + trace_android_vh_cma_alloc_finish(cma, page, count, align, gfp_mask, ts); if (page) { count_vm_event(CMA_ALLOC_SUCCESS); cma_sysfs_account_success_pages(cma, count); - } else { + } else if (!(gfp_mask & __GFP_NORETRY)) { count_vm_event(CMA_ALLOC_FAIL); if (cma) cma_sysfs_account_fail_pages(cma, count); @@ -525,6 +582,7 @@ out: return page; } +EXPORT_SYMBOL_GPL(cma_alloc); /** * cma_release() - release allocated pages @@ -559,6 +617,7 @@ bool cma_release(struct cma *cma, const struct page *pages, return true; } +EXPORT_SYMBOL_GPL(cma_release); int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) { @@ -573,3 +632,4 @@ int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) return 0; } +EXPORT_SYMBOL_GPL(cma_for_each_area); diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 2e7704955f4f..59ec44c955dc 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -137,7 +137,7 @@ static int cma_alloc_mem(struct cma *cma, int count) if (!mem) return -ENOMEM; - p = cma_alloc(cma, count, 0, false); + p = cma_alloc(cma, count, 0, GFP_KERNEL); if (!p) { kfree(mem); return -ENOMEM; diff --git a/mm/compaction.c b/mm/compaction.c index 89517ad5d6a0..66190f7f9a58 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -44,6 +44,8 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define CREATE_TRACE_POINTS #include +#undef CREATE_TRACE_POINTS +#include #define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order)) #define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order)) @@ -758,6 +760,31 @@ isolate_freepages_range(struct compact_control *cc, return pfn; } +#ifdef CONFIG_COMPACTION +unsigned long isolate_and_split_free_page(struct page *page, + struct list_head *list) +{ + unsigned long isolated; + unsigned int order; + + if (!PageBuddy(page)) + return 0; + + order = buddy_order(page); + isolated = __isolate_free_page(page, order); + if (!isolated) + return 0; + + set_page_private(page, order); + list_add(&page->lru, list); + + split_map_pages(list); + + return isolated; +} +EXPORT_SYMBOL_GPL(isolate_and_split_free_page); +#endif + /* Similar to reclaim, but different enough that they don't share logic */ static bool too_many_isolated(pg_data_t *pgdat) { @@ -2305,6 +2332,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) unsigned long last_migrated_pfn; const bool sync = cc->mode != MIGRATE_ASYNC; bool update_cached; + long vendor_ret; /* * These counters track activities during zone compaction. Initialize @@ -2376,6 +2404,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync); + trace_android_vh_mm_compaction_begin(cc, &vendor_ret); /* lru_add_drain_all could be expensive with involving other CPUs */ lru_add_drain(); @@ -2502,6 +2531,7 @@ out: count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned); count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned); + trace_android_vh_mm_compaction_end(cc, vendor_ret); trace_mm_compaction_end(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync, ret); diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 37024798a97c..5bcf05851ad0 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -30,7 +30,15 @@ config DAMON_VADDR select PAGE_IDLE_FLAG help This builds the default data access monitoring primitives for DAMON - that works for virtual address spaces. + that work for virtual address spaces. + +config DAMON_PADDR + bool "Data access monitoring primitives for the physical address space" + depends on DAMON && MMU + select PAGE_IDLE_FLAG + help + This builds the default data access monitoring primitives for DAMON + that works for the physical address space. config DAMON_VADDR_KUNIT_TEST bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS @@ -46,7 +54,7 @@ config DAMON_VADDR_KUNIT_TEST config DAMON_DBGFS bool "DAMON debugfs interface" - depends on DAMON_VADDR && DEBUG_FS + depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS help This builds the debugfs interface for DAMON. The user space admins can use the interface for arbitrary data access monitoring. @@ -65,4 +73,16 @@ config DAMON_DBGFS_KUNIT_TEST If unsure, say N. +config DAMON_RECLAIM + bool "Build DAMON-based reclaim (DAMON_RECLAIM)" + depends on DAMON_PADDR + help + This builds the DAMON-based reclamation subsystem. It finds pages + that not accessed for a long time (cold) using DAMON and reclaim + those. + + This is suggested to be used as a proactive and lightweight + reclamation under light memory pressure, while the traditional page + scanning-based reclamation is used for heavy pressure. + endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile index fed4be3bace3..f7d5ac377a2b 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DAMON) := core.o -obj-$(CONFIG_DAMON_VADDR) += vaddr.o +obj-$(CONFIG_DAMON_VADDR) += prmtv-common.o vaddr.o +obj-$(CONFIG_DAMON_PADDR) += prmtv-common.o paddr.o obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o +obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o diff --git a/mm/damon/core.c b/mm/damon/core.c index 7a4912d6e65f..fa503da9d620 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -10,8 +10,9 @@ #include #include #include -#include +#include #include +#include #define CREATE_TRACE_POINTS #include @@ -21,9 +22,6 @@ #define DAMON_MIN_REGION 1 #endif -/* Get a random number in [l, r) */ -#define damon_rand(l, r) (l + prandom_u32_max(r - l)) - static DEFINE_MUTEX(damon_lock); static int nr_running_ctxs; @@ -45,18 +43,10 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) region->nr_accesses = 0; INIT_LIST_HEAD(®ion->list); - return region; -} + region->age = 0; + region->last_nr_accesses = 0; -/* - * Add a region between two other regions - */ -inline void damon_insert_region(struct damon_region *r, - struct damon_region *prev, struct damon_region *next, - struct damon_target *t) -{ - __list_add(&r->list, &prev->list, &next->list); - t->nr_regions++; + return region; } void damon_add_region(struct damon_region *r, struct damon_target *t) @@ -82,6 +72,73 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t) damon_free_region(r); } +struct damos *damon_new_scheme( + unsigned long min_sz_region, unsigned long max_sz_region, + unsigned int min_nr_accesses, unsigned int max_nr_accesses, + unsigned int min_age_region, unsigned int max_age_region, + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks) +{ + struct damos *scheme; + + scheme = kmalloc(sizeof(*scheme), GFP_KERNEL); + if (!scheme) + return NULL; + scheme->min_sz_region = min_sz_region; + scheme->max_sz_region = max_sz_region; + scheme->min_nr_accesses = min_nr_accesses; + scheme->max_nr_accesses = max_nr_accesses; + scheme->min_age_region = min_age_region; + scheme->max_age_region = max_age_region; + scheme->action = action; + scheme->stat = (struct damos_stat){}; + INIT_LIST_HEAD(&scheme->list); + + scheme->quota.ms = quota->ms; + scheme->quota.sz = quota->sz; + scheme->quota.reset_interval = quota->reset_interval; + scheme->quota.weight_sz = quota->weight_sz; + scheme->quota.weight_nr_accesses = quota->weight_nr_accesses; + scheme->quota.weight_age = quota->weight_age; + scheme->quota.total_charged_sz = 0; + scheme->quota.total_charged_ns = 0; + scheme->quota.esz = 0; + scheme->quota.charged_sz = 0; + scheme->quota.charged_from = 0; + scheme->quota.charge_target_from = NULL; + scheme->quota.charge_addr_from = 0; + + scheme->wmarks.metric = wmarks->metric; + scheme->wmarks.interval = wmarks->interval; + scheme->wmarks.high = wmarks->high; + scheme->wmarks.mid = wmarks->mid; + scheme->wmarks.low = wmarks->low; + scheme->wmarks.activated = true; + + return scheme; +} + +void damon_add_scheme(struct damon_ctx *ctx, struct damos *s) +{ + list_add_tail(&s->list, &ctx->schemes); +} + +static void damon_del_scheme(struct damos *s) +{ + list_del(&s->list); +} + +static void damon_free_scheme(struct damos *s) +{ + kfree(s); +} + +void damon_destroy_scheme(struct damos *s) +{ + damon_del_scheme(s); + damon_free_scheme(s); +} + /* * Construct a damon_target struct * @@ -107,6 +164,11 @@ void damon_add_target(struct damon_ctx *ctx, struct damon_target *t) list_add_tail(&t->list, &ctx->adaptive_targets); } +bool damon_targets_empty(struct damon_ctx *ctx) +{ + return list_empty(&ctx->adaptive_targets); +} + static void damon_del_target(struct damon_target *t) { list_del(&t->list); @@ -153,6 +215,7 @@ struct damon_ctx *damon_new_ctx(void) ctx->max_nr_regions = 1000; INIT_LIST_HEAD(&ctx->adaptive_targets); + INIT_LIST_HEAD(&ctx->schemes); return ctx; } @@ -172,7 +235,13 @@ static void damon_destroy_targets(struct damon_ctx *ctx) void damon_destroy_ctx(struct damon_ctx *ctx) { + struct damos *s, *next_s; + damon_destroy_targets(ctx); + + damon_for_each_scheme_safe(s, next_s, ctx) + damon_destroy_scheme(s); + kfree(ctx); } @@ -197,7 +266,6 @@ int damon_set_targets(struct damon_ctx *ctx, for (i = 0; i < nr_ids; i++) { t = damon_new_target(ids[i]); if (!t) { - pr_err("Failed to alloc damon_target\n"); /* The caller should do cleanup of the ids itself */ damon_for_each_target_safe(t, next, ctx) damon_destroy_target(t); @@ -227,16 +295,10 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, unsigned long aggr_int, unsigned long primitive_upd_int, unsigned long min_nr_reg, unsigned long max_nr_reg) { - if (min_nr_reg < 3) { - pr_err("min_nr_regions (%lu) must be at least 3\n", - min_nr_reg); + if (min_nr_reg < 3) return -EINVAL; - } - if (min_nr_reg > max_nr_reg) { - pr_err("invalid nr_regions. min (%lu) > max (%lu)\n", - min_nr_reg, max_nr_reg); + if (min_nr_reg > max_nr_reg) return -EINVAL; - } ctx->sample_interval = sample_int; ctx->aggr_interval = aggr_int; @@ -247,6 +309,30 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, return 0; } +/** + * damon_set_schemes() - Set data access monitoring based operation schemes. + * @ctx: monitoring context + * @schemes: array of the schemes + * @nr_schemes: number of entries in @schemes + * + * This function should not be called while the kdamond of the context is + * running. + * + * Return: 0 if success, or negative error code otherwise. + */ +int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, + ssize_t nr_schemes) +{ + struct damos *s, *next; + ssize_t i; + + damon_for_each_scheme_safe(s, next, ctx) + damon_destroy_scheme(s); + for (i = 0; i < nr_schemes; i++) + damon_add_scheme(ctx, schemes[i]); + return 0; +} + /** * damon_nr_running_ctxs() - Return number of currently running contexts. */ @@ -281,17 +367,6 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) return sz; } -static bool damon_kdamond_running(struct damon_ctx *ctx) -{ - bool running; - - mutex_lock(&ctx->kdamond_lock); - running = ctx->kdamond != NULL; - mutex_unlock(&ctx->kdamond_lock); - - return running; -} - static int kdamond_fn(void *data); /* @@ -309,12 +384,11 @@ static int __damon_start(struct damon_ctx *ctx) mutex_lock(&ctx->kdamond_lock); if (!ctx->kdamond) { err = 0; - ctx->kdamond_stop = false; ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d", nr_running_ctxs); if (IS_ERR(ctx->kdamond)) { err = PTR_ERR(ctx->kdamond); - ctx->kdamond = 0; + ctx->kdamond = NULL; } } mutex_unlock(&ctx->kdamond_lock); @@ -360,7 +434,7 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs) static void kdamond_usleep(unsigned long usecs) { /* See Documentation/timers/timers-howto.rst for the thresholds */ - if (usecs > 20 * 1000) + if (usecs > 20 * USEC_PER_MSEC) schedule_timeout_idle(usecs_to_jiffies(usecs)); else usleep_idle_range(usecs, usecs + 1); @@ -374,12 +448,15 @@ static void kdamond_usleep(unsigned long usecs) */ static int __damon_stop(struct damon_ctx *ctx) { + struct task_struct *tsk; + mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - ctx->kdamond_stop = true; + tsk = ctx->kdamond; + if (tsk) { + get_task_struct(tsk); mutex_unlock(&ctx->kdamond_lock); - while (damon_kdamond_running(ctx)) - kdamond_usleep(ctx->sample_interval); + kthread_stop(tsk); + put_task_struct(tsk); return 0; } mutex_unlock(&ctx->kdamond_lock); @@ -446,18 +523,221 @@ static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) static void kdamond_reset_aggregated(struct damon_ctx *c) { struct damon_target *t; + unsigned int ti = 0; /* target's index */ damon_for_each_target(t, c) { struct damon_region *r; damon_for_each_region(r, t) { - trace_damon_aggregated(t, r, damon_nr_regions(t)); + trace_damon_aggregated(t, ti, r, damon_nr_regions(t)); + r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; } + ti++; } } -#define sz_damon_region(r) (r->ar.end - r->ar.start) +static void damon_split_region_at(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + unsigned long sz_r); + +static bool __damos_valid_target(struct damon_region *r, struct damos *s) +{ + unsigned long sz; + + sz = r->ar.end - r->ar.start; + return s->min_sz_region <= sz && sz <= s->max_sz_region && + s->min_nr_accesses <= r->nr_accesses && + r->nr_accesses <= s->max_nr_accesses && + s->min_age_region <= r->age && r->age <= s->max_age_region; +} + +static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, + struct damon_region *r, struct damos *s) +{ + bool ret = __damos_valid_target(r, s); + + if (!ret || !s->quota.esz || !c->primitive.get_scheme_score) + return ret; + + return c->primitive.get_scheme_score(c, t, r, s) >= s->quota.min_score; +} + +static void damon_do_apply_schemes(struct damon_ctx *c, + struct damon_target *t, + struct damon_region *r) +{ + struct damos *s; + + damon_for_each_scheme(s, c) { + struct damos_quota *quota = &s->quota; + unsigned long sz = r->ar.end - r->ar.start; + struct timespec64 begin, end; + unsigned long sz_applied = 0; + + if (!s->wmarks.activated) + continue; + + /* Check the quota */ + if (quota->esz && quota->charged_sz >= quota->esz) + continue; + + /* Skip previously charged regions */ + if (quota->charge_target_from) { + if (t != quota->charge_target_from) + continue; + if (r == damon_last_region(t)) { + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + continue; + } + if (quota->charge_addr_from && + r->ar.end <= quota->charge_addr_from) + continue; + + if (quota->charge_addr_from && r->ar.start < + quota->charge_addr_from) { + sz = ALIGN_DOWN(quota->charge_addr_from - + r->ar.start, DAMON_MIN_REGION); + if (!sz) { + if (r->ar.end - r->ar.start <= + DAMON_MIN_REGION) + continue; + sz = DAMON_MIN_REGION; + } + damon_split_region_at(c, t, r, sz); + r = damon_next_region(r); + sz = r->ar.end - r->ar.start; + } + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + } + + if (!damos_valid_target(c, t, r, s)) + continue; + + /* Apply the scheme */ + if (c->primitive.apply_scheme) { + if (quota->esz && + quota->charged_sz + sz > quota->esz) { + sz = ALIGN_DOWN(quota->esz - quota->charged_sz, + DAMON_MIN_REGION); + if (!sz) + goto update_stat; + damon_split_region_at(c, t, r, sz); + } + ktime_get_coarse_ts64(&begin); + sz_applied = c->primitive.apply_scheme(c, t, r, s); + ktime_get_coarse_ts64(&end); + quota->total_charged_ns += timespec64_to_ns(&end) - + timespec64_to_ns(&begin); + quota->charged_sz += sz; + if (quota->esz && quota->charged_sz >= quota->esz) { + quota->charge_target_from = t; + quota->charge_addr_from = r->ar.end + 1; + } + } + if (s->action != DAMOS_STAT) + r->age = 0; + +update_stat: + s->stat.nr_tried++; + s->stat.sz_tried += sz; + if (sz_applied) + s->stat.nr_applied++; + s->stat.sz_applied += sz_applied; + } +} + +/* Shouldn't be called if quota->ms and quota->sz are zero */ +static void damos_set_effective_quota(struct damos_quota *quota) +{ + unsigned long throughput; + unsigned long esz; + + if (!quota->ms) { + quota->esz = quota->sz; + return; + } + + if (quota->total_charged_ns) + throughput = quota->total_charged_sz * 1000000 / + quota->total_charged_ns; + else + throughput = PAGE_SIZE * 1024; + esz = throughput * quota->ms; + + if (quota->sz && quota->sz < esz) + esz = quota->sz; + quota->esz = esz; +} + +static void kdamond_apply_schemes(struct damon_ctx *c) +{ + struct damon_target *t; + struct damon_region *r, *next_r; + struct damos *s; + + damon_for_each_scheme(s, c) { + struct damos_quota *quota = &s->quota; + unsigned long cumulated_sz; + unsigned int score, max_score = 0; + + if (!s->wmarks.activated) + continue; + + if (!quota->ms && !quota->sz) + continue; + + /* New charge window starts */ + if (time_after_eq(jiffies, quota->charged_from + + msecs_to_jiffies( + quota->reset_interval))) { + if (quota->esz && quota->charged_sz >= quota->esz) + s->stat.qt_exceeds++; + quota->total_charged_sz += quota->charged_sz; + quota->charged_from = jiffies; + quota->charged_sz = 0; + damos_set_effective_quota(quota); + } + + if (!c->primitive.get_scheme_score) + continue; + + /* Fill up the score histogram */ + memset(quota->histogram, 0, sizeof(quota->histogram)); + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + if (!__damos_valid_target(r, s)) + continue; + score = c->primitive.get_scheme_score( + c, t, r, s); + quota->histogram[score] += + r->ar.end - r->ar.start; + if (score > max_score) + max_score = score; + } + } + + /* Set the min score limit */ + for (cumulated_sz = 0, score = max_score; ; score--) { + cumulated_sz += quota->histogram[score]; + if (cumulated_sz >= quota->esz || !score) + break; + } + quota->min_score = score; + } + + damon_for_each_target(t, c) { + damon_for_each_region_safe(r, next_r, t) + damon_do_apply_schemes(c, t, r); + } +} + +static inline unsigned long sz_damon_region(struct damon_region *r) +{ + return r->ar.end - r->ar.start; +} /* * Merge two adjacent regions into one region @@ -469,12 +749,11 @@ static void damon_merge_two_regions(struct damon_target *t, l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / (sz_l + sz_r); + l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r); l->ar.end = r->ar.end; damon_destroy_region(r, t); } -#define diff_of(a, b) (a > b ? a - b : b - a) - /* * Merge adjacent regions having similar access frequencies * @@ -488,8 +767,13 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, struct damon_region *r, *prev = NULL, *next; damon_for_each_region_safe(r, next, t) { + if (abs(r->nr_accesses - r->last_nr_accesses) > thres) + r->age = 0; + else + r->age++; + if (prev && prev->ar.end == r->ar.start && - diff_of(prev->nr_accesses, r->nr_accesses) <= thres && + abs(prev->nr_accesses - r->nr_accesses) <= thres && sz_damon_region(prev) + sz_damon_region(r) <= sz_limit) damon_merge_two_regions(t, prev, r); else @@ -535,6 +819,9 @@ static void damon_split_region_at(struct damon_ctx *ctx, r->ar.end = new->ar.start; + new->age = r->age; + new->last_nr_accesses = r->last_nr_accesses; + damon_insert_region(new, r, damon_next_region(r), t); } @@ -623,12 +910,8 @@ static bool kdamond_need_update_primitive(struct damon_ctx *ctx) static bool kdamond_need_stop(struct damon_ctx *ctx) { struct damon_target *t; - bool stop; - mutex_lock(&ctx->kdamond_lock); - stop = ctx->kdamond_stop; - mutex_unlock(&ctx->kdamond_lock); - if (stop) + if (kthread_should_stop()) return true; if (!ctx->primitive.target_valid) @@ -642,11 +925,73 @@ static bool kdamond_need_stop(struct damon_ctx *ctx) return true; } -static void set_kdamond_stop(struct damon_ctx *ctx) +static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric) { - mutex_lock(&ctx->kdamond_lock); - ctx->kdamond_stop = true; - mutex_unlock(&ctx->kdamond_lock); + struct sysinfo i; + + switch (metric) { + case DAMOS_WMARK_FREE_MEM_RATE: + si_meminfo(&i); + return i.freeram * 1000 / i.totalram; + default: + break; + } + return -EINVAL; +} + +/* + * Returns zero if the scheme is active. Else, returns time to wait for next + * watermark check in micro-seconds. + */ +static unsigned long damos_wmark_wait_us(struct damos *scheme) +{ + unsigned long metric; + + if (scheme->wmarks.metric == DAMOS_WMARK_NONE) + return 0; + + metric = damos_wmark_metric_value(scheme->wmarks.metric); + /* higher than high watermark or lower than low watermark */ + if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) { + if (scheme->wmarks.activated) + pr_debug("deactivate a scheme (%d) for %s wmark\n", + scheme->action, + metric > scheme->wmarks.high ? + "high" : "low"); + scheme->wmarks.activated = false; + return scheme->wmarks.interval; + } + + /* inactive and higher than middle watermark */ + if ((scheme->wmarks.high >= metric && metric >= scheme->wmarks.mid) && + !scheme->wmarks.activated) + return scheme->wmarks.interval; + + if (!scheme->wmarks.activated) + pr_debug("activate a scheme (%d)\n", scheme->action); + scheme->wmarks.activated = true; + return 0; +} + +/* Returns negative error code if it's not activated but should return */ +static int kdamond_wait_activation(struct damon_ctx *ctx) +{ + struct damos *s; + unsigned long wait_time; + unsigned long min_wait_time = 0; + + while (!kdamond_need_stop(ctx)) { + damon_for_each_scheme(s, ctx) { + wait_time = damos_wmark_wait_us(s); + if (!min_wait_time || wait_time < min_wait_time) + min_wait_time = wait_time; + } + if (!min_wait_time) + return 0; + + kdamond_usleep(min_wait_time); + } + return -EBUSY; } /* @@ -659,24 +1004,26 @@ static int kdamond_fn(void *data) struct damon_region *r, *next; unsigned int max_nr_accesses = 0; unsigned long sz_limit = 0; + bool done = false; - mutex_lock(&ctx->kdamond_lock); - pr_info("kdamond (%d) starts\n", ctx->kdamond->pid); - mutex_unlock(&ctx->kdamond_lock); + pr_debug("kdamond (%d) starts\n", current->pid); if (ctx->primitive.init) ctx->primitive.init(ctx); if (ctx->callback.before_start && ctx->callback.before_start(ctx)) - set_kdamond_stop(ctx); + done = true; sz_limit = damon_region_sz_limit(ctx); - while (!kdamond_need_stop(ctx)) { + while (!kdamond_need_stop(ctx) && !done) { + if (kdamond_wait_activation(ctx)) + continue; + if (ctx->primitive.prepare_access_checks) ctx->primitive.prepare_access_checks(ctx); if (ctx->callback.after_sampling && ctx->callback.after_sampling(ctx)) - set_kdamond_stop(ctx); + done = true; kdamond_usleep(ctx->sample_interval); @@ -689,7 +1036,8 @@ static int kdamond_fn(void *data) sz_limit); if (ctx->callback.after_aggregation && ctx->callback.after_aggregation(ctx)) - set_kdamond_stop(ctx); + done = true; + kdamond_apply_schemes(ctx); kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); if (ctx->primitive.reset_aggregated) @@ -707,13 +1055,12 @@ static int kdamond_fn(void *data) damon_destroy_region(r, t); } - if (ctx->callback.before_terminate && - ctx->callback.before_terminate(ctx)) - set_kdamond_stop(ctx); + if (ctx->callback.before_terminate) + ctx->callback.before_terminate(ctx); if (ctx->primitive.cleanup) ctx->primitive.cleanup(ctx); - pr_debug("kdamond (%d) finishes\n", ctx->kdamond->pid); + pr_debug("kdamond (%d) finishes\n", current->pid); mutex_lock(&ctx->kdamond_lock); ctx->kdamond = NULL; mutex_unlock(&ctx->kdamond_lock); @@ -722,7 +1069,7 @@ static int kdamond_fn(void *data) nr_running_ctxs--; mutex_unlock(&damon_lock); - do_exit(0); + return 0; } #include "core-test.h" diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h index 4eddcfa73996..86b9f9528231 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/dbgfs-test.h @@ -109,9 +109,63 @@ static void damon_dbgfs_test_set_targets(struct kunit *test) dbgfs_destroy_ctx(ctx); } +static void damon_dbgfs_test_set_init_regions(struct kunit *test) +{ + struct damon_ctx *ctx = damon_new_ctx(); + unsigned long ids[] = {1, 2, 3}; + /* Each line represents one region in `` `` */ + char * const valid_inputs[] = {"2 10 20\n 2 20 30\n2 35 45", + "2 10 20\n", + "2 10 20\n1 39 59\n1 70 134\n 2 20 25\n", + ""}; + /* Reading the file again will show sorted, clean output */ + char * const valid_expects[] = {"2 10 20\n2 20 30\n2 35 45\n", + "2 10 20\n", + "1 39 59\n1 70 134\n2 10 20\n2 20 25\n", + ""}; + char * const invalid_inputs[] = {"4 10 20\n", /* target not exists */ + "2 10 20\n 2 14 26\n", /* regions overlap */ + "1 10 20\n2 30 40\n 1 5 8"}; /* not sorted by address */ + char *input, *expect; + int i, rc; + char buf[256]; + + damon_set_targets(ctx, ids, 3); + + /* Put valid inputs and check the results */ + for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) { + input = valid_inputs[i]; + expect = valid_expects[i]; + + rc = set_init_regions(ctx, input, strnlen(input, 256)); + KUNIT_EXPECT_EQ(test, rc, 0); + + memset(buf, 0, 256); + sprint_init_regions(ctx, buf, 256); + + KUNIT_EXPECT_STREQ(test, (char *)buf, expect); + } + /* Put invalid inputs and check the return error code */ + for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) { + input = invalid_inputs[i]; + pr_info("input: %s\n", input); + rc = set_init_regions(ctx, input, strnlen(input, 256)); + KUNIT_EXPECT_EQ(test, rc, -EINVAL); + + memset(buf, 0, 256); + sprint_init_regions(ctx, buf, 256); + + KUNIT_EXPECT_STREQ(test, (char *)buf, ""); + } + + damon_set_targets(ctx, NULL, 0); + damon_destroy_ctx(ctx); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_dbgfs_test_str_to_target_ids), KUNIT_CASE(damon_dbgfs_test_set_targets), + KUNIT_CASE(damon_dbgfs_test_set_init_regions), {}, }; diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index b039fd1f8a1d..e81bd97a6162 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -69,8 +69,7 @@ static ssize_t dbgfs_attrs_write(struct file *file, struct damon_ctx *ctx = file->private_data; unsigned long s, a, r, minr, maxr; char *kbuf; - ssize_t ret = count; - int err; + ssize_t ret; kbuf = user_input_str(buf, count, ppos); if (IS_ERR(kbuf)) @@ -88,9 +87,9 @@ static ssize_t dbgfs_attrs_write(struct file *file, goto unlock_out; } - err = damon_set_attrs(ctx, s, a, r, minr, maxr); - if (err) - ret = err; + ret = damon_set_attrs(ctx, s, a, r, minr, maxr); + if (!ret) + ret = count; unlock_out: mutex_unlock(&ctx->kdamond_lock); out: @@ -98,6 +97,184 @@ out: return ret; } +static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) +{ + struct damos *s; + int written = 0; + int rc; + + damon_for_each_scheme(s, c) { + rc = scnprintf(&buf[written], len - written, + "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + s->min_sz_region, s->max_sz_region, + s->min_nr_accesses, s->max_nr_accesses, + s->min_age_region, s->max_age_region, + s->action, + s->quota.ms, s->quota.sz, + s->quota.reset_interval, + s->quota.weight_sz, + s->quota.weight_nr_accesses, + s->quota.weight_age, + s->wmarks.metric, s->wmarks.interval, + s->wmarks.high, s->wmarks.mid, s->wmarks.low, + s->stat.nr_tried, s->stat.sz_tried, + s->stat.nr_applied, s->stat.sz_applied, + s->stat.qt_exceeds); + if (!rc) + return -ENOMEM; + + written += rc; + } + return written; +} + +static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t len; + + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + mutex_lock(&ctx->kdamond_lock); + len = sprint_schemes(ctx, kbuf, count); + mutex_unlock(&ctx->kdamond_lock); + if (len < 0) + goto out; + len = simple_read_from_buffer(buf, count, ppos, kbuf, len); + +out: + kfree(kbuf); + return len; +} + +static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes) +{ + ssize_t i; + + for (i = 0; i < nr_schemes; i++) + kfree(schemes[i]); + kfree(schemes); +} + +static bool damos_action_valid(int action) +{ + switch (action) { + case DAMOS_WILLNEED: + case DAMOS_COLD: + case DAMOS_PAGEOUT: + case DAMOS_HUGEPAGE: + case DAMOS_NOHUGEPAGE: + case DAMOS_STAT: + return true; + default: + return false; + } +} + +/* + * Converts a string into an array of struct damos pointers + * + * Returns an array of struct damos pointers that converted if the conversion + * success, or NULL otherwise. + */ +static struct damos **str_to_schemes(const char *str, ssize_t len, + ssize_t *nr_schemes) +{ + struct damos *scheme, **schemes; + const int max_nr_schemes = 256; + int pos = 0, parsed, ret; + unsigned long min_sz, max_sz; + unsigned int min_nr_a, max_nr_a, min_age, max_age; + unsigned int action; + + schemes = kmalloc_array(max_nr_schemes, sizeof(scheme), + GFP_KERNEL); + if (!schemes) + return NULL; + + *nr_schemes = 0; + while (pos < len && *nr_schemes < max_nr_schemes) { + struct damos_quota quota = {}; + struct damos_watermarks wmarks; + + ret = sscanf(&str[pos], + "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n", + &min_sz, &max_sz, &min_nr_a, &max_nr_a, + &min_age, &max_age, &action, "a.ms, + "a.sz, "a.reset_interval, + "a.weight_sz, "a.weight_nr_accesses, + "a.weight_age, &wmarks.metric, + &wmarks.interval, &wmarks.high, &wmarks.mid, + &wmarks.low, &parsed); + if (ret != 18) + break; + if (!damos_action_valid(action)) + goto fail; + + if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age) + goto fail; + + if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low || + wmarks.mid < wmarks.low) + goto fail; + + pos += parsed; + scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a, + min_age, max_age, action, "a, &wmarks); + if (!scheme) + goto fail; + + schemes[*nr_schemes] = scheme; + *nr_schemes += 1; + } + return schemes; +fail: + free_schemes_arr(schemes, *nr_schemes); + return NULL; +} + +static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + struct damos **schemes; + ssize_t nr_schemes = 0, ret; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + schemes = str_to_schemes(kbuf, count, &nr_schemes); + if (!schemes) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ret = -EBUSY; + goto unlock_out; + } + + ret = damon_set_schemes(ctx, schemes, nr_schemes); + if (!ret) { + ret = count; + nr_schemes = 0; + } + +unlock_out: + mutex_unlock(&ctx->kdamond_lock); + free_schemes_arr(schemes, nr_schemes); +out: + kfree(kbuf); + return ret; +} + static inline bool targetid_is_pid(const struct damon_ctx *ctx) { return ctx->primitive.target_valid == damon_va_target_valid; @@ -186,26 +363,30 @@ static ssize_t dbgfs_target_ids_write(struct file *file, { struct damon_ctx *ctx = file->private_data; struct damon_target *t, *next_t; - char *kbuf, *nrs; + bool id_is_pid = true; + char *kbuf; unsigned long *targets; ssize_t nr_targets; - ssize_t ret = count; + ssize_t ret; int i; - int err; kbuf = user_input_str(buf, count, ppos); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - nrs = kbuf; + if (!strncmp(kbuf, "paddr\n", count)) { + id_is_pid = false; + /* target id is meaningless here, but we set it just for fun */ + scnprintf(kbuf, count, "42 "); + } - targets = str_to_target_ids(nrs, ret, &nr_targets); + targets = str_to_target_ids(kbuf, count, &nr_targets); if (!targets) { ret = -ENOMEM; goto out; } - if (targetid_is_pid(ctx)) { + if (id_is_pid) { for (i = 0; i < nr_targets; i++) { targets[i] = (unsigned long)find_get_pid( (int)targets[i]); @@ -219,7 +400,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file, mutex_lock(&ctx->kdamond_lock); if (ctx->kdamond) { - if (targetid_is_pid(ctx)) + if (id_is_pid) dbgfs_put_pids(targets, nr_targets); ret = -EBUSY; goto unlock_out; @@ -232,11 +413,21 @@ static ssize_t dbgfs_target_ids_write(struct file *file, damon_destroy_target(t); } - err = damon_set_targets(ctx, targets, nr_targets); - if (err) { - if (targetid_is_pid(ctx)) + /* remove targets with previously-set primitive */ + damon_set_targets(ctx, NULL, 0); + + /* Configure the context for the address space type */ + if (id_is_pid) + damon_va_set_primitives(ctx); + else + damon_pa_set_primitives(ctx); + + ret = damon_set_targets(ctx, targets, nr_targets); + if (ret) { + if (id_is_pid) dbgfs_put_pids(targets, nr_targets); - ret = err; + } else { + ret = count; } unlock_out: @@ -248,6 +439,152 @@ out: return ret; } +static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len) +{ + struct damon_target *t; + struct damon_region *r; + int written = 0; + int rc; + + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + rc = scnprintf(&buf[written], len - written, + "%lu %lu %lu\n", + t->id, r->ar.start, r->ar.end); + if (!rc) + return -ENOMEM; + written += rc; + } + } + return written; +} + +static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t len; + + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + mutex_unlock(&ctx->kdamond_lock); + len = -EBUSY; + goto out; + } + + len = sprint_init_regions(ctx, kbuf, count); + mutex_unlock(&ctx->kdamond_lock); + if (len < 0) + goto out; + len = simple_read_from_buffer(buf, count, ppos, kbuf, len); + +out: + kfree(kbuf); + return len; +} + +static int add_init_region(struct damon_ctx *c, + unsigned long target_id, struct damon_addr_range *ar) +{ + struct damon_target *t; + struct damon_region *r, *prev; + unsigned long id; + int rc = -EINVAL; + + if (ar->start >= ar->end) + return -EINVAL; + + damon_for_each_target(t, c) { + id = t->id; + if (targetid_is_pid(c)) + id = (unsigned long)pid_vnr((struct pid *)id); + if (id == target_id) { + r = damon_new_region(ar->start, ar->end); + if (!r) + return -ENOMEM; + damon_add_region(r, t); + if (damon_nr_regions(t) > 1) { + prev = damon_prev_region(r); + if (prev->ar.end > r->ar.start) { + damon_destroy_region(r, t); + return -EINVAL; + } + } + rc = 0; + } + } + return rc; +} + +static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len) +{ + struct damon_target *t; + struct damon_region *r, *next; + int pos = 0, parsed, ret; + unsigned long target_id; + struct damon_addr_range ar; + int err; + + damon_for_each_target(t, c) { + damon_for_each_region_safe(r, next, t) + damon_destroy_region(r, t); + } + + while (pos < len) { + ret = sscanf(&str[pos], "%lu %lu %lu%n", + &target_id, &ar.start, &ar.end, &parsed); + if (ret != 3) + break; + err = add_init_region(c, target_id, &ar); + if (err) + goto fail; + pos += parsed; + } + + return 0; + +fail: + damon_for_each_target(t, c) { + damon_for_each_region_safe(r, next, t) + damon_destroy_region(r, t); + } + return err; +} + +static ssize_t dbgfs_init_regions_write(struct file *file, + const char __user *buf, size_t count, + loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t ret = count; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ret = -EBUSY; + goto unlock_out; + } + + err = set_init_regions(ctx, kbuf, ret); + if (err) + ret = err; + +unlock_out: + mutex_unlock(&ctx->kdamond_lock); + kfree(kbuf); + return ret; +} + static ssize_t dbgfs_kdamond_pid_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -287,12 +624,24 @@ static const struct file_operations attrs_fops = { .write = dbgfs_attrs_write, }; +static const struct file_operations schemes_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_schemes_read, + .write = dbgfs_schemes_write, +}; + static const struct file_operations target_ids_fops = { .open = damon_dbgfs_open, .read = dbgfs_target_ids_read, .write = dbgfs_target_ids_write, }; +static const struct file_operations init_regions_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_init_regions_read, + .write = dbgfs_init_regions_write, +}; + static const struct file_operations kdamond_pid_fops = { .open = damon_dbgfs_open, .read = dbgfs_kdamond_pid_read, @@ -300,22 +649,22 @@ static const struct file_operations kdamond_pid_fops = { static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx) { - const char * const file_names[] = {"attrs", "target_ids", - "kdamond_pid"}; - const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops, - &kdamond_pid_fops}; + const char * const file_names[] = {"attrs", "schemes", "target_ids", + "init_regions", "kdamond_pid"}; + const struct file_operations *fops[] = {&attrs_fops, &schemes_fops, + &target_ids_fops, &init_regions_fops, &kdamond_pid_fops}; int i; for (i = 0; i < ARRAY_SIZE(file_names); i++) debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]); } -static int dbgfs_before_terminate(struct damon_ctx *ctx) +static void dbgfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; if (!targetid_is_pid(ctx)) - return 0; + return; mutex_lock(&ctx->kdamond_lock); damon_for_each_target_safe(t, next, ctx) { @@ -323,7 +672,6 @@ static int dbgfs_before_terminate(struct damon_ctx *ctx) damon_destroy_target(t); } mutex_unlock(&ctx->kdamond_lock); - return 0; } static struct damon_ctx *dbgfs_new_ctx(void) @@ -401,8 +749,7 @@ static ssize_t dbgfs_mk_context_write(struct file *file, { char *kbuf; char *ctx_name; - ssize_t ret = count; - int err; + ssize_t ret; kbuf = user_input_str(buf, count, ppos); if (IS_ERR(kbuf)) @@ -420,9 +767,9 @@ static ssize_t dbgfs_mk_context_write(struct file *file, } mutex_lock(&damon_dbgfs_lock); - err = dbgfs_mk_context(ctx_name); - if (err) - ret = err; + ret = dbgfs_mk_context(ctx_name); + if (!ret) + ret = count; mutex_unlock(&damon_dbgfs_lock); out: @@ -507,8 +854,7 @@ static ssize_t dbgfs_rm_context_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char *kbuf; - ssize_t ret = count; - int err; + ssize_t ret; char *ctx_name; kbuf = user_input_str(buf, count, ppos); @@ -527,9 +873,9 @@ static ssize_t dbgfs_rm_context_write(struct file *file, } mutex_lock(&damon_dbgfs_lock); - err = dbgfs_rm_context(ctx_name); - if (err) - ret = err; + ret = dbgfs_rm_context(ctx_name); + if (!ret) + ret = count; mutex_unlock(&damon_dbgfs_lock); out: @@ -553,9 +899,8 @@ static ssize_t dbgfs_monitor_on_read(struct file *file, static ssize_t dbgfs_monitor_on_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - ssize_t ret = count; + ssize_t ret; char *kbuf; - int err; kbuf = user_input_str(buf, count, ppos); if (IS_ERR(kbuf)) @@ -568,16 +913,26 @@ static ssize_t dbgfs_monitor_on_write(struct file *file, } mutex_lock(&damon_dbgfs_lock); - if (!strncmp(kbuf, "on", count)) - err = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs); - else if (!strncmp(kbuf, "off", count)) - err = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); - else - err = -EINVAL; + if (!strncmp(kbuf, "on", count)) { + int i; + + for (i = 0; i < dbgfs_nr_ctxs; i++) { + if (damon_targets_empty(dbgfs_ctxs[i])) { + kfree(kbuf); + mutex_unlock(&damon_dbgfs_lock); + return -EINVAL; + } + } + ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs); + } else if (!strncmp(kbuf, "off", count)) { + ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); + } else { + ret = -EINVAL; + } mutex_unlock(&damon_dbgfs_lock); - if (err) - ret = err; + if (!ret) + ret = count; kfree(kbuf); return ret; } diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c new file mode 100644 index 000000000000..5e8244f65a1a --- /dev/null +++ b/mm/damon/paddr.c @@ -0,0 +1,275 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON Primitives for The Physical Address Space + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-pa: " fmt + +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "prmtv-common.h" + +static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma, + unsigned long addr, void *arg) +{ + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = addr, + }; + + while (page_vma_mapped_walk(&pvmw)) { + addr = pvmw.address; + if (pvmw.pte) + damon_ptep_mkold(pvmw.pte, vma->vm_mm, addr); + else + damon_pmdp_mkold(pvmw.pmd, vma->vm_mm, addr); + } + return true; +} + +static void damon_pa_mkold(unsigned long paddr) +{ + struct page *page = damon_get_page(PHYS_PFN(paddr)); + struct rmap_walk_control rwc = { + .rmap_one = __damon_pa_mkold, + .anon_lock = page_lock_anon_vma_read, + }; + bool need_lock; + + if (!page) + return; + + if (!page_mapped(page) || !page_rmapping(page)) { + set_page_idle(page); + goto out; + } + + need_lock = !PageAnon(page) || PageKsm(page); + if (need_lock && !trylock_page(page)) + goto out; + + rmap_walk(page, &rwc); + + if (need_lock) + unlock_page(page); + +out: + put_page(page); +} + +static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, + struct damon_region *r) +{ + r->sampling_addr = damon_rand(r->ar.start, r->ar.end); + + damon_pa_mkold(r->sampling_addr); +} + +static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) + __damon_pa_prepare_access_check(ctx, r); + } +} + +struct damon_pa_access_chk_result { + unsigned long page_sz; + bool accessed; +}; + +static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma, + unsigned long addr, void *arg) +{ + struct damon_pa_access_chk_result *result = arg; + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = addr, + }; + + result->accessed = false; + result->page_sz = PAGE_SIZE; + while (page_vma_mapped_walk(&pvmw)) { + addr = pvmw.address; + if (pvmw.pte) { + result->accessed = pte_young(*pvmw.pte) || + !page_is_idle(page) || + mmu_notifier_test_young(vma->vm_mm, addr); + } else { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + result->accessed = pmd_young(*pvmw.pmd) || + !page_is_idle(page) || + mmu_notifier_test_young(vma->vm_mm, addr); + result->page_sz = ((1UL) << HPAGE_PMD_SHIFT); +#else + WARN_ON_ONCE(1); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + } + if (result->accessed) { + page_vma_mapped_walk_done(&pvmw); + break; + } + } + + /* If accessed, stop walking */ + return !result->accessed; +} + +static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) +{ + struct page *page = damon_get_page(PHYS_PFN(paddr)); + struct damon_pa_access_chk_result result = { + .page_sz = PAGE_SIZE, + .accessed = false, + }; + struct rmap_walk_control rwc = { + .arg = &result, + .rmap_one = __damon_pa_young, + .anon_lock = page_lock_anon_vma_read, + }; + bool need_lock; + + if (!page) + return false; + + if (!page_mapped(page) || !page_rmapping(page)) { + if (page_is_idle(page)) + result.accessed = false; + else + result.accessed = true; + put_page(page); + goto out; + } + + need_lock = !PageAnon(page) || PageKsm(page); + if (need_lock && !trylock_page(page)) { + put_page(page); + return NULL; + } + + rmap_walk(page, &rwc); + + if (need_lock) + unlock_page(page); + put_page(page); + +out: + *page_sz = result.page_sz; + return result.accessed; +} + +static void __damon_pa_check_access(struct damon_ctx *ctx, + struct damon_region *r) +{ + static unsigned long last_addr; + static unsigned long last_page_sz = PAGE_SIZE; + static bool last_accessed; + + /* If the region is in the last checked page, reuse the result */ + if (ALIGN_DOWN(last_addr, last_page_sz) == + ALIGN_DOWN(r->sampling_addr, last_page_sz)) { + if (last_accessed) + r->nr_accesses++; + return; + } + + last_accessed = damon_pa_young(r->sampling_addr, &last_page_sz); + if (last_accessed) + r->nr_accesses++; + + last_addr = r->sampling_addr; +} + +static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + unsigned int max_nr_accesses = 0; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) { + __damon_pa_check_access(ctx, r); + max_nr_accesses = max(r->nr_accesses, max_nr_accesses); + } + } + + return max_nr_accesses; +} + +bool damon_pa_target_valid(void *t) +{ + return true; +} + +static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) +{ + unsigned long addr, applied; + LIST_HEAD(page_list); + + if (scheme->action != DAMOS_PAGEOUT) + return 0; + + for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + struct page *page = damon_get_page(PHYS_PFN(addr)); + + if (!page) + continue; + + ClearPageReferenced(page); + test_and_clear_page_young(page); + if (isolate_lru_page(page)) { + put_page(page); + continue; + } + if (PageUnevictable(page)) { + putback_lru_page(page); + } else { + list_add(&page->lru, &page_list); + put_page(page); + } + } + applied = reclaim_pages(&page_list); + cond_resched(); + return applied * PAGE_SIZE; +} + +static int damon_pa_scheme_score(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) +{ + switch (scheme->action) { + case DAMOS_PAGEOUT: + return damon_pageout_score(context, r, scheme); + default: + break; + } + + return DAMOS_MAX_SCORE; +} + +void damon_pa_set_primitives(struct damon_ctx *ctx) +{ + ctx->primitive.init = NULL; + ctx->primitive.update = NULL; + ctx->primitive.prepare_access_checks = damon_pa_prepare_access_checks; + ctx->primitive.check_accesses = damon_pa_check_accesses; + ctx->primitive.reset_aggregated = NULL; + ctx->primitive.target_valid = damon_pa_target_valid; + ctx->primitive.cleanup = NULL; + ctx->primitive.apply_scheme = damon_pa_apply_scheme; + ctx->primitive.get_scheme_score = damon_pa_scheme_score; +} diff --git a/mm/damon/prmtv-common.c b/mm/damon/prmtv-common.c new file mode 100644 index 000000000000..92a04f5831d6 --- /dev/null +++ b/mm/damon/prmtv-common.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Primitives for Data Access Monitoring + * + * Author: SeongJae Park + */ + +#include +#include +#include +#include + +#include "prmtv-common.h" + +/* + * Get an online page for a pfn if it's in the LRU list. Otherwise, returns + * NULL. + * + * The body of this function is stolen from the 'page_idle_get_page()'. We + * steal rather than reuse it because the code is quite simple. + */ +struct page *damon_get_page(unsigned long pfn) +{ + struct page *page = pfn_to_online_page(pfn); + + if (!page || !PageLRU(page) || !get_page_unless_zero(page)) + return NULL; + + if (unlikely(!PageLRU(page))) { + put_page(page); + page = NULL; + } + return page; +} + +void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr) +{ + bool referenced = false; + struct page *page = damon_get_page(pte_pfn(*pte)); + + if (!page) + return; + + if (pte_young(*pte)) { + referenced = true; + *pte = pte_mkold(*pte); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE)) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +} + +void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bool referenced = false; + struct page *page = damon_get_page(pmd_pfn(*pmd)); + + if (!page) + return; + + if (pmd_young(*pmd)) { + referenced = true; + *pmd = pmd_mkold(*pmd); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, + addr + ((1UL) << HPAGE_PMD_SHIFT))) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} + +#define DAMON_MAX_SUBSCORE (100) +#define DAMON_MAX_AGE_IN_LOG (32) + +int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s) +{ + unsigned int max_nr_accesses; + int freq_subscore; + unsigned int age_in_sec; + int age_in_log, age_subscore; + unsigned int freq_weight = s->quota.weight_nr_accesses; + unsigned int age_weight = s->quota.weight_age; + int hotness; + + max_nr_accesses = c->aggr_interval / c->sample_interval; + freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses; + + age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000; + for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; + age_in_log++, age_in_sec >>= 1) + ; + + /* If frequency is 0, higher age means it's colder */ + if (freq_subscore == 0) + age_in_log *= -1; + + /* + * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG]. + * Scale it to be in [0, 100] and set it as age subscore. + */ + age_in_log += DAMON_MAX_AGE_IN_LOG; + age_subscore = age_in_log * DAMON_MAX_SUBSCORE / + DAMON_MAX_AGE_IN_LOG / 2; + + hotness = (freq_weight * freq_subscore + age_weight * age_subscore); + if (freq_weight + age_weight) + hotness /= freq_weight + age_weight; + /* + * Transform it to fit in [0, DAMOS_MAX_SCORE] + */ + hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE; + + /* Return coldness of the region */ + return DAMOS_MAX_SCORE - hotness; +} diff --git a/mm/damon/prmtv-common.h b/mm/damon/prmtv-common.h new file mode 100644 index 000000000000..e790cb5f8fe0 --- /dev/null +++ b/mm/damon/prmtv-common.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common Primitives for Data Access Monitoring + * + * Author: SeongJae Park + */ + +#include + +struct page *damon_get_page(unsigned long pfn); + +void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr); +void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr); + +int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c new file mode 100644 index 000000000000..183d4f3b4fde --- /dev/null +++ b/mm/damon/reclaim.c @@ -0,0 +1,425 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON-based page reclamation + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-reclaim: " fmt + +#include +#include +#include +#include +#include + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "damon_reclaim." + +/* + * Enable or disable DAMON_RECLAIM. + * + * You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``. + * Setting it as ``N`` disables DAMON_RECLAIM. Note that DAMON_RECLAIM could + * do no real monitoring and reclamation due to the watermarks-based activation + * condition. Refer to below descriptions for the watermarks parameter for + * this. + */ +static bool enabled __read_mostly; + +/* + * Time threshold for cold memory regions identification in microseconds. + * + * If a memory region is not accessed for this or longer time, DAMON_RECLAIM + * identifies the region as cold, and reclaims. 120 seconds by default. + */ +static unsigned long min_age __read_mostly = 120000000; +module_param(min_age, ulong, 0600); + +/* + * Limit of time for trying the reclamation in milliseconds. + * + * DAMON_RECLAIM tries to use only up to this time within a time window + * (quota_reset_interval_ms) for trying reclamation of cold pages. This can be + * used for limiting CPU consumption of DAMON_RECLAIM. If the value is zero, + * the limit is disabled. + * + * 10 ms by default. + */ +static unsigned long quota_ms __read_mostly = 10; +module_param(quota_ms, ulong, 0600); + +/* + * Limit of size of memory for the reclamation in bytes. + * + * DAMON_RECLAIM charges amount of memory which it tried to reclaim within a + * time window (quota_reset_interval_ms) and makes no more than this limit is + * tried. This can be used for limiting consumption of CPU and IO. If this + * value is zero, the limit is disabled. + * + * 128 MiB by default. + */ +static unsigned long quota_sz __read_mostly = 128 * 1024 * 1024; +module_param(quota_sz, ulong, 0600); + +/* + * The time/size quota charge reset interval in milliseconds. + * + * The charge reset interval for the quota of time (quota_ms) and size + * (quota_sz). That is, DAMON_RECLAIM does not try reclamation for more than + * quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms + * milliseconds. + * + * 1 second by default. + */ +static unsigned long quota_reset_interval_ms __read_mostly = 1000; +module_param(quota_reset_interval_ms, ulong, 0600); + +/* + * The watermarks check time interval in microseconds. + * + * Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is + * enabled but inactive due to its watermarks rule. 5 seconds by default. + */ +static unsigned long wmarks_interval __read_mostly = 5000000; +module_param(wmarks_interval, ulong, 0600); + +/* + * Free memory rate (per thousand) for the high watermark. + * + * If free memory of the system in bytes per thousand bytes is higher than + * this, DAMON_RECLAIM becomes inactive, so it does nothing but periodically + * checks the watermarks. 500 (50%) by default. + */ +static unsigned long wmarks_high __read_mostly = 500; +module_param(wmarks_high, ulong, 0600); + +/* + * Free memory rate (per thousand) for the middle watermark. + * + * If free memory of the system in bytes per thousand bytes is between this and + * the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring + * and the reclaiming. 400 (40%) by default. + */ +static unsigned long wmarks_mid __read_mostly = 400; +module_param(wmarks_mid, ulong, 0600); + +/* + * Free memory rate (per thousand) for the low watermark. + * + * If free memory of the system in bytes per thousand bytes is lower than this, + * DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks + * the watermarks. In the case, the system falls back to the LRU-based page + * granularity reclamation logic. 200 (20%) by default. + */ +static unsigned long wmarks_low __read_mostly = 200; +module_param(wmarks_low, ulong, 0600); + +/* + * Sampling interval for the monitoring in microseconds. + * + * The sampling interval of DAMON for the cold memory monitoring. Please refer + * to the DAMON documentation for more detail. 5 ms by default. + */ +static unsigned long sample_interval __read_mostly = 5000; +module_param(sample_interval, ulong, 0600); + +/* + * Aggregation interval for the monitoring in microseconds. + * + * The aggregation interval of DAMON for the cold memory monitoring. Please + * refer to the DAMON documentation for more detail. 100 ms by default. + */ +static unsigned long aggr_interval __read_mostly = 100000; +module_param(aggr_interval, ulong, 0600); + +/* + * Minimum number of monitoring regions. + * + * The minimal number of monitoring regions of DAMON for the cold memory + * monitoring. This can be used to set lower-bound of the monitoring quality. + * But, setting this too high could result in increased monitoring overhead. + * Please refer to the DAMON documentation for more detail. 10 by default. + */ +static unsigned long min_nr_regions __read_mostly = 10; +module_param(min_nr_regions, ulong, 0600); + +/* + * Maximum number of monitoring regions. + * + * The maximum number of monitoring regions of DAMON for the cold memory + * monitoring. This can be used to set upper-bound of the monitoring overhead. + * However, setting this too low could result in bad monitoring quality. + * Please refer to the DAMON documentation for more detail. 1000 by default. + */ +static unsigned long max_nr_regions __read_mostly = 1000; +module_param(max_nr_regions, ulong, 0600); + +/* + * Start of the target memory region in physical address. + * + * The start physical address of memory region that DAMON_RECLAIM will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_start __read_mostly; +module_param(monitor_region_start, ulong, 0600); + +/* + * End of the target memory region in physical address. + * + * The end physical address of memory region that DAMON_RECLAIM will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_end __read_mostly; +module_param(monitor_region_end, ulong, 0600); + +/* + * PID of the DAMON thread + * + * If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. + * Else, -1. + */ +static int kdamond_pid __read_mostly = -1; +module_param(kdamond_pid, int, 0400); + +/* + * Number of memory regions that tried to be reclaimed. + */ +static unsigned long nr_reclaim_tried_regions __read_mostly; +module_param(nr_reclaim_tried_regions, ulong, 0400); + +/* + * Total bytes of memory regions that tried to be reclaimed. + */ +static unsigned long bytes_reclaim_tried_regions __read_mostly; +module_param(bytes_reclaim_tried_regions, ulong, 0400); + +/* + * Number of memory regions that successfully be reclaimed. + */ +static unsigned long nr_reclaimed_regions __read_mostly; +module_param(nr_reclaimed_regions, ulong, 0400); + +/* + * Total bytes of memory regions that successfully be reclaimed. + */ +static unsigned long bytes_reclaimed_regions __read_mostly; +module_param(bytes_reclaimed_regions, ulong, 0400); + +/* + * Number of times that the time/space quota limits have exceeded + */ +static unsigned long nr_quota_exceeds __read_mostly; +module_param(nr_quota_exceeds, ulong, 0400); + +static struct damon_ctx *ctx; +static struct damon_target *target; + +struct damon_reclaim_ram_walk_arg { + unsigned long start; + unsigned long end; +}; + +static int walk_system_ram(struct resource *res, void *arg) +{ + struct damon_reclaim_ram_walk_arg *a = arg; + + if (a->end - a->start < res->end - res->start) { + a->start = res->start; + a->end = res->end; + } + return 0; +} + +/* + * Find biggest 'System RAM' resource and store its start and end address in + * @start and @end, respectively. If no System RAM is found, returns false. + */ +static bool get_monitoring_region(unsigned long *start, unsigned long *end) +{ + struct damon_reclaim_ram_walk_arg arg = {}; + + walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); + if (arg.end <= arg.start) + return false; + + *start = arg.start; + *end = arg.end; + return true; +} + +static struct damos *damon_reclaim_new_scheme(void) +{ + struct damos_watermarks wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = wmarks_interval, + .high = wmarks_high, + .mid = wmarks_mid, + .low = wmarks_low, + }; + struct damos_quota quota = { + /* + * Do not try reclamation for more than quota_ms milliseconds + * or quota_sz bytes within quota_reset_interval_ms. + */ + .ms = quota_ms, + .sz = quota_sz, + .reset_interval = quota_reset_interval_ms, + /* Within the quota, page out older regions first. */ + .weight_sz = 0, + .weight_nr_accesses = 0, + .weight_age = 1 + }; + struct damos *scheme = damon_new_scheme( + /* Find regions having PAGE_SIZE or larger size */ + PAGE_SIZE, ULONG_MAX, + /* and not accessed at all */ + 0, 0, + /* for min_age or more micro-seconds, and */ + min_age / aggr_interval, UINT_MAX, + /* page out those, as soon as found */ + DAMOS_PAGEOUT, + /* under the quota. */ + "a, + /* (De)activate this according to the watermarks. */ + &wmarks); + + return scheme; +} + +static int damon_reclaim_turn(bool on) +{ + struct damon_region *region; + struct damos *scheme; + int err; + + if (!on) { + err = damon_stop(&ctx, 1); + if (!err) + kdamond_pid = -1; + return err; + } + + err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0, + min_nr_regions, max_nr_regions); + if (err) + return err; + + if (monitor_region_start > monitor_region_end) + return -EINVAL; + if (!monitor_region_start && !monitor_region_end && + !get_monitoring_region(&monitor_region_start, + &monitor_region_end)) + return -EINVAL; + /* DAMON will free this on its own when finish monitoring */ + region = damon_new_region(monitor_region_start, monitor_region_end); + if (!region) + return -ENOMEM; + damon_add_region(region, target); + + /* Will be freed by 'damon_set_schemes()' below */ + scheme = damon_reclaim_new_scheme(); + if (!scheme) { + err = -ENOMEM; + goto free_region_out; + } + err = damon_set_schemes(ctx, &scheme, 1); + if (err) + goto free_scheme_out; + + err = damon_start(&ctx, 1); + if (!err) { + kdamond_pid = ctx->kdamond->pid; + return 0; + } + +free_scheme_out: + damon_destroy_scheme(scheme); +free_region_out: + damon_destroy_region(region, target); + return err; +} + +#define ENABLE_CHECK_INTERVAL_MS 1000 +static struct delayed_work damon_reclaim_timer; +static void damon_reclaim_timer_fn(struct work_struct *work) +{ + static bool last_enabled; + bool now_enabled; + + now_enabled = enabled; + if (last_enabled != now_enabled) { + if (!damon_reclaim_turn(now_enabled)) + last_enabled = now_enabled; + else + enabled = last_enabled; + } + + if (enabled) + schedule_delayed_work(&damon_reclaim_timer, + msecs_to_jiffies(ENABLE_CHECK_INTERVAL_MS)); +} +static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); + +static int enabled_store(const char *val, + const struct kernel_param *kp) +{ + int rc = param_set_bool(val, kp); + + if (rc < 0) + return rc; + + if (enabled) + schedule_delayed_work(&damon_reclaim_timer, 0); + + return 0; +} + +static const struct kernel_param_ops enabled_param_ops = { + .set = enabled_store, + .get = param_get_bool, +}; + +module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); +MODULE_PARM_DESC(enabled, + "Enable or disable DAMON_RECLAIM (default: disabled)"); + +static int damon_reclaim_after_aggregation(struct damon_ctx *c) +{ + struct damos *s; + + /* update the stats parameter */ + damon_for_each_scheme(s, c) { + nr_reclaim_tried_regions = s->stat.nr_tried; + bytes_reclaim_tried_regions = s->stat.sz_tried; + nr_reclaimed_regions = s->stat.nr_applied; + bytes_reclaimed_regions = s->stat.sz_applied; + nr_quota_exceeds = s->stat.qt_exceeds; + } + return 0; +} + +static int __init damon_reclaim_init(void) +{ + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + + damon_pa_set_primitives(ctx); + ctx->callback.after_aggregation = damon_reclaim_after_aggregation; + + /* 4242 means nothing but fun */ + target = damon_new_target(4242); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + + schedule_delayed_work(&damon_reclaim_timer, 0); + return 0; +} + +module_init(damon_reclaim_init); diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index 1f5c13257dba..6a1b9272ea12 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -135,7 +135,6 @@ static void damon_do_test_apply_three_regions(struct kunit *test, struct damon_addr_range *three_regions, unsigned long *expected, int nr_expected) { - struct damon_ctx *ctx = damon_new_ctx(); struct damon_target *t; struct damon_region *r; int i; @@ -145,7 +144,6 @@ static void damon_do_test_apply_three_regions(struct kunit *test, r = damon_new_region(regions[i * 2], regions[i * 2 + 1]); damon_add_region(r, t); } - damon_add_target(ctx, t); damon_va_apply_three_regions(t, three_regions); @@ -154,8 +152,6 @@ static void damon_do_test_apply_three_regions(struct kunit *test, KUNIT_EXPECT_EQ(test, r->ar.start, expected[i * 2]); KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]); } - - damon_destroy_ctx(ctx); } /* @@ -233,7 +229,7 @@ static void damon_test_apply_three_regions3(struct kunit *test) * and 70-100) has totally freed and mapped to different area (30-32 and * 65-68). The target regions which were in the old second and third big * regions should now be removed and new target regions covering the new second - * and third big regions should be crated. + * and third big regions should be created. */ static void damon_test_apply_three_regions4(struct kunit *test) { @@ -252,60 +248,59 @@ static void damon_test_apply_three_regions4(struct kunit *test) new_three_regions, expected, ARRAY_SIZE(expected)); } -static void damon_test_split_evenly(struct kunit *test) +static void damon_test_split_evenly_fail(struct kunit *test, + unsigned long start, unsigned long end, unsigned int nr_pieces) { - struct damon_ctx *c = damon_new_ctx(); - struct damon_target *t; - struct damon_region *r; - unsigned long i; - - KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5), - -EINVAL); - - t = damon_new_target(42); - r = damon_new_region(0, 100); - KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 0), -EINVAL); + struct damon_target *t = damon_new_target(42); + struct damon_region *r = damon_new_region(start, end); damon_add_region(r, t); - KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 10), 0); - KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 10u); - - i = 0; - damon_for_each_region(r, t) { - KUNIT_EXPECT_EQ(test, r->ar.start, i++ * 10); - KUNIT_EXPECT_EQ(test, r->ar.end, i * 10); - } - damon_free_target(t); - - t = damon_new_target(42); - r = damon_new_region(5, 59); - damon_add_region(r, t); - KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 5), 0); - KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u); - - i = 0; - damon_for_each_region(r, t) { - if (i == 4) - break; - KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i++); - KUNIT_EXPECT_EQ(test, r->ar.end, 5 + 10 * i); - } - KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i); - KUNIT_EXPECT_EQ(test, r->ar.end, 59ul); - damon_free_target(t); - - t = damon_new_target(42); - r = damon_new_region(5, 6); - damon_add_region(r, t); - KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 2), -EINVAL); + KUNIT_EXPECT_EQ(test, + damon_va_evenly_split_region(t, r, nr_pieces), -EINVAL); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1u); damon_for_each_region(r, t) { - KUNIT_EXPECT_EQ(test, r->ar.start, 5ul); - KUNIT_EXPECT_EQ(test, r->ar.end, 6ul); + KUNIT_EXPECT_EQ(test, r->ar.start, start); + KUNIT_EXPECT_EQ(test, r->ar.end, end); } + damon_free_target(t); - damon_destroy_ctx(c); +} + +static void damon_test_split_evenly_succ(struct kunit *test, + unsigned long start, unsigned long end, unsigned int nr_pieces) +{ + struct damon_target *t = damon_new_target(42); + struct damon_region *r = damon_new_region(start, end); + unsigned long expected_width = (end - start) / nr_pieces; + unsigned long i = 0; + + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, + damon_va_evenly_split_region(t, r, nr_pieces), 0); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), nr_pieces); + + damon_for_each_region(r, t) { + if (i == nr_pieces - 1) + break; + KUNIT_EXPECT_EQ(test, + r->ar.start, start + i++ * expected_width); + KUNIT_EXPECT_EQ(test, r->ar.end, start + i * expected_width); + } + KUNIT_EXPECT_EQ(test, r->ar.start, start + i * expected_width); + KUNIT_EXPECT_EQ(test, r->ar.end, end); + damon_free_target(t); +} + +static void damon_test_split_evenly(struct kunit *test) +{ + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5), + -EINVAL); + + damon_test_split_evenly_fail(test, 0, 100, 0); + damon_test_split_evenly_succ(test, 0, 100, 10); + damon_test_split_evenly_succ(test, 5, 59, 5); + damon_test_split_evenly_fail(test, 5, 6, 2); } static struct kunit_case damon_test_cases[] = { diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 1945196fd743..c0dec53b2330 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -7,31 +7,29 @@ #define pr_fmt(fmt) "damon-va: " fmt -#include -#include -#include -#include +#include #include +#include +#include #include #include -#include #include -#include + +#include "prmtv-common.h" #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST #undef DAMON_MIN_REGION #define DAMON_MIN_REGION 1 #endif -/* Get a random number in [l, r) */ -#define damon_rand(l, r) (l + prandom_u32_max(r - l)) - /* * 't->id' should be the pointer to the relevant 'struct pid' having reference * count. Caller must put the returned task, unless it is NULL. */ -#define damon_get_task_struct(t) \ - (get_pid_task((struct pid *)t->id, PIDTYPE_PID)) +static inline struct task_struct *damon_get_task_struct(struct damon_target *t) +{ + return get_pid_task((struct pid *)t->id, PIDTYPE_PID); +} /* * Get the mm_struct of the given target @@ -102,16 +100,6 @@ static unsigned long sz_range(struct damon_addr_range *r) return r->end - r->start; } -static void swap_ranges(struct damon_addr_range *r1, - struct damon_addr_range *r2) -{ - struct damon_addr_range tmp; - - tmp = *r1; - *r1 = *r2; - *r2 = tmp; -} - /* * Find three regions separated by two biggest unmapped regions * @@ -150,9 +138,9 @@ static int __damon_va_three_regions(struct vm_area_struct *vma, gap.start = last_vma->vm_end; gap.end = vma->vm_start; if (sz_range(&gap) > sz_range(&second_gap)) { - swap_ranges(&gap, &second_gap); + swap(gap, second_gap); if (sz_range(&second_gap) > sz_range(&first_gap)) - swap_ranges(&second_gap, &first_gap); + swap(second_gap, first_gap); } next: last_vma = vma; @@ -163,7 +151,7 @@ next: /* Sort the two biggest gaps by address */ if (first_gap.start > second_gap.start) - swap_ranges(&first_gap, &second_gap); + swap(first_gap, second_gap); /* Store the result */ regions[0].start = ALIGN(start, DAMON_MIN_REGION); @@ -244,13 +232,19 @@ static int damon_va_three_regions(struct damon_target *t, static void __damon_va_init_regions(struct damon_ctx *ctx, struct damon_target *t) { + struct damon_target *ti; struct damon_region *r; struct damon_addr_range regions[3]; unsigned long sz = 0, nr_pieces; - int i; + int i, tidx = 0; if (damon_va_three_regions(t, regions)) { - pr_err("Failed to get three regions of target %lu\n", t->id); + damon_for_each_target(ti, ctx) { + if (ti == t) + break; + tidx++; + } + pr_debug("Failed to get three regions of %dth target\n", tidx); return; } @@ -276,7 +270,7 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, } /* Initialize '->regions_list' of every target (task) */ -void damon_va_init(struct damon_ctx *ctx) +static void damon_va_init(struct damon_ctx *ctx) { struct damon_target *t; @@ -296,7 +290,8 @@ void damon_va_init(struct damon_ctx *ctx) * * Returns true if it is. */ -static bool damon_intersect(struct damon_region *r, struct damon_addr_range *re) +static bool damon_intersect(struct damon_region *r, + struct damon_addr_range *re) { return !(r->ar.end <= re->start || re->end <= r->ar.start); } @@ -311,7 +306,7 @@ static void damon_va_apply_three_regions(struct damon_target *t, struct damon_addr_range bregions[3]) { struct damon_region *r, *next; - unsigned int i = 0; + unsigned int i; /* Remove regions which are not in the three big regions now */ damon_for_each_region_safe(r, next, t) { @@ -360,7 +355,7 @@ static void damon_va_apply_three_regions(struct damon_target *t, /* * Update regions for current memory mappings */ -void damon_va_update(struct damon_ctx *ctx) +static void damon_va_update(struct damon_ctx *ctx) { struct damon_addr_range three_regions[3]; struct damon_target *t; @@ -372,82 +367,6 @@ void damon_va_update(struct damon_ctx *ctx) } } -/* - * Get an online page for a pfn if it's in the LRU list. Otherwise, returns - * NULL. - * - * The body of this function is stolen from the 'page_idle_get_page()'. We - * steal rather than reuse it because the code is quite simple. - */ -static struct page *damon_get_page(unsigned long pfn) -{ - struct page *page = pfn_to_online_page(pfn); - - if (!page || !PageLRU(page) || !get_page_unless_zero(page)) - return NULL; - - if (unlikely(!PageLRU(page))) { - put_page(page); - page = NULL; - } - return page; -} - -static void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, - unsigned long addr) -{ - bool referenced = false; - struct page *page = damon_get_page(pte_pfn(*pte)); - - if (!page) - return; - - if (pte_young(*pte)) { - referenced = true; - *pte = pte_mkold(*pte); - } - -#ifdef CONFIG_MMU_NOTIFIER - if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE)) - referenced = true; -#endif /* CONFIG_MMU_NOTIFIER */ - - if (referenced) - set_page_young(page); - - set_page_idle(page); - put_page(page); -} - -static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, - unsigned long addr) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - bool referenced = false; - struct page *page = damon_get_page(pmd_pfn(*pmd)); - - if (!page) - return; - - if (pmd_young(*pmd)) { - referenced = true; - *pmd = pmd_mkold(*pmd); - } - -#ifdef CONFIG_MMU_NOTIFIER - if (mmu_notifier_clear_young(mm, addr, - addr + ((1UL) << HPAGE_PMD_SHIFT))) - referenced = true; -#endif /* CONFIG_MMU_NOTIFIER */ - - if (referenced) - set_page_young(page); - - set_page_idle(page); - put_page(page); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -} - static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { @@ -480,8 +399,64 @@ out: return 0; } -static struct mm_walk_ops damon_mkold_ops = { +#ifdef CONFIG_HUGETLB_PAGE +static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr) +{ + bool referenced = false; + pte_t entry = huge_ptep_get(pte); + struct page *page = pte_page(entry); + + if (!page) + return; + + get_page(page); + + if (pte_young(entry)) { + referenced = true; + entry = pte_mkold(entry); + set_huge_pte_at(mm, addr, pte, entry); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, + addr + huge_page_size(hstate_vma(vma)))) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +} + +static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hstate *h = hstate_vma(walk->vma); + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(h, walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto out; + + damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr); + +out: + spin_unlock(ptl); + return 0; +} +#else +#define damon_mkold_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ + +static const struct mm_walk_ops damon_mkold_ops = { .pmd_entry = damon_mkold_pmd_entry, + .hugetlb_entry = damon_mkold_hugetlb_entry, }; static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) @@ -495,7 +470,7 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) * Functions for the access checking of the regions */ -static void damon_va_prepare_access_check(struct damon_ctx *ctx, +static void __damon_va_prepare_access_check(struct damon_ctx *ctx, struct mm_struct *mm, struct damon_region *r) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); @@ -503,7 +478,7 @@ static void damon_va_prepare_access_check(struct damon_ctx *ctx, damon_va_mkold(mm, r->sampling_addr); } -void damon_va_prepare_access_checks(struct damon_ctx *ctx) +static void damon_va_prepare_access_checks(struct damon_ctx *ctx) { struct damon_target *t; struct mm_struct *mm; @@ -514,7 +489,7 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) - damon_va_prepare_access_check(ctx, mm, r); + __damon_va_prepare_access_check(ctx, mm, r); mmput(mm); } } @@ -581,8 +556,47 @@ out: return 0; } -static struct mm_walk_ops damon_young_ops = { +#ifdef CONFIG_HUGETLB_PAGE +static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct damon_young_walk_private *priv = walk->private; + struct hstate *h = hstate_vma(walk->vma); + struct page *page; + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(h, walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto out; + + page = pte_page(entry); + if (!page) + goto out; + + get_page(page); + + if (pte_young(entry) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, addr)) { + *priv->page_sz = huge_page_size(h); + priv->young = true; + } + + put_page(page); + +out: + spin_unlock(ptl); + return 0; +} +#else +#define damon_young_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ + +static const struct mm_walk_ops damon_young_ops = { .pmd_entry = damon_young_pmd_entry, + .hugetlb_entry = damon_young_hugetlb_entry, }; static bool damon_va_young(struct mm_struct *mm, unsigned long addr, @@ -605,7 +619,7 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr, * mm 'mm_struct' for the given virtual address space * r the region to be checked */ -static void damon_va_check_access(struct damon_ctx *ctx, +static void __damon_va_check_access(struct damon_ctx *ctx, struct mm_struct *mm, struct damon_region *r) { static struct mm_struct *last_mm; @@ -629,7 +643,7 @@ static void damon_va_check_access(struct damon_ctx *ctx, last_addr = r->sampling_addr; } -unsigned int damon_va_check_accesses(struct damon_ctx *ctx) +static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) { struct damon_target *t; struct mm_struct *mm; @@ -641,7 +655,7 @@ unsigned int damon_va_check_accesses(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) { - damon_va_check_access(ctx, mm, r); + __damon_va_check_access(ctx, mm, r); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); } mmput(mm); @@ -668,6 +682,78 @@ bool damon_va_target_valid(void *target) return false; } +#ifndef CONFIG_ADVISE_SYSCALLS +static unsigned long damos_madvise(struct damon_target *target, + struct damon_region *r, int behavior) +{ + return 0; +} +#else +static unsigned long damos_madvise(struct damon_target *target, + struct damon_region *r, int behavior) +{ + struct mm_struct *mm; + unsigned long start = PAGE_ALIGN(r->ar.start); + unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start); + unsigned long applied; + + mm = damon_get_mm(target); + if (!mm) + return 0; + + applied = do_madvise(mm, start, len, behavior) ? 0 : len; + mmput(mm); + + return applied; +} +#endif /* CONFIG_ADVISE_SYSCALLS */ + +static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) +{ + int madv_action; + + switch (scheme->action) { + case DAMOS_WILLNEED: + madv_action = MADV_WILLNEED; + break; + case DAMOS_COLD: + madv_action = MADV_COLD; + break; + case DAMOS_PAGEOUT: + madv_action = MADV_PAGEOUT; + break; + case DAMOS_HUGEPAGE: + madv_action = MADV_HUGEPAGE; + break; + case DAMOS_NOHUGEPAGE: + madv_action = MADV_NOHUGEPAGE; + break; + case DAMOS_STAT: + return 0; + default: + return 0; + } + + return damos_madvise(t, r, madv_action); +} + +static int damon_va_scheme_score(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) +{ + + switch (scheme->action) { + case DAMOS_PAGEOUT: + return damon_pageout_score(context, r, scheme); + default: + break; + } + + return DAMOS_MAX_SCORE; +} + void damon_va_set_primitives(struct damon_ctx *ctx) { ctx->primitive.init = damon_va_init; @@ -677,6 +763,8 @@ void damon_va_set_primitives(struct damon_ctx *ctx) ctx->primitive.reset_aggregated = NULL; ctx->primitive.target_valid = damon_va_target_valid; ctx->primitive.cleanup = NULL; + ctx->primitive.apply_scheme = damon_va_apply_scheme; + ctx->primitive.get_scheme_score = damon_va_scheme_score; } #include "vaddr-test.h" diff --git a/mm/debug.c b/mm/debug.c index fae0f81ad831..e0c3f6327571 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -266,6 +266,7 @@ void dump_mm(const struct mm_struct *mm) mm->def_flags, &mm->def_flags ); } +EXPORT_SYMBOL(dump_mm); static bool page_init_poisoning __read_mostly = true; diff --git a/mm/filemap.c b/mm/filemap.c index 30c9c2a63746..d497061e54fa 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1272,7 +1272,7 @@ static inline bool trylock_page_bit_common(struct page *page, int bit_nr, /* How many times do we accept lock stealing from under a waiter? */ int sysctl_page_lock_unfairness = 5; -static inline int wait_on_page_bit_common(wait_queue_head_t *q, +static inline __sched int wait_on_page_bit_common(wait_queue_head_t *q, struct page *page, int bit_nr, int state, enum behavior behavior) { int unfairness = sysctl_page_lock_unfairness; @@ -1411,14 +1411,14 @@ repeat: return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } -void wait_on_page_bit(struct page *page, int bit_nr) +__sched void wait_on_page_bit(struct page *page, int bit_nr) { wait_queue_head_t *q = page_waitqueue(page); wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); } EXPORT_SYMBOL(wait_on_page_bit); -int wait_on_page_bit_killable(struct page *page, int bit_nr) +__sched int wait_on_page_bit_killable(struct page *page, int bit_nr) { wait_queue_head_t *q = page_waitqueue(page); return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED); @@ -1641,7 +1641,7 @@ EXPORT_SYMBOL_GPL(page_endio); * __lock_page - get a lock on the page, assuming we need to sleep to get it * @__page: the page to lock */ -void __lock_page(struct page *__page) +__sched void __lock_page(struct page *__page) { struct page *page = compound_head(__page); wait_queue_head_t *q = page_waitqueue(page); @@ -1650,7 +1650,7 @@ void __lock_page(struct page *__page) } EXPORT_SYMBOL(__lock_page); -int __lock_page_killable(struct page *__page) +__sched int __lock_page_killable(struct page *__page) { struct page *page = compound_head(__page); wait_queue_head_t *q = page_waitqueue(page); @@ -1659,7 +1659,7 @@ int __lock_page_killable(struct page *__page) } EXPORT_SYMBOL_GPL(__lock_page_killable); -int __lock_page_async(struct page *page, struct wait_page_queue *wait) +__sched int __lock_page_async(struct page *page, struct wait_page_queue *wait) { struct wait_queue_head *q = page_waitqueue(page); int ret = 0; @@ -1696,7 +1696,7 @@ int __lock_page_async(struct page *page, struct wait_page_queue *wait) * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 * with the page locked and the mmap_lock unperturbed. */ -int __lock_page_or_retry(struct page *page, struct mm_struct *mm, +__sched int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { if (fault_flag_allow_retry_first(flags)) { @@ -3027,7 +3027,9 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. * - * vma->vm_mm->mmap_lock must be held on entry. + * If FAULT_FLAG_SPECULATIVE is set, this function runs within an rcu + * read locked section and with mmap lock not held. + * Otherwise, vma->vm_mm->mmap_lock must be held on entry. * * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock * may be dropped before doing I/O or by lock_page_maybe_drop_mmap(). @@ -3045,6 +3047,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) struct file *file = vmf->vma->vm_file; struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; + struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; pgoff_t max_off; @@ -3052,6 +3055,52 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) vm_fault_t ret = 0; bool mapping_locked = false; + if (vmf->flags & FAULT_FLAG_SPECULATIVE) { + page = find_get_page(mapping, offset); + if (unlikely(!page)) + return VM_FAULT_RETRY; + + if (unlikely(PageReadahead(page))) + goto page_put; + + if (!trylock_page(page)) + goto page_put; + + if (unlikely(compound_head(page)->mapping != mapping)) + goto page_unlock; + VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); + if (unlikely(!PageUptodate(page))) + goto page_unlock; + + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) + goto page_unlock; + + /* + * Update readahead mmap_miss statistic. + * + * Note that we are not sure if finish_fault() will + * manage to complete the transaction. If it fails, + * we'll come back to filemap_fault() non-speculative + * case which will update mmap_miss a second time. + * This is not ideal, we would prefer to guarantee the + * update will happen exactly once. + */ + if (!(vmf->vma->vm_flags & VM_RAND_READ) && ra->ra_pages) { + unsigned int mmap_miss = READ_ONCE(ra->mmap_miss); + if (mmap_miss) + WRITE_ONCE(ra->mmap_miss, --mmap_miss); + } + + vmf->page = page; + return VM_FAULT_LOCKED; +page_unlock: + unlock_page(page); +page_put: + put_page(page); + return VM_FAULT_RETRY; + } + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(offset >= max_off)) return VM_FAULT_SIGBUS; @@ -3293,25 +3342,31 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, struct vm_area_struct *vma = vmf->vma; struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; - pgoff_t last_pgoff = start_pgoff; + pgoff_t last_pgoff; unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct page *head, *page; unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); - vm_fault_t ret = 0; + vm_fault_t ret = (vmf->flags & FAULT_FLAG_SPECULATIVE) ? + VM_FAULT_RETRY : 0; - rcu_read_lock(); + /* filemap_map_pages() is called within an rcu read lock already. */ head = first_map_page(mapping, &xas, end_pgoff); if (!head) - goto out; + return ret; - if (filemap_map_pmd(vmf, head)) { - ret = VM_FAULT_NOPAGE; - goto out; + if (!(vmf->flags & FAULT_FLAG_SPECULATIVE) && + filemap_map_pmd(vmf, head)) + return VM_FAULT_NOPAGE; + + if (!pte_map_lock(vmf)) { + unlock_page(head); + put_page(head); + return VM_FAULT_RETRY; } + addr = vmf->address; + last_pgoff = vmf->pgoff; - addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); do { page = find_subpage(head, xas.xa_index); if (PageHWPoison(page)) @@ -3341,8 +3396,7 @@ unlock: put_page(head); } while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL); pte_unmap_unlock(vmf->pte, vmf->ptl); -out: - rcu_read_unlock(); + vmf->pte = NULL; WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); return ret; } @@ -3378,6 +3432,7 @@ const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, + .speculative = true, }; /* This is used for a general mmap of a disk file */ diff --git a/mm/gup.c b/mm/gup.c index 2370565a81dc..b2094b55a6f0 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1730,6 +1730,35 @@ out: } EXPORT_SYMBOL(fault_in_writeable); +/** + * fault_in_subpage_writeable - fault in an address range for writing + * @uaddr: start of address range + * @size: size of address range + * + * Fault in a user address range for writing while checking for permissions at + * sub-page granularity (e.g. arm64 MTE). This function should be used when + * the caller cannot guarantee forward progress of a copy_to_user() loop. + * + * Returns the number of bytes not faulted in (like copy_to_user() and + * copy_from_user()). + */ +size_t fault_in_subpage_writeable(char __user *uaddr, size_t size) +{ + size_t faulted_in; + + /* + * Attempt faulting in at page granularity first for page table + * permission checking. The arch-specific probe_subpage_writeable() + * functions may not check for this. + */ + faulted_in = size - fault_in_writeable(uaddr, size); + if (faulted_in) + faulted_in -= probe_subpage_writeable(uaddr, faulted_in); + + return size - faulted_in; +} +EXPORT_SYMBOL(fault_in_subpage_writeable); + /* * fault_in_safe_writeable - fault in an address range for writing * @uaddr: start of address range diff --git a/mm/highmem.c b/mm/highmem.c index 4f942678e9da..0695b972973c 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -564,7 +564,7 @@ void *__kmap_local_page_prot(struct page *page, pgprot_t prot) } EXPORT_SYMBOL(__kmap_local_page_prot); -void kunmap_local_indexed(void *vaddr) +void kunmap_local_indexed(const void *vaddr) { unsigned long addr = (unsigned long) vaddr & PAGE_MASK; pte_t *kmap_pte; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 07941a1540cb..7f14b4121fd3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struct page *head, int tail, #ifdef CONFIG_64BIT (1L << PG_arch_2) | #endif - (1L << PG_dirty))); + (1L << PG_dirty) | + LRU_GEN_MASK | LRU_REFS_MASK)); /* ->mapping in first tail page is compound_mapcount */ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8599f16d4aa4..241ad2070bbf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1312,7 +1312,8 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, if (hugetlb_cma[nid]) { page = cma_alloc(hugetlb_cma[nid], nr_pages, - huge_page_order(h), true); + huge_page_order(h), + GFP_KERNEL | __GFP_NOWARN); if (page) return page; } @@ -1323,7 +1324,8 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, continue; page = cma_alloc(hugetlb_cma[node], nr_pages, - huge_page_order(h), true); + huge_page_order(h), + GFP_KERNEL | __GFP_NOWARN); if (page) return page; } diff --git a/mm/internal.h b/mm/internal.h index cf3cb933eba3..5c73246a092e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -35,6 +35,7 @@ void page_writeback_init(void); vm_fault_t do_swap_page(struct vm_fault *vmf); +void activate_page(struct page *page); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index adcd9acaef61..d4837bff3b60 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -35,7 +35,15 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) -obj-$(CONFIG_KASAN) := common.o report.o +CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) -fno-builtin $(call cc-disable-warning, vla) + +CFLAGS_kasan_test.o := $(CFLAGS_KASAN_TEST) +CFLAGS_kasan_test_module.o := $(CFLAGS_KASAN_TEST) + +obj-y := common.o report.o obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o tags.o report_tags.o obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o tags.o report_tags.o + +obj-$(CONFIG_KASAN_KUNIT_TEST) += kasan_test.o +obj-$(CONFIG_KASAN_MODULE_TEST) += kasan_test_module.o diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 2baf121fb8c5..b37c79530923 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -30,20 +30,19 @@ #include "kasan.h" #include "../slab.h" -depot_stack_handle_t kasan_save_stack(gfp_t flags) +depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) { unsigned long entries[KASAN_STACK_DEPTH]; unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); - nr_entries = filter_irq_stacks(entries, nr_entries); - return stack_depot_save(entries, nr_entries, flags); + return __stack_depot_save(entries, nr_entries, flags, can_alloc); } void kasan_set_track(struct kasan_track *track, gfp_t flags) { track->pid = current->pid; - track->stack = kasan_save_stack(flags); + track->stack = kasan_save_stack(flags, true); } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) @@ -89,128 +88,31 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) } #endif /* CONFIG_KASAN_STACK */ -/* - * Only allow cache merging when stack collection is disabled and no metadata - * is present. - */ -slab_flags_t __kasan_never_merge(void) -{ - if (kasan_stack_collection_enabled()) - return SLAB_KASAN; - return 0; -} - -void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init) +bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { u8 tag; unsigned long i; if (unlikely(PageHighMem(page))) - return; + return false; + + if (!kasan_sample_page_alloc(order)) + return false; tag = kasan_random_tag(); + kasan_unpoison(set_tag(page_address(page), tag), + PAGE_SIZE << order, init); for (i = 0; i < (1 << order); i++) page_kasan_tag_set(page + i, tag); - kasan_unpoison(page_address(page), PAGE_SIZE << order, init); + + return true; } void __kasan_poison_pages(struct page *page, unsigned int order, bool init) { if (likely(!PageHighMem(page))) kasan_poison(page_address(page), PAGE_SIZE << order, - KASAN_FREE_PAGE, init); -} - -/* - * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. - * For larger allocations larger redzones are used. - */ -static inline unsigned int optimal_redzone(unsigned int object_size) -{ - return - object_size <= 64 - 16 ? 16 : - object_size <= 128 - 32 ? 32 : - object_size <= 512 - 64 ? 64 : - object_size <= 4096 - 128 ? 128 : - object_size <= (1 << 14) - 256 ? 256 : - object_size <= (1 << 15) - 512 ? 512 : - object_size <= (1 << 16) - 1024 ? 1024 : 2048; -} - -void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, - slab_flags_t *flags) -{ - unsigned int ok_size; - unsigned int optimal_size; - - /* - * SLAB_KASAN is used to mark caches as ones that are sanitized by - * KASAN. Currently this flag is used in two places: - * 1. In slab_ksize() when calculating the size of the accessible - * memory within the object. - * 2. In slab_common.c to prevent merging of sanitized caches. - */ - *flags |= SLAB_KASAN; - - if (!kasan_stack_collection_enabled()) - return; - - ok_size = *size; - - /* Add alloc meta into redzone. */ - cache->kasan_info.alloc_meta_offset = *size; - *size += sizeof(struct kasan_alloc_meta); - - /* - * If alloc meta doesn't fit, don't add it. - * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal - * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for - * larger sizes. - */ - if (*size > KMALLOC_MAX_SIZE) { - cache->kasan_info.alloc_meta_offset = 0; - *size = ok_size; - /* Continue, since free meta might still fit. */ - } - - /* Only the generic mode uses free meta or flexible redzones. */ - if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { - cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; - return; - } - - /* - * Add free meta into redzone when it's not possible to store - * it in the object. This is the case when: - * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can - * be touched after it was freed, or - * 2. Object has a constructor, which means it's expected to - * retain its content until the next allocation, or - * 3. Object is too small. - * Otherwise cache->kasan_info.free_meta_offset = 0 is implied. - */ - if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor || - cache->object_size < sizeof(struct kasan_free_meta)) { - ok_size = *size; - - cache->kasan_info.free_meta_offset = *size; - *size += sizeof(struct kasan_free_meta); - - /* If free meta doesn't fit, don't add it. */ - if (*size > KMALLOC_MAX_SIZE) { - cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; - *size = ok_size; - } - } - - /* Calculate size with optimal redzone. */ - optimal_size = cache->object_size + optimal_redzone(cache->object_size); - /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */ - if (optimal_size > KMALLOC_MAX_SIZE) - optimal_size = KMALLOC_MAX_SIZE; - /* Use optimal size if the size with added metas is not large enough. */ - if (*size < optimal_size) - *size = optimal_size; + KASAN_PAGE_FREE, init); } void __kasan_cache_create_kmalloc(struct kmem_cache *cache) @@ -218,35 +120,6 @@ void __kasan_cache_create_kmalloc(struct kmem_cache *cache) cache->kasan_info.is_kmalloc = true; } -size_t __kasan_metadata_size(struct kmem_cache *cache) -{ - if (!kasan_stack_collection_enabled()) - return 0; - return (cache->kasan_info.alloc_meta_offset ? - sizeof(struct kasan_alloc_meta) : 0) + - (cache->kasan_info.free_meta_offset ? - sizeof(struct kasan_free_meta) : 0); -} - -struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, - const void *object) -{ - if (!cache->kasan_info.alloc_meta_offset) - return NULL; - return kasan_reset_tag(object) + cache->kasan_info.alloc_meta_offset; -} - -#ifdef CONFIG_KASAN_GENERIC -struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, - const void *object) -{ - BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); - if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META) - return NULL; - return kasan_reset_tag(object) + cache->kasan_info.free_meta_offset; -} -#endif - void __kasan_poison_slab(struct page *page) { unsigned long i; @@ -254,7 +127,7 @@ void __kasan_poison_slab(struct page *page) for (i = 0; i < compound_nr(page); i++) page_kasan_tag_reset(page + i); kasan_poison(page_address(page), page_size(page), - KASAN_KMALLOC_REDZONE, false); + KASAN_SLAB_REDZONE, false); } void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object) @@ -265,7 +138,7 @@ void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object) void __kasan_poison_object_data(struct kmem_cache *cache, void *object) { kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), - KASAN_KMALLOC_REDZONE, false); + KASAN_SLAB_REDZONE, false); } /* @@ -298,7 +171,7 @@ static inline u8 assign_tag(struct kmem_cache *cache, /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */ #ifdef CONFIG_SLAB /* For SLAB assign tags based on the object index in the freelist. */ - return (u8)obj_to_index(cache, virt_to_page(object), (void *)object); + return (u8)obj_to_index(cache, virt_to_head_page(object), (void *)object); #else /* * For SLUB assign a random tag during slab creation, otherwise reuse @@ -311,13 +184,9 @@ static inline u8 assign_tag(struct kmem_cache *cache, void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, const void *object) { - struct kasan_alloc_meta *alloc_meta; - - if (kasan_stack_collection_enabled()) { - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - __memset(alloc_meta, 0, sizeof(*alloc_meta)); - } + /* Initialize per-object metadata if it is present. */ + if (kasan_requires_meta()) + kasan_init_object_meta(cache, object); /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */ object = set_tag(object, assign_tag(cache, object, true)); @@ -328,13 +197,11 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip, bool quarantine, bool init) { - u8 tag; void *tagged_object; if (!kasan_arch_is_ready()) return false; - tag = get_tag(object); tagged_object = object; object = kasan_reset_tag(object); @@ -343,7 +210,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) != object)) { - kasan_report_invalid_free(tagged_object, ip); + kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_INVALID_FREE); return true; } @@ -352,18 +219,18 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, return false; if (!kasan_byte_accessible(tagged_object)) { - kasan_report_invalid_free(tagged_object, ip); + kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_DOUBLE_FREE); return true; } kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), - KASAN_KMALLOC_FREE, init); + KASAN_SLAB_FREE, init); if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine)) return false; if (kasan_stack_collection_enabled()) - kasan_set_free_info(cache, object, tag); + kasan_save_free_info(cache, tagged_object); return kasan_quarantine_put(cache, object); } @@ -377,17 +244,17 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) { if (ptr != page_address(virt_to_head_page(ptr))) { - kasan_report_invalid_free(ptr, ip); + kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE); return true; } if (!kasan_byte_accessible(ptr)) { - kasan_report_invalid_free(ptr, ip); + kasan_report_invalid_free(ptr, ip, KASAN_REPORT_DOUBLE_FREE); return true; } /* - * The object will be poisoned by kasan_free_pages() or + * The object will be poisoned by kasan_poison_pages() or * kasan_slab_free_mempool(). */ @@ -414,26 +281,12 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) if (unlikely(!PageSlab(page))) { if (____kasan_kfree_large(ptr, ip)) return; - kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE, false); + kasan_poison(ptr, page_size(page), KASAN_PAGE_FREE, false); } else { ____kasan_slab_free(page->slab_cache, ptr, ip, false, false); } } -static void set_alloc_info(struct kmem_cache *cache, void *object, - gfp_t flags, bool is_kmalloc) -{ - struct kasan_alloc_meta *alloc_meta; - - /* Don't save alloc info for kmalloc caches in kasan_slab_alloc(). */ - if (cache->kasan_info.is_kmalloc && !is_kmalloc) - return; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - kasan_set_track(&alloc_meta->alloc_track, flags); -} - void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags, bool init) { @@ -463,8 +316,8 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, kasan_unpoison(tagged_object, cache->object_size, init); /* Save alloc info (if possible) for non-kmalloc() allocations. */ - if (kasan_stack_collection_enabled()) - set_alloc_info(cache, (void *)object, flags, false); + if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc) + kasan_save_alloc_info(cache, tagged_object, flags); return tagged_object; } @@ -503,14 +356,14 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache, redzone_end = round_up((unsigned long)(object + cache->object_size), KASAN_GRANULE_SIZE); kasan_poison((void *)redzone_start, redzone_end - redzone_start, - KASAN_KMALLOC_REDZONE, false); + KASAN_SLAB_REDZONE, false); /* * Save alloc info (if possible) for kmalloc() allocations. * This also rewrites the alloc info when called from kasan_krealloc(). */ - if (kasan_stack_collection_enabled()) - set_alloc_info(cache, (void *)object, flags, true); + if (kasan_stack_collection_enabled() && cache->kasan_info.is_kmalloc) + kasan_save_alloc_info(cache, (void *)object, flags); /* Keep the tag that was set by kasan_slab_alloc(). */ return (void *)object; @@ -536,7 +389,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, return NULL; /* - * The object has already been unpoisoned by kasan_alloc_pages() for + * The object has already been unpoisoned by kasan_unpoison_pages() for * alloc_pages() or by kasan_krealloc() for krealloc(). */ diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index c3f5ba7a294a..cf274d7f37b7 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -328,7 +328,140 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); -void kasan_record_aux_stack(void *addr) +/* Only allow cache merging when no per-object metadata is present. */ +slab_flags_t kasan_never_merge(void) +{ + if (!kasan_requires_meta()) + return 0; + return SLAB_KASAN; +} + +/* + * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. + * For larger allocations larger redzones are used. + */ +static inline unsigned int optimal_redzone(unsigned int object_size) +{ + return + object_size <= 64 - 16 ? 16 : + object_size <= 128 - 32 ? 32 : + object_size <= 512 - 64 ? 64 : + object_size <= 4096 - 128 ? 128 : + object_size <= (1 << 14) - 256 ? 256 : + object_size <= (1 << 15) - 512 ? 512 : + object_size <= (1 << 16) - 1024 ? 1024 : 2048; +} + +void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, + slab_flags_t *flags) +{ + unsigned int ok_size; + unsigned int optimal_size; + + if (!kasan_requires_meta()) + return; + + /* + * SLAB_KASAN is used to mark caches that are sanitized by KASAN + * and that thus have per-object metadata. + * Currently this flag is used in two places: + * 1. In slab_ksize() to account for per-object metadata when + * calculating the size of the accessible memory within the object. + * 2. In slab_common.c via kasan_never_merge() to prevent merging of + * caches with per-object metadata. + */ + *flags |= SLAB_KASAN; + + ok_size = *size; + + /* Add alloc meta into redzone. */ + cache->kasan_info.alloc_meta_offset = *size; + *size += sizeof(struct kasan_alloc_meta); + + /* + * If alloc meta doesn't fit, don't add it. + * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal + * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for + * larger sizes. + */ + if (*size > KMALLOC_MAX_SIZE) { + cache->kasan_info.alloc_meta_offset = 0; + *size = ok_size; + /* Continue, since free meta might still fit. */ + } + + /* + * Add free meta into redzone when it's not possible to store + * it in the object. This is the case when: + * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can + * be touched after it was freed, or + * 2. Object has a constructor, which means it's expected to + * retain its content until the next allocation, or + * 3. Object is too small. + * Otherwise cache->kasan_info.free_meta_offset = 0 is implied. + */ + if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor || + cache->object_size < sizeof(struct kasan_free_meta)) { + ok_size = *size; + + cache->kasan_info.free_meta_offset = *size; + *size += sizeof(struct kasan_free_meta); + + /* If free meta doesn't fit, don't add it. */ + if (*size > KMALLOC_MAX_SIZE) { + cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; + *size = ok_size; + } + } + + /* Calculate size with optimal redzone. */ + optimal_size = cache->object_size + optimal_redzone(cache->object_size); + /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */ + if (optimal_size > KMALLOC_MAX_SIZE) + optimal_size = KMALLOC_MAX_SIZE; + /* Use optimal size if the size with added metas is not large enough. */ + if (*size < optimal_size) + *size = optimal_size; +} + +struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, + const void *object) +{ + if (!cache->kasan_info.alloc_meta_offset) + return NULL; + return (void *)object + cache->kasan_info.alloc_meta_offset; +} + +struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, + const void *object) +{ + BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); + if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META) + return NULL; + return (void *)object + cache->kasan_info.free_meta_offset; +} + +void kasan_init_object_meta(struct kmem_cache *cache, const void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + __memset(alloc_meta, 0, sizeof(*alloc_meta)); +} + +size_t kasan_metadata_size(struct kmem_cache *cache) +{ + if (!kasan_requires_meta()) + return 0; + return (cache->kasan_info.alloc_meta_offset ? + sizeof(struct kasan_alloc_meta) : 0) + + ((cache->kasan_info.free_meta_offset && + cache->kasan_info.free_meta_offset != KASAN_NO_FREE_META) ? + sizeof(struct kasan_free_meta) : 0); +} + +static void __kasan_record_aux_stack(void *addr, bool can_alloc) { struct page *page = kasan_addr_to_page(addr); struct kmem_cache *cache; @@ -345,11 +478,29 @@ void kasan_record_aux_stack(void *addr) return; alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; - alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT); + alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT, can_alloc); } -void kasan_set_free_info(struct kmem_cache *cache, - void *object, u8 tag) +void kasan_record_aux_stack(void *addr) +{ + return __kasan_record_aux_stack(addr, true); +} + +void kasan_record_aux_stack_noalloc(void *addr) +{ + return __kasan_record_aux_stack(addr, false); +} + +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + kasan_set_track(&alloc_meta->alloc_track, flags); +} + +void kasan_save_free_info(struct kmem_cache *cache, void *object) { struct kasan_free_meta *free_meta; @@ -359,14 +510,5 @@ void kasan_set_free_info(struct kmem_cache *cache, kasan_set_track(&free_meta->free_track, GFP_NOWAIT); /* The object was freed and has free track set. */ - *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREETRACK; -} - -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) -{ - if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_KMALLOC_FREETRACK) - return NULL; - /* Free meta must be present with KASAN_KMALLOC_FREETRACK. */ - return &kasan_get_free_meta(cache, object)->free_track; + *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREETRACK; } diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 05d1e9460e2e..d1bcb0205327 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -29,28 +29,53 @@ enum kasan_arg_mode { KASAN_ARG_MODE_DEFAULT, KASAN_ARG_MODE_SYNC, KASAN_ARG_MODE_ASYNC, + KASAN_ARG_MODE_ASYMM, }; -enum kasan_arg_stacktrace { - KASAN_ARG_STACKTRACE_DEFAULT, - KASAN_ARG_STACKTRACE_OFF, - KASAN_ARG_STACKTRACE_ON, +enum kasan_arg_vmalloc { + KASAN_ARG_VMALLOC_DEFAULT, + KASAN_ARG_VMALLOC_OFF, + KASAN_ARG_VMALLOC_ON, }; static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; -static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init; +static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata; -/* Whether KASAN is enabled at all. */ +/* + * Whether KASAN is enabled at all. + * The value remains false until KASAN is initialized by kasan_init_hw_tags(). + */ DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); EXPORT_SYMBOL(kasan_flag_enabled); -/* Whether the asynchronous mode is enabled. */ -bool kasan_flag_async __ro_after_init; -EXPORT_SYMBOL_GPL(kasan_flag_async); +/* + * Whether the selected mode is synchronous, asynchronous, or asymmetric. + * Defaults to KASAN_MODE_SYNC. + */ +enum kasan_mode kasan_mode __ro_after_init; +EXPORT_SYMBOL_GPL(kasan_mode); -/* Whether to collect alloc/free stack traces. */ -DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); +/* Whether to enable vmalloc tagging. */ +DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc); + +#define PAGE_ALLOC_SAMPLE_DEFAULT 1 +#define PAGE_ALLOC_SAMPLE_ORDER_DEFAULT 3 + +/* + * Sampling interval of page_alloc allocation (un)poisoning. + * Defaults to no sampling. + */ +unsigned long kasan_page_alloc_sample = PAGE_ALLOC_SAMPLE_DEFAULT; + +/* + * Minimum order of page_alloc allocations to be affected by sampling. + * The default value is chosen to match both + * PAGE_ALLOC_COSTLY_ORDER and SKB_FRAG_PAGE_ORDER. + */ +unsigned int kasan_page_alloc_sample_order = PAGE_ALLOC_SAMPLE_ORDER_DEFAULT; + +DEFINE_PER_CPU(long, kasan_page_alloc_skip); /* kasan=off/on */ static int __init early_kasan_flag(char *arg) @@ -69,7 +94,7 @@ static int __init early_kasan_flag(char *arg) } early_param("kasan", early_kasan_flag); -/* kasan.mode=sync/async */ +/* kasan.mode=sync/async/asymm */ static int __init early_kasan_mode(char *arg) { if (!arg) @@ -79,6 +104,8 @@ static int __init early_kasan_mode(char *arg) kasan_arg_mode = KASAN_ARG_MODE_SYNC; else if (!strcmp(arg, "async")) kasan_arg_mode = KASAN_ARG_MODE_ASYNC; + else if (!strcmp(arg, "asymm")) + kasan_arg_mode = KASAN_ARG_MODE_ASYMM; else return -EINVAL; @@ -86,24 +113,79 @@ static int __init early_kasan_mode(char *arg) } early_param("kasan.mode", early_kasan_mode); -/* kasan.stacktrace=off/on */ -static int __init early_kasan_flag_stacktrace(char *arg) +/* kasan.vmalloc=off/on */ +static int __init early_kasan_flag_vmalloc(char *arg) { if (!arg) return -EINVAL; if (!strcmp(arg, "off")) - kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF; + kasan_arg_vmalloc = KASAN_ARG_VMALLOC_OFF; else if (!strcmp(arg, "on")) - kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON; + kasan_arg_vmalloc = KASAN_ARG_VMALLOC_ON; else return -EINVAL; return 0; } -early_param("kasan.stacktrace", early_kasan_flag_stacktrace); +early_param("kasan.vmalloc", early_kasan_flag_vmalloc); -/* kasan_init_hw_tags_cpu() is called for each CPU. */ +static inline const char *kasan_mode_info(void) +{ + if (kasan_mode == KASAN_MODE_ASYNC) + return "async"; + else if (kasan_mode == KASAN_MODE_ASYMM) + return "asymm"; + else + return "sync"; +} + +/* kasan.page_alloc.sample= */ +static int __init early_kasan_flag_page_alloc_sample(char *arg) +{ + int rv; + + if (!arg) + return -EINVAL; + + rv = kstrtoul(arg, 0, &kasan_page_alloc_sample); + if (rv) + return rv; + + if (!kasan_page_alloc_sample || kasan_page_alloc_sample > LONG_MAX) { + kasan_page_alloc_sample = PAGE_ALLOC_SAMPLE_DEFAULT; + return -EINVAL; + } + + return 0; +} +early_param("kasan.page_alloc.sample", early_kasan_flag_page_alloc_sample); + +/* kasan.page_alloc.sample.order= */ +static int __init early_kasan_flag_page_alloc_sample_order(char *arg) +{ + int rv; + + if (!arg) + return -EINVAL; + + rv = kstrtouint(arg, 0, &kasan_page_alloc_sample_order); + if (rv) + return rv; + + if (kasan_page_alloc_sample_order > INT_MAX) { + kasan_page_alloc_sample_order = PAGE_ALLOC_SAMPLE_ORDER_DEFAULT; + return -EINVAL; + } + + return 0; +} +early_param("kasan.page_alloc.sample.order", early_kasan_flag_page_alloc_sample_order); + +/* + * kasan_init_hw_tags_cpu() is called for each CPU. + * Not marked as __init as a CPU can be hot-plugged after boot. + */ void kasan_init_hw_tags_cpu(void) { /* @@ -111,18 +193,19 @@ void kasan_init_hw_tags_cpu(void) * as this function is only called for MTE-capable hardware. */ - /* If KASAN is disabled via command line, don't initialize it. */ + /* + * If KASAN is disabled via command line, don't initialize it. + * When this function is called, kasan_flag_enabled is not yet + * set by kasan_init_hw_tags(). Thus, check kasan_arg instead. + */ if (kasan_arg == KASAN_ARG_OFF) return; /* - * Enable async mode only when explicitly requested through - * the command line. + * Enable async or asymm modes only when explicitly requested + * through the command line. */ - if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC) - hw_enable_tagging_async(); - else - hw_enable_tagging_sync(); + kasan_enable_tagging(); } /* kasan_init_hw_tags() is called once on boot CPU. */ @@ -136,80 +219,173 @@ void __init kasan_init_hw_tags(void) if (kasan_arg == KASAN_ARG_OFF) return; - /* Enable KASAN. */ - static_branch_enable(&kasan_flag_enabled); - switch (kasan_arg_mode) { case KASAN_ARG_MODE_DEFAULT: - /* - * Default to sync mode. - * Do nothing, kasan_flag_async keeps its default value. - */ + /* Default is specified by kasan_mode definition. */ break; case KASAN_ARG_MODE_SYNC: - /* Do nothing, kasan_flag_async keeps its default value. */ + kasan_mode = KASAN_MODE_SYNC; break; case KASAN_ARG_MODE_ASYNC: - /* Async mode enabled. */ - kasan_flag_async = true; + kasan_mode = KASAN_MODE_ASYNC; + break; + case KASAN_ARG_MODE_ASYMM: + kasan_mode = KASAN_MODE_ASYMM; break; } - switch (kasan_arg_stacktrace) { - case KASAN_ARG_STACKTRACE_DEFAULT: - /* Default to enabling stack trace collection. */ - static_branch_enable(&kasan_flag_stacktrace); + switch (kasan_arg_vmalloc) { + case KASAN_ARG_VMALLOC_DEFAULT: + /* Default is specified by kasan_flag_vmalloc definition. */ break; - case KASAN_ARG_STACKTRACE_OFF: - /* Do nothing, kasan_flag_stacktrace keeps its default value. */ + case KASAN_ARG_VMALLOC_OFF: + static_branch_disable(&kasan_flag_vmalloc); break; - case KASAN_ARG_STACKTRACE_ON: - static_branch_enable(&kasan_flag_stacktrace); + case KASAN_ARG_VMALLOC_ON: + static_branch_enable(&kasan_flag_vmalloc); break; } - pr_info("KernelAddressSanitizer initialized\n"); + kasan_init_tags(); + + /* KASAN is now initialized, enable it. */ + static_branch_enable(&kasan_flag_enabled); + + pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n", + kasan_mode_info(), + kasan_vmalloc_enabled() ? "on" : "off", + kasan_stack_collection_enabled() ? "on" : "off"); } -void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags) +#ifdef CONFIG_KASAN_VMALLOC + +static void unpoison_vmalloc_pages(const void *addr, u8 tag) { + struct vm_struct *area; + int i; + /* - * This condition should match the one in post_alloc_hook() in - * page_alloc.c. + * As hardware tag-based KASAN only tags VM_ALLOC vmalloc allocations + * (see the comment in __kasan_unpoison_vmalloc), all of the pages + * should belong to a single area. */ - bool init = !want_init_on_free() && want_init_on_alloc(flags); + area = find_vm_area((void *)addr); + if (WARN_ON(!area)) + return; - if (flags & __GFP_SKIP_KASAN_POISON) - SetPageSkipKASanPoison(page); + for (i = 0; i < area->nr_pages; i++) { + struct page *page = area->pages[i]; - if (flags & __GFP_ZEROTAGS) { - int i; - - for (i = 0; i != 1 << order; ++i) - tag_clear_highpage(page + i); - } else { - kasan_unpoison_pages(page, order, init); + page_kasan_tag_set(page, tag); } } -void kasan_free_pages(struct page *page, unsigned int order) +static void init_vmalloc_pages(const void *start, unsigned long size) +{ + const void *addr; + + for (addr = start; addr < start + size; addr += PAGE_SIZE) { + struct page *page = virt_to_page(addr); + + clear_highpage_kasan_tagged(page); + } +} + +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags) +{ + u8 tag; + unsigned long redzone_start, redzone_size; + + if (!kasan_vmalloc_enabled() || !is_vmalloc_or_module_addr(start)) { + if (flags & KASAN_VMALLOC_INIT) + init_vmalloc_pages(start, size); + return (void *)start; + } + + /* + * Don't tag non-VM_ALLOC mappings, as: + * + * 1. Unlike the software KASAN modes, hardware tag-based KASAN only + * supports tagging physical memory. Therefore, it can only tag a + * single mapping of normal physical pages. + * 2. Hardware tag-based KASAN can only tag memory mapped with special + * mapping protection bits, see arch_vmap_pgprot_tagged(). + * As non-VM_ALLOC mappings can be mapped outside of vmalloc code, + * providing these bits would require tracking all non-VM_ALLOC + * mappers. + * + * Thus, for VM_ALLOC mappings, hardware tag-based KASAN only tags + * the first virtual mapping, which is created by vmalloc(). + * Tagging the page_alloc memory backing that vmalloc() allocation is + * skipped, see ___GFP_SKIP_KASAN_UNPOISON. + * + * For non-VM_ALLOC allocations, page_alloc memory is tagged as usual. + */ + if (!(flags & KASAN_VMALLOC_VM_ALLOC)) { + WARN_ON(flags & KASAN_VMALLOC_INIT); + return (void *)start; + } + + /* + * Don't tag executable memory. + * The kernel doesn't tolerate having the PC register tagged. + */ + if (!(flags & KASAN_VMALLOC_PROT_NORMAL)) { + WARN_ON(flags & KASAN_VMALLOC_INIT); + return (void *)start; + } + + tag = kasan_random_tag(); + start = set_tag(start, tag); + + /* Unpoison and initialize memory up to size. */ + kasan_unpoison(start, size, flags & KASAN_VMALLOC_INIT); + + /* + * Explicitly poison and initialize the in-page vmalloc() redzone. + * Unlike software KASAN modes, hardware tag-based KASAN doesn't + * unpoison memory when populating shadow for vmalloc() space. + */ + redzone_start = round_up((unsigned long)start + size, + KASAN_GRANULE_SIZE); + redzone_size = round_up(redzone_start, PAGE_SIZE) - redzone_start; + kasan_poison((void *)redzone_start, redzone_size, KASAN_TAG_INVALID, + flags & KASAN_VMALLOC_INIT); + + /* + * Set per-page tag flags to allow accessing physical memory for the + * vmalloc() mapping through page_address(vmalloc_to_page()). + */ + unpoison_vmalloc_pages(start, tag); + + return (void *)start; +} + +void __kasan_poison_vmalloc(const void *start, unsigned long size) { /* - * This condition should match the one in free_pages_prepare() in - * page_alloc.c. + * No tagging here. + * The physical pages backing the vmalloc() allocation are poisoned + * through the usual page_alloc paths. */ - bool init = want_init_on_free(); +} - kasan_poison_pages(page, order, init); +#endif + +void kasan_enable_tagging(void) +{ + if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC) + hw_enable_tagging_async(); + else if (kasan_arg_mode == KASAN_ARG_MODE_ASYMM) + hw_enable_tagging_asymm(); + else + hw_enable_tagging_sync(); } #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -void kasan_enable_tagging_sync(void) -{ - hw_enable_tagging_sync(); -} -EXPORT_SYMBOL_GPL(kasan_enable_tagging_sync); +EXPORT_SYMBOL_GPL(kasan_enable_tagging); void kasan_force_async_fault(void) { diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 8bf568a80eb8..708ca4941fbf 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -2,43 +2,131 @@ #ifndef __MM_KASAN_KASAN_H #define __MM_KASAN_KASAN_H +#include #include #include #include #include -#ifdef CONFIG_KASAN_HW_TAGS +/* Used in KUnit-compatible KASAN tests. */ +struct kunit_kasan_status { + bool report_found; + bool sync_fault; +}; + +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) #include -#include "../slab.h" -DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace); -extern bool kasan_flag_async __ro_after_init; +DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); static inline bool kasan_stack_collection_enabled(void) { return static_branch_unlikely(&kasan_flag_stacktrace); } -static inline bool kasan_async_mode_enabled(void) -{ - return kasan_flag_async; -} -#else +#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline bool kasan_stack_collection_enabled(void) { return true; } -static inline bool kasan_async_mode_enabled(void) +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_HW_TAGS + +#include "../slab.h" + +DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc); + +enum kasan_mode { + KASAN_MODE_SYNC, + KASAN_MODE_ASYNC, + KASAN_MODE_ASYMM, +}; + +extern enum kasan_mode kasan_mode __ro_after_init; + +extern unsigned long kasan_page_alloc_sample; +extern unsigned int kasan_page_alloc_sample_order; +DECLARE_PER_CPU(long, kasan_page_alloc_skip); + +static inline bool kasan_vmalloc_enabled(void) +{ + return static_branch_likely(&kasan_flag_vmalloc); +} + +static inline bool kasan_async_fault_possible(void) +{ + return kasan_mode == KASAN_MODE_ASYNC || kasan_mode == KASAN_MODE_ASYMM; +} + +static inline bool kasan_sync_fault_possible(void) +{ + return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM; +} + +static inline bool kasan_sample_page_alloc(unsigned int order) +{ + /* Fast-path for when sampling is disabled. */ + if (kasan_page_alloc_sample == 1) + return true; + + if (order < kasan_page_alloc_sample_order) + return true; + + if (this_cpu_dec_return(kasan_page_alloc_skip) < 0) { + this_cpu_write(kasan_page_alloc_skip, + kasan_page_alloc_sample - 1); + return true; + } + + return false; +} + +#else /* CONFIG_KASAN_HW_TAGS */ + +static inline bool kasan_async_fault_possible(void) { return false; } -#endif +static inline bool kasan_sync_fault_possible(void) +{ + return true; +} -extern bool kasan_flag_async __ro_after_init; +static inline bool kasan_sample_page_alloc(unsigned int order) +{ + return true; +} + +#endif /* CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_GENERIC + +/* Generic KASAN uses per-object metadata to store stack traces. */ +static inline bool kasan_requires_meta(void) +{ + /* + * Technically, Generic KASAN always collects stack traces right now. + * However, let's use kasan_stack_collection_enabled() in case the + * kasan.stacktrace command-line argument is changed to affect + * Generic KASAN. + */ + return kasan_stack_collection_enabled(); +} + +#else /* CONFIG_KASAN_GENERIC */ + +/* Tag-based KASAN modes do not use per-object metadata. */ +static inline bool kasan_requires_meta(void) +{ + return false; +} + +#endif /* CONFIG_KASAN_GENERIC */ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) @@ -52,49 +140,47 @@ extern bool kasan_flag_async __ro_after_init; #define KASAN_MEMORY_PER_SHADOW_PAGE (KASAN_GRANULE_SIZE << PAGE_SHIFT) #ifdef CONFIG_KASAN_GENERIC -#define KASAN_FREE_PAGE 0xFF /* page was freed */ -#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ -#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ -#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ -#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */ +#define KASAN_PAGE_FREE 0xFF /* freed page */ +#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocation */ +#define KASAN_SLAB_REDZONE 0xFC /* redzone for slab object */ +#define KASAN_SLAB_FREE 0xFB /* freed slab object */ +#define KASAN_VMALLOC_INVALID 0xF8 /* inaccessible space in vmap area */ #else -#define KASAN_FREE_PAGE KASAN_TAG_INVALID -#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID -#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID -#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID -#define KASAN_KMALLOC_FREETRACK KASAN_TAG_INVALID +#define KASAN_PAGE_FREE KASAN_TAG_INVALID +#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID +#define KASAN_SLAB_REDZONE KASAN_TAG_INVALID +#define KASAN_SLAB_FREE KASAN_TAG_INVALID +#define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only used for SW_TAGS */ #endif -#define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */ -#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */ +#ifdef CONFIG_KASAN_GENERIC -/* - * Stack redzone shadow values - * (Those are compiler's ABI, don't change them) - */ -#define KASAN_STACK_LEFT 0xF1 -#define KASAN_STACK_MID 0xF2 -#define KASAN_STACK_RIGHT 0xF3 -#define KASAN_STACK_PARTIAL 0xF4 +#define KASAN_SLAB_FREETRACK 0xFA /* freed slab object with free track */ +#define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */ -/* - * alloca redzone shadow values - */ +/* Stack redzone shadow values. Compiler ABI, do not change. */ +#define KASAN_STACK_LEFT 0xF1 +#define KASAN_STACK_MID 0xF2 +#define KASAN_STACK_RIGHT 0xF3 +#define KASAN_STACK_PARTIAL 0xF4 + +/* alloca redzone shadow values. */ #define KASAN_ALLOCA_LEFT 0xCA #define KASAN_ALLOCA_RIGHT 0xCB +/* alloca redzone size. Compiler ABI, do not change. */ #define KASAN_ALLOCA_REDZONE_SIZE 32 -/* - * Stack frame marker (compiler ABI). - */ +/* Stack frame marker. Compiler ABI, do not change. */ #define KASAN_CURRENT_STACK_FRAME_MAGIC 0x41B58AB3 -/* Don't break randconfig/all*config builds */ +/* Dummy value to avoid breaking randconfig/all*config builds. */ #ifndef KASAN_ABI_VERSION #define KASAN_ABI_VERSION 1 #endif +#endif /* CONFIG_KASAN_GENERIC */ + /* Metadata layout customization. */ #define META_BYTES_PER_BLOCK 1 #define META_BLOCKS_PER_ROW 16 @@ -102,29 +188,53 @@ extern bool kasan_flag_async __ro_after_init; #define META_MEM_BYTES_PER_ROW (META_BYTES_PER_ROW * KASAN_GRANULE_SIZE) #define META_ROWS_AROUND_ADDR 2 -struct kasan_access_info { - const void *access_addr; - const void *first_bad_addr; +#define KASAN_STACK_DEPTH 64 + +struct kasan_track { + u32 pid; + depot_stack_handle_t stack; +}; + +enum kasan_report_type { + KASAN_REPORT_ACCESS, + KASAN_REPORT_INVALID_FREE, + KASAN_REPORT_DOUBLE_FREE, +}; + +struct kasan_report_info { + /* Filled in by kasan_report_*(). */ + enum kasan_report_type type; + void *access_addr; size_t access_size; bool is_write; unsigned long ip; + + /* Filled in by the common reporting code. */ + void *first_bad_addr; + struct kmem_cache *cache; + void *object; + + /* Filled in by the mode-specific reporting code. */ + const char *bug_type; + struct kasan_track alloc_track; + struct kasan_track free_track; }; -/* The layout of struct dictated by compiler */ +/* Do not change the struct layout: compiler ABI. */ struct kasan_source_location { const char *filename; int line_no; int column_no; }; -/* The layout of struct dictated by compiler */ +/* Do not change the struct layout: compiler ABI. */ struct kasan_global { const void *beg; /* Address of the beginning of the global variable. */ size_t size; /* Size of the global variable. */ - size_t size_with_redzone; /* Size of the variable + size of the red zone. 32 bytes aligned */ + size_t size_with_redzone; /* Size of the variable + size of the redzone. 32 bytes aligned. */ const void *name; const void *module_name; /* Name of the module where the global variable is declared. */ - unsigned long has_dynamic_init; /* This needed for C++ */ + unsigned long has_dynamic_init; /* This is needed for C++. */ #if KASAN_ABI_VERSION >= 4 struct kasan_source_location *location; #endif @@ -133,38 +243,14 @@ struct kasan_global { #endif }; -/** - * Structures to keep alloc and free tracks * - */ +/* Structures for keeping alloc and free meta. */ -#define KASAN_STACK_DEPTH 64 - -struct kasan_track { - u32 pid; - depot_stack_handle_t stack; -}; - -#if defined(CONFIG_KASAN_TAGS_IDENTIFY) && defined(CONFIG_KASAN_SW_TAGS) -#define KASAN_NR_FREE_STACKS 5 -#else -#define KASAN_NR_FREE_STACKS 1 -#endif +#ifdef CONFIG_KASAN_GENERIC struct kasan_alloc_meta { struct kasan_track alloc_track; -#ifdef CONFIG_KASAN_GENERIC - /* - * The auxiliary stack is stored into struct kasan_alloc_meta. - * The free stack is stored into struct kasan_free_meta. - */ + /* Free track is stored in kasan_free_meta. */ depot_stack_handle_t aux_stack[2]; -#else - struct kasan_track free_track[KASAN_NR_FREE_STACKS]; -#endif -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - u8 free_pointer_tag[KASAN_NR_FREE_STACKS]; - u8 free_track_idx; -#endif }; struct qlist_node { @@ -172,29 +258,41 @@ struct qlist_node { }; /* - * Generic mode either stores free meta in the object itself or in the redzone - * after the object. In the former case free meta offset is 0, in the latter - * case it has some sane value smaller than INT_MAX. Use INT_MAX as free meta - * offset when free meta isn't present. + * Free meta is stored either in the object itself or in the redzone after the + * object. In the former case, free meta offset is 0. In the latter case, the + * offset is between 0 and INT_MAX. INT_MAX marks that free meta is not present. */ #define KASAN_NO_FREE_META INT_MAX +/* + * Free meta is only used by Generic mode while the object is in quarantine. + * After that, slab allocator stores the freelist pointer in the object. + */ struct kasan_free_meta { -#ifdef CONFIG_KASAN_GENERIC - /* This field is used while the object is in the quarantine. - * Otherwise it might be used for the allocator freelist. - */ struct qlist_node quarantine_link; struct kasan_track free_track; -#endif }; -struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, - const void *object); -#ifdef CONFIG_KASAN_GENERIC -struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, - const void *object); -#endif +#endif /* CONFIG_KASAN_GENERIC */ + +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) + +struct kasan_stack_ring_entry { + void *ptr; + size_t size; + u32 pid; + depot_stack_handle_t stack; + bool is_free; +}; + +struct kasan_stack_ring { + rwlock_t lock; + size_t size; + atomic64_t pos; + struct kasan_stack_ring_entry *entries; +}; + +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) @@ -206,7 +304,8 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) static inline bool addr_has_metadata(const void *addr) { - return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); + return (kasan_reset_tag(addr) >= + kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); } /** @@ -229,33 +328,50 @@ static inline bool addr_has_metadata(const void *addr) #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ +void *kasan_find_first_bad_addr(void *addr, size_t size); +void kasan_complete_mode_report_info(struct kasan_report_info *info); +void kasan_metadata_fetch_row(char *buffer, void *row); + #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) void kasan_print_tags(u8 addr_tag, const void *addr); #else static inline void kasan_print_tags(u8 addr_tag, const void *addr) { } #endif -void *kasan_find_first_bad_addr(void *addr, size_t size); -const char *kasan_get_bug_type(struct kasan_access_info *info); -void kasan_metadata_fetch_row(char *buffer, void *row); - -#if defined(CONFIG_KASAN_GENERIC) && defined(CONFIG_KASAN_STACK) +#if defined(CONFIG_KASAN_STACK) void kasan_print_address_stack_frame(const void *addr); #else static inline void kasan_print_address_stack_frame(const void *addr) { } #endif +#ifdef CONFIG_KASAN_GENERIC +void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object); +#else +static inline void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) { } +#endif + bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); -void kasan_report_invalid_free(void *object, unsigned long ip); +void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type); struct page *kasan_addr_to_page(const void *addr); -depot_stack_handle_t kasan_save_stack(gfp_t flags); +#ifdef CONFIG_KASAN_GENERIC +void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size); +void kasan_init_object_meta(struct kmem_cache *cache, const void *object); +struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, + const void *object); +struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, + const void *object); +#else +static inline void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) { } +static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { } +#endif + +depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); -void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag); -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag); +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); +void kasan_save_free_info(struct kmem_cache *cache, void *object); #if defined(CONFIG_KASAN_GENERIC) && \ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) @@ -289,6 +405,9 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #ifndef arch_enable_tagging_async #define arch_enable_tagging_async() #endif +#ifndef arch_enable_tagging_asymm +#define arch_enable_tagging_asymm() +#endif #ifndef arch_force_async_tag_fault #define arch_force_async_tag_fault() #endif @@ -304,30 +423,38 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #define hw_enable_tagging_sync() arch_enable_tagging_sync() #define hw_enable_tagging_async() arch_enable_tagging_async() +#define hw_enable_tagging_asymm() arch_enable_tagging_asymm() #define hw_force_async_tag_fault() arch_force_async_tag_fault() #define hw_get_random_tag() arch_get_random_tag() #define hw_get_mem_tag(addr) arch_get_mem_tag(addr) #define hw_set_mem_tag_range(addr, size, tag, init) \ arch_set_mem_tag_range((addr), (size), (tag), (init)) +void kasan_enable_tagging(void); + #else /* CONFIG_KASAN_HW_TAGS */ #define hw_enable_tagging_sync() #define hw_enable_tagging_async() +#define hw_enable_tagging_asymm() + +static inline void kasan_enable_tagging(void) { } #endif /* CONFIG_KASAN_HW_TAGS */ +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) +void __init kasan_init_tags(void); +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + #if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -void kasan_enable_tagging_sync(void); void kasan_force_async_fault(void); -#else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ +#else /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */ -static inline void kasan_enable_tagging_sync(void) { } static inline void kasan_force_async_fault(void) { } -#endif /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ +#endif /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */ #ifdef CONFIG_KASAN_SW_TAGS u8 kasan_random_tag(void); @@ -369,9 +496,10 @@ static inline void kasan_unpoison(const void *addr, size_t size, bool init) return; /* * Explicitly initialize the memory with the precise object size to - * avoid overwriting the SLAB redzone. This disables initialization in - * the arch code and may thus lead to performance penalty. The penalty - * is accepted since SLAB redzones aren't enabled in production builds. + * avoid overwriting the slab redzone. This disables initialization in + * the arch code and may thus lead to performance penalty. This penalty + * does not affect production builds, as slab redzones are not enabled + * there. */ if (__slub_debug_enabled() && init && ((unsigned long)size & KASAN_GRANULE_MASK)) { @@ -446,10 +574,18 @@ static inline bool kasan_arch_is_ready(void) { return true; } #error kasan_arch_is_ready only works in KASAN generic outline mode! #endif +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST) + +bool kasan_save_enable_multi_shot(void); +void kasan_restore_multi_shot(bool enabled); + +#endif + /* * Exported functions for interfaces called from assembly or from generated - * code. Declarations here to avoid warning about missing declarations. + * code. Declared here to avoid warnings about missing declarations. */ + asmlinkage void kasan_unpoison_task_stack_below(const void *watermark); void __asan_register_globals(struct kasan_global *globals, size_t size); void __asan_unregister_globals(struct kasan_global *globals, size_t size); @@ -518,4 +654,4 @@ void __hwasan_storeN_noabort(unsigned long addr, size_t size); void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size); -#endif +#endif /* __MM_KASAN_KASAN_H */ diff --git a/lib/test_kasan.c b/mm/kasan/kasan_test.c similarity index 76% rename from lib/test_kasan.c rename to mm/kasan/kasan_test.c index 89f444cabd4a..5fa3fd924ed8 100644 --- a/lib/test_kasan.c +++ b/mm/kasan/kasan_test.c @@ -19,12 +19,13 @@ #include #include #include +#include #include #include -#include "../mm/kasan/kasan.h" +#include "kasan.h" #define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE) @@ -36,7 +37,7 @@ void *kasan_ptr_result; int kasan_int_result; static struct kunit_resource resource; -static struct kunit_kasan_expectation fail_data; +static struct kunit_kasan_status test_status; static bool multishot; /* @@ -53,58 +54,63 @@ static int kasan_test_init(struct kunit *test) } multishot = kasan_save_enable_multi_shot(); - fail_data.report_found = false; + test_status.report_found = false; + test_status.sync_fault = false; kunit_add_named_resource(test, NULL, NULL, &resource, - "kasan_data", &fail_data); + "kasan_status", &test_status); return 0; } static void kasan_test_exit(struct kunit *test) { kasan_restore_multi_shot(multishot); - KUNIT_EXPECT_FALSE(test, fail_data.report_found); + KUNIT_EXPECT_FALSE(test, test_status.report_found); } /** * KUNIT_EXPECT_KASAN_FAIL() - check that the executed expression produces a * KASAN report; causes a test failure otherwise. This relies on a KUnit - * resource named "kasan_data". Do not use this name for KUnit resources + * resource named "kasan_status". Do not use this name for KUnit resources * outside of KASAN tests. * - * For hardware tag-based KASAN in sync mode, when a tag fault happens, tag + * For hardware tag-based KASAN, when a synchronous tag fault happens, tag * checking is auto-disabled. When this happens, this test handler reenables * tag checking. As tag checking can be only disabled or enabled per CPU, * this handler disables migration (preemption). * - * Since the compiler doesn't see that the expression can change the fail_data + * Since the compiler doesn't see that the expression can change the test_status * fields, it can reorder or optimize away the accesses to those fields. * Use READ/WRITE_ONCE() for the accesses and compiler barriers around the * expression to prevent that. * - * In between KUNIT_EXPECT_KASAN_FAIL checks, fail_data.report_found is kept as - * false. This allows detecting KASAN reports that happen outside of the checks - * by asserting !fail_data.report_found at the start of KUNIT_EXPECT_KASAN_FAIL - * and in kasan_test_exit. + * In between KUNIT_EXPECT_KASAN_FAIL checks, test_status.report_found is kept + * as false. This allows detecting KASAN reports that happen outside of the + * checks by asserting !test_status.report_found at the start of + * KUNIT_EXPECT_KASAN_FAIL and in kasan_test_exit. */ #define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \ - !kasan_async_mode_enabled()) \ + kasan_sync_fault_possible()) \ migrate_disable(); \ - KUNIT_EXPECT_FALSE(test, READ_ONCE(fail_data.report_found)); \ + KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found)); \ barrier(); \ expression; \ barrier(); \ - if (!READ_ONCE(fail_data.report_found)) { \ + if (kasan_async_fault_possible()) \ + kasan_force_async_fault(); \ + if (!READ_ONCE(test_status.report_found)) { \ KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure " \ "expected in \"" #expression \ "\", but none occurred"); \ } \ - if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) { \ - if (READ_ONCE(fail_data.report_found)) \ - kasan_enable_tagging_sync(); \ + if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \ + kasan_sync_fault_possible()) { \ + if (READ_ONCE(test_status.report_found) && \ + READ_ONCE(test_status.sync_fault)) \ + kasan_enable_tagging(); \ migrate_enable(); \ } \ - WRITE_ONCE(fail_data.report_found, false); \ + WRITE_ONCE(test_status.report_found, false); \ } while (0) #define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \ @@ -289,6 +295,9 @@ static void krealloc_more_oob_helper(struct kunit *test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + /* Suppress -Warray-bounds warnings. */ + OPTIMIZER_HIDE_VAR(ptr2); + /* All offsets up to size2 must be accessible. */ ptr2[size1 - 1] = 'x'; ptr2[size1] = 'x'; @@ -321,6 +330,9 @@ static void krealloc_less_oob_helper(struct kunit *test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + /* Suppress -Warray-bounds warnings. */ + OPTIMIZER_HIDE_VAR(ptr2); + /* Must be accessible for all modes. */ ptr2[size2 - 1] = 'x'; @@ -447,6 +459,7 @@ static void kmalloc_oob_memset_2(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 1, 0, 2)); kfree(ptr); } @@ -459,6 +472,7 @@ static void kmalloc_oob_memset_4(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 3, 0, 4)); kfree(ptr); } @@ -471,6 +485,7 @@ static void kmalloc_oob_memset_8(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 7, 0, 8)); kfree(ptr); } @@ -483,6 +498,7 @@ static void kmalloc_oob_memset_16(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 15, 0, 16)); kfree(ptr); } @@ -495,16 +511,18 @@ static void kmalloc_oob_in_memset(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size + KASAN_GRANULE_SIZE)); kfree(ptr); } -static void kmalloc_memmove_invalid_size(struct kunit *test) +static void kmalloc_memmove_negative_size(struct kunit *test) { char *ptr; size_t size = 64; - volatile size_t invalid_size = -2; + size_t invalid_size = -2; /* * Hardware tag-based mode doesn't check memmove for negative size. @@ -517,6 +535,25 @@ static void kmalloc_memmove_invalid_size(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset((char *)ptr, 0, 64); + OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(invalid_size); + KUNIT_EXPECT_KASAN_FAIL(test, + memmove((char *)ptr, (char *)ptr + 4, invalid_size)); + kfree(ptr); +} + +static void kmalloc_memmove_invalid_size(struct kunit *test) +{ + char *ptr; + size_t size = 64; + size_t invalid_size = size; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + memset((char *)ptr, 0, 64); + OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(invalid_size); KUNIT_EXPECT_KASAN_FAIL(test, memmove((char *)ptr, (char *)ptr + 4, invalid_size)); kfree(ptr); @@ -582,6 +619,29 @@ again: kfree(ptr2); } +/* + * Check that KASAN detects use-after-free when another object was allocated in + * the same slot. Relevant for the tag-based modes, which do not use quarantine. + */ +static void kmalloc_uaf3(struct kunit *test) +{ + char *ptr1, *ptr2; + size_t size = 100; + + /* This test is specifically crafted for tag-based modes. */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + ptr1 = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + kfree(ptr1); + + ptr2 = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + kfree(ptr2); + + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]); +} + static void kfree_via_page(struct kunit *test) { char *ptr; @@ -686,7 +746,7 @@ static void kmem_cache_bulk(struct kunit *test) static char global_array[10]; -static void kasan_global_oob(struct kunit *test) +static void kasan_global_oob_right(struct kunit *test) { /* * Deliberate out-of-bounds access. To prevent CONFIG_UBSAN_LOCAL_BOUNDS @@ -709,6 +769,20 @@ static void kasan_global_oob(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); } +static void kasan_global_oob_left(struct kunit *test) +{ + char *volatile array = global_array; + char *p = array - 3; + + /* + * GCC is known to fail this test, skip it. + * See https://bugzilla.kernel.org/show_bug.cgi?id=215051. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_CC_IS_CLANG); + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); +} + /* Check that ksize() makes the whole object accessible. */ static void ksize_unpoisons_memory(struct kunit *test) { @@ -752,7 +826,7 @@ static void ksize_uaf(struct kunit *test) static void kasan_stack_oob(struct kunit *test) { char stack_array[10]; - /* See comment in kasan_global_oob. */ + /* See comment in kasan_global_oob_right. */ char *volatile array = stack_array; char *p = &array[ARRAY_SIZE(stack_array) + OOB_TAG_OFF]; @@ -765,7 +839,7 @@ static void kasan_alloca_oob_left(struct kunit *test) { volatile int i = 10; char alloca_array[i]; - /* See comment in kasan_global_oob. */ + /* See comment in kasan_global_oob_right. */ char *volatile array = alloca_array; char *p = array - 1; @@ -780,7 +854,7 @@ static void kasan_alloca_oob_right(struct kunit *test) { volatile int i = 10; char alloca_array[i]; - /* See comment in kasan_global_oob. */ + /* See comment in kasan_global_oob_right. */ char *volatile array = alloca_array; char *p = array + i; @@ -841,6 +915,19 @@ static void kmem_cache_invalid_free(struct kunit *test) kmem_cache_destroy(cache); } +static void empty_cache_ctor(void *object) { } + +static void kmem_cache_double_destroy(struct kunit *test) +{ + struct kmem_cache *cache; + + /* Provide a constructor to prevent cache merging. */ + cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + kmem_cache_destroy(cache); + KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache)); +} + static void kasan_memchr(struct kunit *test) { char *ptr; @@ -858,6 +945,8 @@ static void kasan_memchr(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = memchr(ptr, '1', size + 1)); @@ -883,6 +972,8 @@ static void kasan_memcmp(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset(arr, 0, sizeof(arr)); + OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = memcmp(ptr, arr, size+1)); kfree(ptr); @@ -1012,21 +1103,186 @@ static void kmalloc_double_kzfree(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr)); } -static void vmalloc_oob(struct kunit *test) +static void vmalloc_helpers_tags(struct kunit *test) { - void *area; + void *ptr; + + /* This test is intended for tag-based modes. */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + ptr = vmalloc(PAGE_SIZE); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + /* Check that the returned pointer is tagged. */ + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure exported vmalloc helpers handle tagged pointers. */ + KUNIT_ASSERT_TRUE(test, is_vmalloc_addr(ptr)); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, vmalloc_to_page(ptr)); + +#if !IS_MODULE(CONFIG_KASAN_KUNIT_TEST) + { + int rv; + + /* Make sure vmalloc'ed memory permissions can be changed. */ + rv = set_memory_ro((unsigned long)ptr, 1); + KUNIT_ASSERT_GE(test, rv, 0); + rv = set_memory_rw((unsigned long)ptr, 1); + KUNIT_ASSERT_GE(test, rv, 0); + } +#endif + + vfree(ptr); +} + +static void vmalloc_oob(struct kunit *test) +{ + char *v_ptr, *p_ptr; + struct page *page; + size_t size = PAGE_SIZE / 2 - KASAN_GRANULE_SIZE - 5; + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + + v_ptr = vmalloc(size); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + OPTIMIZER_HIDE_VAR(v_ptr); + /* - * We have to be careful not to hit the guard page. + * We have to be careful not to hit the guard page in vmalloc tests. * The MMU will catch that and crash us. */ - area = vmalloc(3000); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, area); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)area)[3100]); - vfree(area); + /* Make sure in-bounds accesses are valid. */ + v_ptr[0] = 0; + v_ptr[size - 1] = 0; + + /* + * An unaligned access past the requested vmalloc size. + * Only generic KASAN can precisely detect these. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]); + + /* An aligned access into the first out-of-bounds granule. */ + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size + 5]); + + /* Check that in-bounds accesses to the physical page are valid. */ + page = vmalloc_to_page(v_ptr); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page); + p_ptr = page_address(page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + p_ptr[0] = 0; + + vfree(v_ptr); + + /* + * We can't check for use-after-unmap bugs in this nor in the following + * vmalloc tests, as the page might be fully unmapped and accessing it + * will crash the kernel. + */ +} + +static void vmap_tags(struct kunit *test) +{ + char *p_ptr, *v_ptr; + struct page *p_page, *v_page; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons vmap mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + + p_page = alloc_pages(GFP_KERNEL, 1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_page); + p_ptr = page_address(p_page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + + v_ptr = vmap(&p_page, 1, VM_MAP, PAGE_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + /* + * We can't check for out-of-bounds bugs in this nor in the following + * vmalloc tests, as allocations have page granularity and accessing + * the guard page will crash the kernel. + */ + + KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses through both pointers work. */ + *p_ptr = 0; + *v_ptr = 0; + + /* Make sure vmalloc_to_page() correctly recovers the page pointer. */ + v_page = vmalloc_to_page(v_ptr); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_page); + KUNIT_EXPECT_PTR_EQ(test, p_page, v_page); + + vunmap(v_ptr); + free_pages((unsigned long)p_ptr, 1); +} + +static void vm_map_ram_tags(struct kunit *test) +{ + char *p_ptr, *v_ptr; + struct page *page; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons vm_map_ram mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + page = alloc_pages(GFP_KERNEL, 1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page); + p_ptr = page_address(page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + + v_ptr = vm_map_ram(&page, 1, -1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses through both pointers work. */ + *p_ptr = 0; + *v_ptr = 0; + + vm_unmap_ram(v_ptr, 1); + free_pages((unsigned long)p_ptr, 1); +} + +static void vmalloc_percpu(struct kunit *test) +{ + char __percpu *ptr; + int cpu; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons percpu mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); + + for_each_possible_cpu(cpu) { + char *c_ptr = per_cpu_ptr(ptr, cpu); + + KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses don't crash the kernel. */ + *c_ptr = 0; + } + + free_percpu(ptr); } /* @@ -1060,6 +1316,18 @@ static void match_all_not_assigned(struct kunit *test) KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); free_pages((unsigned long)ptr, order); } + + if (!IS_ENABLED(CONFIG_KASAN_VMALLOC)) + return; + + for (i = 0; i < 256; i++) { + size = (get_random_int() % 1024) + 1; + ptr = vmalloc(size); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + vfree(ptr); + } } /* Check that 0xff works as a match-all pointer tag for tag-based modes. */ @@ -1139,16 +1407,19 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmalloc_oob_memset_4), KUNIT_CASE(kmalloc_oob_memset_8), KUNIT_CASE(kmalloc_oob_memset_16), + KUNIT_CASE(kmalloc_memmove_negative_size), KUNIT_CASE(kmalloc_memmove_invalid_size), KUNIT_CASE(kmalloc_uaf), KUNIT_CASE(kmalloc_uaf_memset), KUNIT_CASE(kmalloc_uaf2), + KUNIT_CASE(kmalloc_uaf3), KUNIT_CASE(kfree_via_page), KUNIT_CASE(kfree_via_phys), KUNIT_CASE(kmem_cache_oob), KUNIT_CASE(kmem_cache_accounted), KUNIT_CASE(kmem_cache_bulk), - KUNIT_CASE(kasan_global_oob), + KUNIT_CASE(kasan_global_oob_right), + KUNIT_CASE(kasan_global_oob_left), KUNIT_CASE(kasan_stack_oob), KUNIT_CASE(kasan_alloca_oob_left), KUNIT_CASE(kasan_alloca_oob_right), @@ -1156,13 +1427,18 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(ksize_uaf), KUNIT_CASE(kmem_cache_double_free), KUNIT_CASE(kmem_cache_invalid_free), + KUNIT_CASE(kmem_cache_double_destroy), KUNIT_CASE(kasan_memchr), KUNIT_CASE(kasan_memcmp), KUNIT_CASE(kasan_strings), KUNIT_CASE(kasan_bitops_generic), KUNIT_CASE(kasan_bitops_tags), KUNIT_CASE(kmalloc_double_kzfree), + KUNIT_CASE(vmalloc_helpers_tags), KUNIT_CASE(vmalloc_oob), + KUNIT_CASE(vmap_tags), + KUNIT_CASE(vm_map_ram_tags), + KUNIT_CASE(vmalloc_percpu), KUNIT_CASE(match_all_not_assigned), KUNIT_CASE(match_all_ptr_tag), KUNIT_CASE(match_all_mem_tag), diff --git a/lib/test_kasan_module.c b/mm/kasan/kasan_test_module.c similarity index 98% rename from lib/test_kasan_module.c rename to mm/kasan/kasan_test_module.c index 7ebf433edef3..e4ca82dc2c16 100644 --- a/lib/test_kasan_module.c +++ b/mm/kasan/kasan_test_module.c @@ -13,7 +13,7 @@ #include #include -#include "../mm/kasan/kasan.h" +#include "kasan.h" static noinline void __init copy_user_test(void) { @@ -35,6 +35,8 @@ static noinline void __init copy_user_test(void) return; } + OPTIMIZER_HIDE_VAR(size); + pr_info("out-of-bounds in copy_from_user()\n"); unused = copy_from_user(kmem, usermem, size + 1); diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 1bd6a3f13467..933a45f0878a 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -99,6 +99,17 @@ static unsigned long quarantine_size; static DEFINE_RAW_SPINLOCK(quarantine_lock); DEFINE_STATIC_SRCU(remove_cache_srcu); +#ifdef CONFIG_PREEMPT_RT +struct cpu_shrink_qlist { + raw_spinlock_t lock; + struct qlist_head qlist; +}; + +static DEFINE_PER_CPU(struct cpu_shrink_qlist, shrink_qlist) = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(shrink_qlist.lock), +}; +#endif + /* Maximum size of the global queue. */ static unsigned long quarantine_max_size; @@ -152,7 +163,7 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) * As the object now gets freed from the quarantine, assume that its * free track is no longer valid. */ - *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE; + *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREE; ___cache_free(cache, object, _THIS_IP_); @@ -308,10 +319,31 @@ static void qlist_move_cache(struct qlist_head *from, } } -static void per_cpu_remove_cache(void *arg) +#ifndef CONFIG_PREEMPT_RT +static void __per_cpu_remove_cache(struct qlist_head *q, void *arg) { struct kmem_cache *cache = arg; struct qlist_head to_free = QLIST_INIT; + + qlist_move_cache(q, &to_free, cache); + qlist_free_all(&to_free, cache); +} +#else +static void __per_cpu_remove_cache(struct qlist_head *q, void *arg) +{ + struct kmem_cache *cache = arg; + unsigned long flags; + struct cpu_shrink_qlist *sq; + + sq = this_cpu_ptr(&shrink_qlist); + raw_spin_lock_irqsave(&sq->lock, flags); + qlist_move_cache(q, &sq->qlist, cache); + raw_spin_unlock_irqrestore(&sq->lock, flags); +} +#endif + +static void per_cpu_remove_cache(void *arg) +{ struct qlist_head *q; q = this_cpu_ptr(&cpu_quarantine); @@ -322,8 +354,7 @@ static void per_cpu_remove_cache(void *arg) */ if (READ_ONCE(q->offline)) return; - qlist_move_cache(q, &to_free, cache); - qlist_free_all(&to_free, cache); + __per_cpu_remove_cache(q, arg); } /* Free all quarantined objects belonging to cache. */ @@ -341,6 +372,21 @@ void kasan_quarantine_remove_cache(struct kmem_cache *cache) */ on_each_cpu(per_cpu_remove_cache, cache, 1); +#ifdef CONFIG_PREEMPT_RT + { + int cpu; + struct cpu_shrink_qlist *sq; + + for_each_online_cpu(cpu) { + sq = per_cpu_ptr(&shrink_qlist, cpu); + raw_spin_lock_irqsave(&sq->lock, flags); + qlist_move_cache(&sq->qlist, &to_free, cache); + raw_spin_unlock_irqrestore(&sq->lock, flags); + } + qlist_free_all(&to_free, cache); + } +#endif + raw_spin_lock_irqsave(&quarantine_lock, flags); for (i = 0; i < QUARANTINE_BATCHES; i++) { if (qlist_empty(&global_quarantine[i])) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 887af873733b..075562b0aeb5 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -64,6 +65,40 @@ static int __init early_kasan_fault(char *arg) } early_param("kasan.fault", early_kasan_fault); +static int __init kasan_set_multi_shot(char *str) +{ + set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); + return 1; +} +__setup("kasan_multi_shot", kasan_set_multi_shot); + +/* + * Used to suppress reports within kasan_disable/enable_current() critical + * sections, which are used for marking accesses to slab metadata. + */ +static bool report_suppressed(void) +{ +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + if (current->kasan_depth) + return true; +#endif + return false; +} + +/* + * Used to avoid reporting more than one KASAN bug unless kasan_multi_shot + * is enabled. Note that KASAN tests effectively enable kasan_multi_shot + * for their duration. + */ +static bool report_enabled(void) +{ + if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) + return true; + return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); +} + +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST) + bool kasan_save_enable_multi_shot(void) { return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); @@ -77,17 +112,73 @@ void kasan_restore_multi_shot(bool enabled) } EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); -static int __init kasan_set_multi_shot(char *str) -{ - set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); - return 1; -} -__setup("kasan_multi_shot", kasan_set_multi_shot); +#endif -static void print_error_description(struct kasan_access_info *info) +static void update_kunit_status(bool sync) { - pr_err("BUG: KASAN: %s in %pS\n", - kasan_get_bug_type(info), (void *)info->ip); + struct kunit *test; + struct kunit_resource *resource; + struct kunit_kasan_status *status; + + test = current->kunit_test; + if (!test) + return; + + resource = kunit_find_named_resource(test, "kasan_status"); + if (!resource) { + kunit_set_failure(test); + return; + } + + status = (struct kunit_kasan_status *)resource->data; + WRITE_ONCE(status->report_found, true); + WRITE_ONCE(status->sync_fault, sync); + + kunit_put_resource(resource); +} + +static DEFINE_SPINLOCK(report_lock); + +static void start_report(unsigned long *flags, bool sync) +{ + /* Respect the /proc/sys/kernel/traceoff_on_warning interface. */ + disable_trace_on_warning(); + /* Update status of the currently running KASAN test. */ + update_kunit_status(sync); + /* Do not allow LOCKDEP mangling KASAN reports. */ + lockdep_off(); + /* Make sure we don't end up in loop. */ + kasan_disable_current(); + spin_lock_irqsave(&report_lock, *flags); + pr_err("==================================================================\n"); +} + +static void end_report(unsigned long *flags, void *addr) +{ + if (addr) + trace_error_report_end(ERROR_DETECTOR_KASAN, + (unsigned long)addr); + pr_err("==================================================================\n"); + spin_unlock_irqrestore(&report_lock, *flags); + if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) + check_panic_on_warn("KASAN"); + if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC) + panic("kasan.fault=panic set ...\n"); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + lockdep_on(); + kasan_enable_current(); +} + +static void print_error_description(struct kasan_report_info *info) +{ + pr_err("BUG: KASAN: %s in %pS\n", info->bug_type, (void *)info->ip); + + if (info->type != KASAN_REPORT_ACCESS) { + pr_err("Free of addr %px by task %s/%d\n", + info->access_addr, current->comm, task_pid_nr(current)); + return; + } + if (info->access_size) pr_err("%s of size %zu at addr %px by task %s/%d\n", info->is_write ? "Write" : "Read", info->access_size, @@ -98,61 +189,24 @@ static void print_error_description(struct kasan_access_info *info) info->access_addr, current->comm, task_pid_nr(current)); } -static DEFINE_SPINLOCK(report_lock); - -static void start_report(unsigned long *flags) -{ - /* - * Make sure we don't end up in loop. - */ - kasan_disable_current(); - spin_lock_irqsave(&report_lock, *flags); - pr_err("==================================================================\n"); -} - -static void end_report(unsigned long *flags, unsigned long addr) -{ - if (!kasan_async_mode_enabled()) - trace_error_report_end(ERROR_DETECTOR_KASAN, addr); - pr_err("==================================================================\n"); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); - spin_unlock_irqrestore(&report_lock, *flags); - if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) - check_panic_on_warn("KASAN"); - if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC) - panic("kasan.fault=panic set ...\n"); - kasan_enable_current(); -} - -static void print_stack(depot_stack_handle_t stack) -{ - unsigned long *entries; - unsigned int nr_entries; - - nr_entries = stack_depot_fetch(stack, &entries); - stack_trace_print(entries, nr_entries, 0); -} - static void print_track(struct kasan_track *track, const char *prefix) { pr_err("%s by task %u:\n", prefix, track->pid); - if (track->stack) { - print_stack(track->stack); - } else { + if (track->stack) + stack_depot_print(track->stack); + else pr_err("(stack is not available)\n"); - } } struct page *kasan_addr_to_page(const void *addr) { - if ((addr >= (void *)PAGE_OFFSET) && - (addr < high_memory)) + if (virt_addr_valid(addr)) return virt_to_head_page(addr); return NULL; } -static void describe_object_addr(struct kmem_cache *cache, void *object, - const void *addr) +static void describe_object_addr(const void *addr, struct kmem_cache *cache, + void *object) { unsigned long access_addr = (unsigned long)addr; unsigned long object_addr = (unsigned long)object; @@ -163,9 +217,6 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, " which belongs to the cache %s of size %d\n", object, cache->name, cache->object_size); - if (!addr) - return; - if (access_addr < object_addr) { rel_type = "to the left"; rel_bytes = object_addr - access_addr; @@ -183,51 +234,31 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, (void *)(object_addr + cache->object_size)); } -static void describe_object_stacks(struct kmem_cache *cache, void *object, - const void *addr, u8 tag) +static void describe_object_stacks(struct kasan_report_info *info) { - struct kasan_alloc_meta *alloc_meta; - struct kasan_track *free_track; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) { - print_track(&alloc_meta->alloc_track, "Allocated"); + if (info->alloc_track.stack) { + print_track(&info->alloc_track, "Allocated"); pr_err("\n"); } - free_track = kasan_get_free_track(cache, object, tag); - if (free_track) { - print_track(free_track, "Freed"); + if (info->free_track.stack) { + print_track(&info->free_track, "Freed"); pr_err("\n"); } -#ifdef CONFIG_KASAN_GENERIC - if (!alloc_meta) - return; - if (alloc_meta->aux_stack[0]) { - pr_err("Last potentially related work creation:\n"); - print_stack(alloc_meta->aux_stack[0]); - pr_err("\n"); - } - if (alloc_meta->aux_stack[1]) { - pr_err("Second to last potentially related work creation:\n"); - print_stack(alloc_meta->aux_stack[1]); - pr_err("\n"); - } -#endif + kasan_print_aux_stacks(info->cache, info->object); } -static void describe_object(struct kmem_cache *cache, void *object, - const void *addr, u8 tag) +static void describe_object(const void *addr, struct kasan_report_info *info) { if (kasan_stack_collection_enabled()) - describe_object_stacks(cache, object, addr, tag); - describe_object_addr(cache, object, addr); + describe_object_stacks(info); + describe_object_addr(addr, info->cache, info->object); } static inline bool kernel_or_module_addr(const void *addr) { - if (addr >= (void *)_stext && addr < (void *)_end) + if (is_kernel((unsigned long)addr)) return true; if (is_module_address((unsigned long)addr)) return true; @@ -241,31 +272,53 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } -static void print_address_description(void *addr, u8 tag) +static void print_address_description(void *addr, u8 tag, + struct kasan_report_info *info) { struct page *page = kasan_addr_to_page(addr); dump_stack_lvl(KERN_ERR); pr_err("\n"); - if (page && PageSlab(page)) { - struct kmem_cache *cache = page->slab_cache; - void *object = nearest_obj(cache, page, addr); - - describe_object(cache, object, addr, tag); + if (info->cache && info->object) { + describe_object(addr, info); + pr_err("\n"); } if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { pr_err("The buggy address belongs to the variable:\n"); pr_err(" %pS\n", addr); + pr_err("\n"); + } + + if (object_is_on_stack(addr)) { + /* + * Currently, KASAN supports printing frame information only + * for accesses to the task's own stack. + */ + kasan_print_address_stack_frame(addr); + pr_err("\n"); + } + + if (is_vmalloc_addr(addr)) { + struct vm_struct *va = find_vm_area(addr); + + if (va) { + pr_err("The buggy address belongs to the virtual mapping at\n" + " [%px, %px) created by:\n" + " %pS\n", + va->addr, va->addr + va->size, va->caller); + pr_err("\n"); + + page = vmalloc_to_page(addr); + } } if (page) { - pr_err("The buggy address belongs to the page:\n"); + pr_err("The buggy address belongs to the physical page:\n"); dump_page(page, "kasan: bad access detected"); + pr_err("\n"); } - - kasan_print_address_stack_frame(addr); } static bool meta_row_is_guilty(const void *row, const void *addr) @@ -324,56 +377,125 @@ static void print_memory_metadata(const void *addr) } } -static bool report_enabled(void) +static void print_report(struct kasan_report_info *info) { -#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) - if (current->kasan_depth) - return false; -#endif - if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) - return true; - return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); + void *addr = kasan_reset_tag(info->access_addr); + u8 tag = get_tag(info->access_addr); + + print_error_description(info); + if (addr_has_metadata(addr)) + kasan_print_tags(tag, info->first_bad_addr); + pr_err("\n"); + + if (addr_has_metadata(addr)) { + print_address_description(addr, tag, info); + print_memory_metadata(info->first_bad_addr); + } else { + dump_stack_lvl(KERN_ERR); + } } -#if IS_ENABLED(CONFIG_KUNIT) -static void kasan_update_kunit_status(struct kunit *cur_test) +static void complete_report_info(struct kasan_report_info *info) { - struct kunit_resource *resource; - struct kunit_kasan_expectation *kasan_data; + void *addr = kasan_reset_tag(info->access_addr); + struct page *page; - resource = kunit_find_named_resource(cur_test, "kasan_data"); + if (info->type == KASAN_REPORT_ACCESS) + info->first_bad_addr = kasan_find_first_bad_addr( + info->access_addr, info->access_size); + else + info->first_bad_addr = addr; - if (!resource) { - kunit_set_failure(cur_test); - return; + page = kasan_addr_to_page(addr); + if (page && PageSlab(page) && page->slab_cache) { + info->cache = page->slab_cache; + info->object = nearest_obj(info->cache, page, addr); + } else + info->cache = info->object = NULL; + + switch (info->type) { + case KASAN_REPORT_INVALID_FREE: + info->bug_type = "invalid-free"; + break; + case KASAN_REPORT_DOUBLE_FREE: + info->bug_type = "double-free"; + break; + default: + /* bug_type filled in by kasan_complete_mode_report_info. */ + break; } - kasan_data = (struct kunit_kasan_expectation *)resource->data; - WRITE_ONCE(kasan_data->report_found, true); - kunit_put_resource(resource); + /* Fill in mode-specific report info fields. */ + kasan_complete_mode_report_info(info); } -#endif /* IS_ENABLED(CONFIG_KUNIT) */ -void kasan_report_invalid_free(void *object, unsigned long ip) +void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type) { unsigned long flags; - u8 tag = get_tag(object); + struct kasan_report_info info; - object = kasan_reset_tag(object); + /* + * Do not check report_suppressed(), as an invalid-free cannot be + * caused by accessing slab metadata and thus should not be + * suppressed by kasan_disable/enable_current() critical sections. + */ + if (unlikely(!report_enabled())) + return; -#if IS_ENABLED(CONFIG_KUNIT) - if (current->kunit_test) - kasan_update_kunit_status(current->kunit_test); -#endif /* IS_ENABLED(CONFIG_KUNIT) */ + start_report(&flags, true); - start_report(&flags); - pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); - kasan_print_tags(tag, object); - pr_err("\n"); - print_address_description(object, tag); - pr_err("\n"); - print_memory_metadata(object); - end_report(&flags, (unsigned long)object); + memset(&info, 0, sizeof(info)); + info.type = type; + info.access_addr = ptr; + info.access_size = 0; + info.is_write = false; + info.ip = ip; + + complete_report_info(&info); + + print_report(&info); + + end_report(&flags, ptr); +} + +/* + * kasan_report() is the only reporting function that uses + * user_access_save/restore(): kasan_report_invalid_free() cannot be called + * from a UACCESS region, and kasan_report_async() is not used on x86. + */ +bool kasan_report(unsigned long addr, size_t size, bool is_write, + unsigned long ip) +{ + bool ret = true; + void *ptr = (void *)addr; + unsigned long ua_flags = user_access_save(); + unsigned long irq_flags; + struct kasan_report_info info; + + if (unlikely(report_suppressed()) || unlikely(!report_enabled())) { + ret = false; + goto out; + } + + start_report(&irq_flags, true); + + memset(&info, 0, sizeof(info)); + info.type = KASAN_REPORT_ACCESS; + info.access_addr = ptr; + info.access_size = size; + info.is_write = is_write; + info.ip = ip; + + complete_report_info(&info); + + print_report(&info); + + end_report(&irq_flags, ptr); + +out: + user_access_restore(ua_flags); + + return ret; } #ifdef CONFIG_KASAN_HW_TAGS @@ -381,82 +503,22 @@ void kasan_report_async(void) { unsigned long flags; -#if IS_ENABLED(CONFIG_KUNIT) - if (current->kunit_test) - kasan_update_kunit_status(current->kunit_test); -#endif /* IS_ENABLED(CONFIG_KUNIT) */ + /* + * Do not check report_suppressed(), as kasan_disable/enable_current() + * critical sections do not affect Hardware Tag-Based KASAN. + */ + if (unlikely(!report_enabled())) + return; - start_report(&flags); + start_report(&flags, false); pr_err("BUG: KASAN: invalid-access\n"); - pr_err("Asynchronous mode enabled: no access details available\n"); + pr_err("Asynchronous fault: no details available\n"); pr_err("\n"); dump_stack_lvl(KERN_ERR); - end_report(&flags, 0); + end_report(&flags, NULL); } #endif /* CONFIG_KASAN_HW_TAGS */ -static void __kasan_report(unsigned long addr, size_t size, bool is_write, - unsigned long ip) -{ - struct kasan_access_info info; - void *tagged_addr; - void *untagged_addr; - unsigned long flags; - -#if IS_ENABLED(CONFIG_KUNIT) - if (current->kunit_test) - kasan_update_kunit_status(current->kunit_test); -#endif /* IS_ENABLED(CONFIG_KUNIT) */ - - disable_trace_on_warning(); - - tagged_addr = (void *)addr; - untagged_addr = kasan_reset_tag(tagged_addr); - - info.access_addr = tagged_addr; - if (addr_has_metadata(untagged_addr)) - info.first_bad_addr = - kasan_find_first_bad_addr(tagged_addr, size); - else - info.first_bad_addr = untagged_addr; - info.access_size = size; - info.is_write = is_write; - info.ip = ip; - - start_report(&flags); - - print_error_description(&info); - if (addr_has_metadata(untagged_addr)) - kasan_print_tags(get_tag(tagged_addr), info.first_bad_addr); - pr_err("\n"); - - if (addr_has_metadata(untagged_addr)) { - print_address_description(untagged_addr, get_tag(tagged_addr)); - pr_err("\n"); - print_memory_metadata(info.first_bad_addr); - } else { - dump_stack_lvl(KERN_ERR); - } - - end_report(&flags, addr); -} - -bool kasan_report(unsigned long addr, size_t size, bool is_write, - unsigned long ip) -{ - unsigned long flags = user_access_save(); - bool ret = false; - - if (likely(report_enabled())) { - __kasan_report(addr, size, is_write, ip); - ret = true; - } - - user_access_restore(flags); - - return ret; -} - #ifdef CONFIG_KASAN_INLINE /* * With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 139615ef326b..043c94b04605 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -34,12 +34,16 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) { void *p = addr; + if (!addr_has_metadata(p)) + return p; + while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p))) p += KASAN_GRANULE_SIZE; + return p; } -static const char *get_shadow_bug_type(struct kasan_access_info *info) +static const char *get_shadow_bug_type(struct kasan_report_info *info) { const char *bug_type = "unknown-crash"; u8 *shadow_addr; @@ -62,7 +66,7 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info) bug_type = "out-of-bounds"; break; case KASAN_PAGE_REDZONE: - case KASAN_KMALLOC_REDZONE: + case KASAN_SLAB_REDZONE: bug_type = "slab-out-of-bounds"; break; case KASAN_GLOBAL_REDZONE: @@ -74,9 +78,9 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info) case KASAN_STACK_PARTIAL: bug_type = "stack-out-of-bounds"; break; - case KASAN_FREE_PAGE: - case KASAN_KMALLOC_FREE: - case KASAN_KMALLOC_FREETRACK: + case KASAN_PAGE_FREE: + case KASAN_SLAB_FREE: + case KASAN_SLAB_FREETRACK: bug_type = "use-after-free"; break; case KASAN_ALLOCA_LEFT: @@ -91,7 +95,7 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info) return bug_type; } -static const char *get_wild_bug_type(struct kasan_access_info *info) +static const char *get_wild_bug_type(struct kasan_report_info *info) { const char *bug_type = "unknown-crash"; @@ -105,7 +109,7 @@ static const char *get_wild_bug_type(struct kasan_access_info *info) return bug_type; } -const char *kasan_get_bug_type(struct kasan_access_info *info) +static const char *get_bug_type(struct kasan_report_info *info) { /* * If access_size is a negative number, then it has reason to be @@ -123,11 +127,55 @@ const char *kasan_get_bug_type(struct kasan_access_info *info) return get_wild_bug_type(info); } +void kasan_complete_mode_report_info(struct kasan_report_info *info) +{ + struct kasan_alloc_meta *alloc_meta; + struct kasan_free_meta *free_meta; + + if (!info->bug_type) + info->bug_type = get_bug_type(info); + + if (!info->cache || !info->object) + return; + + alloc_meta = kasan_get_alloc_meta(info->cache, info->object); + if (alloc_meta) + memcpy(&info->alloc_track, &alloc_meta->alloc_track, + sizeof(info->alloc_track)); + + if (*(u8 *)kasan_mem_to_shadow(info->object) == KASAN_SLAB_FREETRACK) { + /* Free meta must be present with KASAN_SLAB_FREETRACK. */ + free_meta = kasan_get_free_meta(info->cache, info->object); + memcpy(&info->free_track, &free_meta->free_track, + sizeof(info->free_track)); + } +} + void kasan_metadata_fetch_row(char *buffer, void *row) { memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); } +void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return; + + if (alloc_meta->aux_stack[0]) { + pr_err("Last potentially related work creation:\n"); + stack_depot_print(alloc_meta->aux_stack[0]); + pr_err("\n"); + } + if (alloc_meta->aux_stack[1]) { + pr_err("Second to last potentially related work creation:\n"); + stack_depot_print(alloc_meta->aux_stack[1]); + pr_err("\n"); + } +} + #ifdef CONFIG_KASAN_STACK static bool __must_check tokenize_frame_descr(const char **frame_descr, char *token, size_t max_tok_len, @@ -180,7 +228,7 @@ static void print_decoded_frame_descr(const char *frame_descr) return; pr_err("\n"); - pr_err("this frame has %lu %s:\n", num_objects, + pr_err("This frame has %lu %s:\n", num_objects, num_objects == 1 ? "object" : "objects"); while (num_objects--) { @@ -211,6 +259,7 @@ static void print_decoded_frame_descr(const char *frame_descr) } } +/* Returns true only if the address is on the current task's stack. */ static bool __must_check get_address_stack_frame_info(const void *addr, unsigned long *offset, const char **frame_descr, @@ -224,13 +273,6 @@ static bool __must_check get_address_stack_frame_info(const void *addr, BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP)); - /* - * NOTE: We currently only support printing frame information for - * accesses to the task's own stack. - */ - if (!object_is_on_stack(addr)) - return false; - aligned_addr = round_down((unsigned long)addr, sizeof(long)); mem_ptr = round_down(aligned_addr, KASAN_GRANULE_SIZE); shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr); @@ -269,17 +311,17 @@ void kasan_print_address_stack_frame(const void *addr) const char *frame_descr; const void *frame_pc; + if (WARN_ON(!object_is_on_stack(addr))) + return; + + pr_err("The buggy address belongs to stack of task %s/%d\n", + current->comm, task_pid_nr(current)); + if (!get_address_stack_frame_info(addr, &offset, &frame_descr, &frame_pc)) return; - /* - * get_address_stack_frame_info only returns true if the given addr is - * on the current task's stack. - */ - pr_err("\n"); - pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n", - addr, current->comm, task_pid_nr(current), offset); + pr_err(" and is located at offset %lu in frame:\n", offset); pr_err(" %pS\n", frame_pc); if (!frame_descr) diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c index 5dbbbb930e7a..f3d3be614e4b 100644 --- a/mm/kasan/report_hw_tags.c +++ b/mm/kasan/report_hw_tags.c @@ -17,6 +17,7 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) { + /* Return the same value regardless of whether addr_has_metadata(). */ return kasan_reset_tag(addr); } diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c index d2298c357834..7a26397297ed 100644 --- a/mm/kasan/report_sw_tags.c +++ b/mm/kasan/report_sw_tags.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -35,8 +36,12 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) void *p = kasan_reset_tag(addr); void *end = p + size; + if (!addr_has_metadata(p)) + return p; + while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p)) p += KASAN_GRANULE_SIZE; + return p; } @@ -51,3 +56,14 @@ void kasan_print_tags(u8 addr_tag, const void *addr) pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow); } + +#ifdef CONFIG_KASAN_STACK +void kasan_print_address_stack_frame(const void *addr) +{ + if (WARN_ON(!object_is_on_stack(addr))) + return; + + pr_err("The buggy address belongs to stack of task %s/%d\n", + current->comm, task_pid_nr(current)); +} +#endif diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 8a319fc16dab..aee2153e09df 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -4,38 +4,14 @@ * Copyright (c) 2020 Google, Inc. */ +#include + #include "kasan.h" -#include "../slab.h" -const char *kasan_get_bug_type(struct kasan_access_info *info) +extern struct kasan_stack_ring stack_ring; + +static const char *get_common_bug_type(struct kasan_report_info *info) { -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - struct kasan_alloc_meta *alloc_meta; - struct kmem_cache *cache; - struct page *page; - const void *addr; - void *object; - u8 tag; - int i; - - tag = get_tag(info->access_addr); - addr = kasan_reset_tag(info->access_addr); - page = kasan_addr_to_page(addr); - if (page && PageSlab(page)) { - cache = page->slab_cache; - object = nearest_obj(cache, page, (void *)addr); - alloc_meta = kasan_get_alloc_meta(cache, object); - - if (alloc_meta) { - for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { - if (alloc_meta->free_pointer_tag[i] == tag) - return "use-after-free"; - } - } - return "out-of-bounds"; - } -#endif - /* * If access_size is a negative number, then it has reason to be * defined as out-of-bounds bug type. @@ -49,3 +25,92 @@ const char *kasan_get_bug_type(struct kasan_access_info *info) return "invalid-access"; } + +void kasan_complete_mode_report_info(struct kasan_report_info *info) +{ + unsigned long flags; + u64 pos, i; + struct kasan_stack_ring_entry *entry; + void *ptr; + u32 pid; + depot_stack_handle_t stack; + bool is_free; + bool alloc_found = false, free_found = false; + + if ((!info->cache || !info->object) && !info->bug_type) { + info->bug_type = get_common_bug_type(info); + return; + } + + write_lock_irqsave(&stack_ring.lock, flags); + + pos = atomic64_read(&stack_ring.pos); + + /* + * The loop below tries to find stack ring entries relevant to the + * buggy object. This is a best-effort process. + * + * First, another object with the same tag can be allocated in place of + * the buggy object. Also, since the number of entries is limited, the + * entries relevant to the buggy object can be overwritten. + */ + + for (i = pos - 1; i != pos - 1 - stack_ring.size; i--) { + if (alloc_found && free_found) + break; + + entry = &stack_ring.entries[i % stack_ring.size]; + + /* Paired with smp_store_release() in save_stack_info(). */ + ptr = (void *)smp_load_acquire(&entry->ptr); + + if (kasan_reset_tag(ptr) != info->object || + get_tag(ptr) != get_tag(info->access_addr)) + continue; + + pid = READ_ONCE(entry->pid); + stack = READ_ONCE(entry->stack); + is_free = READ_ONCE(entry->is_free); + + if (is_free) { + /* + * Second free of the same object. + * Give up on trying to find the alloc entry. + */ + if (free_found) + break; + + info->free_track.pid = pid; + info->free_track.stack = stack; + free_found = true; + + /* + * If a free entry is found first, the bug is likely + * a use-after-free. + */ + if (!info->bug_type) + info->bug_type = "use-after-free"; + } else { + /* Second alloc of the same object. Give up. */ + if (alloc_found) + break; + + info->alloc_track.pid = pid; + info->alloc_track.stack = stack; + alloc_found = true; + + /* + * If an alloc entry is found first, the bug is likely + * an out-of-bounds. + */ + if (!info->bug_type) + info->bug_type = "slab-out-of-bounds"; + } + } + + write_unlock_irqrestore(&stack_ring.lock, flags); + + /* Assign the common bug type if no entries were found. */ + if (!info->bug_type) + info->bug_type = get_common_bug_type(info); +} diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index dd79840e6096..0e3648b603a6 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -254,6 +254,11 @@ core_initcall(kasan_memhotplug_init); #ifdef CONFIG_KASAN_VMALLOC +void __init __weak kasan_populate_early_vm_area_shadow(void *start, + unsigned long size) +{ +} + static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, void *unused) { @@ -290,9 +295,22 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) return 0; shadow_start = (unsigned long)kasan_mem_to_shadow((void *)addr); - shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE); shadow_end = (unsigned long)kasan_mem_to_shadow((void *)addr + size); - shadow_end = ALIGN(shadow_end, PAGE_SIZE); + + /* + * User Mode Linux maps enough shadow memory for all of virtual memory + * at boot, so doesn't need to allocate more on vmalloc, just clear it. + * + * The remaining CONFIG_UML checks in this file exist for the same + * reason. + */ + if (IS_ENABLED(CONFIG_UML)) { + __memset((void *)shadow_start, KASAN_VMALLOC_INVALID, shadow_end - shadow_start); + return 0; + } + + shadow_start = PAGE_ALIGN_DOWN(shadow_start); + shadow_end = PAGE_ALIGN(shadow_end); ret = apply_to_page_range(&init_mm, shadow_start, shadow_end - shadow_start, @@ -340,27 +358,6 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) return 0; } -/* - * Poison the shadow for a vmalloc region. Called as part of the - * freeing process at the time the region is freed. - */ -void kasan_poison_vmalloc(const void *start, unsigned long size) -{ - if (!is_vmalloc_or_module_addr(start)) - return; - - size = round_up(size, KASAN_GRANULE_SIZE); - kasan_poison(start, size, KASAN_VMALLOC_INVALID, false); -} - -void kasan_unpoison_vmalloc(const void *start, unsigned long size) -{ - if (!is_vmalloc_or_module_addr(start)) - return; - - kasan_unpoison(start, size, false); -} - static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, void *unused) { @@ -482,6 +479,10 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, if (shadow_end > shadow_start) { size = shadow_end - shadow_start; + if (IS_ENABLED(CONFIG_UML)) { + __memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start); + return; + } apply_to_existing_page_range(&init_mm, (unsigned long)shadow_start, size, kasan_depopulate_vmalloc_pte, @@ -491,9 +492,48 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, } } +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags) +{ + /* + * Software KASAN modes unpoison both VM_ALLOC and non-VM_ALLOC + * mappings, so the KASAN_VMALLOC_VM_ALLOC flag is ignored. + * Software KASAN modes can't optimize zeroing memory by combining it + * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. + */ + + if (!is_vmalloc_or_module_addr(start)) + return (void *)start; + + /* + * Don't tag executable memory with the tag-based mode. + * The kernel doesn't tolerate having the PC register tagged. + */ + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS) && + !(flags & KASAN_VMALLOC_PROT_NORMAL)) + return (void *)start; + + start = set_tag(start, kasan_random_tag()); + kasan_unpoison(start, size, false); + return (void *)start; +} + +/* + * Poison the shadow for a vmalloc region. Called as part of the + * freeing process at the time the region is freed. + */ +void __kasan_poison_vmalloc(const void *start, unsigned long size) +{ + if (!is_vmalloc_or_module_addr(start)) + return; + + size = round_up(size, KASAN_GRANULE_SIZE); + kasan_poison(start, size, KASAN_VMALLOC_INVALID, false); +} + #else /* CONFIG_KASAN_VMALLOC */ -int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) +int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) { void *ret; size_t scaled_size; @@ -508,6 +548,11 @@ int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) if (WARN_ON(!PAGE_ALIGNED(shadow_start))) return -EINVAL; + if (IS_ENABLED(CONFIG_UML)) { + __memset((void *)shadow_start, KASAN_SHADOW_INIT, shadow_size); + return 0; + } + ret = __vmalloc_node_range(shadow_size, 1, shadow_start, shadow_start + shadow_size, GFP_KERNEL, @@ -529,8 +574,11 @@ int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) return -ENOMEM; } -void kasan_free_shadow(const struct vm_struct *vm) +void kasan_free_module_shadow(const struct vm_struct *vm) { + if (IS_ENABLED(CONFIG_UML)) + return; + if (vm->flags & VM_KASAN) vfree(kasan_mem_to_shadow(vm->addr)); } diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index bd3f540feb47..a3afaf2ad1b1 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -42,7 +42,10 @@ void __init kasan_init_sw_tags(void) for_each_possible_cpu(cpu) per_cpu(prng_state, cpu) = (u32)get_cycles(); - pr_info("KernelAddressSanitizer initialized\n"); + kasan_init_tags(); + + pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n", + kasan_stack_collection_enabled() ? "on" : "off"); } /* diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 8f48b9502a17..67a222586846 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -6,9 +6,11 @@ * Copyright (c) 2020 Google, Inc. */ +#include #include #include #include +#include #include #include #include @@ -16,44 +18,127 @@ #include #include "kasan.h" +#include "../slab.h" -void kasan_set_free_info(struct kmem_cache *cache, - void *object, u8 tag) +#define KASAN_STACK_RING_SIZE_DEFAULT (32 << 10) + +enum kasan_arg_stacktrace { + KASAN_ARG_STACKTRACE_DEFAULT, + KASAN_ARG_STACKTRACE_OFF, + KASAN_ARG_STACKTRACE_ON, +}; + +static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; + +/* Whether to collect alloc/free stack traces. */ +DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); + +/* Non-zero, as initial pointer values are 0. */ +#define STACK_RING_BUSY_PTR ((void *)1) + +struct kasan_stack_ring stack_ring = { + .lock = __RW_LOCK_UNLOCKED(stack_ring.lock) +}; + +/* kasan.stacktrace=off/on */ +static int __init early_kasan_flag_stacktrace(char *arg) { - struct kasan_alloc_meta *alloc_meta; - u8 idx = 0; + if (!arg) + return -EINVAL; - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return; + if (!strcmp(arg, "off")) + kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF; + else if (!strcmp(arg, "on")) + kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON; + else + return -EINVAL; -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - idx = alloc_meta->free_track_idx; - alloc_meta->free_pointer_tag[idx] = tag; - alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS; -#endif - - kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT); + return 0; } +early_param("kasan.stacktrace", early_kasan_flag_stacktrace); -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) +/* kasan.stack_ring_size= */ +static int __init early_kasan_flag_stack_ring_size(char *arg) { - struct kasan_alloc_meta *alloc_meta; - int i = 0; + if (!arg) + return -EINVAL; - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return NULL; + return kstrtoul(arg, 0, &stack_ring.size); +} +early_param("kasan.stack_ring_size", early_kasan_flag_stack_ring_size); -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { - if (alloc_meta->free_pointer_tag[i] == tag) - break; +void __init kasan_init_tags(void) +{ + switch (kasan_arg_stacktrace) { + case KASAN_ARG_STACKTRACE_DEFAULT: + /* Default is specified by kasan_flag_stacktrace definition. */ + break; + case KASAN_ARG_STACKTRACE_OFF: + static_branch_disable(&kasan_flag_stacktrace); + break; + case KASAN_ARG_STACKTRACE_ON: + static_branch_enable(&kasan_flag_stacktrace); + break; } - if (i == KASAN_NR_FREE_STACKS) - i = alloc_meta->free_track_idx; -#endif - return &alloc_meta->free_track[i]; + if (kasan_stack_collection_enabled()) { + if (!stack_ring.size) + stack_ring.size = KASAN_STACK_RING_SIZE_DEFAULT; + stack_ring.entries = memblock_alloc( + sizeof(stack_ring.entries[0]) * stack_ring.size, + SMP_CACHE_BYTES); + if (WARN_ON(!stack_ring.entries)) + static_branch_disable(&kasan_flag_stacktrace); + } +} + +static void save_stack_info(struct kmem_cache *cache, void *object, + gfp_t gfp_flags, bool is_free) +{ + unsigned long flags; + depot_stack_handle_t stack; + u64 pos; + struct kasan_stack_ring_entry *entry; + void *old_ptr; + + stack = kasan_save_stack(gfp_flags, true); + + /* + * Prevent save_stack_info() from modifying stack ring + * when kasan_complete_mode_report_info() is walking it. + */ + read_lock_irqsave(&stack_ring.lock, flags); + +next: + pos = atomic64_fetch_add(1, &stack_ring.pos); + entry = &stack_ring.entries[pos % stack_ring.size]; + + /* Detect stack ring entry slots that are being written to. */ + old_ptr = READ_ONCE(entry->ptr); + if (old_ptr == STACK_RING_BUSY_PTR) + goto next; /* Busy slot. */ + if (!try_cmpxchg(&entry->ptr, &old_ptr, STACK_RING_BUSY_PTR)) + goto next; /* Busy slot. */ + + WRITE_ONCE(entry->size, cache->object_size); + WRITE_ONCE(entry->pid, current->pid); + WRITE_ONCE(entry->stack, stack); + WRITE_ONCE(entry->is_free, is_free); + + /* + * Paired with smp_load_acquire() in kasan_complete_mode_report_info(). + */ + smp_store_release(&entry->ptr, (s64)object); + + read_unlock_irqrestore(&stack_ring.lock, flags); +} + +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) +{ + save_stack_info(cache, object, flags, false); +} + +void kasan_save_free_info(struct kmem_cache *cache, void *object) +{ + save_stack_info(cache, object, GFP_NOWAIT, true); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 3afcb1466ec5..9eae1bfdca19 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2336,6 +2336,11 @@ static void set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; + if (!khugepaged_enabled()) { + calculate_min_free_kbytes(); + goto update_wmarks; + } + for_each_populated_zone(zone) { /* * We don't need to worry about fragmentation of @@ -2371,6 +2376,8 @@ static void set_recommended_min_free_kbytes(void) min_free_kbytes = recommended_min; } + +update_wmarks: setup_per_zone_wmarks(); } @@ -2392,12 +2399,11 @@ int start_stop_khugepaged(void) if (!list_empty(&khugepaged_scan.mm_head)) wake_up_interruptible(&khugepaged_wait); - - set_recommended_min_free_kbytes(); } else if (khugepaged_thread) { kthread_stop(khugepaged_thread); khugepaged_thread = NULL; } + set_recommended_min_free_kbytes(); fail: mutex_unlock(&khugepaged_mutex); return err; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index b78861b8e013..acd7cbb82e16 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -381,15 +381,20 @@ static void dump_object_info(struct kmemleak_object *object) static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) { struct rb_node *rb = object_tree_root.rb_node; + unsigned long untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); while (rb) { - struct kmemleak_object *object = - rb_entry(rb, struct kmemleak_object, rb_node); - if (ptr < object->pointer) + struct kmemleak_object *object; + unsigned long untagged_objp; + + object = rb_entry(rb, struct kmemleak_object, rb_node); + untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer); + + if (untagged_ptr < untagged_objp) rb = object->rb_node.rb_left; - else if (object->pointer + object->size <= ptr) + else if (untagged_objp + object->size <= untagged_ptr) rb = object->rb_node.rb_right; - else if (object->pointer == ptr || alias) + else if (untagged_objp == untagged_ptr || alias) return object; else { kmemleak_warn("Found object by alias at 0x%08lx\n", @@ -576,6 +581,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, struct kmemleak_object *object, *parent; struct rb_node **link, *rb_parent; unsigned long untagged_ptr; + unsigned long untagged_objp; object = mem_pool_alloc(gfp); if (!object) { @@ -629,9 +635,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, while (*link) { rb_parent = *link; parent = rb_entry(rb_parent, struct kmemleak_object, rb_node); - if (ptr + size <= parent->pointer) + untagged_objp = (unsigned long)kasan_reset_tag((void *)parent->pointer); + if (untagged_ptr + size <= untagged_objp) link = &parent->rb_node.rb_left; - else if (parent->pointer + parent->size <= ptr) + else if (untagged_objp + parent->size <= untagged_ptr) link = &parent->rb_node.rb_right; else { kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n", diff --git a/mm/ksm.c b/mm/ksm.c index a5716fdec1aa..255bd4888d60 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2617,7 +2617,13 @@ again: struct vm_area_struct *vma; cond_resched(); - anon_vma_lock_read(anon_vma); + if (!anon_vma_trylock_read(anon_vma)) { + if (rwc->try_lock) { + rwc->contended = true; + return; + } + anon_vma_lock_read(anon_vma); + } anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { unsigned long addr; diff --git a/mm/madvise.c b/mm/madvise.c index 6c099f8bb8e6..539ee3502dbf 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include #include @@ -29,6 +31,7 @@ #include #include #include +#include #include @@ -62,83 +65,94 @@ static int madvise_need_mmap_write(int behavior) } } -/* - * We can potentially split a vm area into separate - * areas, each area with its own behavior. - */ -static long madvise_behavior(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end, int behavior) +#ifdef CONFIG_ANON_VMA_NAME +struct anon_vma_name *anon_vma_name_alloc(const char *name) { - struct mm_struct *mm = vma->vm_mm; - int error = 0; - pgoff_t pgoff; - unsigned long new_flags = vma->vm_flags; + struct anon_vma_name *anon_name; + size_t count; - switch (behavior) { - case MADV_NORMAL: - new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; - break; - case MADV_SEQUENTIAL: - new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; - break; - case MADV_RANDOM: - new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; - break; - case MADV_DONTFORK: - new_flags |= VM_DONTCOPY; - break; - case MADV_DOFORK: - if (vma->vm_flags & VM_IO) { - error = -EINVAL; - goto out; - } - new_flags &= ~VM_DONTCOPY; - break; - case MADV_WIPEONFORK: - /* MADV_WIPEONFORK is only supported on anonymous memory. */ - if (vma->vm_file || vma->vm_flags & VM_SHARED) { - error = -EINVAL; - goto out; - } - new_flags |= VM_WIPEONFORK; - break; - case MADV_KEEPONFORK: - new_flags &= ~VM_WIPEONFORK; - break; - case MADV_DONTDUMP: - new_flags |= VM_DONTDUMP; - break; - case MADV_DODUMP: - if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) { - error = -EINVAL; - goto out; - } - new_flags &= ~VM_DONTDUMP; - break; - case MADV_MERGEABLE: - case MADV_UNMERGEABLE: - error = ksm_madvise(vma, start, end, behavior, &new_flags); - if (error) - goto out_convert_errno; - break; - case MADV_HUGEPAGE: - case MADV_NOHUGEPAGE: - error = hugepage_madvise(vma, &new_flags, behavior); - if (error) - goto out_convert_errno; - break; + /* Add 1 for NUL terminator at the end of the anon_name->name */ + count = strlen(name) + 1; + anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); + if (anon_name) { + kref_init(&anon_name->kref); + memcpy(anon_name->name, name, count); } - if (new_flags == vma->vm_flags) { + return anon_name; +} + +void anon_vma_name_free(struct kref *kref) +{ + struct anon_vma_name *anon_name = + container_of(kref, struct anon_vma_name, kref); + kfree(anon_name); +} + +struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) +{ + mmap_assert_locked(vma->vm_mm); + + if (vma->vm_file) + return NULL; + + return vma->anon_name; +} + +/* mmap_lock should be write-locked */ +static int replace_anon_vma_name(struct vm_area_struct *vma, + struct anon_vma_name *anon_name) +{ + struct anon_vma_name *orig_name = anon_vma_name(vma); + + if (!anon_name) { + vma->anon_name = NULL; + anon_vma_name_put(orig_name); + return 0; + } + + if (anon_vma_name_eq(orig_name, anon_name)) + return 0; + + vma->anon_name = anon_vma_name_reuse(anon_name); + anon_vma_name_put(orig_name); + + return 0; +} +#else /* CONFIG_ANON_VMA_NAME */ +static int replace_anon_vma_name(struct vm_area_struct *vma, + struct anon_vma_name *anon_name) +{ + if (anon_name) + return -EINVAL; + + return 0; +} +#endif /* CONFIG_ANON_VMA_NAME */ +/* + * Update the vm_flags on region of a vma, splitting it or merging it as + * necessary. Must be called with mmap_sem held for writing; + * Caller should ensure anon_name stability by raising its refcount even when + * anon_name belongs to a valid vma because this function might free that vma. + */ +static int madvise_update_vma(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, unsigned long new_flags, + struct anon_vma_name *anon_name) +{ + struct mm_struct *mm = vma->vm_mm; + int error; + pgoff_t pgoff; + + if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { *prev = vma; - goto out; + return 0; } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, anon_name); if (*prev) { vma = *prev; goto success; @@ -147,23 +161,19 @@ static long madvise_behavior(struct vm_area_struct *vma, *prev = vma; if (start != vma->vm_start) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) { - error = -ENOMEM; - goto out; - } + if (unlikely(mm->map_count >= sysctl_max_map_count)) + return -ENOMEM; error = __split_vma(mm, vma, start, 1); if (error) - goto out_convert_errno; + return error; } if (end != vma->vm_end) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) { - error = -ENOMEM; - goto out; - } + if (unlikely(mm->map_count >= sysctl_max_map_count)) + return -ENOMEM; error = __split_vma(mm, vma, end, 0); if (error) - goto out_convert_errno; + return error; } success: @@ -171,16 +181,13 @@ success: * vm_flags is protected by the mmap_lock held in write mode. */ vma->vm_flags = new_flags; + if (!vma->vm_file) { + error = replace_anon_vma_name(vma, anon_name); + if (error) + return error; + } -out_convert_errno: - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; -out: - return error; + return 0; } #ifdef CONFIG_SWAP @@ -933,6 +940,99 @@ static long madvise_remove(struct vm_area_struct *vma, return error; } +/* + * Apply an madvise behavior to a region of a vma. madvise_update_vma + * will handle splitting a vm area into separate areas, each area with its own + * behavior. + */ +static int madvise_vma_behavior(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + unsigned long behavior) +{ + int error; + struct anon_vma_name *anon_name; + unsigned long new_flags = vma->vm_flags; + + switch (behavior) { + case MADV_REMOVE: + return madvise_remove(vma, prev, start, end); + case MADV_WILLNEED: + return madvise_willneed(vma, prev, start, end); + case MADV_COLD: + return madvise_cold(vma, prev, start, end); + case MADV_PAGEOUT: + return madvise_pageout(vma, prev, start, end); + case MADV_FREE: + case MADV_DONTNEED: + return madvise_dontneed_free(vma, prev, start, end, behavior); + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + return madvise_populate(vma, prev, start, end, behavior); + case MADV_NORMAL: + new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; + break; + case MADV_SEQUENTIAL: + new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; + break; + case MADV_RANDOM: + new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; + break; + case MADV_DONTFORK: + new_flags |= VM_DONTCOPY; + break; + case MADV_DOFORK: + if (vma->vm_flags & VM_IO) + return -EINVAL; + new_flags &= ~VM_DONTCOPY; + break; + case MADV_WIPEONFORK: + /* MADV_WIPEONFORK is only supported on anonymous memory. */ + if (vma->vm_file || vma->vm_flags & VM_SHARED) + return -EINVAL; + new_flags |= VM_WIPEONFORK; + break; + case MADV_KEEPONFORK: + new_flags &= ~VM_WIPEONFORK; + break; + case MADV_DONTDUMP: + new_flags |= VM_DONTDUMP; + break; + case MADV_DODUMP: + if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) + return -EINVAL; + new_flags &= ~VM_DONTDUMP; + break; + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + error = ksm_madvise(vma, start, end, behavior, &new_flags); + if (error) + goto out; + break; + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: + error = hugepage_madvise(vma, &new_flags, behavior); + if (error) + goto out; + break; + } + + anon_name = anon_vma_name(vma); + anon_vma_name_get(anon_name); + error = madvise_update_vma(vma, prev, start, end, new_flags, + anon_name); + anon_vma_name_put(anon_name); + +out: + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + #ifdef CONFIG_MEMORY_FAILURE /* * Error injection support for memory error handling. @@ -983,30 +1083,6 @@ static int madvise_inject_error(int behavior, } #endif -static long -madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, int behavior) -{ - switch (behavior) { - case MADV_REMOVE: - return madvise_remove(vma, prev, start, end); - case MADV_WILLNEED: - return madvise_willneed(vma, prev, start, end); - case MADV_COLD: - return madvise_cold(vma, prev, start, end); - case MADV_PAGEOUT: - return madvise_pageout(vma, prev, start, end); - case MADV_FREE: - case MADV_DONTNEED: - return madvise_dontneed_free(vma, prev, start, end, behavior); - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - return madvise_populate(vma, prev, start, end, behavior); - default: - return madvise_behavior(vma, prev, start, end, behavior); - } -} - static bool madvise_behavior_valid(int behavior) { @@ -1060,6 +1136,122 @@ process_madvise_behavior_valid(int behavior) } } +/* + * Walk the vmas in range [start,end), and call the visit function on each one. + * The visit function will get start and end parameters that cover the overlap + * between the current vma and the original range. Any unmapped regions in the + * original range will result in this function returning -ENOMEM while still + * calling the visit function on all of the existing vmas in the range. + * Must be called with the mmap_lock held for reading or writing. + */ +static +int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned long arg, + int (*visit)(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, unsigned long arg)) +{ + struct vm_area_struct *vma; + struct vm_area_struct *prev; + unsigned long tmp; + int unmapped_error = 0; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + * - different from the way of handling in mlock etc. + */ + vma = find_vma_prev(mm, start, &prev); + if (vma && start > vma->vm_start) + prev = vma; + + for (;;) { + int error; + + /* Still start < end. */ + if (!vma) + return -ENOMEM; + + /* Here start < (end|vma->vm_end). */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + if (start >= end) + break; + } + + /* Here vma->vm_start <= start < (end|vma->vm_end) */ + tmp = vma->vm_end; + if (end < tmp) + tmp = end; + + /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ + error = visit(vma, &prev, start, tmp, arg); + if (error) + return error; + start = tmp; + if (prev && start < prev->vm_end) + start = prev->vm_end; + if (start >= end) + break; + if (prev) + vma = prev->vm_next; + else /* madvise_remove dropped mmap_lock */ + vma = find_vma(mm, start); + } + + return unmapped_error; +} + +#ifdef CONFIG_ANON_VMA_NAME +static int madvise_vma_anon_name(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + unsigned long anon_name) +{ + int error; + + /* Only anonymous mappings can be named */ + if (vma->vm_file) + return -EBADF; + + error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, + (struct anon_vma_name *)anon_name); + + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, struct anon_vma_name *anon_name) +{ + unsigned long end; + unsigned long len; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, + madvise_vma_anon_name); +} +#endif /* CONFIG_ANON_VMA_NAME */ /* * The madvise(2) system call. * @@ -1132,34 +1324,32 @@ process_madvise_behavior_valid(int behavior) */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { - unsigned long end, tmp; - struct vm_area_struct *vma, *prev; - int unmapped_error = 0; - int error = -EINVAL; + unsigned long end; + int error; int write; size_t len; struct blk_plug plug; + bool do_plug = true; start = untagged_addr(start); if (!madvise_behavior_valid(behavior)) - return error; + return -EINVAL; if (!PAGE_ALIGNED(start)) - return error; + return -EINVAL; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) - return error; + return -EINVAL; end = start + len; if (end < start) - return error; + return -EINVAL; - error = 0; if (end == start) - return error; + return 0; #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) @@ -1174,52 +1364,13 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh mmap_read_lock(mm); } - /* - * If the interval [start,end) covers some unmapped address - * ranges, just ignore them, but return -ENOMEM at the end. - * - different from the way of handling in mlock etc. - */ - vma = find_vma_prev(mm, start, &prev); - if (vma && start > vma->vm_start) - prev = vma; - - blk_start_plug(&plug); - for (;;) { - /* Still start < end. */ - error = -ENOMEM; - if (!vma) - goto out; - - /* Here start < (end|vma->vm_end). */ - if (start < vma->vm_start) { - unmapped_error = -ENOMEM; - start = vma->vm_start; - if (start >= end) - goto out; - } - - /* Here vma->vm_start <= start < (end|vma->vm_end) */ - tmp = vma->vm_end; - if (end < tmp) - tmp = end; - - /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ - error = madvise_vma(vma, &prev, start, tmp, behavior); - if (error) - goto out; - start = tmp; - if (prev && start < prev->vm_end) - start = prev->vm_end; - error = unmapped_error; - if (start >= end) - goto out; - if (prev) - vma = prev->vm_next; - else /* madvise_remove dropped mmap_lock */ - vma = find_vma(mm, start); - } -out: - blk_finish_plug(&plug); + trace_android_vh_do_madvise_blk_plug(behavior, &do_plug); + if (do_plug) + blk_start_plug(&plug); + error = madvise_walk_vmas(mm, start, end, behavior, + madvise_vma_behavior); + if (do_plug) + blk_finish_plug(&plug); if (write) mmap_write_unlock(mm); else diff --git a/mm/memblock.c b/mm/memblock.c index 838d59a74c65..c5783822bfa6 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -287,7 +287,7 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, { /* pump up @end */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE || - end == MEMBLOCK_ALLOC_KASAN) + end == MEMBLOCK_ALLOC_NOLEAKTRACE) end = memblock.current_limit; /* avoid allocating the first page */ @@ -833,6 +833,9 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) kmemleak_free_part_phys(base, size); return memblock_remove_range(&memblock.reserved, base, size); } +#ifdef CONFIG_ARCH_KEEP_MEMBLOCK +EXPORT_SYMBOL_GPL(memblock_free); +#endif int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) { @@ -1388,8 +1391,11 @@ again: return 0; done: - /* Skip kmemleak for kasan_init() due to high volume. */ - if (end != MEMBLOCK_ALLOC_KASAN) + /* + * Skip kmemleak for those places like kasan_init() and + * early_pgtable_alloc() due to high volume. + */ + if (end != MEMBLOCK_ALLOC_NOLEAKTRACE) /* * The min_count is set to 0 so that memblock allocated * blocks are never reported as leaks. This is because many @@ -1652,6 +1658,7 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); } +EXPORT_SYMBOL_GPL(memblock_end_of_DRAM); static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3d3364cd4ff1..16d96ca71052 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -70,6 +70,9 @@ #include #include +#include + +#include struct cgroup_subsys memory_cgrp_subsys __read_mostly; EXPORT_SYMBOL(memory_cgrp_subsys); @@ -1400,6 +1403,7 @@ static const struct memory_stat memory_stats[] = { { "file", NR_FILE_PAGES }, { "kernel_stack", NR_KERNEL_STACK_KB }, { "pagetables", NR_PAGETABLE }, + { "sec_pagetables", NR_SECONDARY_PAGETABLE }, { "percpu", MEMCG_PERCPU_B }, { "sock", MEMCG_SOCK }, { "shmem", NR_SHMEM }, @@ -2798,6 +2802,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) * - LRU isolation * - lock_page_memcg() * - exclusive reference + * - mem_cgroup_trylock_pages() */ page->memcg_data = (unsigned long)memcg; } @@ -5079,6 +5084,7 @@ static DEFINE_IDR(mem_cgroup_idr); static void mem_cgroup_id_remove(struct mem_cgroup *memcg) { if (memcg->id.id > 0) { + trace_android_vh_mem_cgroup_id_remove(memcg); idr_remove(&mem_cgroup_idr, memcg->id.id); memcg->id.id = 0; } @@ -5116,6 +5122,7 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) WARN_ON_ONCE(!rcu_read_lock_held()); return idr_find(&mem_cgroup_idr, id); } +EXPORT_SYMBOL_GPL(mem_cgroup_from_id); static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { @@ -5166,6 +5173,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; + trace_android_vh_mem_cgroup_free(memcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); @@ -5174,6 +5182,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) static void mem_cgroup_free(struct mem_cgroup *memcg) { + lru_gen_exit_memcg(memcg); memcg_wb_domain_exit(memcg); __mem_cgroup_free(memcg); } @@ -5221,6 +5230,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) INIT_LIST_HEAD(&memcg->event_list); spin_lock_init(&memcg->event_list_lock); memcg->socket_pressure = jiffies; + trace_android_rvh_memcgv2_init(memcg); #ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; INIT_LIST_HEAD(&memcg->objcg_list); @@ -5237,6 +5247,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void) memcg->deferred_split_queue.split_queue_len = 0; #endif idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); + lru_gen_init_memcg(memcg); + trace_android_vh_mem_cgroup_alloc(memcg); return memcg; fail: mem_cgroup_id_remove(memcg); @@ -5314,6 +5326,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (unlikely(mem_cgroup_is_root(memcg))) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); + trace_android_vh_mem_cgroup_css_online(css, memcg); return 0; } @@ -5322,6 +5335,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event, *tmp; + trace_android_vh_mem_cgroup_css_offline(css, memcg); /* * Unregister events and notify userspace. * Notify userspace about cgroup removing only after rmdir of cgroup @@ -6205,6 +6219,30 @@ static void mem_cgroup_move_task(void) } #endif +#ifdef CONFIG_LRU_GEN +static void mem_cgroup_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + + /* find the first leader if there is any */ + cgroup_taskset_for_each_leader(task, css, tset) + break; + + if (!task) + return; + + task_lock(task); + if (task->mm && READ_ONCE(task->mm->owner) == task) + lru_gen_migrate_mm(task->mm); + task_unlock(task); +} +#else +static void mem_cgroup_attach(struct cgroup_taskset *tset) +{ +} +#endif /* CONFIG_LRU_GEN */ + static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) { if (value == PAGE_COUNTER_MAX) @@ -6548,6 +6586,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_reset = mem_cgroup_css_reset, .css_rstat_flush = mem_cgroup_css_rstat_flush, .can_attach = mem_cgroup_can_attach, + .attach = mem_cgroup_attach, .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, .dfl_cftypes = memory_files, @@ -6710,6 +6749,8 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, if (!parent) return; + trace_android_rvh_memcgv2_calc_decayed_watermark(memcg); + if (parent == root) { memcg->memory.emin = READ_ONCE(memcg->memory.min); memcg->memory.elow = READ_ONCE(memcg->memory.low); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2ad0f4580091..bd816f936f4a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -491,7 +491,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct anon_vma *av; pgoff_t pgoff; - av = page_lock_anon_vma_read(page); + av = page_lock_anon_vma_read(page, NULL); if (av == NULL) /* Not actually mapped anymore */ return; diff --git a/mm/memory.c b/mm/memory.c index a4d0f744a458..9c3f59c77503 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -73,6 +73,7 @@ #include #include #include +#include #include @@ -121,18 +122,6 @@ int randomize_va_space __read_mostly = 2; #endif -#ifndef arch_faults_on_old_pte -static inline bool arch_faults_on_old_pte(void) -{ - /* - * Those arches which don't have hw access flag feature need to - * implement their own helper. By default, "true" means pagefault - * will be hit on old pte. - */ - return true; -} -#endif - #ifndef arch_wants_old_prefaulted_pte static inline bool arch_wants_old_prefaulted_pte(void) { @@ -171,6 +160,7 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member, long count) { trace_rss_stat(mm, member, count); } +EXPORT_SYMBOL_GPL(mm_trace_rss_stat); #if defined(SPLIT_RSS_COUNTING) @@ -219,6 +209,35 @@ static void check_sync_rss_stat(struct task_struct *task) #endif /* SPLIT_RSS_COUNTING */ +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + +struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma; + + rcu_read_lock(); + vma = find_vma_from_tree(mm, addr); + if (vma) { + if (vma->vm_start > addr || + !atomic_inc_unless_negative(&vma->file_ref_count)) + vma = NULL; + } + rcu_read_unlock(); + + return vma; +} + +void put_vma(struct vm_area_struct *vma) +{ + int new_ref_count; + + new_ref_count = atomic_dec_return(&vma->file_ref_count); + if (new_ref_count < 0) + vm_area_free_no_check(vma); +} + +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ + /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. @@ -227,6 +246,16 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + /* + * Ensure page table destruction is blocked if __pte_map_lock managed + * to take this lock. Without this barrier tlb_remove_table_rcu can + * destroy ptl after __pte_map_lock locked it and during unlock would + * cause a use-after-free. + */ + spinlock_t *ptl = pmd_lock(tlb->mm, pmd); + spin_unlock(ptl); +#endif pmd_clear(pmd); pte_free_tlb(tlb, token, addr); mm_dec_nr_ptes(tlb->mm); @@ -1324,15 +1353,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t *start_pte; pte_t *pte; swp_entry_t entry; + int v_ret = 0; tlb_change_page_size(tlb, PAGE_SIZE); again: + trace_android_vh_zap_pte_range_tlb_start(&v_ret); init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte = start_pte; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { + bool flush = false; pte_t ptent = *pte; if (pte_none(ptent)) continue; @@ -1373,7 +1405,8 @@ again: page_remove_rmap(page, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); - if (unlikely(__tlb_remove_page(tlb, page))) { + trace_android_vh_zap_pte_range_tlb_force_flush(page, &flush); + if (unlikely(__tlb_remove_page(tlb, page)) || flush) { force_flush = 1; addr += PAGE_SIZE; break; @@ -1445,6 +1478,7 @@ again: tlb_flush_mmu(tlb); } + trace_android_vh_zap_pte_range_tlb_end(&v_ret); if (addr != end) { cond_resched(); goto again; @@ -2729,6 +2763,99 @@ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL_GPL(apply_to_existing_page_range); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + +/* + * speculative_page_walk_begin() ... speculative_page_walk_end() protects + * against races with page table reclamation. + * + * This is similar to what fast GUP does, but fast GUP also needs to + * protect against races with THP page splitting, so it always needs + * to disable interrupts. + * Speculative page faults need to protect against page table reclamation, + * even with MMU_GATHER_RCU_TABLE_FREE case page table removal slow-path is + * not RCU-safe (see comment inside tlb_remove_table_sync_one), therefore + * we still have to disable IRQs. + */ +#define speculative_page_walk_begin() local_irq_disable() +#define speculative_page_walk_end() local_irq_enable() + +bool __pte_map_lock(struct vm_fault *vmf) +{ + pmd_t pmdval; + pte_t *pte = vmf->pte; + spinlock_t *ptl; + + if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) { + vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); + if (!pte) + vmf->pte = pte_offset_map(vmf->pmd, vmf->address); + spin_lock(vmf->ptl); + return true; + } + + speculative_page_walk_begin(); + if (!mmap_seq_read_check(vmf->vma->vm_mm, vmf->seq, + SPF_ABORT_PTE_MAP_LOCK_SEQ1)) + goto fail; + /* + * The mmap sequence count check guarantees that the page + * tables are still valid at that point, and + * speculative_page_walk_begin() ensures that they stay around. + */ + /* + * We check if the pmd value is still the same to ensure that there + * is not a huge collapse operation in progress in our back. + * It also ensures that pmd was not cleared by pmd_clear in + * free_pte_range and ptl is still valid. + */ + pmdval = READ_ONCE(*vmf->pmd); + if (!pmd_same(pmdval, vmf->orig_pmd)) { + count_vm_spf_event(SPF_ABORT_PTE_MAP_LOCK_PMD); + goto fail; + } + ptl = pte_lockptr(vmf->vma->vm_mm, &pmdval); + if (!pte) + pte = pte_offset_map(&pmdval, vmf->address); + /* + * Try locking the page table. + * + * Note that we might race against zap_pte_range() which + * invalidates TLBs while holding the page table lock. + * We are still under the speculative_page_walk_begin() section, + * and zap_pte_range() could thus deadlock with us if we tried + * using spin_lock() here. + * + * We also don't want to retry until spin_trylock() succeeds, + * because of the starvation potential against a stream of lockers. + */ + if (unlikely(!spin_trylock(ptl))) { + count_vm_spf_event(SPF_ABORT_PTE_MAP_LOCK_PTL); + goto fail; + } + /* + * The check below will fail if __pte_map_lock passed its ptl barrier + * before we took the ptl lock. + */ + if (!mmap_seq_read_check(vmf->vma->vm_mm, vmf->seq, + SPF_ABORT_PTE_MAP_LOCK_SEQ2)) + goto unlock_fail; + speculative_page_walk_end(); + vmf->pte = pte; + vmf->ptl = ptl; + return true; + +unlock_fail: + spin_unlock(ptl); +fail: + if (pte) + pte_unmap(pte); + speculative_page_walk_end(); + return false; +} + +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ + /* * handle_pte_fault chooses page fault handler according to an entry which was * read non-atomically. Before making any commitment, on those architectures @@ -2782,7 +2909,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src, * On architectures with software "accessed" bits, we would * take a double page fault, so mark it accessed here. */ - if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { + if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) { pte_t entry; vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); @@ -3003,20 +3130,28 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) pte_t entry; int page_copied = 0; struct mmu_notifier_range range; + vm_fault_t ret = VM_FAULT_OOM; - if (unlikely(anon_vma_prepare(vma))) - goto oom; + if (unlikely(!vma->anon_vma)) { + if (vmf->flags & FAULT_FLAG_SPECULATIVE) { + count_vm_spf_event(SPF_ABORT_ANON_VMA); + ret = VM_FAULT_RETRY; + goto out; + } + if (__anon_vma_prepare(vma)) + goto out; + } if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { new_page = alloc_zeroed_user_highpage_movable(vma, vmf->address); if (!new_page) - goto oom; + goto out; } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!new_page) - goto oom; + goto out; if (!cow_user_page(new_page, old_page, vmf)) { /* @@ -3033,11 +3168,16 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) - goto oom_free_new; + goto out_free_new; cgroup_throttle_swaprate(new_page, GFP_KERNEL); __SetPageUptodate(new_page); + if ((vmf->flags & FAULT_FLAG_SPECULATIVE) && + !mmu_notifier_trylock(mm)) { + ret = VM_FAULT_RETRY; + goto out_free_new; + } mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); @@ -3046,7 +3186,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) /* * Re-check the pte - we dropped the lock */ - vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); + if (!pte_map_lock(vmf)) { + ret = VM_FAULT_RETRY; + /* put_page() will uncharge the page */ + goto out_notify; + } if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { @@ -3121,6 +3265,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * the above ptep_clear_flush_notify() did already call it. */ mmu_notifier_invalidate_range_only_end(&range); + if (vmf->flags & FAULT_FLAG_SPECULATIVE) + mmu_notifier_unlock(mm); if (old_page) { /* * Don't let another task, with possibly unlocked vma, @@ -3137,12 +3283,16 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) put_page(old_page); } return page_copied ? VM_FAULT_WRITE : 0; -oom_free_new: +out_notify: + mmu_notifier_invalidate_range_only_end(&range); + if (vmf->flags & FAULT_FLAG_SPECULATIVE) + mmu_notifier_unlock(mm); +out_free_new: put_page(new_page); -oom: +out: if (old_page) put_page(old_page); - return VM_FAULT_OOM; + return ret; } /** @@ -3187,6 +3337,7 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + VM_BUG_ON(vmf->flags & FAULT_FLAG_SPECULATIVE); if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { vm_fault_t ret; @@ -3207,6 +3358,8 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = VM_FAULT_WRITE; + VM_BUG_ON(vmf->flags & FAULT_FLAG_SPECULATIVE); + get_page(vmf->page); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { @@ -3258,8 +3411,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + if (vmf->flags & FAULT_FLAG_SPECULATIVE) + count_vm_spf_event(SPF_ATTEMPT_WP); + if (userfaultfd_pte_wp(vma, *vmf->pte)) { pte_unmap_unlock(vmf->pte, vmf->ptl); + if (vmf->flags & FAULT_FLAG_SPECULATIVE) { + count_vm_spf_event(SPF_ABORT_USERFAULTFD); + return VM_FAULT_RETRY; + } return handle_userfault(vmf, VM_UFFD_WP); } @@ -3285,6 +3445,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) return wp_pfn_shared(vmf); pte_unmap_unlock(vmf->pte, vmf->ptl); + vmf->pte = NULL; return wp_page_copy(vmf); } @@ -3323,6 +3484,7 @@ copy: get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); + vmf->pte = NULL; return wp_page_copy(vmf); } @@ -3501,6 +3663,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vm_fault_t ret = 0; void *shadow = NULL; + if (vmf->flags & FAULT_FLAG_SPECULATIVE) { + pte_unmap(vmf->pte); + count_vm_spf_event(SPF_ABORT_SWAP); + return VM_FAULT_RETRY; + } + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) goto out; @@ -3537,8 +3705,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { /* skip swapcache */ - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - vmf->address); + gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_CMA; + + trace_android_rvh_set_skip_swapcache_flags(&flags); + page = alloc_page_vma(flags, vma, vmf->address); if (page) { __SetPageLocked(page); __SetPageSwapBacked(page); @@ -3562,7 +3732,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) set_page_private(page, 0); } } else { - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + page = swapin_readahead(entry, + GFP_HIGHUSER_MOVABLE | __GFP_CMA, vmf); swapcache = page; } @@ -3728,14 +3899,21 @@ out_release: static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page; + struct page *page = NULL; vm_fault_t ret = 0; pte_t entry; + if (vmf->flags & FAULT_FLAG_SPECULATIVE) + count_vm_spf_event(SPF_ATTEMPT_ANON); + /* File mapping without ->vm_ops ? */ if (vma->vm_flags & VM_SHARED) return VM_FAULT_SIGBUS; + /* Do not check unstable pmd, if it's changed will retry later */ + if (vmf->flags & FAULT_FLAG_SPECULATIVE) + goto skip_pmd_checks; + /* * Use pte_alloc() instead of pte_alloc_map(). We can't run * pte_offset_map() on pmds where a huge pmd might be created @@ -3749,87 +3927,90 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; - /* See comment in handle_pte_fault() */ + /* See comment in __handle_mm_fault() */ if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; +skip_pmd_checks: /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); - if (!pte_none(*vmf->pte)) { - update_mmu_tlb(vma, vmf->address, vmf->pte); - goto unlock; + } else { + /* Allocate our own private page. */ + if (unlikely(!vma->anon_vma)) { + if (vmf->flags & FAULT_FLAG_SPECULATIVE) { + count_vm_spf_event(SPF_ABORT_ANON_VMA); + return VM_FAULT_RETRY; + } + if (__anon_vma_prepare(vma)) + goto oom; } - ret = check_stable_address_space(vma->vm_mm); - if (ret) - goto unlock; - /* Deliver the page fault to userland, check inside PT lock */ - if (userfaultfd_missing(vma)) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - return handle_userfault(vmf, VM_UFFD_MISSING); - } - goto setpte; + page = alloc_zeroed_user_highpage_movable(vma, vmf->address); + if (!page) + goto oom; + + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) + goto oom_free_page; + cgroup_throttle_swaprate(page, GFP_KERNEL); + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + entry = mk_pte(page, vma->vm_page_prot); + entry = pte_sw_mkyoung(entry); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); } - /* Allocate our own private page. */ - if (unlikely(anon_vma_prepare(vma))) - goto oom; - page = alloc_zeroed_user_highpage_movable(vma, vmf->address); - if (!page) - goto oom; - - if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) - goto oom_free_page; - cgroup_throttle_swaprate(page, GFP_KERNEL); - - /* - * The memory barrier inside __SetPageUptodate makes sure that - * preceding stores to the page contents become visible before - * the set_pte_at() write. - */ - __SetPageUptodate(page); - - entry = mk_pte(page, vma->vm_page_prot); - entry = pte_sw_mkyoung(entry); - if (vma->vm_flags & VM_WRITE) - entry = pte_mkwrite(pte_mkdirty(entry)); - - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, - &vmf->ptl); - if (!pte_none(*vmf->pte)) { - update_mmu_cache(vma, vmf->address, vmf->pte); + if (!pte_map_lock(vmf)) { + ret = VM_FAULT_RETRY; goto release; } + if (!pte_none(*vmf->pte)) { + update_mmu_tlb(vma, vmf->address, vmf->pte); + goto unlock; + } ret = check_stable_address_space(vma->vm_mm); if (ret) - goto release; + goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); - put_page(page); + if (page) + put_page(page); + if (vmf->flags & FAULT_FLAG_SPECULATIVE) { + count_vm_spf_event(SPF_ABORT_USERFAULTFD); + return VM_FAULT_RETRY; + } return handle_userfault(vmf, VM_UFFD_MISSING); } - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, vmf->address, false); - lru_cache_add_inactive_or_unevictable(page, vma); -setpte: + if (page) { + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, vmf->address, false); + lru_cache_add_inactive_or_unevictable(page, vma); + } + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); - return ret; release: - put_page(page); - goto unlock; + if (page) + put_page(page); + return ret; oom_free_page: put_page(page); oom: @@ -3846,29 +4027,51 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; - /* - * Preallocate pte before we take page_lock because this might lead to - * deadlocks for memcg reclaim which waits for pages under writeback: - * lock_page(A) - * SetPageWriteback(A) - * unlock_page(A) - * lock_page(B) - * lock_page(B) - * pte_alloc_one - * shrink_page_list - * wait_on_page_writeback(A) - * SetPageWriteback(B) - * unlock_page(B) - * # flush A, B to clear the writeback - */ - if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { - vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); - if (!vmf->prealloc_pte) - return VM_FAULT_OOM; - smp_wmb(); /* See comment in __pte_alloc() */ +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + if (vmf->flags & FAULT_FLAG_SPECULATIVE) { + rcu_read_lock(); + if (!mmap_seq_read_check(vmf->vma->vm_mm, vmf->seq, + SPF_ABORT_FAULT)) { + ret = VM_FAULT_RETRY; + } else { + /* + * The mmap sequence count check guarantees that the + * vma we fetched at the start of the fault was still + * current at that point in time. The rcu read lock + * ensures vmf->vma->vm_file stays valid. + */ + ret = vma->vm_ops->fault(vmf); + } + rcu_read_unlock(); + } else +#endif + { + /* + * Preallocate pte before we take page_lock because + * this might lead to deadlocks for memcg reclaim + * which waits for pages under writeback: + * lock_page(A) + * SetPageWriteback(A) + * unlock_page(A) + * lock_page(B) + * lock_page(B) + * pte_alloc_one + * shrink_page_list + * wait_on_page_writeback(A) + * SetPageWriteback(B) + * unlock_page(B) + * # flush A, B to clear writeback + */ + if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); + if (!vmf->prealloc_pte) + return VM_FAULT_OOM; + smp_wmb(); /* See comment in __pte_alloc() */ + } + + ret = vma->vm_ops->fault(vmf); } - ret = vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | VM_FAULT_DONE_COW))) return ret; @@ -4051,35 +4254,37 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return ret; } - if (pmd_none(*vmf->pmd)) { - if (PageTransCompound(page)) { - ret = do_set_pmd(vmf, page); - if (ret != VM_FAULT_FALLBACK) - return ret; + if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) { + if (pmd_none(*vmf->pmd)) { + if (PageTransCompound(page)) { + ret = do_set_pmd(vmf, page); + if (ret != VM_FAULT_FALLBACK) + return ret; + } + + if (vmf->prealloc_pte) { + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (likely(pmd_none(*vmf->pmd))) { + mm_inc_nr_ptes(vma->vm_mm); + pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); + vmf->prealloc_pte = NULL; + } + spin_unlock(vmf->ptl); + } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { + return VM_FAULT_OOM; + } } - if (vmf->prealloc_pte) { - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (likely(pmd_none(*vmf->pmd))) { - mm_inc_nr_ptes(vma->vm_mm); - pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); - vmf->prealloc_pte = NULL; - } - spin_unlock(vmf->ptl); - } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { - return VM_FAULT_OOM; - } + /* + * See comment in handle_pte_fault() for how this scenario happens, we + * need to return NOPAGE so that we drop this page. + */ + if (pmd_devmap_trans_unstable(vmf->pmd)) + return VM_FAULT_NOPAGE; } - /* - * See comment in handle_pte_fault() for how this scenario happens, we - * need to return NOPAGE so that we drop this page. - */ - if (pmd_devmap_trans_unstable(vmf->pmd)) - return VM_FAULT_NOPAGE; - - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); + if (!pte_map_lock(vmf)) + return VM_FAULT_RETRY; ret = 0; /* Re-check under ptl */ if (likely(pte_none(*vmf->pte))) @@ -4158,6 +4363,7 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) pgoff_t start_pgoff = vmf->pgoff; pgoff_t end_pgoff; int off; + vm_fault_t ret; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; @@ -4176,14 +4382,32 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, start_pgoff + nr_pages - 1); - if (pmd_none(*vmf->pmd)) { + if (!(vmf->flags & FAULT_FLAG_SPECULATIVE) && + pmd_none(*vmf->pmd)) { vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ } - return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); + rcu_read_lock(); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + if (vmf->flags & FAULT_FLAG_SPECULATIVE) { + if (!mmap_seq_read_check(vmf->vma->vm_mm, vmf->seq, + SPF_ABORT_FAULT)) { + rcu_read_unlock(); + return VM_FAULT_RETRY; + } + /* + * the mmap sequence check verified that vmf->vma was still + * current at that point in time. + * The rcu read lock ensures vmf->vma->vm_file stays valid. + */ + } +#endif + ret = vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); + rcu_read_unlock(); + return ret; } static vm_fault_t do_read_fault(struct vm_fault *vmf) @@ -4220,8 +4444,14 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; + if (unlikely(!vma->anon_vma)) { + if (vmf->flags & FAULT_FLAG_SPECULATIVE) { + count_vm_spf_event(SPF_ABORT_ANON_VMA); + return VM_FAULT_RETRY; + } + if (__anon_vma_prepare(vma)) + return VM_FAULT_OOM; + } vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!vmf->cow_page) @@ -4258,6 +4488,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret, tmp; + VM_BUG_ON(vmf->flags & FAULT_FLAG_SPECULATIVE); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -4302,10 +4534,15 @@ static vm_fault_t do_fault(struct vm_fault *vmf) struct mm_struct *vm_mm = vma->vm_mm; vm_fault_t ret; + if (vmf->flags & FAULT_FLAG_SPECULATIVE) + count_vm_spf_event(SPF_ATTEMPT_FILE); + /* * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) { + VM_BUG_ON(vmf->flags & FAULT_FLAG_SPECULATIVE); + /* * If we find a migration pmd entry or a none pmd entry, which * should never happen, return SIGBUS @@ -4371,13 +4608,16 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; + if (vmf->flags & FAULT_FLAG_SPECULATIVE) + count_vm_spf_event(SPF_ATTEMPT_NUMA); + /* * The "pte" at this point cannot be used safely without * validation through pte_unmap_same(). It's of NUMA type but * the pfn may be screwed if the read is non atomic. */ - vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); - spin_lock(vmf->ptl); + if (!pte_spinlock(vmf)) + return VM_FAULT_RETRY; if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; @@ -4540,53 +4780,6 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { pte_t entry; - if (unlikely(pmd_none(*vmf->pmd))) { - /* - * Leave __pte_alloc() until later: because vm_ops->fault may - * want to allocate huge page, and if we expose page table - * for an instant, it will be difficult to retract from - * concurrent faults and from rmap lookups. - */ - vmf->pte = NULL; - } else { - /* - * If a huge pmd materialized under us just retry later. Use - * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead - * of pmd_trans_huge() to ensure the pmd didn't become - * pmd_trans_huge under us and then back to pmd_none, as a - * result of MADV_DONTNEED running immediately after a huge pmd - * fault in a different thread of this mm, in turn leading to a - * misleading pmd_trans_huge() retval. All we have to ensure is - * that it is a regular pmd that we can walk with - * pte_offset_map() and we can do that through an atomic read - * in C, which is what pmd_trans_unstable() provides. - */ - if (pmd_devmap_trans_unstable(vmf->pmd)) - return 0; - /* - * A regular pmd is established and it can't morph into a huge - * pmd from under us anymore at this point because we hold the - * mmap_lock read mode and khugepaged takes it in write mode. - * So now it's safe to run pte_offset_map(). - */ - vmf->pte = pte_offset_map(vmf->pmd, vmf->address); - vmf->orig_pte = *vmf->pte; - - /* - * some architectures can have larger ptes than wordsize, - * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and - * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic - * accesses. The code below just needs a consistent view - * for the ifs and we later double check anyway with the - * ptl lock held. So here a barrier will do. - */ - barrier(); - if (pte_none(vmf->orig_pte)) { - pte_unmap(vmf->pte); - vmf->pte = NULL; - } - } - if (!vmf->pte) { if (vma_is_anonymous(vmf->vma)) return do_anonymous_page(vmf); @@ -4600,8 +4793,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) return do_numa_page(vmf); - vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); - spin_lock(vmf->ptl); + if (vmf->flags & FAULT_FLAG_SPECULATIVE) + count_vm_spf_event(SPF_ATTEMPT_PTE); + + if (!pte_spinlock(vmf)) + return VM_FAULT_RETRY; entry = vmf->orig_pte; if (unlikely(!pte_same(*vmf->pte, entry))) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); @@ -4641,7 +4837,7 @@ unlock: * return value. See filemap_fault() and __lock_page_or_retry(). */ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, - unsigned long address, unsigned int flags) + unsigned long address, unsigned int flags, unsigned long seq) { struct vm_fault vmf = { .vma = vma, @@ -4656,6 +4852,98 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, p4d_t *p4d; vm_fault_t ret; +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + if (flags & FAULT_FLAG_SPECULATIVE) { + pgd_t pgdval; + p4d_t p4dval; + pud_t pudval; + + vmf.seq = seq; + + speculative_page_walk_begin(); + pgd = pgd_offset(mm, address); + pgdval = READ_ONCE(*pgd); + if (pgd_none(pgdval) || unlikely(pgd_bad(pgdval))) { + count_vm_spf_event(SPF_ABORT_PUD); + goto spf_fail; + } + + p4d = p4d_offset(pgd, address); + if (pgd_val(READ_ONCE(*pgd)) != pgd_val(pgdval)) + goto spf_fail; + p4dval = READ_ONCE(*p4d); + if (p4d_none(p4dval) || unlikely(p4d_bad(p4dval))) { + count_vm_spf_event(SPF_ABORT_PUD); + goto spf_fail; + } + + vmf.pud = pud_offset(p4d, address); + if (p4d_val(READ_ONCE(*p4d)) != p4d_val(p4dval)) + goto spf_fail; + pudval = READ_ONCE(*vmf.pud); + if (pud_none(pudval) || unlikely(pud_bad(pudval)) || + unlikely(pud_trans_huge(pudval)) || + unlikely(pud_devmap(pudval))) { + count_vm_spf_event(SPF_ABORT_PUD); + goto spf_fail; + } + + vmf.pmd = pmd_offset(vmf.pud, address); + if (pud_val(READ_ONCE(*vmf.pud)) != pud_val(pudval)) + goto spf_fail; + vmf.orig_pmd = READ_ONCE(*vmf.pmd); + + /* + * pmd_none could mean that a hugepage collapse is in + * progress in our back as collapse_huge_page() mark + * it before invalidating the pte (which is done once + * the IPI is catched by all CPU and we have interrupt + * disabled). For this reason we cannot handle THP in + * a speculative way since we can't safely identify an + * in progress collapse operation done in our back on + * that PMD. + */ + if (unlikely(pmd_none(vmf.orig_pmd) || + is_swap_pmd(vmf.orig_pmd) || + pmd_trans_huge(vmf.orig_pmd) || + pmd_devmap(vmf.orig_pmd))) { + count_vm_spf_event(SPF_ABORT_PMD); + goto spf_fail; + } + + /* + * The above does not allocate/instantiate page-tables because + * doing so would lead to the possibility of instantiating + * page-tables after free_pgtables() -- and consequently + * leaking them. + * + * The result is that we take at least one non-speculative + * fault per PMD in order to instantiate it. + */ + + vmf.pte = pte_offset_map(vmf.pmd, address); + if (pmd_val(READ_ONCE(*vmf.pmd)) != pmd_val(vmf.orig_pmd)) { + pte_unmap(vmf.pte); + vmf.pte = NULL; + goto spf_fail; + } + vmf.orig_pte = READ_ONCE(*vmf.pte); + barrier(); + if (pte_none(vmf.orig_pte)) { + pte_unmap(vmf.pte); + vmf.pte = NULL; + } + + speculative_page_walk_end(); + + return handle_pte_fault(&vmf); + + spf_fail: + speculative_page_walk_end(); + return VM_FAULT_RETRY; + } +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ + pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); if (!p4d) @@ -4726,6 +5014,53 @@ retry_pud: } } + if (unlikely(pmd_none(*vmf.pmd))) { + /* + * Leave __pte_alloc() until later: because vm_ops->fault may + * want to allocate huge page, and if we expose page table + * for an instant, it will be difficult to retract from + * concurrent faults and from rmap lookups. + */ + vmf.pte = NULL; + } else { + /* + * If a huge pmd materialized under us just retry later. Use + * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead + * of pmd_trans_huge() to ensure the pmd didn't become + * pmd_trans_huge under us and then back to pmd_none, as a + * result of MADV_DONTNEED running immediately after a huge pmd + * fault in a different thread of this mm, in turn leading to a + * misleading pmd_trans_huge() retval. All we have to ensure is + * that it is a regular pmd that we can walk with + * pte_offset_map() and we can do that through an atomic read + * in C, which is what pmd_trans_unstable() provides. + */ + if (pmd_devmap_trans_unstable(vmf.pmd)) + return 0; + /* + * A regular pmd is established and it can't morph into a huge + * pmd from under us anymore at this point because we hold the + * mmap_lock read mode and khugepaged takes it in write mode. + * So now it's safe to run pte_offset_map(). + */ + vmf.pte = pte_offset_map(vmf.pmd, vmf.address); + vmf.orig_pte = *vmf.pte; + + /* + * some architectures can have larger ptes than wordsize, + * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and + * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic + * accesses. The code below just needs a consistent view + * for the ifs and we later double check anyway with the + * ptl lock held. So here a barrier will do. + */ + barrier(); + if (pte_none(vmf.orig_pte)) { + pte_unmap(vmf.pte); + vmf.pte = NULL; + } + } + return handle_pte_fault(&vmf); } @@ -4790,17 +5125,42 @@ static inline void mm_account_fault(struct pt_regs *regs, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } +#ifdef CONFIG_LRU_GEN +static void lru_gen_enter_fault(struct vm_area_struct *vma) +{ + /* the LRU algorithm doesn't apply to sequential or random reads */ + current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)); +} + +static void lru_gen_exit_fault(void) +{ + current->in_lru_fault = false; +} +#else +static void lru_gen_enter_fault(struct vm_area_struct *vma) +{ +} + +static void lru_gen_exit_fault(void) +{ +} +#endif /* CONFIG_LRU_GEN */ + /* * By the time we get here, we already hold the mm semaphore * * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags, struct pt_regs *regs) +vm_fault_t do_handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned long seq, struct pt_regs *regs) { vm_fault_t ret; + VM_BUG_ON((flags & FAULT_FLAG_SPECULATIVE) && + !vma_can_speculate(vma, flags)); + __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); @@ -4821,10 +5181,16 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (flags & FAULT_FLAG_USER) mem_cgroup_enter_user_fault(); - if (unlikely(is_vm_hugetlb_page(vma))) + lru_gen_enter_fault(vma); + + if (unlikely(is_vm_hugetlb_page(vma))) { + VM_BUG_ON(flags & FAULT_FLAG_SPECULATIVE); ret = hugetlb_fault(vma->vm_mm, vma, address, flags); - else - ret = __handle_mm_fault(vma, address, flags); + } else { + ret = __handle_mm_fault(vma, address, flags, seq); + } + + lru_gen_exit_fault(); if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); @@ -4842,7 +5208,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, return ret; } -EXPORT_SYMBOL_GPL(handle_mm_fault); +EXPORT_SYMBOL_GPL(do_handle_mm_fault); #ifndef __PAGETABLE_P4D_FOLDED /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9fd0be32a281..ddb82d1082a2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1490,6 +1490,43 @@ int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) } EXPORT_SYMBOL_GPL(add_memory); +int add_memory_subsection(int nid, u64 start, u64 size) +{ + struct mhp_params params = { .pgprot = PAGE_KERNEL }; + struct resource *res; + int ret; + + if (!IS_ALIGNED(start, SUBSECTION_SIZE) || + !IS_ALIGNED(size, SUBSECTION_SIZE)) { + pr_err("%s: start 0x%llx size 0x%llx not aligned to subsection size\n", + __func__, start, size); + return -EINVAL; + } + + res = register_memory_resource(start, size, "System RAM"); + if (IS_ERR(res)) + return PTR_ERR(res); + + mem_hotplug_begin(); + + nid = memory_add_physaddr_to_nid(start); + + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) + memblock_add_node(start, size, nid); + + ret = arch_add_memory(nid, start, size, ¶ms); + if (ret) { + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) + memblock_remove(start, size); + pr_err("%s failed to add subsection start 0x%llx size 0x%llx\n", + __func__, start, size); + } + mem_hotplug_done(); + + return ret; +} +EXPORT_SYMBOL_GPL(add_memory_subsection); + /* * Add special, driver-managed memory to the system as system RAM. Such * memory is not exposed via the raw firmware-provided memmap as system @@ -1930,7 +1967,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - MEMORY_OFFLINE | REPORT_FAILURE); + MEMORY_OFFLINE | REPORT_FAILURE, NULL); if (ret) { reason = "failure to isolate range"; goto failed_removal_pcplists_disabled; @@ -1984,7 +2021,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, goto failed_removal_isolated; } - ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE); + ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE, NULL); } while (ret); @@ -2253,6 +2290,29 @@ int remove_memory(u64 start, u64 size) } EXPORT_SYMBOL_GPL(remove_memory); +int remove_memory_subsection(u64 start, u64 size) +{ + if (!IS_ALIGNED(start, SUBSECTION_SIZE) || + !IS_ALIGNED(size, SUBSECTION_SIZE)) { + pr_err("%s: start 0x%llx size 0x%llx not aligned to subsection size\n", + __func__, start, size); + return -EINVAL; + } + + mem_hotplug_begin(); + arch_remove_memory(start, size, NULL); + + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) + memblock_remove(start, size); + + release_mem_region_adjustable(start, size); + + mem_hotplug_done(); + + return 0; +} +EXPORT_SYMBOL_GPL(remove_memory_subsection); + static int try_offline_memory_block(struct memory_block *mem, void *arg) { uint8_t online_type = MMOP_ONLINE_KERNEL; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4472be6f123d..55a806946c8d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -808,7 +808,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx); + new_pol, vma->vm_userfaultfd_ctx, + anon_vma_name(vma)); if (prev) { vma = prev; goto replace; diff --git a/mm/migrate.c b/mm/migrate.c index 7da052c6cf1e..2ee48861cbbd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -106,7 +106,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) /* Driver shouldn't use PG_isolated bit of page->flags */ WARN_ON_ONCE(PageIsolated(page)); - __SetPageIsolated(page); + SetPageIsolated(page); unlock_page(page); return 0; @@ -125,7 +125,7 @@ static void putback_movable_page(struct page *page) mapping = page_mapping(page); mapping->a_ops->putback_page(page); - __ClearPageIsolated(page); + ClearPageIsolated(page); } /* @@ -158,7 +158,7 @@ void putback_movable_pages(struct list_head *l) if (PageMovable(page)) putback_movable_page(page); else - __ClearPageIsolated(page); + ClearPageIsolated(page); unlock_page(page); put_page(page); } else { @@ -168,6 +168,7 @@ void putback_movable_pages(struct list_head *l) } } } +EXPORT_SYMBOL_GPL(putback_movable_pages); /* * Restore a potential migration pte to a working pte entry @@ -915,7 +916,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, VM_BUG_ON_PAGE(!PageIsolated(page), page); if (!PageMovable(page)) { rc = MIGRATEPAGE_SUCCESS; - __ClearPageIsolated(page); + ClearPageIsolated(page); goto out; } @@ -937,7 +938,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, * We clear PG_movable under page_lock so any compactor * cannot try to migrate this page. */ - __ClearPageIsolated(page); + ClearPageIsolated(page); } /* @@ -1201,7 +1202,7 @@ static int unmap_and_move(new_page_t get_new_page, if (unlikely(__PageMovable(page))) { lock_page(page); if (!PageMovable(page)) - __ClearPageIsolated(page); + ClearPageIsolated(page); unlock_page(page); } goto out; @@ -1604,6 +1605,7 @@ out: return rc; } +EXPORT_SYMBOL_GPL(migrate_pages); struct page *alloc_migration_target(struct page *page, unsigned long private) { diff --git a/mm/mlock.c b/mm/mlock.c index 0cee3f97d3df..0cc7fe053755 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -511,7 +511,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*prev) { vma = *prev; goto success; diff --git a/mm/mm_init.c b/mm/mm_init.c index 9ddaf0e1b0ab..0d7b2bd2454a 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void) shift = 8 * sizeof(unsigned long); width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; + - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", - "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, LAST_CPUPID_WIDTH, KASAN_TAG_WIDTH, + LRU_GEN_WIDTH, + LRU_REFS_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", diff --git a/mm/mmap.c b/mm/mmap.c index a0a4eadc8779..e3a10b3cc6be 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -55,6 +56,8 @@ #define CREATE_TRACE_POINTS #include +#undef CREATE_TRACE_POINTS +#include #include "internal.h" @@ -182,9 +185,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); - if (vma->vm_file) - fput(vma->vm_file); mpol_put(vma_policy(vma)); + /* fput(vma->vm_file) happens in vm_area_free after an RCU delay. */ vm_area_free(vma); return next; } @@ -952,7 +954,8 @@ again: if (remove_next) { if (file) { uprobe_munmap(next, next->vm_start, next->vm_end); - fput(file); + /* fput(file) happens whthin vm_area_free(next) */ + VM_BUG_ON(file != next->vm_file); } if (next->anon_vma) anon_vma_merge(vma, next); @@ -1029,7 +1032,8 @@ again: */ static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -1047,6 +1051,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) return 0; + if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) + return 0; return 1; } @@ -1079,9 +1085,10 @@ static int can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return 1; @@ -1100,9 +1107,10 @@ static int can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); @@ -1113,9 +1121,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, } /* - * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out - * whether that can be merged with its predecessor or its successor. - * Or both (it neatly fills a hole). + * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), + * figure out whether that can be merged with its predecessor or its + * successor. Or both (it neatly fills a hole). * * In most cases - when called for mmap, brk or mremap - [addr,end) is * certain not to be mapped by the time vma_merge is called; but when @@ -1160,7 +1168,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -1190,7 +1199,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, anon_name)) { /* * OK, it can. Can we now merge in the successor as well? */ @@ -1199,7 +1208,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx) && + vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ @@ -1222,7 +1231,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, anon_name)) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL, next); @@ -1759,7 +1768,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * Can we just expand an old mapping? */ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -1808,13 +1817,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr, */ if (unlikely(vm_flags != vma->vm_flags && prev)) { merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, - NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (merge) { /* ->mmap() can change vma->vm_file and fput the original file. So * fput the vma->vm_file here or we would add an extra fput for file * and cause general protection fault ultimately. */ - fput(vma->vm_file); + /* fput happens within vm_area_free */ vm_area_free(vma); vma = merge; /* Update vm_flags to pick up the change. */ @@ -1874,6 +1883,8 @@ out: vma_set_page_prot(vma); + trace_android_vh_mmap_region(vma, addr); + return addr; close_and_free_vma: @@ -1888,6 +1899,7 @@ unmap_and_free_vma: if (vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); free_vma: + VM_BUG_ON(vma->vm_file); vm_area_free(vma); unacct_error: if (charged) @@ -2267,17 +2279,10 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); -/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +struct vm_area_struct *find_vma_from_tree(struct mm_struct *mm, unsigned long addr) { struct rb_node *rb_node; - struct vm_area_struct *vma; - - mmap_assert_locked(mm); - /* Check the cache first. */ - vma = vmacache_find(mm, addr); - if (likely(vma)) - return vma; + struct vm_area_struct *vma = NULL; rb_node = mm->mm_rb.rb_node; @@ -2295,12 +2300,27 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) rb_node = rb_node->rb_right; } + return vma; +} + +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +struct vm_area_struct *__find_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma; + + /* Check the cache first. */ + vma = vmacache_find(mm, addr); + if (likely(vma)) + return vma; + + vma = find_vma_from_tree(mm, addr); + if (vma) vmacache_update(addr, vma); return vma; } -EXPORT_SYMBOL(find_vma); +EXPORT_SYMBOL(__find_vma); /* * Same as find_vma, but also return a pointer to the previous VMA in *pprev. @@ -2770,6 +2790,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, out_free_mpol: mpol_put(vma_policy(new)); out_free_vma: + new->vm_file = NULL; /* prevents fput within vm_area_free() */ vm_area_free(new); return err; } @@ -3072,7 +3093,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla /* Can we just expand an old private anonymous mapping? */ vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -3158,25 +3179,27 @@ void exit_mmap(struct mm_struct *mm) * to mmu_notifier_release(mm) ensures mmu notifier callbacks in * __oom_reap_task_mm() will not block. * - * This needs to be done before calling munlock_vma_pages_all(), + * This needs to be done before calling unlock_range(), * which clears VM_LOCKED, otherwise the oom reaper cannot * reliably test it. */ (void)__oom_reap_task_mm(mm); set_bit(MMF_OOM_SKIP, &mm->flags); - mmap_write_lock(mm); - mmap_write_unlock(mm); } + mmap_write_lock(mm); if (mm->locked_vm) unlock_range(mm->mmap, ULONG_MAX); arch_exit_mmap(mm); vma = mm->mmap; - if (!vma) /* Can happen if dup_mmap() received an OOM */ + if (!vma) { + /* Can happen if dup_mmap() received an OOM */ + mmap_write_unlock(mm); return; + } lru_add_drain(); flush_cache_mm(mm); @@ -3187,16 +3210,15 @@ void exit_mmap(struct mm_struct *mm) free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); - /* - * Walk the list again, actually closing and freeing it, - * with preemption enabled, without holding any MM locks. - */ + /* Walk the list again, actually closing and freeing it. */ while (vma) { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); vma = remove_vma(vma); cond_resched(); } + mm->mmap = NULL; + mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } @@ -3265,7 +3287,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (new_vma) { /* * Source vma may have been merged into new_vma @@ -3311,6 +3333,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: + new_vma->vm_file = NULL; /* Prevent fput within vm_area_free */ vm_area_free(new_vma); out: return NULL; @@ -3603,6 +3626,10 @@ int mm_take_all_locks(struct mm_struct *mm) mutex_lock(&mm_all_locks_mutex); +#if defined(CONFIG_MMU_NOTIFIER) && defined(CONFIG_SPECULATIVE_PAGE_FAULT) + percpu_down_write(mm->mmu_notifier_lock); +#endif + for (vma = mm->mmap; vma; vma = vma->vm_next) { if (signal_pending(current)) goto out_unlock; @@ -3690,6 +3717,10 @@ void mm_drop_all_locks(struct mm_struct *mm) vm_unlock_mapping(vma->vm_file->f_mapping); } +#if defined(CONFIG_MMU_NOTIFIER) && defined(CONFIG_SPECULATIVE_PAGE_FAULT) + percpu_up_write(mm->mmu_notifier_lock); +#endif + mutex_unlock(&mm_all_locks_mutex); } diff --git a/mm/mmzone.c b/mm/mmzone.c index eb89d6e018e2..bccbc5aef647 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -9,6 +9,7 @@ #include #include #include +#include struct pglist_data *first_online_pgdat(void) { @@ -81,6 +82,8 @@ void lruvec_init(struct lruvec *lruvec) for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); + + lru_gen_init_lruvec(lruvec); } #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) @@ -100,3 +103,19 @@ int page_cpupid_xchg_last(struct page *page, int cpupid) return last_cpupid; } #endif + +enum zone_type gfp_zone(gfp_t flags) +{ + enum zone_type z; + gfp_t local_flags = flags; + int bit; + + trace_android_rvh_set_gfp_zone_flags(&local_flags); + + bit = (__force int) ((local_flags) & GFP_ZONEMASK); + + z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & + ((1 << GFP_ZONES_SHIFT) - 1); + VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); + return z; +} diff --git a/mm/mprotect.c b/mm/mprotect.c index ed18dc49533f..ba53529cdd5e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*pprev) { vma = *pprev; VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); diff --git a/mm/mremap.c b/mm/mremap.c index 3a3cf4cc2c63..6c87fc859cdc 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -219,7 +219,11 @@ static inline bool arch_supports_page_table_move(void) } #endif -#ifdef CONFIG_HAVE_MOVE_PMD +/* + * Speculative page fault handlers will not detect page table changes done + * without ptl locking. + */ +#if defined(CONFIG_HAVE_MOVE_PMD) && !defined(CONFIG_SPECULATIVE_PAGE_FAULT) static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) { @@ -287,7 +291,12 @@ static inline bool move_normal_pmd(struct vm_area_struct *vma, } #endif -#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD) +/* + * Speculative page fault handlers will not detect page table changes done + * without ptl locking. + */ +#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD) && \ + !defined(CONFIG_SPECULATIVE_PAGE_FAULT) static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) { @@ -674,6 +683,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, /* We always clear VM_LOCKED[ONFAULT] on the old vma */ vma->vm_flags &= VM_LOCKED_CLEAR_MASK; +#ifndef CONFIG_SPECULATIVE_PAGE_FAULT /* * anon_vma links of the old vma is no longer needed after its page * table has been moved. @@ -681,6 +691,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (new_vma != vma && vma->vm_start == old_addr && vma->vm_end == (old_addr + old_len)) unlink_anon_vmas(vma); +#endif /* Because we won't unmap we don't need to touch locked_vm */ return new_addr; diff --git a/mm/nommu.c b/mm/nommu.c index 02d2427b8f9e..f90afb1c8578 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -654,39 +654,47 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) { if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); - if (vma->vm_file) - fput(vma->vm_file); put_nommu_region(vma->vm_region); + /* fput(vma->vm_file) happens within vm_area_free() */ vm_area_free(vma); } -/* - * look up the first VMA in which addr resides, NULL if none - * - should be called with mm->mmap_lock at least held readlocked - */ -struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +struct vm_area_struct *find_vma_from_tree(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; - /* check the cache first */ - vma = vmacache_find(mm, addr); - if (likely(vma)) - return vma; - /* trawl the list (there may be multiple mappings in which addr * resides) */ for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->vm_start > addr) return NULL; - if (vma->vm_end > addr) { - vmacache_update(addr, vma); + if (vma->vm_end > addr) return vma; - } } return NULL; } -EXPORT_SYMBOL(find_vma); + +/* + * look up the first VMA in which addr resides, NULL if none + * - should be called with mm->mmap_lock at least held readlocked + */ +struct vm_area_struct *__find_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma; + + /* Check the cache first. */ + vma = vmacache_find(mm, addr); + if (likely(vma)) + return vma; + + vma = find_vma_from_tree(mm, addr); + + if (vma) + vmacache_update(addr, vma); + return vma; +} +EXPORT_SYMBOL(__find_vma); /* * find a VMA @@ -1254,8 +1262,7 @@ error: if (region->vm_file) fput(region->vm_file); kmem_cache_free(vm_region_jar, region); - if (vma->vm_file) - fput(vma->vm_file); + /* fput(vma->vm_file) happens within vm_area_free() */ vm_area_free(vma); return ret; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 262f752d3d51..67946e2f50ea 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -52,6 +52,9 @@ #define CREATE_TRACE_POINTS #include +#undef CREATE_TRACE_POINTS +#include + int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks = 1; @@ -1129,6 +1132,12 @@ bool out_of_memory(struct oom_control *oc) select_bad_process(oc); /* Found nothing?!?! */ if (!oc->chosen) { + int ret = false; + + trace_android_vh_oom_check_panic(oc, &ret); + if (ret) + return true; + dump_header(oc, NULL); pr_warn("Out of memory and no killable processes...\n"); /* @@ -1200,15 +1209,21 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) goto put_task; } - if (mmget_not_zero(p->mm)) { - mm = p->mm; - if (task_will_free_mem(p)) - reap = true; - else { - /* Error only if the work has not been done already */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags)) - ret = -EINVAL; - } + mm = p->mm; + mmgrab(mm); + + /* + * If we are too late and exit_mmap already checked mm_is_oom_victim + * then will block on mmap_read_lock until exit_mmap releases mmap_lock + */ + set_bit(MMF_OOM_VICTIM, &mm->flags); + + if (task_will_free_mem(p)) + reap = true; + else { + /* Error only if the work has not been done already */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags)) + ret = -EINVAL; } task_unlock(p); @@ -1219,13 +1234,16 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) ret = -EINTR; goto drop_mm; } - if (!__oom_reap_task_mm(mm)) + /* + * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure + * possible change in exit_mmap is seen + */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm)) ret = -EAGAIN; mmap_read_unlock(mm); drop_mm: - if (mm) - mmput(mm); + mmdrop(mm); put_task: put_task_struct(task); put_pid: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 53a201aafe18..5453d0d59088 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include #include @@ -72,6 +73,7 @@ #include #include #include +#include #include #include #include @@ -337,10 +339,10 @@ const char * const migratetype_names[MIGRATE_TYPES] = { "Unmovable", "Movable", "Reclaimable", - "HighAtomic", #ifdef CONFIG_CMA "CMA", #endif + "HighAtomic", #ifdef CONFIG_MEMORY_ISOLATION "Isolate", #endif @@ -396,25 +398,9 @@ int page_group_by_mobility_disabled __read_mostly; */ static DEFINE_STATIC_KEY_TRUE(deferred_pages); -/* - * Calling kasan_poison_pages() only after deferred memory initialization - * has completed. Poisoning pages during deferred memory init will greatly - * lengthen the process and cause problem in large memory systems as the - * deferred pages initialization is done with interrupt disabled. - * - * Assuming that there will be no reference to those newly initialized - * pages before they are ever allocated, this should have no effect on - * KASAN memory tracking as the poison will be properly inserted at page - * allocation time. The only corner case is when pages are allocated by - * on-demand allocation and then freed again before the deferred pages - * initialization is done, but this is not likely to happen. - */ -static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +static inline bool deferred_pages_enabled(void) { - return static_branch_unlikely(&deferred_pages) || - (!IS_ENABLED(CONFIG_KASAN_GENERIC) && - (fpi_flags & FPI_SKIP_KASAN_POISON)) || - PageSkipKASanPoison(page); + return static_branch_unlikely(&deferred_pages); } /* Returns true if the struct page for the pfn is uninitialised */ @@ -465,11 +451,9 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } #else -static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +static inline bool deferred_pages_enabled(void) { - return (!IS_ENABLED(CONFIG_KASAN_GENERIC) && - (fpi_flags & FPI_SKIP_KASAN_POISON)) || - PageSkipKASanPoison(page); + return false; } static inline bool early_page_uninitialised(unsigned long pfn) @@ -517,8 +501,12 @@ unsigned long __get_pfnblock_flags_mask(const struct page *page, bitidx = pfn_to_bitidx(page, pfn); word_bitidx = bitidx / BITS_PER_LONG; bitidx &= (BITS_PER_LONG-1); - - word = bitmap[word_bitidx]; + /* + * This races, without locks, with set_pfnblock_flags_mask(). Ensure + * a consistent read of the memory array, so that results, even though + * racy, are not corrupted. + */ + word = READ_ONCE(bitmap[word_bitidx]); return (word >> bitidx) & mask; } @@ -535,6 +523,24 @@ unsigned long get_pfnblock_flags_mask(const struct page *page, { return __get_pfnblock_flags_mask(page, pfn, mask); } +EXPORT_SYMBOL_GPL(get_pfnblock_flags_mask); + +int isolate_anon_lru_page(struct page *page) +{ + int ret; + + if (!PageLRU(page) || !PageAnon(page)) + return -EINVAL; + + if (!get_page_unless_zero(page)) + return -EINVAL; + + ret = isolate_lru_page(page); + put_page(page); + + return ret; +} +EXPORT_SYMBOL_GPL(isolate_anon_lru_page); static __always_inline int get_pfnblock_migratetype(const struct page *page, unsigned long pfn) @@ -1281,24 +1287,45 @@ out: return ret; } -static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags) +/* + * Skip KASAN memory poisoning when either: + * + * 1. Deferred memory initialization has not yet completed, + * see the explanation below. + * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON, + * see the comment next to it. + * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON, + * see the comment next to it. + * 4. The allocation is excluded from being checked due to sampling, + * see the call to kasan_unpoison_pages. + * + * Poisoning pages during deferred memory init will greatly lengthen the + * process and cause problem in large memory systems as the deferred pages + * initialization is done with interrupt disabled. + * + * Assuming that there will be no reference to those newly initialized + * pages before they are ever allocated, this should have no effect on + * KASAN memory tracking as the poison will be properly inserted at page + * allocation time. The only corner case is when pages are allocated by + * on-demand allocation and then freed again before the deferred pages + * initialization is done, but this is not likely to happen. + */ +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +{ + return deferred_pages_enabled() || + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) || + PageSkipKASanPoison(page); +} + +static void kernel_init_pages(struct page *page, int numpages) { int i; - if (zero_tags) { - for (i = 0; i < numpages; i++) - tag_clear_highpage(page + i); - return; - } - /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); - for (i = 0; i < numpages; i++) { - u8 tag = page_kasan_tag(page + i); - page_kasan_tag_reset(page + i); - clear_highpage(page + i); - page_kasan_tag_set(page + i, tag); - } + for (i = 0; i < numpages; i++) + clear_highpage_kasan_tagged(page + i); kasan_enable_current(); } @@ -1306,7 +1333,7 @@ static __always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free, fpi_t fpi_flags) { int bad = 0; - bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); + bool init = want_init_on_free(); VM_BUG_ON_PAGE(PageTail(page), page); @@ -1320,6 +1347,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); reset_page_owner(page, order); + free_page_pinner(page, order); return false; } @@ -1359,6 +1387,7 @@ static __always_inline bool free_pages_prepare(struct page *page, page_cpupid_reset_last(page); page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); + free_page_pinner(page, order); if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), @@ -1371,23 +1400,21 @@ static __always_inline bool free_pages_prepare(struct page *page, /* * As memory initialization might be integrated into KASAN, - * kasan_free_pages and kernel_init_free_pages must be + * KASAN poisoning and memory initialization code must be * kept together to avoid discrepancies in behavior. * * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - if (kasan_has_integrated_init()) { - if (!skip_kasan_poison) - kasan_free_pages(page, order); - } else { - bool init = want_init_on_free(); + if (!should_skip_kasan_poison(page, fpi_flags)) { + kasan_poison_pages(page, order, init); - if (init) - kernel_init_free_pages(page, 1 << order, false); - if (!skip_kasan_poison) - kasan_poison_pages(page, order, init); + /* Memory is already initialized if KASAN did it internally. */ + if (kasan_has_integrated_init()) + init = false; } + if (init) + kernel_init_pages(page, 1 << order); /* * arch_free_page() can make the page's contents inaccessible. s390 @@ -2406,9 +2433,43 @@ static bool check_new_pages(struct page *page, unsigned int order) return false; } +static inline bool should_skip_kasan_unpoison(gfp_t flags) +{ + /* Don't skip if a software KASAN mode is enabled. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC) || + IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + return false; + + /* Skip, if hardware tag-based KASAN is not enabled. */ + if (!kasan_hw_tags_enabled()) + return true; + + /* + * With hardware tag-based KASAN enabled, skip if this has been + * requested via __GFP_SKIP_KASAN_UNPOISON. + */ + return flags & __GFP_SKIP_KASAN_UNPOISON; +} + +static inline bool should_skip_init(gfp_t flags) +{ + /* Don't skip, if hardware tag-based KASAN is not enabled. */ + if (!kasan_hw_tags_enabled()) + return false; + + /* For hardware tag-based KASAN, skip if requested. */ + return (flags & __GFP_SKIP_ZERO); +} + inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { + bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && + !should_skip_init(gfp_flags); + bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS); + bool reset_tags = true; + int i; + set_page_private(page, 0); set_page_refcounted(page); @@ -2424,19 +2485,53 @@ inline void post_alloc_hook(struct page *page, unsigned int order, /* * As memory initialization might be integrated into KASAN, - * kasan_alloc_pages and kernel_init_free_pages must be + * KASAN unpoisoning and memory initializion code must be * kept together to avoid discrepancies in behavior. */ - if (kasan_has_integrated_init()) { - kasan_alloc_pages(page, order, gfp_flags); - } else { - bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); - kasan_unpoison_pages(page, order, init); - if (init) - kernel_init_free_pages(page, 1 << order, - gfp_flags & __GFP_ZEROTAGS); + /* + * If memory tags should be zeroed + * (which happens only when memory should be initialized as well). + */ + if (zero_tags) { + /* Initialize both memory and memory tags. */ + for (i = 0; i != 1 << order; ++i) + tag_clear_highpage(page + i); + + /* Take note that memory was initialized by the loop above. */ + init = false; } + if (!should_skip_kasan_unpoison(gfp_flags)) { + /* Try unpoisoning (or setting tags) and initializing memory. */ + if (kasan_unpoison_pages(page, order, init)) { + /* Take note that memory was initialized by KASAN. */ + if (kasan_has_integrated_init()) + init = false; + /* Take note that memory tags were set by KASAN. */ + reset_tags = false; + } else { + /* + * KASAN decided to exclude this allocation from being + * (un)poisoned due to sampling. Make KASAN skip + * poisoning when the allocation is freed. + */ + SetPageSkipKASanPoison(page); + } + } + /* + * If memory tags have not been set by KASAN, reset the page tags to + * ensure page_address() dereferencing does not fault. + */ + if (reset_tags) { + for (i = 0; i != 1 << order; ++i) + page_kasan_tag_reset(page + i); + } + /* If memory is still not initialized, initialize it now. */ + if (init) + kernel_init_pages(page, 1 << order); + /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ + if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON)) + SetPageSkipKASanPoison(page); set_page_owner(page, order, gfp_flags); } @@ -3015,20 +3110,36 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, } retry: page = __rmqueue_smallest(zone, order, migratetype); - if (unlikely(!page)) { - if (alloc_flags & ALLOC_CMA) - page = __rmqueue_cma_fallback(zone, order); - if (!page && __rmqueue_fallback(zone, order, migratetype, - alloc_flags)) - goto retry; - } + if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype, + alloc_flags)) + goto retry; out: if (page) trace_mm_page_alloc_zone_locked(page, order, migratetype); return page; } +#ifdef CONFIG_CMA +static struct page *__rmqueue_cma(struct zone *zone, unsigned int order, + int migratetype, + unsigned int alloc_flags) +{ + struct page *page = __rmqueue_cma_fallback(zone, order); + + if (page) + trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA); + return page; +} +#else +static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order, + int migratetype, + unsigned int alloc_flags) +{ + return NULL; +} +#endif + /* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. @@ -3046,8 +3157,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, */ spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - struct page *page = __rmqueue(zone, order, migratetype, - alloc_flags); + struct page *page = NULL; + + if (is_migrate_cma(migratetype)) { + bool is_cma_alloc = true; + + trace_android_vh_cma_alloc_adjust(zone, &is_cma_alloc); + if (is_cma_alloc) + page = __rmqueue_cma(zone, order, migratetype, + alloc_flags); + } else + page = __rmqueue(zone, order, migratetype, alloc_flags); + if (unlikely(page == NULL)) break; @@ -3082,6 +3203,39 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, return allocated; } +/* + * Return the pcp list that corresponds to the migrate type if that list isn't + * empty. + * If the list is empty return NULL. + */ +static struct list_head *get_populated_pcp_list(struct zone *zone, + unsigned int order, struct per_cpu_pages *pcp, + int migratetype, unsigned int alloc_flags) +{ + struct list_head *list = &pcp->lists[order_to_pindex(migratetype, order)]; + + if (list_empty(list)) { + int batch = READ_ONCE(pcp->batch); + int alloced; + + /* + * Scale batch relative to order if batch implies + * free pages can be stored on the PCP. Batch can + * be 1 for small zones or for boot pagesets which + * should never store free pages as the pages may + * belong to arbitrary zones. + */ + if (batch > 1) + batch = max(batch >> order, 2); + alloced = rmqueue_bulk(zone, order, pcp->batch, list, migratetype, alloc_flags); + + pcp->count += alloced << order; + if (list_empty(list)) + list = NULL; + } + return list; +} + #ifdef CONFIG_NUMA /* * Called from the vmstat counter updater to drain pagesets of this @@ -3414,24 +3568,30 @@ void free_unref_page(struct page *page, unsigned int order) unsigned long flags; unsigned long pfn = page_to_pfn(page); int migratetype; + bool pcp_skip_cma_pages = false; if (!free_unref_page_prepare(page, pfn, order)) return; /* - * We only track unmovable, reclaimable and movable on pcp lists. + * We only track unmovable, reclaimable movable, and CMA on pcp lists. * Place ISOLATE pages on the isolated list because they are being * offlined but treat HIGHATOMIC as movable pages so we can get those * areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ migratetype = get_pcppage_migratetype(page); - if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { - if (unlikely(is_migrate_isolate(migratetype))) { + if (unlikely(migratetype > MIGRATE_RECLAIMABLE)) { + trace_android_vh_pcplist_add_cma_pages_bypass(migratetype, + &pcp_skip_cma_pages); + if (unlikely(is_migrate_isolate(migratetype)) || + pcp_skip_cma_pages) { free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); return; } migratetype = MIGRATE_MOVABLE; + if (migratetype == MIGRATE_HIGHATOMIC) + migratetype = MIGRATE_MOVABLE; } local_lock_irqsave(&pagesets.lock, flags); @@ -3448,6 +3608,7 @@ void free_unref_page_list(struct list_head *list) unsigned long flags, pfn; int batch_count = 0; int migratetype; + bool pcp_skip_cma_pages = false; /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { @@ -3462,7 +3623,10 @@ void free_unref_page_list(struct list_head *list) * comment in free_unref_page. */ migratetype = get_pcppage_migratetype(page); - if (unlikely(is_migrate_isolate(migratetype))) { + trace_android_vh_pcplist_add_cma_pages_bypass(migratetype, + &pcp_skip_cma_pages); + if (unlikely(is_migrate_isolate(migratetype)) || + pcp_skip_cma_pages) { list_del(&page->lru); free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); continue; @@ -3625,30 +3789,23 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, int migratetype, unsigned int alloc_flags, struct per_cpu_pages *pcp, - struct list_head *list) + gfp_t gfp_flags) { - struct page *page; + struct page *page = NULL; + struct list_head *list = NULL; do { - if (list_empty(list)) { - int batch = READ_ONCE(pcp->batch); - int alloced; - + /* First try to get CMA pages */ + if (migratetype == MIGRATE_MOVABLE && alloc_flags & ALLOC_CMA) + list = get_populated_pcp_list(zone, order, pcp, get_cma_migrate_type(), + alloc_flags); + if (list == NULL) { /* - * Scale batch relative to order if batch implies - * free pages can be stored on the PCP. Batch can - * be 1 for small zones or for boot pagesets which - * should never store free pages as the pages may - * belong to arbitrary zones. + * Either CMA is not suitable or there are no + * free CMA pages. */ - if (batch > 1) - batch = max(batch >> order, 2); - alloced = rmqueue_bulk(zone, order, - batch, list, - migratetype, alloc_flags); - - pcp->count += alloced << order; - if (unlikely(list_empty(list))) + list = get_populated_pcp_list(zone, order, pcp, migratetype, alloc_flags); + if (unlikely(list == NULL) || unlikely(list_empty(list))) return NULL; } @@ -3667,7 +3824,6 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, unsigned int alloc_flags) { struct per_cpu_pages *pcp; - struct list_head *list; struct page *page; unsigned long flags; @@ -3680,8 +3836,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, */ pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp->free_factor >>= 1; - list = &pcp->lists[order_to_pindex(migratetype, order)]; - page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); + page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, gfp_flags); local_unlock_irqrestore(&pagesets.lock, flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); @@ -3703,16 +3858,9 @@ struct page *rmqueue(struct zone *preferred_zone, struct page *page; if (likely(pcp_allowed_order(order))) { - /* - * MIGRATE_MOVABLE pcplist could have the pages on CMA area and - * we need to skip it when CMA area isn't allowed. - */ - if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || - migratetype != MIGRATE_MOVABLE) { - page = rmqueue_pcplist(preferred_zone, zone, order, - gfp_flags, migratetype, alloc_flags); - goto out; - } + page = rmqueue_pcplist(preferred_zone, zone, order, + gfp_flags, migratetype, alloc_flags); + goto out; } /* @@ -3735,8 +3883,14 @@ struct page *rmqueue(struct zone *preferred_zone, if (page) trace_mm_page_alloc_zone_locked(page, order, migratetype); } - if (!page) - page = __rmqueue(zone, order, migratetype, alloc_flags); + if (!page) { + if (alloc_flags & ALLOC_CMA && migratetype == MIGRATE_MOVABLE) + page = __rmqueue_cma(zone, order, migratetype, + alloc_flags); + if (!page) + page = __rmqueue(zone, order, migratetype, + alloc_flags); + } } while (page && check_new_pages(page, order)); if (!page) goto failed; @@ -3914,6 +4068,14 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, continue; for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { +#ifdef CONFIG_CMA + /* + * Note that this check is needed only + * when MIGRATE_CMA < MIGRATE_PCPTYPES. + */ + if (mt == MIGRATE_CMA) + continue; +#endif if (!free_area_empty(area, mt)) return true; } @@ -4051,7 +4213,7 @@ static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask, unsigned int alloc_flags) { #ifdef CONFIG_CMA - if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE && gfp_mask & __GFP_CMA) alloc_flags |= ALLOC_CMA; #endif return alloc_flags; @@ -4633,13 +4795,12 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac) { unsigned int noreclaim_flag; - unsigned long pflags, progress; + unsigned long progress; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - psi_memstall_enter(&pflags); fs_reclaim_acquire(gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); @@ -4648,7 +4809,6 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(gfp_mask); - psi_memstall_leave(&pflags); cond_resched(); @@ -4662,11 +4822,14 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, unsigned long *did_some_progress) { struct page *page = NULL; + unsigned long pflags; bool drained = false; + bool skip_pcp_drain = false; + psi_memstall_enter(&pflags); *did_some_progress = __perform_reclaim(gfp_mask, order, ac); if (unlikely(!(*did_some_progress))) - return NULL; + goto out; retry: page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); @@ -4678,10 +4841,15 @@ retry: */ if (!page && !drained) { unreserve_highatomic_pageblock(ac, false); - drain_all_pages(NULL); + trace_android_vh_drain_all_pages_bypass(gfp_mask, order, + alloc_flags, ac->migratetype, *did_some_progress, &skip_pcp_drain); + if (!skip_pcp_drain) + drain_all_pages(NULL); drained = true; goto retry; } +out: + psi_memstall_leave(&pflags); return page; } @@ -4936,7 +5104,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; unsigned int zonelist_iter_cookie; int reserve_flags; - + unsigned long alloc_start = jiffies; /* * We also sanity check to catch abuse of atomic reserves being used by * callers that are not in atomic context. @@ -5187,6 +5355,7 @@ fail: warn_alloc(gfp_mask, ac->nodemask, "page allocation failure: order:%u", order); got_pg: + trace_android_vh_alloc_pages_slowpath(gfp_mask, order, alloc_start); return page; } @@ -5266,7 +5435,6 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, struct zone *zone; struct zoneref *z; struct per_cpu_pages *pcp; - struct list_head *pcp_list; struct alloc_context ac; gfp_t alloc_gfp; unsigned int alloc_flags = ALLOC_WMARK_LOW; @@ -5346,7 +5514,6 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, /* Attempt the batch allocation */ local_lock_irqsave(&pagesets.lock, flags); pcp = this_cpu_ptr(zone->per_cpu_pageset); - pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; while (nr_populated < nr_pages) { @@ -5357,7 +5524,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, } page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, - pcp, pcp_list); + pcp, alloc_gfp); if (unlikely(!page)) { /* Try and allocate at least one page */ if (!nr_account) @@ -5967,7 +6134,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " active_file:%lu inactive_file:%lu isolated_file:%lu\n" " unevictable:%lu dirty:%lu writeback:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" - " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" + " mapped:%lu shmem:%lu pagetables:%lu\n" + " sec_pagetables:%lu bounce:%lu\n" " kernel_misc_reclaimable:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", global_node_page_state(NR_ACTIVE_ANON), @@ -5984,6 +6152,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), global_node_page_state(NR_PAGETABLE), + global_node_page_state(NR_SECONDARY_PAGETABLE), global_zone_page_state(NR_BOUNCE), global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), global_zone_page_state(NR_FREE_PAGES), @@ -6017,6 +6186,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " shadow_call_stack:%lukB" #endif " pagetables:%lukB" + " sec_pagetables:%lukB" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -6042,6 +6212,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) node_page_state(pgdat, NR_KERNEL_SCS_KB), #endif K(node_page_state(pgdat, NR_PAGETABLE)), + K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } @@ -8538,7 +8709,7 @@ void setup_per_zone_wmarks(void) * 8192MB: 11584k * 16384MB: 16384k */ -int __meminit init_per_zone_wmark_min(void) +void calculate_min_free_kbytes(void) { unsigned long lowmem_kbytes; int new_min_free_kbytes; @@ -8556,6 +8727,11 @@ int __meminit init_per_zone_wmark_min(void) pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", new_min_free_kbytes, user_min_free_kbytes); } +} + +int __meminit init_per_zone_wmark_min(void) +{ + calculate_min_free_kbytes(); setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); @@ -8995,7 +9171,7 @@ static unsigned long pfn_max_align_down(unsigned long pfn) pageblock_nr_pages) - 1); } -static unsigned long pfn_max_align_up(unsigned long pfn) +unsigned long pfn_max_align_up(unsigned long pfn) { return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, pageblock_nr_pages)); @@ -9024,19 +9200,28 @@ static inline void alloc_contig_dump_pages(struct list_head *page_list) /* [start, end) must belong to a single zone. */ static int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + struct acr_info *info) { /* This function is based on compact_zone() from compaction.c. */ unsigned int nr_reclaimed; unsigned long pfn = start; unsigned int tries = 0; + unsigned int max_tries = 5; int ret = 0; + bool skip = false; + struct page *page; struct migration_target_control mtc = { .nid = zone_to_nid(cc->zone), .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; - lru_cache_disable(); + if (cc->alloc_contig && cc->mode == MIGRATE_ASYNC) + max_tries = 1; + + trace_android_vh_skip_lru_disable(&skip); + if (!skip) + lru_cache_disable(); while (pfn < end || !list_empty(&cc->migratepages)) { if (fatal_signal_pending(current)) { @@ -9051,18 +9236,25 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, break; pfn = cc->migrate_pfn; tries = 0; - } else if (++tries == 5) { + } else if (++tries == max_tries) { ret = -EBUSY; break; } nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, &cc->migratepages); + info->nr_reclaimed += nr_reclaimed; cc->nr_migratepages -= nr_reclaimed; + list_for_each_entry(page, &cc->migratepages, lru) + info->nr_mapped += page_mapcount(page); + ret = migrate_pages(&cc->migratepages, alloc_migration_target, NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL); + if (!ret) + info->nr_migrated += cc->nr_migratepages; + /* * On -ENOMEM, migrate_pages() bails out right away. It is pointless * to retry again over this error, so do the same here. @@ -9071,11 +9263,28 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, break; } - lru_cache_enable(); + if (!skip) + lru_cache_enable(); if (ret < 0) { - if (ret == -EBUSY) + if (ret == -EBUSY) { + struct page *page; + alloc_contig_dump_pages(&cc->migratepages); + list_for_each_entry(page, &cc->migratepages, lru) { + /* The page will be freed by putback_movable_pages soon */ + if (page_count(page) == 1) + continue; + page_pinner_failure_detect(page); + } + } + + if (!list_empty(&cc->migratepages)) { + page = list_first_entry(&cc->migratepages, struct page , lru); + info->failed_pfn = page_to_pfn(page); + } + putback_movable_pages(&cc->migratepages); + info->err |= ACR_ERR_MIGRATE; return ret; } return 0; @@ -9103,17 +9312,19 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * need to be freed with free_contig_range(). */ int alloc_contig_range(unsigned long start, unsigned long end, - unsigned migratetype, gfp_t gfp_mask) + unsigned migratetype, gfp_t gfp_mask, + struct acr_info *info) { unsigned long outer_start, outer_end; unsigned int order; int ret = 0; + bool skip_drain_all_pages = false; struct compact_control cc = { .nr_migratepages = 0, .order = -1, .zone = page_zone(pfn_to_page(start)), - .mode = MIGRATE_SYNC, + .mode = gfp_mask & __GFP_NORETRY ? MIGRATE_ASYNC : MIGRATE_SYNC, .ignore_skip_hint = true, .no_set_skip_hint = true, .gfp_mask = current_gfp_context(gfp_mask), @@ -9146,11 +9357,17 @@ int alloc_contig_range(unsigned long start, unsigned long end, */ ret = start_isolate_page_range(pfn_max_align_down(start), - pfn_max_align_up(end), migratetype, 0); - if (ret) + pfn_max_align_up(end), migratetype, 0, + &info->failed_pfn); + if (ret) { + info->err |= ACR_ERR_ISOLATE; return ret; + } - drain_all_pages(cc.zone); + trace_android_vh_cma_drain_all_pages_bypass(migratetype, + &skip_drain_all_pages); + if (!skip_drain_all_pages) + drain_all_pages(cc.zone); /* * In case of -EBUSY, we'd like to know which page causes problem. @@ -9162,8 +9379,8 @@ int alloc_contig_range(unsigned long start, unsigned long end, * allocated. So, if we fall through be sure to clear ret so that * -EBUSY is not accidentally used or returned to caller. */ - ret = __alloc_contig_migrate_range(&cc, start, end); - if (ret && ret != -EBUSY) + ret = __alloc_contig_migrate_range(&cc, start, end, info); + if (ret && (ret != -EBUSY || (gfp_mask & __GFP_NORETRY))) goto done; ret = 0; @@ -9208,8 +9425,9 @@ int alloc_contig_range(unsigned long start, unsigned long end, } /* Make sure the range is really isolated. */ - if (test_pages_isolated(outer_start, end, 0)) { + if (test_pages_isolated(outer_start, end, 0, &info->failed_pfn)) { ret = -EBUSY; + info->err |= ACR_ERR_TEST; goto done; } @@ -9236,10 +9454,11 @@ EXPORT_SYMBOL(alloc_contig_range); static int __alloc_contig_pages(unsigned long start_pfn, unsigned long nr_pages, gfp_t gfp_mask) { + struct acr_info dummy; unsigned long end_pfn = start_pfn + nr_pages; return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - gfp_mask); + gfp_mask, &dummy); } static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, diff --git a/mm/page_counter.c b/mm/page_counter.c index 7d83641eb86b..036d57acebb4 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -12,6 +12,7 @@ #include #include #include +#include static void propagate_protected_usage(struct page_counter *c, unsigned long usage) @@ -83,6 +84,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) */ if (new > READ_ONCE(c->watermark)) WRITE_ONCE(c->watermark, new); + trace_android_rvh_update_watermark(new, c); } } @@ -137,6 +139,7 @@ bool page_counter_try_charge(struct page_counter *counter, */ if (new > READ_ONCE(c->watermark)) WRITE_ONCE(c->watermark, new); + trace_android_rvh_update_watermark(new, c); } return true; diff --git a/mm/page_ext.c b/mm/page_ext.c index 2a52fd9ed464..c429973b0401 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -7,8 +7,9 @@ #include #include #include +#include #include - +#include /* * struct page extension * @@ -58,6 +59,10 @@ * can utilize this callback to initialize the state of it correctly. */ +#ifdef CONFIG_SPARSEMEM +#define PAGE_EXT_INVALID (0x1) +#endif + #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) static bool need_page_idle(void) { @@ -75,6 +80,9 @@ static struct page_ext_operations *page_ext_ops[] = { #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif +#ifdef CONFIG_PAGE_PINNER + &page_pinner_ops, +#endif }; unsigned long page_ext_size = sizeof(struct page_ext); @@ -121,8 +129,50 @@ static inline struct page_ext *get_entry(void *base, unsigned long index) return base + page_ext_size * index; } -#ifndef CONFIG_SPARSEMEM +/** + * page_ext_get() - Get the extended information for a page. + * @page: The page we're interested in. + * + * Ensures that the page_ext will remain valid until page_ext_put() + * is called. + * + * Return: NULL if no page_ext exists for this page. + * Context: Any context. Caller may not sleep until they have called + * page_ext_put(). + */ +struct page_ext *page_ext_get(struct page *page) +{ + struct page_ext *page_ext; + rcu_read_lock(); + page_ext = lookup_page_ext(page); + if (!page_ext) { + rcu_read_unlock(); + return NULL; + } + + return page_ext; +} + +/** + * page_ext_put() - Working with page extended information is done. + * @page_ext - Page extended information received from page_ext_get(). + * + * The page extended information of the page may not be valid after this + * function is called. + * + * Return: None. + * Context: Any context with corresponding page_ext_get() is called. + */ +void page_ext_put(struct page_ext *page_ext) +{ + if (unlikely(!page_ext)) + return; + + rcu_read_unlock(); +} + +#if !defined(CONFIG_SPARSEMEM) void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) { @@ -135,6 +185,7 @@ struct page_ext *lookup_page_ext(const struct page *page) unsigned long index; struct page_ext *base; + WARN_ON_ONCE(!rcu_read_lock_held()); base = NODE_DATA(page_to_nid(page))->node_page_ext; /* * The sanity checks the page allocator does upon freeing a @@ -148,6 +199,7 @@ struct page_ext *lookup_page_ext(const struct page *page) MAX_ORDER_NR_PAGES); return get_entry(base, index); } +EXPORT_SYMBOL_NS_GPL(lookup_page_ext, MINIDUMP); static int __init alloc_node_page_ext(int nid) { @@ -202,21 +254,29 @@ fail: } #else /* CONFIG_FLATMEM */ +static bool page_ext_invalid(struct page_ext *page_ext) +{ + return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); +} struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); + struct page_ext *page_ext = READ_ONCE(section->page_ext); + + WARN_ON_ONCE(!rcu_read_lock_held()); /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ - if (!section->page_ext) + if (page_ext_invalid(page_ext)) return NULL; - return get_entry(section->page_ext, pfn); + return get_entry(page_ext, pfn); } +EXPORT_SYMBOL_NS_GPL(lookup_page_ext, MINIDUMP); static void *__meminit alloc_page_ext(size_t size, int nid) { @@ -294,9 +354,30 @@ static void __free_page_ext(unsigned long pfn) ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; - base = get_entry(ms->page_ext, pfn); + + base = READ_ONCE(ms->page_ext); + /* + * page_ext here can be valid while doing the roll back + * operation in online_page_ext(). + */ + if (page_ext_invalid(base)) + base = (void *)base - PAGE_EXT_INVALID; + WRITE_ONCE(ms->page_ext, NULL); + + base = get_entry(base, pfn); free_page_ext(base); - ms->page_ext = NULL; +} + +static void __invalidate_page_ext(unsigned long pfn) +{ + struct mem_section *ms; + void *val; + + ms = __pfn_to_section(pfn); + if (!ms || !ms->page_ext) + return; + val = (void *)ms->page_ext + PAGE_EXT_INVALID; + WRITE_ONCE(ms->page_ext, val); } static int __meminit online_page_ext(unsigned long start_pfn, @@ -339,6 +420,20 @@ static int __meminit offline_page_ext(unsigned long start_pfn, start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); + /* + * Freeing of page_ext is done in 3 steps to avoid + * use-after-free of it: + * 1) Traverse all the sections and mark their page_ext + * as invalid. + * 2) Wait for all the existing users of page_ext who + * started before invalidation to finish. + * 3) Free the page_ext. + */ + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __invalidate_page_ext(pfn); + + synchronize_rcu(); + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); return 0; diff --git a/mm/page_idle.c b/mm/page_idle.c index edead6a8a5f9..9f6e8b11c4bb 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -88,10 +88,10 @@ static bool page_idle_clear_pte_refs_one(struct page *page, static void page_idle_clear_pte_refs(struct page *page) { /* - * Since rwc.arg is unused, rwc is effectively immutable, so we - * can make it static const to save some cycles and stack. + * Since rwc.try_lock is unused, rwc is effectively immutable, so we + * can make it static to save some cycles and stack. */ - static const struct rmap_walk_control rwc = { + static struct rmap_walk_control rwc = { .rmap_one = page_idle_clear_pte_refs_one, .anon_lock = page_lock_anon_vma_read, }; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index a95c2c6562d0..c142c4eb327b 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "internal.h" @@ -180,7 +181,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * Return: 0 on success and -EBUSY if any part of range cannot be isolated. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - unsigned migratetype, int flags) + unsigned migratetype, int flags, + unsigned long *failed_pfn) { unsigned long pfn; unsigned long undo_pfn; @@ -196,6 +198,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, if (page) { if (set_migratetype_isolate(page, migratetype, flags)) { undo_pfn = pfn; + if (failed_pfn) + *failed_pfn = page_to_pfn(page); goto undo; } } @@ -277,7 +281,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, /* Caller should ensure that requested range is in a single zone */ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, - int isol_flags) + int isol_flags, unsigned long *failed_pfn) { unsigned long pfn, flags; struct page *page; @@ -296,6 +300,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, } page = __first_valid_page(start_pfn, end_pfn - start_pfn); if ((pfn < end_pfn) || !page) { + if (failed_pfn) + *failed_pfn = pfn; ret = -EBUSY; goto out; } @@ -310,6 +316,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, out: trace_test_pages_isolated(start_pfn, end_pfn, pfn); + if (pfn < end_pfn) + page_pinner_failure_detect(pfn_to_page(pfn)); return ret; } diff --git a/mm/page_owner.c b/mm/page_owner.c index 62402d22539b..4f54eef6e364 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -98,6 +98,25 @@ static inline struct page_owner *get_page_owner(struct page_ext *page_ext) return (void *)page_ext + page_owner_ops.offset; } +depot_stack_handle_t get_page_owner_handle(struct page_ext *page_ext, unsigned long pfn) +{ + struct page_owner *page_owner; + depot_stack_handle_t handle; + + if (!page_owner_enabled) + return 0; + + page_owner = get_page_owner(page_ext); + + /* skip handle for tail pages of higher order allocations */ + if (!IS_ALIGNED(pfn, 1 << page_owner->order)) + return 0; + + handle = READ_ONCE(page_owner->handle); + return handle; +} +EXPORT_SYMBOL_NS_GPL(get_page_owner_handle, MINIDUMP); + static noinline depot_stack_handle_t save_stack(gfp_t flags) { unsigned long entries[PAGE_OWNER_STACK_DEPTH]; @@ -133,11 +152,12 @@ void __reset_page_owner(struct page *page, unsigned int order) struct page_owner *page_owner; u64 free_ts_nsec = local_clock(); - page_ext = lookup_page_ext(page); + handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); + page_ext = page_ext_get(page); + if (unlikely(!page_ext)) return; - handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); for (i = 0; i < (1 << order); i++) { __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); page_owner = get_page_owner(page_ext); @@ -145,6 +165,7 @@ void __reset_page_owner(struct page *page, unsigned int order) page_owner->free_ts_nsec = free_ts_nsec; page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } static inline void __set_page_owner_handle(struct page_ext *page_ext, @@ -172,19 +193,22 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext, noinline void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext; depot_stack_handle_t handle; + handle = save_stack(gfp_mask); + + page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; - handle = save_stack(gfp_mask); __set_page_owner_handle(page_ext, handle, order, gfp_mask); + page_ext_put(page_ext); } void __set_page_owner_migrate_reason(struct page *page, int reason) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; if (unlikely(!page_ext)) @@ -192,12 +216,13 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) page_owner = get_page_owner(page_ext); page_owner->last_migrate_reason = reason; + page_ext_put(page_ext); } void __split_page_owner(struct page *page, unsigned int nr) { int i; - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; if (unlikely(!page_ext)) @@ -208,17 +233,25 @@ void __split_page_owner(struct page *page, unsigned int nr) page_owner->order = 0; page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } void __copy_page_owner(struct page *oldpage, struct page *newpage) { - struct page_ext *old_ext = lookup_page_ext(oldpage); - struct page_ext *new_ext = lookup_page_ext(newpage); + struct page_ext *old_ext; + struct page_ext *new_ext; struct page_owner *old_page_owner, *new_page_owner; - if (unlikely(!old_ext || !new_ext)) + old_ext = page_ext_get(oldpage); + if (unlikely(!old_ext)) return; + new_ext = page_ext_get(newpage); + if (unlikely(!new_ext)) { + page_ext_put(old_ext); + return; + } + old_page_owner = get_page_owner(old_ext); new_page_owner = get_page_owner(new_ext); new_page_owner->order = old_page_owner->order; @@ -241,6 +274,8 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) */ __set_bit(PAGE_EXT_OWNER, &new_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); + page_ext_put(new_ext); + page_ext_put(old_ext); } void pagetypeinfo_showmixedcount_print(struct seq_file *m, @@ -294,12 +329,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, if (PageReserved(page)) continue; - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) - continue; + goto ext_put_continue; page_owner = get_page_owner(page_ext); page_mt = gfp_migratetype(page_owner->gfp_mask); @@ -310,9 +345,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, count[pageblock_mt]++; pfn = block_end_pfn; + page_ext_put(page_ext); break; } pfn += (1UL << page_owner->order) - 1; +ext_put_continue: + page_ext_put(page_ext); } } @@ -391,11 +429,9 @@ err: void __dump_page_owner(const struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get((void *)page); struct page_owner *page_owner; depot_stack_handle_t handle; - unsigned long *entries; - unsigned int nr_entries; gfp_t gfp_mask; int mt; @@ -410,6 +446,7 @@ void __dump_page_owner(const struct page *page) if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { pr_alert("page_owner info is not present (never set?)\n"); + page_ext_put(page_ext); return; } @@ -423,25 +460,23 @@ void __dump_page_owner(const struct page *page) page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec); handle = READ_ONCE(page_owner->handle); - if (!handle) { + if (!handle) pr_alert("page_owner allocation stack trace missing\n"); - } else { - nr_entries = stack_depot_fetch(handle, &entries); - stack_trace_print(entries, nr_entries, 0); - } + else + stack_depot_print(handle); handle = READ_ONCE(page_owner->free_handle); if (!handle) { pr_alert("page_owner free stack trace missing\n"); } else { - nr_entries = stack_depot_fetch(handle, &entries); pr_alert("page last free stack trace:\n"); - stack_trace_print(entries, nr_entries, 0); + stack_depot_print(handle); } if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); + page_ext_put(page_ext); } static ssize_t @@ -467,6 +502,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) /* Find an allocated page */ for (; pfn < max_pfn; pfn++) { + /* + * This temporary page_owner is required so + * that we can avoid the context switches while holding + * the rcu lock and copying the page owner information to + * user through copy_to_user() or GFP_KERNEL allocations. + */ + struct page_owner page_owner_tmp; + /* * If the new page is in a new MAX_ORDER_NR_PAGES area, * validate the area as existing, skip it if not @@ -485,7 +528,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) continue; } - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; @@ -494,14 +537,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) * because we don't hold the zone lock. */ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) - continue; + goto ext_put_continue; /* * Although we do have the info about past allocation of free * pages, it's not relevant for current memory usage. */ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) - continue; + goto ext_put_continue; page_owner = get_page_owner(page_ext); @@ -510,7 +553,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) * would inflate the stats. */ if (!IS_ALIGNED(pfn, 1 << page_owner->order)) - continue; + goto ext_put_continue; /* * Access to page_ext->handle isn't synchronous so we should @@ -518,13 +561,17 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) */ handle = READ_ONCE(page_owner->handle); if (!handle) - continue; + goto ext_put_continue; /* Record the next PFN to read in the file offset */ *ppos = (pfn - min_low_pfn) + 1; + page_owner_tmp = *page_owner; + page_ext_put(page_ext); return print_page_owner(buf, count, pfn, page, - page_owner, handle); + &page_owner_tmp, handle); +ext_put_continue: + page_ext_put(page_ext); } return 0; @@ -577,18 +624,20 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) if (PageReserved(page)) continue; - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; /* Maybe overlapping zone */ if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) - continue; + goto ext_put_continue; /* Found early allocated page */ __set_page_owner_handle(page_ext, early_handle, 0, 0); count++; +ext_put_continue: + page_ext_put(page_ext); } cond_resched(); } diff --git a/mm/page_pinner.c b/mm/page_pinner.c new file mode 100644 index 000000000000..2567206e7ce8 --- /dev/null +++ b/mm/page_pinner.c @@ -0,0 +1,426 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#define PAGE_PINNER_STACK_DEPTH 16 +static unsigned long pp_buf_size = 4096; + +struct page_pinner { + depot_stack_handle_t handle; + u64 ts_usec; + atomic_t count; +}; + +enum pp_state { + PP_PUT, + PP_FREE, + PP_FAIL_DETECTED, +}; + +struct captured_pinner { + depot_stack_handle_t handle; + union { + u64 ts_usec; + u64 elapsed; + }; + + /* struct page fields */ + unsigned long pfn; + int count; + int mapcount; + struct address_space *mapping; + unsigned long flags; + enum pp_state state; +}; + +struct page_pinner_buffer { + spinlock_t lock; + unsigned long index; + struct captured_pinner *buffer; +}; + +/* alloc_contig failed pinner */ +static struct page_pinner_buffer pp_buffer; + +static bool page_pinner_enabled; +DEFINE_STATIC_KEY_FALSE(page_pinner_inited); + +DEFINE_STATIC_KEY_TRUE(failure_tracking); +EXPORT_SYMBOL_GPL(failure_tracking); + +static depot_stack_handle_t failure_handle; + +static int __init early_page_pinner_param(char *buf) +{ + page_pinner_enabled = true; + return 0; +} +early_param("page_pinner", early_page_pinner_param); + +static bool need_page_pinner(void) +{ + return page_pinner_enabled; +} + +static noinline void register_failure_stack(void) +{ + unsigned long entries[4]; + unsigned int nr_entries; + + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); + failure_handle = stack_depot_save(entries, nr_entries, GFP_KERNEL); +} + +static void init_page_pinner(void) +{ + if (!page_pinner_enabled) + return; + + pp_buffer.buffer = kvmalloc_array(pp_buf_size, sizeof(*pp_buffer.buffer), + GFP_KERNEL); + if (!pp_buffer.buffer) { + pr_info("page_pinner disabled due to failure of buffer allocation\n"); + return; + } + + spin_lock_init(&pp_buffer.lock); + pp_buffer.index = 0; + + register_failure_stack(); + static_branch_enable(&page_pinner_inited); +} + +struct page_ext_operations page_pinner_ops = { + .size = sizeof(struct page_pinner), + .need = need_page_pinner, + .init = init_page_pinner, +}; + +static inline struct page_pinner *get_page_pinner(struct page_ext *page_ext) +{ + return (void *)page_ext + page_pinner_ops.offset; +} + +static noinline depot_stack_handle_t save_stack(gfp_t flags) +{ + unsigned long entries[PAGE_PINNER_STACK_DEPTH]; + depot_stack_handle_t handle; + unsigned int nr_entries; + + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); + handle = stack_depot_save(entries, nr_entries, flags); + if (!handle) + handle = failure_handle; + + return handle; +} + +static void capture_page_state(struct page *page, + struct captured_pinner *record) +{ + record->flags = page->flags; + record->mapping = page_mapping(page); + record->pfn = page_to_pfn(page); + record->count = page_count(page); + record->mapcount = page_mapcount(page); +} + +static void add_record(struct page_pinner_buffer *pp_buf, + struct captured_pinner *record) +{ + unsigned long flags; + unsigned int idx; + + spin_lock_irqsave(&pp_buf->lock, flags); + idx = pp_buf->index++; + pp_buf->index %= pp_buf_size; + pp_buf->buffer[idx] = *record; + spin_unlock_irqrestore(&pp_buf->lock, flags); +} + +void __free_page_pinner(struct page *page, unsigned int order) +{ + struct page_pinner *page_pinner; + struct page_ext *page_ext; + int i; + + /* free_page could be called before buffer is initialized */ + if (!pp_buffer.buffer) + return; + + page_ext = page_ext_get(page); + if (unlikely(!page_ext)) + return; + + for (i = 0; i < (1 << order); i++) { + struct captured_pinner record; + + if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) + continue; + + page_pinner = get_page_pinner(page_ext); + + record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN); + record.ts_usec = (u64)ktime_to_us(ktime_get_boottime()); + record.state = PP_FREE; + capture_page_state(page, &record); + + add_record(&pp_buffer, &record); + + atomic_set(&page_pinner->count, 0); + page_pinner->ts_usec = 0; + clear_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags); + page_ext = page_ext_next(page_ext); + } + page_ext_put(page_ext); +} + +static ssize_t +print_page_pinner(char __user *buf, size_t count, struct captured_pinner *record) +{ + int ret; + unsigned long *entries; + unsigned int nr_entries; + char *kbuf; + + count = min_t(size_t, count, PAGE_SIZE); + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + if (record->state == PP_PUT) { + ret = snprintf(kbuf, count, "At least, pinned for %llu us\n", + record->elapsed); + } else { + u64 ts_usec = record->ts_usec; + unsigned long rem_usec = do_div(ts_usec, 1000000); + + ret = snprintf(kbuf, count, + "%s [%5lu.%06lu]\n", + record->state == PP_FREE ? "Freed at" : + "Failure detected at", + (unsigned long)ts_usec, rem_usec); + } + + if (ret >= count) + goto err; + + /* Print information relevant to grouping pages by mobility */ + ret += snprintf(kbuf + ret, count - ret, + "PFN 0x%lx Block %lu count %d mapcount %d mapping %pS Flags %#lx(%pGp)\n", + record->pfn, + record->pfn >> pageblock_order, + record->count, record->mapcount, + record->mapping, + record->flags, &record->flags); + + if (ret >= count) + goto err; + + nr_entries = stack_depot_fetch(record->handle, &entries); + ret += stack_trace_snprint(kbuf + ret, count - ret, entries, + nr_entries, 0); + if (ret >= count) + goto err; + + ret += snprintf(kbuf + ret, count - ret, "\n"); + if (ret >= count) + goto err; + + if (copy_to_user(buf, kbuf, ret)) + ret = -EFAULT; + + kfree(kbuf); + return ret; + +err: + kfree(kbuf); + return -ENOMEM; +} + +void __page_pinner_failure_detect(struct page *page) +{ + struct page_ext *page_ext = page_ext_get(page); + struct page_pinner *page_pinner; + struct captured_pinner record; + u64 now; + + if (unlikely(!page_ext)) + return; + + if (test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) { + page_ext_put(page_ext); + return; + } + + now = (u64)ktime_to_us(ktime_get_boottime()); + page_pinner = get_page_pinner(page_ext); + if (!page_pinner->ts_usec) + page_pinner->ts_usec = now; + set_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags); + record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN); + record.ts_usec = now; + record.state = PP_FAIL_DETECTED; + capture_page_state(page, &record); + + add_record(&pp_buffer, &record); + page_ext_put(page_ext); +} +EXPORT_SYMBOL_GPL(__page_pinner_failure_detect); + +void __page_pinner_put_page(struct page *page) +{ + struct page_ext *page_ext = page_ext_get(page); + struct page_pinner *page_pinner; + struct captured_pinner record; + u64 now, ts_usec; + + if (unlikely(!page_ext)) + return; + + if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) { + page_ext_put(page_ext); + return; + } + + page_pinner = get_page_pinner(page_ext); + record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN); + now = (u64)ktime_to_us(ktime_get_boottime()); + ts_usec = page_pinner->ts_usec; + + if (now > ts_usec) + record.elapsed = now - ts_usec; + else + record.elapsed = 0; + record.state = PP_PUT; + capture_page_state(page, &record); + + add_record(&pp_buffer, &record); + page_ext_put(page_ext); +} +EXPORT_SYMBOL_GPL(__page_pinner_put_page); + +static ssize_t read_buffer(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 tmp; + loff_t i, idx; + struct captured_pinner record; + unsigned long flags; + + if (!static_branch_unlikely(&failure_tracking)) + return -EINVAL; + + if (*ppos >= pp_buf_size) + return 0; + + i = *ppos; + *ppos = i + 1; + + /* + * reading the records in the reverse order with newest one + * being read first followed by older ones + */ + tmp = pp_buffer.index - 1 - i + pp_buf_size; + idx = do_div(tmp, pp_buf_size); + + spin_lock_irqsave(&pp_buffer.lock, flags); + record = pp_buffer.buffer[idx]; + spin_unlock_irqrestore(&pp_buffer.lock, flags); + if (!record.handle) + return 0; + + return print_page_pinner(buf, count, &record); +} + +static const struct file_operations proc_buffer_operations = { + .read = read_buffer, +}; + +static int failure_tracking_set(void *data, u64 val) +{ + bool on; + + on = (bool)val; + if (on) + static_branch_enable(&failure_tracking); + else + static_branch_disable(&failure_tracking); + return 0; +} + +static int failure_tracking_get(void *data, u64 *val) +{ + *val = static_branch_unlikely(&failure_tracking); + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(failure_tracking_fops, + failure_tracking_get, + failure_tracking_set, "%llu\n"); + +static int buffer_size_set(void *data, u64 val) +{ + unsigned long flags; + struct captured_pinner *new, *old; + + new = kvmalloc_array(val, sizeof(*new), GFP_KERNEL); + if (!new) + return -ENOMEM; + + spin_lock_irqsave(&pp_buffer.lock, flags); + old = pp_buffer.buffer; + pp_buffer.buffer = new; + pp_buffer.index = 0; + pp_buf_size = val; + spin_unlock_irqrestore(&pp_buffer.lock, flags); + kvfree(old); + + return 0; +} + +static int buffer_size_get(void *data, u64 *val) +{ + *val = pp_buf_size; + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(buffer_size_fops, + buffer_size_get, + buffer_size_set, "%llu\n"); + +static int __init page_pinner_init(void) +{ + struct dentry *pp_debugfs_root; + + if (!static_branch_unlikely(&page_pinner_inited)) + return 0; + + pr_info("page_pinner enabled\n"); + + pp_debugfs_root = debugfs_create_dir("page_pinner", NULL); + + debugfs_create_file("buffer", 0444, + pp_debugfs_root, NULL, + &proc_buffer_operations); + + debugfs_create_file("failure_tracking", 0644, + pp_debugfs_root, NULL, + &failure_tracking_fops); + + debugfs_create_file("buffer_size", 0644, + pp_debugfs_root, NULL, + &buffer_size_fops); + return 0; +} +late_initcall(page_pinner_init) diff --git a/mm/page_reporting.c b/mm/page_reporting.c index 382958eef8a9..5c4b1fb73187 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "page_reporting.h" #include "internal.h" @@ -120,7 +121,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, unsigned int page_len = PAGE_SIZE << order; struct page *page, *next; long budget; - int err = 0; + int i, err = 0; /* * Perform early check, if free area is empty there is @@ -175,6 +176,10 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, --(*offset); sg_set_page(&sgl[*offset], page, page_len, 0); + /* Notify hyp that these pages are reclaimable. */ + for (i = 0; i < (1< #include #include +#include #include "internal.h" @@ -114,6 +115,15 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, EXPORT_SYMBOL(read_cache_pages); +gfp_t readahead_gfp_mask(struct address_space *x) +{ + gfp_t mask = mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN; + + trace_android_rvh_set_readahead_gfp_mask(&mask); + return mask; +} +EXPORT_SYMBOL_GPL(readahead_gfp_mask); + static void read_pages(struct readahead_control *rac, struct list_head *pages, bool skip_page) { diff --git a/mm/rmap.c b/mm/rmap.c index 330b361a460e..83b1af012b5c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -73,11 +73,14 @@ #include #include #include +#include #include #include +#include + #include "internal.h" static struct kmem_cache *anon_vma_cachep; @@ -419,11 +422,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma) if (vma->anon_vma) { vma->anon_vma->num_active_vmas--; +#ifndef CONFIG_SPECULATIVE_PAGE_FAULT /* * vma would still be needed after unlink, and anon_vma will be prepared * when handle fault. */ vma->anon_vma = NULL; +#endif } unlock_anon_vma_root(root); @@ -527,9 +532,11 @@ out: * * Its a little more complex as it tries to keep the fast path to a single * atomic op -- the trylock. If we fail the trylock, we fall back to getting a - * reference like with page_get_anon_vma() and then block on the mutex. + * reference like with page_get_anon_vma() and then block on the mutex + * on !rwc->try_lock case. */ -struct anon_vma *page_lock_anon_vma_read(struct page *page) +struct anon_vma *page_lock_anon_vma_read(struct page *page, + struct rmap_walk_control *rwc) { struct anon_vma *anon_vma = NULL; struct anon_vma *root_anon_vma; @@ -557,6 +564,12 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) goto out; } + if (rwc && rwc->try_lock) { + anon_vma = NULL; + rwc->contended = true; + goto out; + } + /* trylock failed, we got to sleep */ if (!atomic_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; @@ -793,6 +806,12 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, } if (pvmw.pte) { + if (lru_gen_enabled() && pte_young(*pvmw.pte) && + !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) { + lru_gen_look_around(&pvmw); + referenced++; + } + if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) { /* @@ -852,8 +871,10 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) * @memcg: target memory cgroup * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * - * Quick test_and_clear_referenced for all mappings to a page, - * returns the number of ptes which referenced the page. + * Quick test_and_clear_referenced for all mappings of a page, + * + * Return: The number of mappings which referenced the page. Return -1 if + * the function bailed out due to rmap lock contention. */ int page_referenced(struct page *page, int is_locked, @@ -869,6 +890,7 @@ int page_referenced(struct page *page, .rmap_one = page_referenced_one, .arg = (void *)&pra, .anon_lock = page_lock_anon_vma_read, + .try_lock = true, }; *vm_flags = 0; @@ -899,7 +921,7 @@ int page_referenced(struct page *page, if (we_locked) unlock_page(page); - return pra.referenced; + return rwc.contended ? -1 : pra.referenced; } static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, @@ -1670,6 +1692,7 @@ discard: } mmu_notifier_invalidate_range_end(&range); + trace_android_vh_try_to_unmap_one(vma, page, address, ret); return ret; } @@ -2260,7 +2283,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, struct anon_vma *anon_vma; if (rwc->anon_lock) - return rwc->anon_lock(page); + return rwc->anon_lock(page, rwc); /* * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() @@ -2272,7 +2295,17 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, if (!anon_vma) return NULL; + if (anon_vma_trylock_read(anon_vma)) + goto out; + + if (rwc->try_lock) { + anon_vma = NULL; + rwc->contended = true; + goto out; + } + anon_vma_lock_read(anon_vma); +out: return anon_vma; } @@ -2363,8 +2396,18 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, pgoff_start = page_to_pgoff(page); pgoff_end = pgoff_start + thp_nr_pages(page) - 1; - if (!locked) + if (!locked) { + if (i_mmap_trylock_read(mapping)) + goto lookup; + + if (rwc->try_lock) { + rwc->contended = true; + return; + } + i_mmap_lock_read(mapping); + } +lookup: vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { unsigned long address = vma_address(page, vma); diff --git a/mm/shmem.c b/mm/shmem.c index 0c37c5f0a903..55c837973f90 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -39,6 +39,7 @@ #include #include #include +#include static struct vfsmount *shm_mnt; @@ -4248,3 +4249,41 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, #endif } EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); + +int reclaim_shmem_address_space(struct address_space *mapping) +{ +#ifdef CONFIG_SHMEM + pgoff_t start = 0; + struct page *page; + LIST_HEAD(page_list); + XA_STATE(xas, &mapping->i_pages, start); + + if (!shmem_mapping(mapping)) + return -EINVAL; + + lru_add_drain(); + + rcu_read_lock(); + xas_for_each(&xas, page, ULONG_MAX) { + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) + continue; + if (isolate_lru_page(page)) + continue; + + list_add(&page->lru, &page_list); + + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); + } + } + rcu_read_unlock(); + + return reclaim_pages(&page_list); +#else + return 0; +#endif +} +EXPORT_SYMBOL_GPL(reclaim_shmem_address_space); diff --git a/mm/slab.c b/mm/slab.c index 1bd283e98c58..5618edbccd30 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4087,6 +4087,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) sinfo->objects_per_slab = cachep->num; sinfo->cache_order = cachep->gfporder; } +EXPORT_SYMBOL_NS_GPL(get_slabinfo, MINIDUMP); void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) { diff --git a/mm/slab.h b/mm/slab.h index 1ae1bdd485c1..61d235eebc4e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -91,6 +91,24 @@ struct kmem_cache *kmalloc_slab(size_t, gfp_t); gfp_t kmalloc_fix_flags(gfp_t flags); +#ifdef CONFIG_SLUB +/* + * Tracking user of a slab. + */ +#define TRACK_ADDRS_COUNT 16 +struct track { + unsigned long addr; /* Called from address */ +#ifdef CONFIG_STACKTRACE + unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ +#endif + int cpu; /* Was running on cpu */ + int pid; /* Pid context */ + unsigned long when; /* When did the operation occur */ +}; + +enum track_item { TRACK_ALLOC, TRACK_FREE }; +#endif + /* Functions provided by the slab allocators */ int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); @@ -216,6 +234,10 @@ DECLARE_STATIC_KEY_FALSE(slub_debug_enabled); #endif extern void print_tracking(struct kmem_cache *s, void *object); long validate_slab_cache(struct kmem_cache *s); +extern unsigned long get_each_object_track(struct kmem_cache *s, + struct page *page, enum track_item alloc, + int (*fn)(const struct kmem_cache *, const void *, + const struct track *, void *), void *private); static inline bool __slub_debug_enabled(void) { return static_branch_unlikely(&slub_debug_enabled); @@ -224,6 +246,15 @@ static inline bool __slub_debug_enabled(void) static inline void print_tracking(struct kmem_cache *s, void *object) { } +#ifdef CONFIG_SLUB +static inline unsigned long get_each_object_track(struct kmem_cache *s, + struct page *page, enum track_item alloc, + int (*fn)(const struct kmem_cache *, const void *, + const struct track *, void *), void *private) +{ + return 0; +} +#endif static inline bool __slub_debug_enabled(void) { return false; diff --git a/mm/slab_common.c b/mm/slab_common.c index 022319e7deaf..8cf667b3065e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -27,7 +27,8 @@ #define CREATE_TRACE_POINTS #include - +#undef CREATE_TRACE_POINTS +#include #include "internal.h" #include "slab.h" @@ -499,7 +500,7 @@ void kmem_cache_destroy(struct kmem_cache *s) { int err; - if (unlikely(!s)) + if (unlikely(!s) || !kasan_check_byte(s)) return; cpus_read_lock(); @@ -1058,6 +1059,7 @@ static void print_slabinfo_header(struct seq_file *m) seq_puts(m, " : globalstat "); seq_puts(m, " : cpustat "); #endif + trace_android_vh_print_slabinfo_header(m); seq_putc(m, '\n'); } @@ -1093,6 +1095,7 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m) seq_printf(m, " : slabdata %6lu %6lu %6lu", sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); slabinfo_show_stats(m, s); + trace_android_vh_cache_show(m, &sinfo, s); seq_putc(m, '\n'); } diff --git a/mm/slub.c b/mm/slub.c index f95ae136a069..361fa43129fe 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -258,22 +258,6 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* Use cmpxchg_double */ #define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U) -/* - * Tracking user of a slab. - */ -#define TRACK_ADDRS_COUNT 16 -struct track { - unsigned long addr; /* Called from address */ -#ifdef CONFIG_STACKTRACE - unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ -#endif - int cpu; /* Was running on cpu */ - int pid; /* Pid context */ - unsigned long when; /* When did the operation occur */ -}; - -enum track_item { TRACK_ALLOC, TRACK_FREE }; - #ifdef CONFIG_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); @@ -551,7 +535,6 @@ static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, set_bit(__obj_to_index(s, addr, p), obj_map); } -#if IS_ENABLED(CONFIG_KUNIT) static bool slab_add_kunit_errors(void) { struct kunit_resource *resource; @@ -567,9 +550,6 @@ static bool slab_add_kunit_errors(void) kunit_put_resource(resource); return true; } -#else -static inline bool slab_add_kunit_errors(void) { return false; } -#endif /* * Determine a map of object in use on a page. @@ -702,6 +682,42 @@ static struct track *get_track(struct kmem_cache *s, void *object, return kasan_reset_tag(p + alloc); } +/* + * This function will be used to loop through all the slab objects in + * a page to give track structure for each object, the function fn will + * be using this track structure and extract required info into its private + * data, the return value will be the number of track structures that are + * processed. + */ +unsigned long get_each_object_track(struct kmem_cache *s, + struct page *page, enum track_item alloc, + int (*fn)(const struct kmem_cache *, const void *, + const struct track *, void *), void *private) +{ + void *p; + struct track *t; + int ret; + unsigned long num_track = 0; + unsigned long flags = 0; + + if (!slub_debug || !(s->flags & SLAB_STORE_USER)) + return 0; + + slab_lock(page, &flags); + for_each_object(p, s, page_address(page), page->objects) { + t = get_track(s, p, alloc); + metadata_access_enable(); + ret = fn(s, p, t, private); + metadata_access_disable(); + if (ret < 0) + break; + num_track += 1; + } + slab_unlock(page, &flags); + return num_track; +} +EXPORT_SYMBOL_NS_GPL(get_each_object_track, MINIDUMP); + static void set_track(struct kmem_cache *s, void *object, enum track_item alloc, unsigned long addr) { @@ -6254,6 +6270,7 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) sinfo->objects_per_slab = oo_objects(s->oo); sinfo->cache_order = oo_order(s->oo); } +EXPORT_SYMBOL_NS_GPL(get_slabinfo, MINIDUMP); void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) { diff --git a/mm/swap.c b/mm/swap.c index af3cad4e5378..a27ab12d1ef6 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -43,6 +43,9 @@ #define CREATE_TRACE_POINTS #include +#undef CREATE_TRACE_POINTS +#include + /* How many pages do we try to swap or page in/out together? */ int page_cluster; @@ -223,6 +226,7 @@ static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page) lru_cache_disabled()) ret = true; + trace_android_vh_pagevec_drain(page, &ret); return ret; } @@ -325,7 +329,7 @@ static bool need_activate_page_drain(int cpu) return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; } -static void activate_page(struct page *page) +void activate_page(struct page *page) { page = compound_head(page); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -345,7 +349,7 @@ static inline void activate_page_drain(int cpu) { } -static void activate_page(struct page *page) +void activate_page(struct page *page) { struct lruvec *lruvec; @@ -389,6 +393,40 @@ static void __lru_cache_activate_page(struct page *page) local_unlock(&lru_pvecs.lock); } +#ifdef CONFIG_LRU_GEN +static void page_inc_refs(struct page *page) +{ + unsigned long new_flags, old_flags = READ_ONCE(page->flags); + + if (PageUnevictable(page)) + return; + + if (!PageReferenced(page)) { + SetPageReferenced(page); + return; + } + + if (!PageWorkingset(page)) { + SetPageWorkingset(page); + return; + } + + /* see the comment on MAX_NR_TIERS */ + do { + new_flags = old_flags & LRU_REFS_MASK; + if (new_flags == LRU_REFS_MASK) + break; + + new_flags += BIT(LRU_REFS_PGOFF); + new_flags |= old_flags & ~LRU_REFS_MASK; + } while (!try_cmpxchg(&page->flags, &old_flags, new_flags)); +} +#else +static void page_inc_refs(struct page *page) +{ +} +#endif /* CONFIG_LRU_GEN */ + /* * Mark a page as having seen activity. * @@ -403,6 +441,11 @@ void mark_page_accessed(struct page *page) { page = compound_head(page); + if (lru_gen_enabled()) { + page_inc_refs(page); + return; + } + if (!PageReferenced(page)) { SetPageReferenced(page); } else if (PageUnevictable(page)) { @@ -446,6 +489,11 @@ void lru_cache_add(struct page *page) VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); + /* see the comment in lru_gen_add_page() */ + if (lru_gen_enabled() && !PageUnevictable(page) && + lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) + SetPageActive(page); + get_page(page); local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_add); @@ -547,7 +595,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) { - if (PageActive(page) && !PageUnevictable(page)) { + if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { int nr_pages = thp_nr_pages(page); del_page_from_lru_list(page, lruvec); @@ -661,7 +709,8 @@ void deactivate_file_page(struct page *page) */ void deactivate_page(struct page *page) { - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + if (PageLRU(page) && !PageUnevictable(page) && + (PageActive(page) || lru_gen_enabled())) { struct pagevec *pvec; local_lock(&lru_pvecs.lock); @@ -847,7 +896,18 @@ void lru_add_drain_all(void) } #endif /* CONFIG_SMP */ -atomic_t lru_disable_count = ATOMIC_INIT(0); +static atomic_t lru_disable_count = ATOMIC_INIT(0); + +bool lru_cache_disabled(void) +{ + return atomic_read(&lru_disable_count) != 0; +} + +void lru_cache_enable(void) +{ + atomic_dec(&lru_disable_count); +} +EXPORT_SYMBOL_GPL(lru_cache_enable); /* * lru_cache_disable() needs to be called before we start compiling @@ -859,7 +919,12 @@ atomic_t lru_disable_count = ATOMIC_INIT(0); */ void lru_cache_disable(void) { - atomic_inc(&lru_disable_count); + /* + * If someone is already disabled lru_cache, just return with + * increasing the lru_disable_count. + */ + if (atomic_inc_not_zero(&lru_disable_count)) + return; #ifdef CONFIG_SMP /* * lru_add_drain_all in the force mode will schedule draining on @@ -873,7 +938,9 @@ void lru_cache_disable(void) #else lru_add_and_bh_lrus_drain(); #endif + atomic_inc(&lru_disable_count); } +EXPORT_SYMBOL_GPL(lru_cache_disable); /** * release_pages - batched put_page() diff --git a/mm/swapfile.c b/mm/swapfile.c index 22d10f713848..b4ef01f382dc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3409,6 +3409,7 @@ void si_swapinfo(struct sysinfo *val) val->totalswap = total_swap_pages + nr_to_be_unused; spin_unlock(&swap_lock); } +EXPORT_SYMBOL_NS_GPL(si_swapinfo, MINIDUMP); /* * Verify that a swap entry is valid and increment its swap map count. diff --git a/mm/util.c b/mm/util.c index ea04979f131e..8b9369596f49 100644 --- a/mm/util.c +++ b/mm/util.c @@ -27,6 +27,9 @@ #include #include "internal.h" +#ifndef __GENSYMS__ +#include +#endif /** * kfree_const - conditionally free memory @@ -555,6 +558,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, if (populate) mm_populate(ret, populate); } + trace_android_vh_check_mmap_file(file, prot, flag, ret); return ret; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8375eecc55de..8d7fb80b50cf 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -72,7 +73,7 @@ static const bool vmap_allow_huge = false; bool is_vmalloc_addr(const void *x) { - unsigned long addr = (unsigned long)x; + unsigned long addr = (unsigned long)kasan_reset_tag(x); return addr >= VMALLOC_START && addr < VMALLOC_END; } @@ -316,9 +317,14 @@ int ioremap_page_range(unsigned long addr, unsigned long end, { int err; - err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), + prot = pgprot_nx(prot); + err = vmap_range_noflush(addr, end, phys_addr, prot, ioremap_max_page_shift); flush_cache_vmap(addr, end); + + if (IS_ENABLED(CONFIG_ARCH_HAS_IOREMAP_PHYS_HOOKS) && !err) + ioremap_phys_range_hook(phys_addr, end - addr, prot); + return err; } @@ -630,7 +636,7 @@ int is_vmalloc_or_module_addr(const void *x) * just put it in the vmalloc space. */ #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) - unsigned long addr = (unsigned long)x; + unsigned long addr = (unsigned long)kasan_reset_tag(x); if (addr >= MODULES_VADDR && addr < MODULES_END) return 1; #endif @@ -798,12 +804,15 @@ unsigned long vmalloc_nr_pages(void) { return atomic_long_read(&nr_vmalloc_pages); } +EXPORT_SYMBOL_GPL(vmalloc_nr_pages); static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) { struct vmap_area *va = NULL; struct rb_node *n = vmap_area_root.rb_node; + addr = (unsigned long)kasan_reset_tag((void *)addr); + while (n) { struct vmap_area *tmp; @@ -825,6 +834,8 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; + addr = (unsigned long)kasan_reset_tag((void *)addr); + while (n) { struct vmap_area *va; @@ -2139,7 +2150,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases); void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = (unsigned long)count << PAGE_SHIFT; - unsigned long addr = (unsigned long)mem; + unsigned long addr = (unsigned long)kasan_reset_tag(mem); struct vmap_area *va; might_sleep(); @@ -2200,14 +2211,19 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) mem = (void *)addr; } - kasan_unpoison_vmalloc(mem, size); - if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, PAGE_SHIFT) < 0) { vm_unmap_ram(mem, count); return NULL; } + /* + * Mark the pages as accessible, now that they are mapped. + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL); + return mem; } EXPORT_SYMBOL(vm_map_ram); @@ -2281,6 +2297,7 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align) vm->addr = (void *)addr; vm_area_add_early(vm); + kasan_populate_early_vm_area_shadow(vm->addr, vm->size); } static void vmap_init_free_space(void) @@ -2426,10 +2443,20 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - kasan_unpoison_vmalloc((void *)va->va_start, requested_size); - setup_vmalloc_vm(area, va, flags, caller); + /* + * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a + * best-effort approach, as they can be mapped outside of vmalloc code. + * For VM_ALLOC mappings, the pages are marked as accessible after + * getting mapped in __vmalloc_node_range(). + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + if (!(flags & VM_ALLOC)) + area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, + KASAN_VMALLOC_PROT_NORMAL); + return area; } @@ -2488,6 +2515,7 @@ struct vm_struct *find_vm_area(const void *addr) return va->vm; } +EXPORT_SYMBOL_GPL(find_vm_area); /** * remove_vm_area - find and remove a continuous kernel virtual area @@ -2513,7 +2541,7 @@ struct vm_struct *remove_vm_area(const void *addr) va->vm = NULL; spin_unlock(&vmap_area_lock); - kasan_free_shadow(vm); + kasan_free_module_shadow(vm); free_unmap_vmap_area(va); return vm; @@ -2608,6 +2636,10 @@ static void __vunmap(const void *addr, int deallocate_pages) kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); + if (IS_ENABLED(CONFIG_ARCH_HAS_IOREMAP_PHYS_HOOKS) && + area->flags & VM_IOREMAP) + iounmap_phys_range_hook(area->phys_addr, get_vm_area_size(area)); + vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { @@ -2972,7 +3004,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, const void *caller) { struct vm_struct *area; - void *addr; + void *ret; + kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; unsigned long real_size = size; unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; @@ -3020,10 +3053,50 @@ again: goto fail; } - addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); - if (!addr) + /* + * Prepare arguments for __vmalloc_area_node() and + * kasan_unpoison_vmalloc(). + */ + if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { + if (kasan_hw_tags_enabled()) { + /* + * Modify protection bits to allow tagging. + * This must be done before mapping. + */ + prot = arch_vmap_pgprot_tagged(prot); + + /* + * Skip page_alloc poisoning and zeroing for physical + * pages backing VM_ALLOC mapping. Memory is instead + * poisoned and zeroed by kasan_unpoison_vmalloc(). + */ + gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO; + } + + /* Take note that the mapping is PAGE_KERNEL. */ + kasan_flags |= KASAN_VMALLOC_PROT_NORMAL; + } + + /* Allocate physical pages and map them into vmalloc space. */ + ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); + if (!ret) goto fail; + /* + * Mark the pages as accessible, now that they are mapped. + * The condition for setting KASAN_VMALLOC_INIT should complement the + * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check + * to make sure that memory is initialized under the same conditions. + * Tag-based KASAN modes only assign tags to normal non-executable + * allocations, see __kasan_unpoison_vmalloc(). + */ + kasan_flags |= KASAN_VMALLOC_VM_ALLOC; + if (!want_init_on_free() && want_init_on_alloc(gfp_mask) && + (gfp_mask & __GFP_SKIP_ZERO)) + kasan_flags |= KASAN_VMALLOC_INIT; + /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ + area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); + /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. @@ -3035,7 +3108,7 @@ again: if (!(vm_flags & VM_DEFER_KMEMLEAK)) kmemleak_vmalloc(area, size, gfp_mask); - return addr; + return area->addr; fail: if (shift > PAGE_SHIFT) { @@ -3320,6 +3393,8 @@ long vread(char *buf, char *addr, unsigned long count) unsigned long buflen = count; unsigned long n; + addr = kasan_reset_tag(addr); + /* Don't allow overflow */ if ((unsigned long) addr + count < count) count = -(unsigned long) addr; @@ -3705,9 +3780,6 @@ retry: for (area = 0; area < nr_vms; area++) { if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) goto err_free_shadow; - - kasan_unpoison_vmalloc((void *)vas[area]->va_start, - sizes[area]); } /* insert all vm's */ @@ -3720,6 +3792,16 @@ retry: } spin_unlock(&vmap_area_lock); + /* + * Mark allocated areas as accessible. Do it now as a best-effort + * approach, as they can be mapped outside of vmalloc code. + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + for (area = 0; area < nr_vms; area++) + vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, + vms[area]->size, KASAN_VMALLOC_PROT_NORMAL); + kfree(vas); return vms; diff --git a/mm/vmscan.c b/mm/vmscan.c index 201acea81804..564a6f279e92 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -50,6 +50,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -62,6 +66,15 @@ #define CREATE_TRACE_POINTS #include +#undef CREATE_TRACE_POINTS +#include + +#undef CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL_GPL(mm_vmscan_direct_reclaim_begin); +EXPORT_TRACEPOINT_SYMBOL_GPL(mm_vmscan_direct_reclaim_end); + struct scan_control { /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; @@ -125,6 +138,12 @@ struct scan_control { /* Always discard instead of demoting to lower tier memory */ unsigned int no_demotion:1; +#ifdef CONFIG_LRU_GEN + /* help kswapd make better choices among multiple memcgs */ + unsigned int memcgs_need_aging:1; + unsigned long last_reclaimed; +#endif + /* Allocation order */ s8 order; @@ -702,6 +721,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, : SHRINK_BATCH; long scanned = 0, next_deferred; + trace_android_vh_do_shrink_slab(shrinker, shrinkctl, priority); + freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0 || freeable == SHRINK_EMPTY) return freeable; @@ -888,12 +909,17 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, * * Returns the number of reclaimed slab objects. */ -static unsigned long shrink_slab(gfp_t gfp_mask, int nid, +unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { unsigned long ret, freed = 0; struct shrinker *shrinker; + bool bypass = false; + + trace_android_vh_shrink_slab_bypass(gfp_mask, nid, memcg, priority, &bypass); + if (bypass) + return 0; /* * The root memcg might be allocated even though memcg is disabled @@ -935,6 +961,7 @@ out: cond_resched(); return freed; } +EXPORT_SYMBOL_GPL(shrink_slab); void drop_slab_node(int nid) { @@ -1142,9 +1169,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; - mem_cgroup_swapout(page, swap); + + /* get a shadow entry before mem_cgroup_swapout() clears page_memcg() */ if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(page, target_memcg); + mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page, swap, shadow); xa_unlock_irq(&mapping->i_pages); put_swap_page(page, swap); @@ -1244,6 +1273,10 @@ static enum page_references page_check_references(struct page *page, if (vm_flags & VM_LOCKED) return PAGEREF_RECLAIM; + /* rmap lock contention: rotate */ + if (referenced_ptes == -1) + return PAGEREF_KEEP; + if (referenced_ptes) { /* * All mapped pages start out with page table @@ -1407,6 +1440,11 @@ retry: if (!sc->may_unmap && page_mapped(page)) goto keep_locked; + /* page_update_gen() tried to promote this page? */ + if (lru_gen_enabled() && !ignore_references && + page_mapped(page) && PageReferenced(page)) + goto keep_locked; + may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); @@ -2180,6 +2218,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); bool stalled = false; + struct blk_plug plug; + bool do_plug = false; while (unlikely(too_many_isolated(pgdat, file, sc))) { if (stalled) @@ -2213,6 +2253,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (nr_taken == 0) return 0; + trace_android_vh_shrink_inactive_list_blk_plug(&do_plug); + if (do_plug) + blk_start_plug(&plug); nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false); spin_lock_irq(&lruvec->lru_lock); @@ -2226,6 +2269,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&lruvec->lru_lock); + if (do_plug) + blk_finish_plug(&plug); + lru_note_cost(lruvec, file, stat.nr_pageout); mem_cgroup_uncharge_list(&page_list); free_unref_page_list(&page_list); @@ -2291,6 +2337,7 @@ static void shrink_active_list(unsigned long nr_to_scan, unsigned nr_rotated = 0; int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); + bool bypass = false; lru_add_drain(); @@ -2325,8 +2372,13 @@ static void shrink_active_list(unsigned long nr_to_scan, } } + trace_android_vh_page_referenced_check_bypass(page, nr_to_scan, lru, &bypass); + if (bypass) + goto skip_page_referenced; + + /* Referenced or rmap lock contention: rotate */ if (page_referenced(page, 0, sc->target_mem_cgroup, - &vm_flags)) { + &vm_flags) != 0) { /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So @@ -2342,7 +2394,7 @@ static void shrink_active_list(unsigned long nr_to_scan, continue; } } - +skip_page_referenced: ClearPageActive(page); /* we are de-activating */ SetPageWorkingset(page); list_add(&page->lru, &l_inactive); @@ -2377,6 +2429,8 @@ unsigned long reclaim_pages(struct list_head *page_list) LIST_HEAD(node_page_list); struct reclaim_stat dummy_stat; struct page *page; + struct blk_plug plug; + bool do_plug = false; unsigned int noreclaim_flag; struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -2388,6 +2442,10 @@ unsigned long reclaim_pages(struct list_head *page_list) noreclaim_flag = memalloc_noreclaim_save(); + trace_android_vh_reclaim_pages_plug(&do_plug); + if (do_plug) + blk_start_plug(&plug); + while (!list_empty(page_list)) { page = lru_to_page(page_list); if (nid == NUMA_NO_NODE) { @@ -2426,6 +2484,9 @@ unsigned long reclaim_pages(struct list_head *page_list) memalloc_noreclaim_restore(noreclaim_flag); + if (do_plug) + blk_finish_plug(&plug); + return nr_reclaimed; } @@ -2497,6 +2558,112 @@ enum scan_balance { SCAN_FILE, }; +static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) +{ + unsigned long file; + struct lruvec *target_lruvec; + + if (lru_gen_enabled()) + return; + + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + /* + * Flush the memory cgroup stats, so that we read accurate per-memcg + * lruvec stats for heuristics. + */ + mem_cgroup_flush_stats(); + + /* + * Determine the scan balance between anon and file LRUs. + */ + spin_lock_irq(&target_lruvec->lru_lock); + sc->anon_cost = target_lruvec->anon_cost; + sc->file_cost = target_lruvec->file_cost; + spin_unlock_irq(&target_lruvec->lru_lock); + + /* + * Target desirable inactive:active list ratios for the anon + * and file LRU lists. + */ + if (!sc->force_deactivate) { + unsigned long refaults; + + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_ANON); + if (refaults != target_lruvec->refaults[0] || + inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) + sc->may_deactivate |= DEACTIVATE_ANON; + else + sc->may_deactivate &= ~DEACTIVATE_ANON; + + /* + * When refaults are being observed, it means a new + * workingset is being established. Deactivate to get + * rid of any stale active pages quickly. + */ + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_FILE); + if (refaults != target_lruvec->refaults[1] || + inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) + sc->may_deactivate |= DEACTIVATE_FILE; + else + sc->may_deactivate &= ~DEACTIVATE_FILE; + } else + sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; + + /* + * If we have plenty of inactive file pages that aren't + * thrashing, try to reclaim those first before touching + * anonymous pages. + */ + file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); + if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) + sc->cache_trim_mode = 1; + else + sc->cache_trim_mode = 0; + + /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ + if (!cgroup_reclaim(sc)) { + unsigned long total_high_wmark = 0; + unsigned long free, anon; + int z; + + free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + file = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + + if (!managed_zone(zone)) + continue; + + total_high_wmark += high_wmark_pages(zone); + } + + /* + * Consider anon: if that's low too, this isn't a + * runaway file reclaim problem, but rather just + * extreme pressure. Reclaim as per usual then. + */ + anon = node_page_state(pgdat, NR_INACTIVE_ANON); + + sc->file_is_tiny = + file + free <= total_high_wmark && + !(sc->may_deactivate & DEACTIVATE_ANON) && + anon >> sc->priority; + } +} + /* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined @@ -2518,6 +2685,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, enum scan_balance scan_balance; unsigned long ap, fp; enum lru_list lru; + bool balance_anon_file_reclaim = false; /* If we have no swap space, do not bother scanning anon pages. */ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { @@ -2555,11 +2723,15 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, goto out; } + trace_android_rvh_set_balance_anon_file_reclaim(&balance_anon_file_reclaim); + /* * If there is enough inactive page cache, we do not reclaim - * anything from the anonymous working right now. + * anything from the anonymous working right now. But when balancing + * anon and page cache files for reclaim, allow swapping of anon pages + * even if there are a number of inactive file cache pages. */ - if (sc->cache_trim_mode) { + if (!balance_anon_file_reclaim && sc->cache_trim_mode) { scan_balance = SCAN_FILE; goto out; } @@ -2718,6 +2890,2753 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, return can_demote(pgdat->node_id, sc); } +#ifdef CONFIG_LRU_GEN + +#ifdef CONFIG_LRU_GEN_ENABLED +DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); +#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) +#else +DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); +#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) +#endif + +/****************************************************************************** + * shorthand helpers + ******************************************************************************/ + +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) + +#define DEFINE_MAX_SEQ(lruvec) \ + unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) + +#define DEFINE_MIN_SEQ(lruvec) \ + unsigned long min_seq[ANON_AND_FILE] = { \ + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \ + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ + } + +#define for_each_gen_type_zone(gen, type, zone) \ + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) + +static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + +#ifdef CONFIG_MEMCG + if (memcg) { + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; + + /* for hotadd_new_pgdat() */ + if (!lruvec->pgdat) + lruvec->pgdat = pgdat; + + return lruvec; + } +#endif + VM_WARN_ON_ONCE(!mem_cgroup_disabled()); + + return pgdat ? &pgdat->__lruvec : NULL; +} + +static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) +{ + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + if (!can_demote(pgdat->node_id, sc) && + mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) + return 0; + + return mem_cgroup_swappiness(memcg); +} + +static int get_nr_gens(struct lruvec *lruvec, int type) +{ + return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; +} + +static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) +{ + /* see the comment on lru_gen_struct */ + return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && + get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && + get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; +} + +/****************************************************************************** + * mm_struct list + ******************************************************************************/ + +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) +{ + static struct lru_gen_mm_list mm_list = { + .fifo = LIST_HEAD_INIT(mm_list.fifo), + .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), + }; + +#ifdef CONFIG_MEMCG + if (memcg) + return &memcg->mm_list; +#endif + VM_WARN_ON_ONCE(!mem_cgroup_disabled()); + + return &mm_list; +} + +void lru_gen_add_mm(struct mm_struct *mm) +{ + int nid; + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list)); +#ifdef CONFIG_MEMCG + VM_WARN_ON_ONCE(mm->lru_gen.memcg); + mm->lru_gen.memcg = memcg; +#endif + spin_lock(&mm_list->lock); + + for_each_node_state(nid, N_MEMORY) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + if (!lruvec) + continue; + + /* the first addition since the last iteration */ + if (lruvec->mm_state.tail == &mm_list->fifo) + lruvec->mm_state.tail = &mm->lru_gen.list; + } + + list_add_tail(&mm->lru_gen.list, &mm_list->fifo); + + spin_unlock(&mm_list->lock); +} + +void lru_gen_del_mm(struct mm_struct *mm) +{ + int nid; + struct lru_gen_mm_list *mm_list; + struct mem_cgroup *memcg = NULL; + + if (list_empty(&mm->lru_gen.list)) + return; + +#ifdef CONFIG_MEMCG + memcg = mm->lru_gen.memcg; +#endif + mm_list = get_mm_list(memcg); + + spin_lock(&mm_list->lock); + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + if (!lruvec) + continue; + + /* where the last iteration ended (exclusive) */ + if (lruvec->mm_state.tail == &mm->lru_gen.list) + lruvec->mm_state.tail = lruvec->mm_state.tail->next; + + /* where the current iteration continues (inclusive) */ + if (lruvec->mm_state.head != &mm->lru_gen.list) + continue; + + lruvec->mm_state.head = lruvec->mm_state.head->next; + /* the deletion ends the current iteration */ + if (lruvec->mm_state.head == &mm_list->fifo) + WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1); + } + + list_del_init(&mm->lru_gen.list); + + spin_unlock(&mm_list->lock); + +#ifdef CONFIG_MEMCG + mem_cgroup_put(mm->lru_gen.memcg); + mm->lru_gen.memcg = NULL; +#endif +} + +#ifdef CONFIG_MEMCG +void lru_gen_migrate_mm(struct mm_struct *mm) +{ + struct mem_cgroup *memcg; + struct task_struct *task = rcu_dereference_protected(mm->owner, true); + + VM_WARN_ON_ONCE(task->mm != mm); + lockdep_assert_held(&task->alloc_lock); + + /* for mm_update_next_owner() */ + if (mem_cgroup_disabled()) + return; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(task); + rcu_read_unlock(); + if (memcg == mm->lru_gen.memcg) + return; + + VM_WARN_ON_ONCE(!mm->lru_gen.memcg); + VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); + + lru_gen_del_mm(mm); + lru_gen_add_mm(mm); +} +#endif + +/* + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of + * bits in a bitmap, k is the number of hash functions and n is the number of + * inserted items. + * + * Page table walkers use one of the two filters to reduce their search space. + * To get rid of non-leaf entries that no longer have enough leaf entries, the + * aging uses the double-buffering technique to flip to the other filter each + * time it produces a new generation. For non-leaf entries that have enough + * leaf entries, the aging carries them over to the next generation in + * walk_pmd_range(); the eviction also report them when walking the rmap + * in lru_gen_look_around(). + * + * For future optimizations: + * 1. It's not necessary to keep both filters all the time. The spare one can be + * freed after the RCU grace period and reallocated if needed again. + * 2. And when reallocating, it's worth scaling its size according to the number + * of inserted entries in the other filter, to reduce the memory overhead on + * small systems and false positives on large systems. + * 3. Jenkins' hash function is an alternative to Knuth's. + */ +#define BLOOM_FILTER_SHIFT 15 + +static inline int filter_gen_from_seq(unsigned long seq) +{ + return seq % NR_BLOOM_FILTERS; +} + +static void get_item_key(void *item, int *key) +{ + u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); + + BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); + + key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); + key[1] = hash >> BLOOM_FILTER_SHIFT; +} + +static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) +{ + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = lruvec->mm_state.filters[gen]; + if (filter) { + bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); + return; + } + + filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + WRITE_ONCE(lruvec->mm_state.filters[gen], filter); +} + +static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return; + + get_item_key(item, key); + + if (!test_bit(key[0], filter)) + set_bit(key[0], filter); + if (!test_bit(key[1], filter)) + set_bit(key[1], filter); +} + +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return true; + + get_item_key(item, key); + + return test_bit(key[0], filter) && test_bit(key[1], filter); +} + +static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) +{ + int i; + int hist; + + lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); + + if (walk) { + hist = lru_hist_from_seq(walk->max_seq); + + for (i = 0; i < NR_MM_STATS; i++) { + WRITE_ONCE(lruvec->mm_state.stats[hist][i], + lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); + walk->mm_stats[i] = 0; + } + } + + if (NR_HIST_GENS > 1 && last) { + hist = lru_hist_from_seq(lruvec->mm_state.seq + 1); + + for (i = 0; i < NR_MM_STATS; i++) + WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); + } +} + +static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) +{ + int type; + unsigned long size = 0; + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); + + if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) + return true; + + clear_bit(key, &mm->lru_gen.bitmap); + + for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { + size += type ? get_mm_counter(mm, MM_FILEPAGES) : + get_mm_counter(mm, MM_ANONPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); + } + + if (size < MIN_LRU_BATCH) + return true; + + return !mmget_not_zero(mm); +} + +static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, + struct mm_struct **iter) +{ + bool first = false; + bool last = true; + struct mm_struct *mm = NULL; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + + /* + * There are four interesting cases for this page table walker: + * 1. It tries to start a new iteration of mm_list with a stale max_seq; + * there is nothing left to do. + * 2. It's the first of the current generation, and it needs to reset + * the Bloom filter for the next generation. + * 3. It reaches the end of mm_list, and it needs to increment + * mm_state->seq; the iteration is done. + * 4. It's the last of the current generation, and it needs to reset the + * mm stats counters for the next generation. + */ + spin_lock(&mm_list->lock); + + VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); + VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq); + VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers); + + if (walk->max_seq <= mm_state->seq) { + if (!*iter) + last = false; + goto done; + } + + if (!mm_state->nr_walkers) { + VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); + + mm_state->head = mm_list->fifo.next; + first = true; + } + + while (!mm && mm_state->head != &mm_list->fifo) { + mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); + + mm_state->head = mm_state->head->next; + + /* force scan for those added after the last iteration */ + if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) { + mm_state->tail = mm_state->head; + walk->force_scan = true; + } + + if (should_skip_mm(mm, walk)) + mm = NULL; + } + + if (mm_state->head == &mm_list->fifo) + WRITE_ONCE(mm_state->seq, mm_state->seq + 1); +done: + if (*iter && !mm) + mm_state->nr_walkers--; + if (!*iter && mm) + mm_state->nr_walkers++; + + if (mm_state->nr_walkers) + last = false; + + if (*iter || last) + reset_mm_stats(lruvec, walk, last); + + spin_unlock(&mm_list->lock); + + if (mm && first) + reset_bloom_filter(lruvec, walk->max_seq + 1); + + if (*iter) + mmput_async(*iter); + + *iter = mm; + + return last; +} + +static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) +{ + bool success = false; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + + spin_lock(&mm_list->lock); + + VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); + + if (max_seq > mm_state->seq && !mm_state->nr_walkers) { + VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); + + WRITE_ONCE(mm_state->seq, mm_state->seq + 1); + reset_mm_stats(lruvec, NULL, true); + success = true; + } + + spin_unlock(&mm_list->lock); + + return success; +} + +/****************************************************************************** + * refault feedback loop + ******************************************************************************/ + +/* + * A feedback loop based on Proportional-Integral-Derivative (PID) controller. + * + * The P term is refaulted/(evicted+protected) from a tier in the generation + * currently being evicted; the I term is the exponential moving average of the + * P term over the generations previously evicted, using the smoothing factor + * 1/2; the D term isn't supported. + * + * The setpoint (SP) is always the first tier of one type; the process variable + * (PV) is either any tier of the other type or any other tier of the same + * type. + * + * The error is the difference between the SP and the PV; the correction is to + * turn off protection when SP>PV or turn on protection when SPlrugen; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + pos->refaulted = lrugen->avg_refaulted[type][tier] + + atomic_long_read(&lrugen->refaulted[hist][type][tier]); + pos->total = lrugen->avg_total[type][tier] + + atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + pos->total += lrugen->protected[hist][type][tier - 1]; + pos->gain = gain; +} + +static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) +{ + int hist, tier; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; + unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; + + lockdep_assert_held(&lruvec->lru_lock); + + if (!carryover && !clear) + return; + + hist = lru_hist_from_seq(seq); + + for (tier = 0; tier < MAX_NR_TIERS; tier++) { + if (carryover) { + unsigned long sum; + + sum = lrugen->avg_refaulted[type][tier] + + atomic_long_read(&lrugen->refaulted[hist][type][tier]); + WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); + + sum = lrugen->avg_total[type][tier] + + atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + sum += lrugen->protected[hist][type][tier - 1]; + WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); + } + + if (clear) { + atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); + atomic_long_set(&lrugen->evicted[hist][type][tier], 0); + if (tier) + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); + } + } +} + +static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) +{ + /* + * Return true if the PV has a limited number of refaults or a lower + * refaulted/total than the SP. + */ + return pv->refaulted < MIN_LRU_BATCH || + pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= + (sp->refaulted + 1) * pv->total * pv->gain; +} + +/****************************************************************************** + * the aging + ******************************************************************************/ + +/* promote pages accessed through page tables */ +static int page_update_gen(struct page *page, int gen) +{ + unsigned long new_flags, old_flags = READ_ONCE(page->flags); + + VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); + VM_WARN_ON_ONCE(!rcu_read_lock_held()); + + do { + /* lru_gen_del_page() has isolated this page? */ + if (!(old_flags & LRU_GEN_MASK)) { + /* for shrink_page_list() */ + new_flags = old_flags | BIT(PG_referenced); + continue; + } + + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; + } while (!try_cmpxchg(&page->flags, &old_flags, new_flags)); + + return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; +} + +/* protect pages accessed multiple times through file descriptors */ +static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming) +{ + int type = page_is_file_lru(page); + struct lru_gen_struct *lrugen = &lruvec->lrugen; + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); + unsigned long new_flags, old_flags = READ_ONCE(page->flags); + + VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page); + + do { + new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + /* page_update_gen() has promoted this page? */ + if (new_gen >= 0 && new_gen != old_gen) + return new_gen; + + new_gen = (old_gen + 1) % MAX_NR_GENS; + + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; + /* for end_page_writeback() */ + if (reclaiming) + new_flags |= BIT(PG_reclaim); + } while (!try_cmpxchg(&page->flags, &old_flags, new_flags)); + + lru_gen_update_size(lruvec, page, old_gen, new_gen); + + return new_gen; +} + +static void update_batch_size(struct lru_gen_mm_walk *walk, struct page *page, + int old_gen, int new_gen) +{ + int type = page_is_file_lru(page); + int zone = page_zonenum(page); + int delta = thp_nr_pages(page); + + VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS); + VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS); + + walk->batched++; + + walk->nr_pages[old_gen][type][zone] -= delta; + walk->nr_pages[new_gen][type][zone] += delta; +} + +static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) +{ + int gen, type, zone; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + walk->batched = 0; + + for_each_gen_type_zone(gen, type, zone) { + enum lru_list lru = type * LRU_INACTIVE_FILE; + int delta = walk->nr_pages[gen][type][zone]; + + if (!delta) + continue; + + walk->nr_pages[gen][type][zone] = 0; + WRITE_ONCE(lrugen->nr_pages[gen][type][zone], + lrugen->nr_pages[gen][type][zone] + delta); + + if (lru_gen_is_active(lruvec, gen)) + lru += LRU_ACTIVE; + __update_lru_size(lruvec, lru, zone, delta); + } +} + +static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args) +{ + struct address_space *mapping; + struct vm_area_struct *vma = args->vma; + struct lru_gen_mm_walk *walk = args->private; + + if (!vma_is_accessible(vma)) + return true; + + if (is_vm_hugetlb_page(vma)) + return true; + + if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) + return true; + + if (vma == get_gate_vma(vma->vm_mm)) + return true; + + if (vma_is_anonymous(vma)) + return !walk->can_swap; + + if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) + return true; + + mapping = vma->vm_file->f_mapping; + if (mapping_unevictable(mapping)) + return true; + + if (shmem_mapping(mapping)) + return !walk->can_swap; + + /* to exclude special mappings like dax, etc. */ + return !mapping->a_ops->readpage; +} + +/* + * Some userspace memory allocators map many single-page VMAs. Instead of + * returning back to the PGD table for each of such VMAs, finish an entire PMD + * table to reduce zigzags and improve cache performance. + */ +static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args, + unsigned long *vm_start, unsigned long *vm_end) +{ + unsigned long start = round_up(*vm_end, size); + unsigned long end = (start | ~mask) + 1; + + VM_WARN_ON_ONCE(mask & size); + VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask)); + + while (args->vma) { + if (start >= args->vma->vm_end) { + args->vma = args->vma->vm_next; + continue; + } + + if (end && end <= args->vma->vm_start) + return false; + + if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) { + args->vma = args->vma->vm_next; + continue; + } + + *vm_start = max(start, args->vma->vm_start); + *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; + + return true; + } + + return false; +} + +static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long pfn = pte_pfn(pte); + + VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); + + if (!pte_present(pte) || is_zero_pfn(pfn)) + return -1; + + if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) + return -1; + + if (WARN_ON_ONCE(!pfn_valid(pfn))) + return -1; + + return pfn; +} + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) +static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long pfn = pmd_pfn(pmd); + + VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); + + if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) + return -1; + + if (WARN_ON_ONCE(pmd_devmap(pmd))) + return -1; + + if (WARN_ON_ONCE(!pfn_valid(pfn))) + return -1; + + return pfn; +} +#endif + +static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg, + struct pglist_data *pgdat, bool can_swap) +{ + struct page *page; + + /* try to avoid unnecessary memory loads */ + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return NULL; + + page = compound_head(pfn_to_page(pfn)); + if (page_to_nid(page) != pgdat->node_id) + return NULL; + + if (page_memcg_rcu(page) != memcg) + return NULL; + + /* file VMAs can contain anon pages from COW */ + if (!page_is_file_lru(page) && !can_swap) + return NULL; + + return page; +} + +static bool suitable_to_scan(int total, int young) +{ + int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); + + /* suitable if the average number of young PTEs per cacheline is >=1 */ + return young * n >= total; +} + +static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, + struct mm_walk *args) +{ + int i; + pte_t *pte; + spinlock_t *ptl; + unsigned long addr; + int total = 0; + int young = 0; + struct lru_gen_mm_walk *walk = args->private; + struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); + + VM_WARN_ON_ONCE(pmd_leaf(*pmd)); + + ptl = pte_lockptr(args->mm, pmd); + if (!spin_trylock(ptl)) + return false; + + arch_enter_lazy_mmu_mode(); + + pte = pte_offset_map(pmd, start & PMD_MASK); +restart: + for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { + unsigned long pfn; + struct page *page; + + total++; + walk->mm_stats[MM_LEAF_TOTAL]++; + + pfn = get_pte_pfn(pte[i], args->vma, addr); + if (pfn == -1) + continue; + + if (!pte_young(pte[i])) { + walk->mm_stats[MM_LEAF_OLD]++; + continue; + } + + page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap); + if (!page) + continue; + + if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) + VM_WARN_ON_ONCE(true); + + young++; + walk->mm_stats[MM_LEAF_YOUNG]++; + + if (pte_dirty(pte[i]) && !PageDirty(page) && + !(PageAnon(page) && PageSwapBacked(page) && + !PageSwapCache(page))) + set_page_dirty(page); + + old_gen = page_update_gen(page, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, page, old_gen, new_gen); + } + + if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) + goto restart; + + pte_unmap(pte); + + arch_leave_lazy_mmu_mode(); + spin_unlock(ptl); + + return suitable_to_scan(total, young); +} + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, + struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +{ + int i; + pmd_t *pmd; + spinlock_t *ptl; + struct lru_gen_mm_walk *walk = args->private; + struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); + + VM_WARN_ON_ONCE(pud_leaf(*pud)); + + /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ + if (*start == -1) { + *start = next; + return; + } + + i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); + if (i && i <= MIN_LRU_BATCH) { + __set_bit(i - 1, bitmap); + return; + } + + pmd = pmd_offset(pud, *start); + + ptl = pmd_lockptr(args->mm, pmd); + if (!spin_trylock(ptl)) + goto done; + + arch_enter_lazy_mmu_mode(); + + do { + unsigned long pfn; + struct page *page; + unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start; + + pfn = get_pmd_pfn(pmd[i], vma, addr); + if (pfn == -1) + goto next; + + if (!pmd_trans_huge(pmd[i])) { + if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && + get_cap(LRU_GEN_NONLEAF_YOUNG)) + pmdp_test_and_clear_young(vma, addr, pmd + i); + goto next; + } + + page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap); + if (!page) + goto next; + + if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) + goto next; + + walk->mm_stats[MM_LEAF_YOUNG]++; + + if (pmd_dirty(pmd[i]) && !PageDirty(page) && + !(PageAnon(page) && PageSwapBacked(page) && + !PageSwapCache(page))) + set_page_dirty(page); + + old_gen = page_update_gen(page, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, page, old_gen, new_gen); +next: + i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; + } while (i <= MIN_LRU_BATCH); + + arch_leave_lazy_mmu_mode(); + spin_unlock(ptl); +done: + *start = -1; + bitmap_zero(bitmap, MIN_LRU_BATCH); +} +#else +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, + struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +{ +} +#endif + +static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, + struct mm_walk *args) +{ + int i; + pmd_t *pmd; + unsigned long next; + unsigned long addr; + struct vm_area_struct *vma; + unsigned long pos = -1; + struct lru_gen_mm_walk *walk = args->private; + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; + + VM_WARN_ON_ONCE(pud_leaf(*pud)); + + /* + * Finish an entire PMD in two passes: the first only reaches to PTE + * tables to avoid taking the PMD lock; the second, if necessary, takes + * the PMD lock to clear the accessed bit in PMD entries. + */ + pmd = pmd_offset(pud, start & PUD_MASK); +restart: + /* walk_pte_range() may call get_next_vma() */ + vma = args->vma; + for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { + pmd_t val = pmd_read_atomic(pmd + i); + + /* for pmd_read_atomic() */ + barrier(); + + next = pmd_addr_end(addr, end); + + if (!pmd_present(val) || is_huge_zero_pmd(val)) { + walk->mm_stats[MM_LEAF_TOTAL]++; + continue; + } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(val)) { + unsigned long pfn = pmd_pfn(val); + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + + walk->mm_stats[MM_LEAF_TOTAL]++; + + if (!pmd_young(val)) { + walk->mm_stats[MM_LEAF_OLD]++; + continue; + } + + /* try to avoid unnecessary memory loads */ + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + continue; + + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); + continue; + } +#endif + walk->mm_stats[MM_NONLEAF_TOTAL]++; + +#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG + if (get_cap(LRU_GEN_NONLEAF_YOUNG)) { + if (!pmd_young(val)) + continue; + + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); + } +#endif + if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) + continue; + + walk->mm_stats[MM_NONLEAF_FOUND]++; + + if (!walk_pte_range(&val, addr, next, args)) + continue; + + walk->mm_stats[MM_NONLEAF_ADDED]++; + + /* carry over to the next generation */ + update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); + } + + walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); + + if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) + goto restart; +} + +static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, + struct mm_walk *args) +{ + int i; + pud_t *pud; + unsigned long addr; + unsigned long next; + struct lru_gen_mm_walk *walk = args->private; + + VM_WARN_ON_ONCE(p4d_leaf(*p4d)); + + pud = pud_offset(p4d, start & P4D_MASK); +restart: + for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { + pud_t val = READ_ONCE(pud[i]); + + next = pud_addr_end(addr, end); + + if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) + continue; + + walk_pmd_range(&val, addr, next, args); + + /* a racy check to curtail the waiting time */ + if (wq_has_sleeper(&walk->lruvec->mm_state.wait)) + return 1; + + if (need_resched() || walk->batched >= MAX_LRU_BATCH) { + end = (addr | ~PUD_MASK) + 1; + goto done; + } + } + + if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end)) + goto restart; + + end = round_up(end, P4D_SIZE); +done: + if (!end || !args->vma) + return 1; + + walk->next_addr = max(end, args->vma->vm_start); + + return -EAGAIN; +} + +static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk) +{ + static const struct mm_walk_ops mm_walk_ops = { + .test_walk = should_skip_vma, + .p4d_entry = walk_pud_range, + }; + + int err; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + walk->next_addr = FIRST_USER_ADDRESS; + + do { + err = -EBUSY; + + /* page_update_gen() requires stable page_memcg() */ + if (!mem_cgroup_trylock_pages(memcg)) + break; + + /* the caller might be holding the lock for write */ + if (mmap_read_trylock(mm)) { + err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); + + mmap_read_unlock(mm); + } + + mem_cgroup_unlock_pages(); + + if (walk->batched) { + spin_lock_irq(&lruvec->lru_lock); + reset_batch_size(lruvec, walk); + spin_unlock_irq(&lruvec->lru_lock); + } + + cond_resched(); + } while (err == -EAGAIN); +} + +static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) +{ + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; + + if (pgdat && current_is_kswapd()) { + VM_WARN_ON_ONCE(walk); + + walk = &pgdat->mm_walk; + } else if (!pgdat && !walk) { + VM_WARN_ON_ONCE(current_is_kswapd()); + + walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + } + + current->reclaim_state->mm_walk = walk; + + return walk; +} + +static void clear_mm_walk(void) +{ + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; + + VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages))); + VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats))); + + current->reclaim_state->mm_walk = NULL; + + if (!current_is_kswapd()) + kfree(walk); +} + +static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) +{ + int zone; + int remaining = MAX_LRU_BATCH; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); + + if (type == LRU_GEN_ANON && !can_swap) + goto done; + + /* prevent cold/hot inversion if force_scan is true */ + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + struct list_head *head = &lrugen->lists[old_gen][type][zone]; + + while (!list_empty(head)) { + struct page *page = lru_to_page(head); + + VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page); + VM_WARN_ON_ONCE_PAGE(PageActive(page), page); + VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page); + VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page); + + new_gen = page_inc_gen(lruvec, page, false); + list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]); + + if (!--remaining) + return false; + } + } +done: + reset_ctrl_pos(lruvec, type, true); + WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); + + return true; +} + +static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) +{ + int gen, type, zone; + bool success = false; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + DEFINE_MIN_SEQ(lruvec); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + + /* find the oldest populated generation */ + for (type = !can_swap; type < ANON_AND_FILE; type++) { + while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { + gen = lru_gen_from_seq(min_seq[type]); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + if (!list_empty(&lrugen->lists[gen][type][zone])) + goto next; + } + + min_seq[type]++; + } +next: + ; + } + + /* see the comment on lru_gen_struct */ + if (can_swap) { + min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); + min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); + } + + for (type = !can_swap; type < ANON_AND_FILE; type++) { + if (min_seq[type] == lrugen->min_seq[type]) + continue; + + reset_ctrl_pos(lruvec, type, true); + WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); + success = true; + } + + return success; +} + +static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) +{ + int prev, next; + int type, zone; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + spin_lock_irq(&lruvec->lru_lock); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + + for (type = ANON_AND_FILE - 1; type >= 0; type--) { + if (get_nr_gens(lruvec, type) != MAX_NR_GENS) + continue; + + VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); + + while (!inc_min_seq(lruvec, type, can_swap)) { + spin_unlock_irq(&lruvec->lru_lock); + cond_resched(); + spin_lock_irq(&lruvec->lru_lock); + } + } + + /* + * Update the active/inactive LRU sizes for compatibility. Both sides of + * the current max_seq need to be covered, since max_seq+1 can overlap + * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do + * overlap, cold/hot inversion happens. + */ + prev = lru_gen_from_seq(lrugen->max_seq - 1); + next = lru_gen_from_seq(lrugen->max_seq + 1); + + for (type = 0; type < ANON_AND_FILE; type++) { + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + enum lru_list lru = type * LRU_INACTIVE_FILE; + long delta = lrugen->nr_pages[prev][type][zone] - + lrugen->nr_pages[next][type][zone]; + + if (!delta) + continue; + + __update_lru_size(lruvec, lru, zone, delta); + __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); + } + } + + for (type = 0; type < ANON_AND_FILE; type++) + reset_ctrl_pos(lruvec, type, false); + + WRITE_ONCE(lrugen->timestamps[next], jiffies); + /* make sure preceding modifications appear */ + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); + + spin_unlock_irq(&lruvec->lru_lock); +} + +static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + struct scan_control *sc, bool can_swap, bool force_scan) +{ + bool success; + struct lru_gen_mm_walk *walk; + struct mm_struct *mm = NULL; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); + + /* see the comment in iterate_mm_list() */ + if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) { + success = false; + goto done; + } + + /* + * If the hardware doesn't automatically set the accessed bit, fallback + * to lru_gen_look_around(), which only clears the accessed bit in a + * handful of PTEs. Spreading the work out over a period of time usually + * is less efficient, but it avoids bursty page faults. + */ + if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { + success = iterate_mm_list_nowalk(lruvec, max_seq); + goto done; + } + + walk = set_mm_walk(NULL); + if (!walk) { + success = iterate_mm_list_nowalk(lruvec, max_seq); + goto done; + } + + walk->lruvec = lruvec; + walk->max_seq = max_seq; + walk->can_swap = can_swap; + walk->force_scan = force_scan; + + do { + success = iterate_mm_list(lruvec, walk, &mm); + if (mm) + walk_mm(lruvec, mm, walk); + + cond_resched(); + } while (mm); +done: + if (!success) { + if (sc->priority <= DEF_PRIORITY - 2) + wait_event_killable(lruvec->mm_state.wait, + max_seq < READ_ONCE(lrugen->max_seq)); + + return max_seq < READ_ONCE(lrugen->max_seq); + } + + VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); + + inc_max_seq(lruvec, can_swap, force_scan); + /* either this sees any waiters or they will see updated max_seq */ + if (wq_has_sleeper(&lruvec->mm_state.wait)) + wake_up_all(&lruvec->mm_state.wait); + + return true; +} + +static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, + struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) +{ + int gen, type, zone; + unsigned long old = 0; + unsigned long young = 0; + unsigned long total = 0; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + for (type = !can_swap; type < ANON_AND_FILE; type++) { + unsigned long seq; + + for (seq = min_seq[type]; seq <= max_seq; seq++) { + unsigned long size = 0; + + gen = lru_gen_from_seq(seq); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + + total += size; + if (seq == max_seq) + young += size; + else if (seq + MIN_NR_GENS == max_seq) + old += size; + } + } + + /* try to scrape all its memory if this memcg was deleted */ + *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; + + /* + * The aging tries to be lazy to reduce the overhead, while the eviction + * stalls when the number of generations reaches MIN_NR_GENS. Hence, the + * ideal number of generations is MIN_NR_GENS+1. + */ + if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) + return true; + if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) + return false; + + /* + * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) + * of the total number of pages for each generation. A reasonable range + * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The + * aging cares about the upper bound of hot pages, while the eviction + * cares about the lower bound of cold pages. + */ + if (young * MIN_NR_GENS > total) + return true; + if (old * (MIN_NR_GENS + 2) < total) + return true; + + return false; +} + +static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) +{ + bool need_aging; + unsigned long nr_to_scan; + int swappiness = get_swappiness(lruvec, sc); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + VM_WARN_ON_ONCE(sc->memcg_low_reclaim); + + mem_cgroup_calculate_protection(NULL, memcg); + + if (mem_cgroup_below_min(memcg)) + return false; + + need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); + + if (min_ttl) { + int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); + unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); + + if (time_is_after_jiffies(birth + min_ttl)) + return false; + + /* the size is likely too small to be helpful */ + if (!nr_to_scan && sc->priority != DEF_PRIORITY) + return false; + } + + if (need_aging) + try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false); + + return true; +} + +/* to protect the working set of the last N jiffies */ +static unsigned long lru_gen_min_ttl __read_mostly; + +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ + struct mem_cgroup *memcg; + bool success = false; + unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); + + VM_WARN_ON_ONCE(!current_is_kswapd()); + + sc->last_reclaimed = sc->nr_reclaimed; + + /* + * To reduce the chance of going into the aging path, which can be + * costly, optimistically skip it if the flag below was cleared in the + * eviction path. This improves the overall performance when multiple + * memcgs are available. + */ + if (!sc->memcgs_need_aging) { + sc->memcgs_need_aging = true; + return; + } + + set_mm_walk(pgdat); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + + if (age_lruvec(lruvec, sc, min_ttl)) + success = true; + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + clear_mm_walk(); + + /* check the order to exclude compaction-induced reclaim */ + if (success || !min_ttl || sc->order) + return; + + /* + * The main goal is to OOM kill if every generation from all memcgs is + * younger than min_ttl. However, another possibility is all memcgs are + * either below min or empty. + */ + if (mutex_trylock(&oom_lock)) { + struct oom_control oc = { + .gfp_mask = sc->gfp_mask, + }; + + out_of_memory(&oc); + + mutex_unlock(&oom_lock); + } +} + +/* + * This function exploits spatial locality when shrink_page_list() walks the + * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If + * the scan was done cacheline efficiently, it adds the PMD entry pointing to + * the PTE table to the Bloom filter. This forms a feedback loop between the + * eviction and the aging. + */ +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +{ + int i; + pte_t *pte; + unsigned long start; + unsigned long end; + unsigned long addr; + struct lru_gen_mm_walk *walk; + int young = 0; + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; + struct page *page = pvmw->page; + struct mem_cgroup *memcg = page_memcg(page); + struct pglist_data *pgdat = page_pgdat(page); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + DEFINE_MAX_SEQ(lruvec); + int old_gen, new_gen = lru_gen_from_seq(max_seq); + + lockdep_assert_held(pvmw->ptl); + VM_WARN_ON_ONCE_PAGE(PageLRU(page), page); + + if (spin_is_contended(pvmw->ptl)) + return; + + /* avoid taking the LRU lock under the PTL when possible */ + walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; + + start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); + end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; + + if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { + if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) + end = start + MIN_LRU_BATCH * PAGE_SIZE; + else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) + start = end - MIN_LRU_BATCH * PAGE_SIZE; + else { + start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; + end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; + } + } + + pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; + + rcu_read_lock(); + arch_enter_lazy_mmu_mode(); + + for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { + unsigned long pfn; + + pfn = get_pte_pfn(pte[i], pvmw->vma, addr); + if (pfn == -1) + continue; + + if (!pte_young(pte[i])) + continue; + + page = get_pfn_page(pfn, memcg, pgdat, !walk || walk->can_swap); + if (!page) + continue; + + if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) + VM_WARN_ON_ONCE(true); + + young++; + + if (pte_dirty(pte[i]) && !PageDirty(page) && + !(PageAnon(page) && PageSwapBacked(page) && + !PageSwapCache(page))) + set_page_dirty(page); + + old_gen = page_lru_gen(page); + if (old_gen < 0) + SetPageReferenced(page); + else if (old_gen != new_gen) + __set_bit(i, bitmap); + } + + arch_leave_lazy_mmu_mode(); + rcu_read_unlock(); + + /* feedback from rmap walkers to page table walkers */ + if (suitable_to_scan(i, young)) + update_bloom_filter(lruvec, max_seq, pvmw->pmd); + + if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { + page = pte_page(pte[i]); + activate_page(page); + } + return; + } + + /* page_update_gen() requires stable page_memcg() */ + if (!mem_cgroup_trylock_pages(memcg)) + return; + + if (!walk) { + spin_lock_irq(&lruvec->lru_lock); + new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); + } + + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { + page = compound_head(pte_page(pte[i])); + if (page_memcg_rcu(page) != memcg) + continue; + + old_gen = page_update_gen(page, new_gen); + if (old_gen < 0 || old_gen == new_gen) + continue; + + if (walk) + update_batch_size(walk, page, old_gen, new_gen); + else + lru_gen_update_size(lruvec, page, old_gen, new_gen); + } + + if (!walk) + spin_unlock_irq(&lruvec->lru_lock); + + mem_cgroup_unlock_pages(); +} + +/****************************************************************************** + * the eviction + ******************************************************************************/ + +static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx) +{ + bool success; + int gen = page_lru_gen(page); + int type = page_is_file_lru(page); + int zone = page_zonenum(page); + int delta = thp_nr_pages(page); + int refs = page_lru_refs(page); + int tier = lru_tier_from_refs(refs); + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE_PAGE(gen >= MAX_NR_GENS, page); + + /* unevictable */ + if (!page_evictable(page)) { + success = lru_gen_del_page(lruvec, page, true); + VM_WARN_ON_ONCE_PAGE(!success, page); + SetPageUnevictable(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGCULLED, delta); + return true; + } + + /* dirty lazyfree */ + if (type == LRU_GEN_FILE && PageAnon(page) && PageDirty(page)) { + success = lru_gen_del_page(lruvec, page, true); + VM_WARN_ON_ONCE_PAGE(!success, page); + SetPageSwapBacked(page); + add_page_to_lru_list_tail(page, lruvec); + return true; + } + + /* promoted */ + if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { + list_move(&page->lru, &lrugen->lists[gen][type][zone]); + return true; + } + + /* protected */ + if (tier > tier_idx) { + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + gen = page_inc_gen(lruvec, page, false); + list_move_tail(&page->lru, &lrugen->lists[gen][type][zone]); + + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], + lrugen->protected[hist][type][tier - 1] + delta); + __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); + return true; + } + + /* waiting for writeback */ + if (PageLocked(page) || PageWriteback(page) || + (type == LRU_GEN_FILE && PageDirty(page))) { + gen = page_inc_gen(lruvec, page, true); + list_move(&page->lru, &lrugen->lists[gen][type][zone]); + return true; + } + + return false; +} + +static bool isolate_page(struct lruvec *lruvec, struct page *page, struct scan_control *sc) +{ + bool success; + + /* unmapping inhibited */ + if (!sc->may_unmap && page_mapped(page)) + return false; + + /* swapping inhibited */ + if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && + (PageDirty(page) || + (PageAnon(page) && !PageSwapCache(page)))) + return false; + + /* raced with release_pages() */ + if (!get_page_unless_zero(page)) + return false; + + /* raced with another isolation */ + if (!TestClearPageLRU(page)) { + put_page(page); + return false; + } + + /* see the comment on MAX_NR_TIERS */ + if (!PageReferenced(page)) + set_mask_bits(&page->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); + + /* for shrink_page_list() */ + ClearPageReclaim(page); + ClearPageReferenced(page); + + success = lru_gen_del_page(lruvec, page, true); + VM_WARN_ON_ONCE_PAGE(!success, page); + + return true; +} + +static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, + int type, int tier, struct list_head *list) +{ + int gen, zone; + enum vm_event_item item; + int sorted = 0; + int scanned = 0; + int isolated = 0; + int remaining = MAX_LRU_BATCH; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + VM_WARN_ON_ONCE(!list_empty(list)); + + if (get_nr_gens(lruvec, type) == MIN_NR_GENS) + return 0; + + gen = lru_gen_from_seq(lrugen->min_seq[type]); + + for (zone = sc->reclaim_idx; zone >= 0; zone--) { + LIST_HEAD(moved); + int skipped = 0; + struct list_head *head = &lrugen->lists[gen][type][zone]; + + while (!list_empty(head)) { + struct page *page = lru_to_page(head); + int delta = thp_nr_pages(page); + + VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page); + VM_WARN_ON_ONCE_PAGE(PageActive(page), page); + VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page); + VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page); + + scanned += delta; + + if (sort_page(lruvec, page, tier)) + sorted += delta; + else if (isolate_page(lruvec, page, sc)) { + list_add(&page->lru, list); + isolated += delta; + } else { + list_move(&page->lru, &moved); + skipped += delta; + } + + if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH) + break; + } + + if (skipped) { + list_splice(&moved, head); + __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); + } + + if (!remaining || isolated >= MIN_LRU_BATCH) + break; + } + + item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + if (!cgroup_reclaim(sc)) { + __count_vm_events(item, isolated); + __count_vm_events(PGREFILL, sorted); + } + __count_memcg_events(memcg, item, isolated); + __count_memcg_events(memcg, PGREFILL, sorted); + __count_vm_events(PGSCAN_ANON + type, isolated); + + /* + * There might not be eligible pages due to reclaim_idx, may_unmap and + * may_writepage. Check the remaining to prevent livelock if it's not + * making progress. + */ + return isolated || !remaining ? scanned : 0; +} + +static int get_tier_idx(struct lruvec *lruvec, int type) +{ + int tier; + struct ctrl_pos sp, pv; + + /* + * To leave a margin for fluctuations, use a larger gain factor (1:2). + * This value is chosen because any other tier would have at least twice + * as many refaults as the first tier. + */ + read_ctrl_pos(lruvec, type, 0, 1, &sp); + for (tier = 1; tier < MAX_NR_TIERS; tier++) { + read_ctrl_pos(lruvec, type, tier, 2, &pv); + if (!positive_ctrl_err(&sp, &pv)) + break; + } + + return tier - 1; +} + +static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) +{ + int type, tier; + struct ctrl_pos sp, pv; + int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; + + /* + * Compare the first tier of anon with that of file to determine which + * type to scan. Also need to compare other tiers of the selected type + * with the first tier of the other type to determine the last tier (of + * the selected type) to evict. + */ + read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); + read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); + type = positive_ctrl_err(&sp, &pv); + + read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); + for (tier = 1; tier < MAX_NR_TIERS; tier++) { + read_ctrl_pos(lruvec, type, tier, gain[type], &pv); + if (!positive_ctrl_err(&sp, &pv)) + break; + } + + *tier_idx = tier - 1; + + return type; +} + +static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + int *type_scanned, struct list_head *list) +{ + int i; + int type; + int scanned; + int tier = -1; + DEFINE_MIN_SEQ(lruvec); + + /* + * Try to make the obvious choice first. When anon and file are both + * available from the same generation, interpret swappiness 1 as file + * first and 200 as anon first. + */ + if (!swappiness) + type = LRU_GEN_FILE; + else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) + type = LRU_GEN_ANON; + else if (swappiness == 1) + type = LRU_GEN_FILE; + else if (swappiness == 200) + type = LRU_GEN_ANON; + else + type = get_type_to_scan(lruvec, swappiness, &tier); + + for (i = !swappiness; i < ANON_AND_FILE; i++) { + if (tier < 0) + tier = get_tier_idx(lruvec, type); + + scanned = scan_pages(lruvec, sc, type, tier, list); + if (scanned) + break; + + type = !type; + tier = -1; + } + + *type_scanned = type; + + return scanned; +} + +static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + bool *need_swapping) +{ + int type; + int scanned; + int reclaimed; + LIST_HEAD(list); + struct page *page; + enum vm_event_item item; + struct reclaim_stat stat; + struct lru_gen_mm_walk *walk; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + spin_lock_irq(&lruvec->lru_lock); + + scanned = isolate_pages(lruvec, sc, swappiness, &type, &list); + + scanned += try_to_inc_min_seq(lruvec, swappiness); + + if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) + scanned = 0; + + spin_unlock_irq(&lruvec->lru_lock); + + if (list_empty(&list)) + return scanned; + + reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); + + list_for_each_entry(page, &list, lru) { + /* restore LRU_REFS_FLAGS cleared by isolate_page() */ + if (PageWorkingset(page)) + SetPageReferenced(page); + + /* don't add rejected pages to the oldest generation */ + if (PageReclaim(page) && + (PageDirty(page) || PageWriteback(page))) + ClearPageActive(page); + else + SetPageActive(page); + } + + spin_lock_irq(&lruvec->lru_lock); + + move_pages_to_lru(lruvec, &list); + + walk = current->reclaim_state->mm_walk; + if (walk && walk->batched) + reset_batch_size(lruvec, walk); + + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + if (!cgroup_reclaim(sc)) + __count_vm_events(item, reclaimed); + __count_memcg_events(memcg, item, reclaimed); + __count_vm_events(PGSTEAL_ANON + type, reclaimed); + + spin_unlock_irq(&lruvec->lru_lock); + + mem_cgroup_uncharge_list(&list); + free_unref_page_list(&list); + + sc->nr_reclaimed += reclaimed; + + if (need_swapping && type == LRU_GEN_ANON) + *need_swapping = true; + + return scanned; +} + +/* + * For future optimizations: + * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg + * reclaim. + */ +static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, + bool can_swap, bool *need_aging) +{ + unsigned long nr_to_scan; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (mem_cgroup_below_min(memcg) || + (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) + return 0; + + *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); + if (!*need_aging) + return nr_to_scan; + + /* skip the aging path at the default priority */ + if (sc->priority == DEF_PRIORITY) + goto done; + + /* leave the work to lru_gen_age_node() */ + if (current_is_kswapd()) + return 0; + + if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false)) + return nr_to_scan; +done: + return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; +} + +static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, + struct scan_control *sc, bool need_swapping) +{ + int i; + DEFINE_MAX_SEQ(lruvec); + + if (!current_is_kswapd()) { + /* age each memcg at most once to ensure fairness */ + if (max_seq - seq > 1) + return true; + + /* over-swapping can increase allocation latency */ + if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) + return true; + + /* give this thread a chance to exit and free its memory */ + if (fatal_signal_pending(current)) { + sc->nr_reclaimed += MIN_LRU_BATCH; + return true; + } + + if (cgroup_reclaim(sc)) + return false; + } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim) + return false; + + /* keep scanning at low priorities to ensure fairness */ + if (sc->priority > DEF_PRIORITY - 2) + return false; + + /* + * A minimum amount of work was done under global memory pressure. For + * kswapd, it may be overshooting. For direct reclaim, the allocation + * may succeed if all suitable zones are somewhat safe. In either case, + * it's better to stop now, and restart later if necessary. + */ + for (i = 0; i <= sc->reclaim_idx; i++) { + unsigned long wmark; + struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; + + if (!managed_zone(zone)) + continue; + + wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone); + if (wmark > zone_page_state(zone, NR_FREE_PAGES)) + return false; + } + + sc->nr_reclaimed += MIN_LRU_BATCH; + + return true; +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + struct blk_plug plug; + bool need_aging = false; + bool need_swapping = false; + unsigned long scanned = 0; + unsigned long reclaimed = sc->nr_reclaimed; + DEFINE_MAX_SEQ(lruvec); + + lru_add_drain(); + + blk_start_plug(&plug); + + set_mm_walk(lruvec_pgdat(lruvec)); + + while (true) { + int delta; + int swappiness; + unsigned long nr_to_scan; + + if (sc->may_swap) + swappiness = get_swappiness(lruvec, sc); + else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) + swappiness = 1; + else + swappiness = 0; + + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging); + if (!nr_to_scan) + goto done; + + delta = evict_pages(lruvec, sc, swappiness, &need_swapping); + if (!delta) + goto done; + + scanned += delta; + if (scanned >= nr_to_scan) + break; + + if (should_abort_scan(lruvec, max_seq, sc, need_swapping)) + break; + + cond_resched(); + } + + /* see the comment in lru_gen_age_node() */ + if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging) + sc->memcgs_need_aging = false; +done: + clear_mm_walk(); + + blk_finish_plug(&plug); +} + +/****************************************************************************** + * state change + ******************************************************************************/ + +static bool __maybe_unused state_is_valid(struct lruvec *lruvec) +{ + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + if (lrugen->enabled) { + enum lru_list lru; + + for_each_evictable_lru(lru) { + if (!list_empty(&lruvec->lists[lru])) + return false; + } + } else { + int gen, type, zone; + + for_each_gen_type_zone(gen, type, zone) { + if (!list_empty(&lrugen->lists[gen][type][zone])) + return false; + } + } + + return true; +} + +static bool fill_evictable(struct lruvec *lruvec) +{ + enum lru_list lru; + int remaining = MAX_LRU_BATCH; + + for_each_evictable_lru(lru) { + int type = is_file_lru(lru); + bool active = is_active_lru(lru); + struct list_head *head = &lruvec->lists[lru]; + + while (!list_empty(head)) { + bool success; + struct page *page = lru_to_page(head); + + VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page); + VM_WARN_ON_ONCE_PAGE(PageActive(page) != active, page); + VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page); + VM_WARN_ON_ONCE_PAGE(page_lru_gen(page) != -1, page); + + del_page_from_lru_list(page, lruvec); + success = lru_gen_add_page(lruvec, page, false); + VM_WARN_ON_ONCE(!success); + + if (!--remaining) + return false; + } + } + + return true; +} + +static bool drain_evictable(struct lruvec *lruvec) +{ + int gen, type, zone; + int remaining = MAX_LRU_BATCH; + + for_each_gen_type_zone(gen, type, zone) { + struct list_head *head = &lruvec->lrugen.lists[gen][type][zone]; + + while (!list_empty(head)) { + bool success; + struct page *page = lru_to_page(head); + + VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page); + VM_WARN_ON_ONCE_PAGE(PageActive(page), page); + VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page); + VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page); + + success = lru_gen_del_page(lruvec, page, false); + VM_WARN_ON_ONCE(!success); + add_page_to_lru_list(page, lruvec); + + if (!--remaining) + return false; + } + } + + return true; +} + +static void lru_gen_change_state(bool enabled) +{ + static DEFINE_MUTEX(state_mutex); + + struct mem_cgroup *memcg; + + cgroup_lock(); + cpus_read_lock(); + get_online_mems(); + mutex_lock(&state_mutex); + + if (enabled == lru_gen_enabled()) + goto unlock; + + if (enabled) + static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); + else + static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + if (!lruvec) + continue; + + spin_lock_irq(&lruvec->lru_lock); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + VM_WARN_ON_ONCE(!state_is_valid(lruvec)); + + lruvec->lrugen.enabled = enabled; + + while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { + spin_unlock_irq(&lruvec->lru_lock); + cond_resched(); + spin_lock_irq(&lruvec->lru_lock); + } + + spin_unlock_irq(&lruvec->lru_lock); + } + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); +unlock: + mutex_unlock(&state_mutex); + put_online_mems(); + cpus_read_unlock(); + cgroup_unlock(); +} + +/****************************************************************************** + * sysfs interface + ******************************************************************************/ + +static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) +{ + unsigned int msecs; + + if (kstrtouint(buf, 0, &msecs)) + return -EINVAL; + + WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); + + return len; +} + +static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( + min_ttl_ms, 0644, show_min_ttl, store_min_ttl +); + +static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + unsigned int caps = 0; + + if (get_cap(LRU_GEN_CORE)) + caps |= BIT(LRU_GEN_CORE); + + if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) + caps |= BIT(LRU_GEN_MM_WALK); + + if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG)) + caps |= BIT(LRU_GEN_NONLEAF_YOUNG); + + return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps); +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) +{ + int i; + unsigned int caps; + + if (tolower(*buf) == 'n') + caps = 0; + else if (tolower(*buf) == 'y') + caps = -1; + else if (kstrtouint(buf, 0, &caps)) + return -EINVAL; + + for (i = 0; i < NR_LRU_GEN_CAPS; i++) { + bool enabled = caps & BIT(i); + + if (i == LRU_GEN_CORE) + lru_gen_change_state(enabled); + else if (enabled) + static_branch_enable(&lru_gen_caps[i]); + else + static_branch_disable(&lru_gen_caps[i]); + } + + return len; +} + +static struct kobj_attribute lru_gen_enabled_attr = __ATTR( + enabled, 0644, show_enabled, store_enabled +); + +static struct attribute *lru_gen_attrs[] = { + &lru_gen_min_ttl_attr.attr, + &lru_gen_enabled_attr.attr, + NULL +}; + +static struct attribute_group lru_gen_attr_group = { + .name = "lru_gen", + .attrs = lru_gen_attrs, +}; + +/****************************************************************************** + * debugfs interface + ******************************************************************************/ + +static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) +{ + struct mem_cgroup *memcg; + loff_t nr_to_skip = *pos; + + m->private = kvmalloc(PATH_MAX, GFP_KERNEL); + if (!m->private) + return ERR_PTR(-ENOMEM); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + int nid; + + for_each_node_state(nid, N_MEMORY) { + if (!nr_to_skip--) + return get_lruvec(memcg, nid); + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + return NULL; +} + +static void lru_gen_seq_stop(struct seq_file *m, void *v) +{ + if (!IS_ERR_OR_NULL(v)) + mem_cgroup_iter_break(NULL, lruvec_memcg(v)); + + kvfree(m->private); + m->private = NULL; +} + +static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + int nid = lruvec_pgdat(v)->node_id; + struct mem_cgroup *memcg = lruvec_memcg(v); + + ++*pos; + + nid = next_memory_node(nid); + if (nid == MAX_NUMNODES) { + memcg = mem_cgroup_iter(NULL, memcg, NULL); + if (!memcg) + return NULL; + + nid = first_memory_node; + } + + return get_lruvec(memcg, nid); +} + +static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, + unsigned long max_seq, unsigned long *min_seq, + unsigned long seq) +{ + int i; + int type, tier; + int hist = lru_hist_from_seq(seq); + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + for (tier = 0; tier < MAX_NR_TIERS; tier++) { + seq_printf(m, " %10d", tier); + for (type = 0; type < ANON_AND_FILE; type++) { + const char *s = " "; + unsigned long n[3] = {}; + + if (seq == max_seq) { + s = "RT "; + n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); + n[1] = READ_ONCE(lrugen->avg_total[type][tier]); + } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { + s = "rep"; + n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); + n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); + } + + for (i = 0; i < 3; i++) + seq_printf(m, " %10lu%c", n[i], s[i]); + } + seq_putc(m, '\n'); + } + + seq_puts(m, " "); + for (i = 0; i < NR_MM_STATS; i++) { + const char *s = " "; + unsigned long n = 0; + + if (seq == max_seq && NR_HIST_GENS == 1) { + s = "LOYNFA"; + n = READ_ONCE(lruvec->mm_state.stats[hist][i]); + } else if (seq != max_seq && NR_HIST_GENS > 1) { + s = "loynfa"; + n = READ_ONCE(lruvec->mm_state.stats[hist][i]); + } + + seq_printf(m, " %10lu%c", n, s[i]); + } + seq_putc(m, '\n'); +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static int lru_gen_seq_show(struct seq_file *m, void *v) +{ + unsigned long seq; + bool full = !debugfs_real_fops(m->file)->write; + struct lruvec *lruvec = v; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + int nid = lruvec_pgdat(lruvec)->node_id; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (nid == first_memory_node) { + const char *path = memcg ? m->private : ""; + +#ifdef CONFIG_MEMCG + if (memcg) + cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); +#endif + seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); + } + + seq_printf(m, " node %5d\n", nid); + + if (!full) + seq = min_seq[LRU_GEN_ANON]; + else if (max_seq >= MAX_NR_GENS) + seq = max_seq - MAX_NR_GENS + 1; + else + seq = 0; + + for (; seq <= max_seq; seq++) { + int type, zone; + int gen = lru_gen_from_seq(seq); + unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); + + seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); + + for (type = 0; type < ANON_AND_FILE; type++) { + unsigned long size = 0; + char mark = full && seq < min_seq[type] ? 'x' : ' '; + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + + seq_printf(m, " %10lu%c", size, mark); + } + + seq_putc(m, '\n'); + + if (full) + lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); + } + + return 0; +} + +static const struct seq_operations lru_gen_seq_ops = { + .start = lru_gen_seq_start, + .stop = lru_gen_seq_stop, + .next = lru_gen_seq_next, + .show = lru_gen_seq_show, +}; + +static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, + bool can_swap, bool force_scan) +{ + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (seq < max_seq) + return 0; + + if (seq > max_seq) + return -EINVAL; + + if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) + return -ERANGE; + + try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan); + + return 0; +} + +static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, + int swappiness, unsigned long nr_to_reclaim) +{ + DEFINE_MAX_SEQ(lruvec); + + if (seq + MIN_NR_GENS > max_seq) + return -EINVAL; + + sc->nr_reclaimed = 0; + + while (!signal_pending(current)) { + DEFINE_MIN_SEQ(lruvec); + + if (seq < min_seq[!swappiness]) + return 0; + + if (sc->nr_reclaimed >= nr_to_reclaim) + return 0; + + if (!evict_pages(lruvec, sc, swappiness, NULL)) + return 0; + + cond_resched(); + } + + return -EINTR; +} + +static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, + struct scan_control *sc, int swappiness, unsigned long opt) +{ + struct lruvec *lruvec; + int err = -EINVAL; + struct mem_cgroup *memcg = NULL; + + if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) + return -EINVAL; + + if (!mem_cgroup_disabled()) { + rcu_read_lock(); + memcg = mem_cgroup_from_id(memcg_id); +#ifdef CONFIG_MEMCG + if (memcg && !css_tryget(&memcg->css)) + memcg = NULL; +#endif + rcu_read_unlock(); + + if (!memcg) + return -EINVAL; + } + + if (memcg_id != mem_cgroup_id(memcg)) + goto done; + + lruvec = get_lruvec(memcg, nid); + + if (swappiness < 0) + swappiness = get_swappiness(lruvec, sc); + else if (swappiness > 200) + goto done; + + switch (cmd) { + case '+': + err = run_aging(lruvec, seq, sc, swappiness, opt); + break; + case '-': + err = run_eviction(lruvec, seq, sc, swappiness, opt); + break; + } +done: + mem_cgroup_put(memcg); + + return err; +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, + size_t len, loff_t *pos) +{ + void *buf; + char *cur, *next; + unsigned int flags; + struct blk_plug plug; + int err = -EINVAL; + struct scan_control sc = { + .may_writepage = true, + .may_unmap = true, + .may_swap = true, + .reclaim_idx = MAX_NR_ZONES - 1, + .gfp_mask = GFP_KERNEL, + }; + + buf = kvmalloc(len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, src, len)) { + kvfree(buf); + return -EFAULT; + } + + set_task_reclaim_state(current, &sc.reclaim_state); + flags = memalloc_noreclaim_save(); + blk_start_plug(&plug); + if (!set_mm_walk(NULL)) { + err = -ENOMEM; + goto done; + } + + next = buf; + next[len] = '\0'; + + while ((cur = strsep(&next, ",;\n"))) { + int n; + int end; + char cmd; + unsigned int memcg_id; + unsigned int nid; + unsigned long seq; + unsigned int swappiness = -1; + unsigned long opt = -1; + + cur = skip_spaces(cur); + if (!*cur) + continue; + + n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, + &seq, &end, &swappiness, &end, &opt, &end); + if (n < 4 || cur[end]) { + err = -EINVAL; + break; + } + + err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); + if (err) + break; + } +done: + clear_mm_walk(); + blk_finish_plug(&plug); + memalloc_noreclaim_restore(flags); + set_task_reclaim_state(current, NULL); + + kvfree(buf); + + return err ? : len; +} + +static int lru_gen_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &lru_gen_seq_ops); +} + +static const struct file_operations lru_gen_rw_fops = { + .open = lru_gen_seq_open, + .read = seq_read, + .write = lru_gen_seq_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations lru_gen_ro_fops = { + .open = lru_gen_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/****************************************************************************** + * initialization + ******************************************************************************/ + +void lru_gen_init_lruvec(struct lruvec *lruvec) +{ + int i; + int gen, type, zone; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + lrugen->max_seq = MIN_NR_GENS + 1; + lrugen->enabled = lru_gen_enabled(); + + for (i = 0; i <= MIN_NR_GENS + 1; i++) + lrugen->timestamps[i] = jiffies; + + for_each_gen_type_zone(gen, type, zone) + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); + + lruvec->mm_state.seq = MIN_NR_GENS; + init_waitqueue_head(&lruvec->mm_state.wait); +} + +#ifdef CONFIG_MEMCG +void lru_gen_init_memcg(struct mem_cgroup *memcg) +{ + INIT_LIST_HEAD(&memcg->mm_list.fifo); + spin_lock_init(&memcg->mm_list.lock); +} + +void lru_gen_exit_memcg(struct mem_cgroup *memcg) +{ + int i; + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, + sizeof(lruvec->lrugen.nr_pages))); + + for (i = 0; i < NR_BLOOM_FILTERS; i++) { + bitmap_free(lruvec->mm_state.filters[i]); + lruvec->mm_state.filters[i] = NULL; + } + } +} +#endif + +static int __init init_lru_gen(void) +{ + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); + + if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) + pr_err("lru_gen: failed to create sysfs group\n"); + + debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); + debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); + + return 0; +}; +late_initcall(init_lru_gen); + +#else /* !CONFIG_LRU_GEN */ + +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ +} + +#endif /* CONFIG_LRU_GEN */ + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; @@ -2728,6 +5647,12 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) unsigned long nr_to_reclaim = sc->nr_to_reclaim; bool proportional_reclaim; struct blk_plug plug; + bool do_plug = true; + + if (lru_gen_enabled()) { + lru_gen_shrink_lruvec(lruvec, sc); + return; + } get_scan_count(lruvec, sc, nr); @@ -2748,7 +5673,9 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() && sc->priority == DEF_PRIORITY); - blk_start_plug(&plug); + trace_android_vh_shrink_lruvec_blk_plug(&do_plug); + if (do_plug) + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { unsigned long nr_anon, nr_file, percentage; @@ -2818,7 +5745,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) nr[lru] = targets[lru] * (100 - percentage) / 100; nr[lru] -= min(nr[lru], nr_scanned); } - blk_finish_plug(&plug); + if (do_plug) + blk_finish_plug(&plug); sc->nr_reclaimed += nr_reclaimed; /* @@ -2912,6 +5840,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); unsigned long reclaimed; unsigned long scanned; + bool skip = false; /* * This loop can become CPU-bound when target memcgs @@ -2921,6 +5850,10 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) */ cond_resched(); + trace_android_vh_shrink_node_memcgs(memcg, &skip); + if (skip) + continue; + mem_cgroup_calculate_protection(target_memcg, memcg); if (mem_cgroup_below_min(memcg)) { @@ -2965,109 +5898,16 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long nr_reclaimed, nr_scanned; struct lruvec *target_lruvec; bool reclaimable = false; - unsigned long file; target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); again: - /* - * Flush the memory cgroup stats, so that we read accurate per-memcg - * lruvec stats for heuristics. - */ - mem_cgroup_flush_stats(); - memset(&sc->nr, 0, sizeof(sc->nr)); nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - /* - * Determine the scan balance between anon and file LRUs. - */ - spin_lock_irq(&target_lruvec->lru_lock); - sc->anon_cost = target_lruvec->anon_cost; - sc->file_cost = target_lruvec->file_cost; - spin_unlock_irq(&target_lruvec->lru_lock); - - /* - * Target desirable inactive:active list ratios for the anon - * and file LRU lists. - */ - if (!sc->force_deactivate) { - unsigned long refaults; - - refaults = lruvec_page_state(target_lruvec, - WORKINGSET_ACTIVATE_ANON); - if (refaults != target_lruvec->refaults[0] || - inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) - sc->may_deactivate |= DEACTIVATE_ANON; - else - sc->may_deactivate &= ~DEACTIVATE_ANON; - - /* - * When refaults are being observed, it means a new - * workingset is being established. Deactivate to get - * rid of any stale active pages quickly. - */ - refaults = lruvec_page_state(target_lruvec, - WORKINGSET_ACTIVATE_FILE); - if (refaults != target_lruvec->refaults[1] || - inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) - sc->may_deactivate |= DEACTIVATE_FILE; - else - sc->may_deactivate &= ~DEACTIVATE_FILE; - } else - sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; - - /* - * If we have plenty of inactive file pages that aren't - * thrashing, try to reclaim those first before touching - * anonymous pages. - */ - file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); - if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) - sc->cache_trim_mode = 1; - else - sc->cache_trim_mode = 0; - - /* - * Prevent the reclaimer from falling into the cache trap: as - * cache pages start out inactive, every cache fault will tip - * the scan balance towards the file LRU. And as the file LRU - * shrinks, so does the window for rotation from references. - * This means we have a runaway feedback loop where a tiny - * thrashing file LRU becomes infinitely more attractive than - * anon pages. Try to detect this based on file LRU size. - */ - if (!cgroup_reclaim(sc)) { - unsigned long total_high_wmark = 0; - unsigned long free, anon; - int z; - - free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); - file = node_page_state(pgdat, NR_ACTIVE_FILE) + - node_page_state(pgdat, NR_INACTIVE_FILE); - - for (z = 0; z < MAX_NR_ZONES; z++) { - struct zone *zone = &pgdat->node_zones[z]; - if (!managed_zone(zone)) - continue; - - total_high_wmark += high_wmark_pages(zone); - } - - /* - * Consider anon: if that's low too, this isn't a - * runaway file reclaim problem, but rather just - * extreme pressure. Reclaim as per usual then. - */ - anon = node_page_state(pgdat, NR_INACTIVE_ANON); - - sc->file_is_tiny = - file + free <= total_high_wmark && - !(sc->may_deactivate & DEACTIVATE_ANON) && - anon >> sc->priority; - } + prepare_scan_count(pgdat, sc); shrink_node_memcgs(pgdat, sc); @@ -3287,6 +6127,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) struct lruvec *target_lruvec; unsigned long refaults; + if (lru_gen_enabled()) + return; + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); target_lruvec->refaults[0] = refaults; @@ -3649,14 +6492,19 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, return nr_reclaimed; } +EXPORT_SYMBOL_GPL(try_to_free_mem_cgroup_pages); #endif -static void age_active_anon(struct pglist_data *pgdat, - struct scan_control *sc) +static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; struct lruvec *lruvec; + if (lru_gen_enabled()) { + lru_gen_age_node(pgdat, sc); + return; + } + if (!can_age_anon_pages(pgdat, sc)) return; @@ -3973,12 +6821,11 @@ restart: sc.may_swap = !nr_boost_reclaim; /* - * Do some background aging of the anon list, to give - * pages a chance to be referenced before reclaiming. All - * pages are rotated regardless of classzone as this is - * about consistent aging. + * Do some background aging, to give pages a chance to be + * referenced before reclaiming. All pages are rotated + * regardless of classzone as this is about consistent aging. */ - age_active_anon(pgdat, &sc); + kswapd_age_node(pgdat, &sc); /* * If we're getting trouble reclaiming, start doing writepage @@ -4191,7 +7038,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ -static int kswapd(void *p) +int kswapd(void *p) { unsigned int alloc_order, reclaim_order; unsigned int highest_zoneidx = MAX_NR_ZONES - 1; @@ -4268,6 +7115,7 @@ kswapd_try_sleep: return 0; } +EXPORT_SYMBOL_GPL(kswapd); /* * A zone is low on free memory or too fragmented for high-order memory. If @@ -4367,10 +7215,14 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) void kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); + bool skip = false; if (pgdat->kswapd) return; + trace_android_vh_kswapd_per_node(nid, &skip, true); + if (skip) + return; pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ @@ -4387,7 +7239,11 @@ void kswapd_run(int nid) void kswapd_stop(int nid) { struct task_struct *kswapd = NODE_DATA(nid)->kswapd; + bool skip = false; + trace_android_vh_kswapd_per_node(nid, &skip, false); + if (skip) + return; if (kswapd) { kthread_stop(kswapd); NODE_DATA(nid)->kswapd = NULL; diff --git a/mm/vmstat.c b/mm/vmstat.c index 8ce2620344b2..4349599d0209 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1176,9 +1176,7 @@ const char * const vmstat_text[] = { "nr_zone_write_pending", "nr_mlock", "nr_bounce", -#if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", -#endif "nr_free_cma", /* enum numa_stat_item counters */ @@ -1233,6 +1231,7 @@ const char * const vmstat_text[] = { "nr_shadow_call_stack", #endif "nr_page_table_pages", + "nr_sec_page_table_pages", #ifdef CONFIG_SWAP "nr_swapcached", #endif @@ -1381,6 +1380,32 @@ const char * const vmstat_text[] = { "direct_map_level2_splits", "direct_map_level3_splits", #endif +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + "spf_attempt", + "spf_abort", +#endif +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT_STATS + "SPF_ABORT_ODD", + "SPF_ABORT_UNMAPPED", + "SPF_ABORT_NO_SPECULATE", + "SPF_ABORT_VMA_COPY", + "SPF_ABORT_ACCESS_ERROR", + "SPF_ABORT_PUD", + "SPF_ABORT_PMD", + "SPF_ABORT_ANON_VMA", + "SPF_ABORT_PTE_MAP_LOCK_SEQ1", + "SPF_ABORT_PTE_MAP_LOCK_PMD", + "SPF_ABORT_PTE_MAP_LOCK_PTL", + "SPF_ABORT_PTE_MAP_LOCK_SEQ2", + "SPF_ABORT_USERFAULTFD", + "SPF_ABORT_FAULT", + "SPF_ABORT_SWAP", + "SPF_ATTEMPT_ANON", + "SPF_ATTEMPT_FILE", + "SPF_ATTEMPT_NUMA", + "SPF_ATTEMPT_PTE", + "SPF_ATTEMPT_WP", +#endif #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ diff --git a/mm/workingset.c b/mm/workingset.c index 880d882f3325..aeba62cebf8c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -187,7 +187,6 @@ static unsigned int bucket_order __read_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, bool workingset) { - eviction >>= bucket_order; eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; @@ -212,10 +211,107 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *memcgidp = memcgid; *pgdat = NODE_DATA(nid); - *evictionp = entry << bucket_order; + *evictionp = entry; *workingsetp = workingset; } +#ifdef CONFIG_LRU_GEN + +static void *lru_gen_eviction(struct page *page) +{ + int hist; + unsigned long token; + unsigned long min_seq; + struct lruvec *lruvec; + struct lru_gen_struct *lrugen; + int type = page_is_file_lru(page); + int delta = thp_nr_pages(page); + int refs = page_lru_refs(page); + int tier = lru_tier_from_refs(refs); + struct mem_cgroup *memcg = page_memcg(page); + struct pglist_data *pgdat = page_pgdat(page); + + BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + lrugen = &lruvec->lrugen; + min_seq = READ_ONCE(lrugen->min_seq[type]); + token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); + + hist = lru_hist_from_seq(min_seq); + atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); + + return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); +} + +static void lru_gen_refault(struct page *page, void *shadow) +{ + int hist, tier, refs; + int memcg_id; + bool workingset; + unsigned long token; + unsigned long min_seq; + struct lruvec *lruvec; + struct lru_gen_struct *lrugen; + struct mem_cgroup *memcg; + struct pglist_data *pgdat; + int type = page_is_file_lru(page); + int delta = thp_nr_pages(page); + + unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); + + if (pgdat != page_pgdat(page)) + return; + + rcu_read_lock(); + + memcg = page_memcg_rcu(page); + if (memcg_id != mem_cgroup_id(memcg)) + goto unlock; + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + lrugen = &lruvec->lrugen; + + min_seq = READ_ONCE(lrugen->min_seq[type]); + if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) + goto unlock; + + hist = lru_hist_from_seq(min_seq); + /* see the comment in page_lru_refs() */ + refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; + tier = lru_tier_from_refs(refs); + + atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); + + /* + * Count the following two cases as stalls: + * 1. For pages accessed through page tables, hotter pages pushed out + * hot pages which refaulted immediately. + * 2. For pages accessed multiple times through file descriptors, + * numbers of accesses might have been out of the range. + */ + if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) { + SetPageWorkingset(page); + mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); + } +unlock: + rcu_read_unlock(); +} + +#else /* !CONFIG_LRU_GEN */ + +static void *lru_gen_eviction(struct page *page) +{ + return NULL; +} + +static void lru_gen_refault(struct page *page, void *shadow) +{ +} + +#endif /* CONFIG_LRU_GEN */ + /** * workingset_age_nonresident - age non-resident entries as LRU ages * @lruvec: the lruvec that was aged @@ -264,10 +360,14 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); + if (lru_gen_enabled()) + return lru_gen_eviction(page); + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); + eviction >>= bucket_order; workingset_age_nonresident(lruvec, thp_nr_pages(page)); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } @@ -296,7 +396,13 @@ void workingset_refault(struct page *page, void *shadow) bool workingset; int memcgid; + if (lru_gen_enabled()) { + lru_gen_refault(page, shadow); + return; + } + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); + eviction <<= bucket_order; rcu_read_lock(); /* diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 439deb8decbc..752fe5a338a6 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -347,7 +347,7 @@ static void destroy_cache(struct zs_pool *pool) static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) { return (unsigned long)kmem_cache_alloc(pool->handle_cachep, - gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); + gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE|__GFP_CMA)); } static void cache_free_handle(struct zs_pool *pool, unsigned long handle) @@ -358,7 +358,7 @@ static void cache_free_handle(struct zs_pool *pool, unsigned long handle) static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) { return kmem_cache_zalloc(pool->zspage_cachep, - flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); + flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE|__GFP_CMA)); } static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) diff --git a/modules.bzl b/modules.bzl new file mode 100644 index 000000000000..88538d040943 --- /dev/null +++ b/modules.bzl @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2022 The Android Open Source Project + +""" +This module contains a full list of kernel modules + compiled by GKI. +""" + +COMMON_GKI_MODULES_LIST = [ + # keep sorted + "drivers/block/zram/zram.ko", + "drivers/bluetooth/btbcm.ko", + "drivers/bluetooth/btqca.ko", + "drivers/bluetooth/btsdio.ko", + "drivers/bluetooth/hci_uart.ko", + "drivers/net/can/dev/can-dev.ko", + "drivers/net/can/slcan.ko", + "drivers/net/can/vcan.ko", + "drivers/net/ppp/bsd_comp.ko", + "drivers/net/ppp/ppp_deflate.ko", + "drivers/net/ppp/ppp_generic.ko", + "drivers/net/ppp/ppp_mppe.ko", + "drivers/net/ppp/pppox.ko", + "drivers/net/ppp/pptp.ko", + "drivers/net/slip/slhc.ko", + "drivers/usb/class/cdc-acm.ko", + "drivers/usb/serial/ftdi_sio.ko", + "drivers/usb/serial/usbserial.ko", + "lib/crypto/libarc4.ko", + "mm/zsmalloc.ko", + "net/6lowpan/6lowpan.ko", + "net/6lowpan/nhc_dest.ko", + "net/6lowpan/nhc_fragment.ko", + "net/6lowpan/nhc_hop.ko", + "net/6lowpan/nhc_ipv6.ko", + "net/6lowpan/nhc_mobility.ko", + "net/6lowpan/nhc_routing.ko", + "net/6lowpan/nhc_udp.ko", + "net/8021q/8021q.ko", + "net/bluetooth/bluetooth.ko", + "net/bluetooth/hidp/hidp.ko", + "net/bluetooth/rfcomm/rfcomm.ko", + "net/can/can.ko", + "net/can/can-bcm.ko", + "net/can/can-gw.ko", + "net/can/can-raw.ko", + "net/ieee802154/6lowpan/ieee802154_6lowpan.ko", + "net/ieee802154/ieee802154.ko", + "net/ieee802154/ieee802154_socket.ko", + "net/l2tp/l2tp_core.ko", + "net/l2tp/l2tp_ppp.ko", + "net/mac80211/mac80211.ko", + "net/mac802154/mac802154.ko", + "net/nfc/nfc.ko", + "net/rfkill/rfkill.ko", + "net/tipc/diag.ko", + "net/tipc/tipc.ko", + "net/wireless/cfg80211.ko", +] diff --git a/net/9p/mod.c b/net/9p/mod.c index 535cf016633c..1bbe9fa8d642 100644 --- a/net/9p/mod.c +++ b/net/9p/mod.c @@ -186,4 +186,5 @@ MODULE_AUTHOR("Latchesar Ionkov "); MODULE_AUTHOR("Eric Van Hensbergen "); MODULE_AUTHOR("Ron Minnich "); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver); MODULE_DESCRIPTION("Plan 9 Resource Sharing Support (9P2000)"); diff --git a/net/OWNERS b/net/OWNERS new file mode 100644 index 000000000000..cbbfa70f7d89 --- /dev/null +++ b/net/OWNERS @@ -0,0 +1,2 @@ +lorenzo@google.com +maze@google.com diff --git a/net/TEST_MAPPING b/net/TEST_MAPPING new file mode 100644 index 000000000000..1eb8d436031a --- /dev/null +++ b/net/TEST_MAPPING @@ -0,0 +1,12 @@ +{ + "presubmit": [ + { + "name": "CtsNetTestCases", + "options": [ + { + "exclude-annotation": "com.android.testutils.SkipPresubmit" + } + ] + } + ] +} diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index 51a941b56ec3..4dfcd0d7ff3f 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -134,3 +134,4 @@ static void __exit fini_umh(void) module_init(load_umh); module_exit(fini_umh); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver); diff --git a/net/core/dev.c b/net/core/dev.c index 33d6b691e15e..0c4c25876f8f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -150,6 +150,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -516,6 +517,12 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) static inline struct list_head *ptype_head(const struct packet_type *pt) { + struct list_head vendor_pt = { .next = NULL, }; + + trace_android_vh_ptype_head(pt, &vendor_pt); + if (vendor_pt.next) + return vendor_pt.next; + if (pt->type == htons(ETH_P_ALL)) return pt->dev ? &pt->dev->ptype_all : &ptype_all; else diff --git a/net/core/net-traces.c b/net/core/net-traces.c index c40cd8dd75c7..82a8a5c05639 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -35,13 +35,11 @@ #include #include #include -#if IS_ENABLED(CONFIG_BRIDGE) #include EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_external_learn_add); EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update); -#endif #if IS_ENABLED(CONFIG_PAGE_POOL) #include @@ -56,6 +54,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_dead); EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_cleanup_and_release); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); +EXPORT_TRACEPOINT_SYMBOL_GPL(consume_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2d3f82b62236..3ebb94112d52 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -78,6 +78,7 @@ #include #include #include +#include #include "datagram.h" #include "sock_destructor.h" @@ -772,6 +773,7 @@ void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) if (!skb_unref(skb)) return; + trace_android_vh_kfree_skb(skb); trace_kfree_skb(skb, __builtin_return_address(0), reason); __kfree_skb(skb); } diff --git a/net/core/sock.c b/net/core/sock.c index b7ac53e72d1a..0aaa12cee40d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -135,6 +135,8 @@ #include #include +#include +#include #include #include @@ -1844,6 +1846,8 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (security_sk_alloc(sk, family, priority)) goto out_free; + trace_android_rvh_sk_alloc(sk); + if (!try_module_get(prot->owner)) goto out_free_sec; } @@ -1852,6 +1856,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, out_free_sec: security_sk_free(sk); + trace_android_rvh_sk_free(sk); out_free: if (slab != NULL) kmem_cache_free(slab, sk); @@ -1871,6 +1876,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) cgroup_sk_free(&sk->sk_cgrp_data); mem_cgroup_sk_free(sk); security_sk_free(sk); + trace_android_rvh_sk_free(sk); if (slab != NULL) kmem_cache_free(slab, sk); else @@ -3054,9 +3060,19 @@ void sock_def_readable(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (skwq_has_sleeper(wq)) + + if (skwq_has_sleeper(wq)) { + int done = 0; + + trace_android_vh_do_wake_up_sync(&wq->wait, &done); + if (done) + goto out; + wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); + } + +out: sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 91710e5eedff..f1564b07a116 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -500,6 +500,10 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, goto out; snum = ntohs(addr->sin_port); + err = -EPERM; + if (snum && inet_is_local_unbindable_port(net, snum)) + goto out; + err = -EACCES; if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && snum && inet_port_requires_bind_service(net, snum) && diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 6eea1e9e998d..2bb839a42e96 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -141,7 +141,7 @@ static void ah_output_done(struct crypto_async_request *base, int err) } kfree(AH_SKB_CB(skb)->tmp); - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb, err); } static int ah_output(struct xfrm_state *x, struct sk_buff *skb) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index d747166bb291..f04935ce298b 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -277,7 +277,7 @@ static void esp_output_done(struct crypto_async_request *base, int err) x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) esp_output_tail_tcp(x, skb); else - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb, err); } } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 495c58e442e2..9b1e685c8d08 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -706,6 +706,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_do_large_bitmap, }, + { + .procname = "ip_local_unbindable_ports", + .data = &init_net.ipv4.sysctl_local_unbindable_ports, + .maxlen = 65536, + .mode = 0644, + .proc_handler = proc_do_large_bitmap, + }, { .procname = "ip_no_pmtu_disc", .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc, @@ -1395,11 +1402,17 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); if (!net->ipv4.sysctl_local_reserved_ports) - goto err_ports; + goto err_reserved_ports; + + net->ipv4.sysctl_local_unbindable_ports = kzalloc(65536 / 8, GFP_KERNEL); + if (!net->ipv4.sysctl_local_unbindable_ports) + goto err_unbindable_ports; return 0; -err_ports: +err_unbindable_ports: + kfree(net->ipv4.sysctl_local_reserved_ports); +err_reserved_ports: unregister_net_sysctl_table(net->ipv4.ipv4_hdr); err_reg: if (!net_eq(net, &init_net)) @@ -1412,6 +1425,7 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net) { struct ctl_table *table; + kfree(net->ipv4.sysctl_local_unbindable_ports); kfree(net->ipv4.sysctl_local_reserved_ports); table = net->ipv4.ipv4_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.ipv4_hdr); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 51f34560a9d6..22328865fd33 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -281,6 +281,8 @@ #include #include +#include + /* Track pending CMSGs. */ enum { TCP_CMSG_INQ = 1, @@ -1211,6 +1213,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) bool zc = false; long timeo; + trace_android_rvh_tcp_sendmsg_locked(sk, size); flags = msg->msg_flags; if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) { @@ -2518,6 +2521,7 @@ found_fin_ok: break; } while (len > 0); + trace_android_rvh_tcp_recvmsg_stat(sk, copied); /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ @@ -2546,6 +2550,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); + trace_android_rvh_tcp_recvmsg(sk); if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 102a0436eb29..5b09f400fdfd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -80,6 +80,7 @@ #include #include #include +#include int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -4702,6 +4703,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk, static void tcp_drop(struct sock *sk, struct sk_buff *skb) { + trace_android_vh_kfree_skb(skb); sk_drops_add(sk, skb); __kfree_skb(skb); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 79d5425bed07..de78a8ef5d2c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -116,6 +116,7 @@ #if IS_ENABLED(CONFIG_IPV6) #include #endif +#include struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); @@ -1070,6 +1071,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; + trace_android_rvh_udp_sendmsg(sk); getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; @@ -1864,6 +1866,7 @@ try_again: skb = __skb_recv_udp(sk, flags, noblock, &off, &err); if (!skb) return err; + trace_android_rvh_udp_recvmsg(sk); ulen = udp_skb_len(skb); copied = len; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 8800987fdb40..34c3d30765fb 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -218,6 +218,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_ra_rt_info_max_plen = 0, #endif #endif + .accept_ra_rt_table = 0, .proxy_ndp = 0, .accept_source_route = 0, /* we do not accept RH0 by default. */ .disable_ipv6 = 0, @@ -277,6 +278,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .accept_ra_rt_info_max_plen = 0, #endif #endif + .accept_ra_rt_table = 0, .proxy_ndp = 0, .accept_source_route = 0, /* we do not accept RH0 by default. */ .disable_ipv6 = 0, @@ -2394,6 +2396,26 @@ regen: goto regen; } +u32 addrconf_rt_table(const struct net_device *dev, u32 default_table) +{ + struct inet6_dev *idev = in6_dev_get(dev); + int sysctl; + u32 table; + + if (!idev) + return default_table; + sysctl = idev->cnf.accept_ra_rt_table; + if (sysctl == 0) { + table = default_table; + } else if (sysctl > 0) { + table = (u32) sysctl; + } else { + table = (unsigned) dev->ifindex + (-sysctl); + } + in6_dev_put(idev); + return table; +} + /* * Add prefix route. */ @@ -2404,7 +2426,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, u32 metric, u32 flags, gfp_t gfp_flags) { struct fib6_config cfg = { - .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX, + .fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_PREFIX), .fc_metric = metric ? : IP6_RT_PRIO_ADDRCONF, .fc_ifindex = dev->ifindex, .fc_expires = expires, @@ -2439,7 +2461,7 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, struct fib6_node *fn; struct fib6_info *rt = NULL; struct fib6_table *table; - u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX; + u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_PREFIX); table = fib6_get_table(dev_net(dev), tb_id); if (!table) @@ -6811,6 +6833,13 @@ static const struct ctl_table addrconf_sysctl[] = { }, #endif #endif + { + .procname = "accept_ra_rt_table", + .data = &ipv6_devconf.accept_ra_rt_table, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "proxy_ndp", .data = &ipv6_devconf.proxy_ndp, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 3a91d0d40aec..f32db6eaa33b 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -296,6 +296,8 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, return -EINVAL; snum = ntohs(addr->sin6_port); + if (snum && inet_is_local_unbindable_port(net, snum)) + return -EPERM; if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 828e62514260..6536d9580cd2 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -316,7 +316,7 @@ static void ah6_output_done(struct crypto_async_request *base, int err) } kfree(AH_SKB_CB(skb)->tmp); - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb, err); } static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 6219d97cac7a..3e2b82b2cce9 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -314,7 +314,7 @@ static void esp_output_done(struct crypto_async_request *base, int err) x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) esp_output_tail_tcp(x, skb); else - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb, err); } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0655fd8c67e9..a867bf782c89 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4276,7 +4276,7 @@ static struct fib6_info *rt6_get_route_info(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev) { - u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; + u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO); int ifindex = dev->ifindex; struct fib6_node *fn; struct fib6_info *rt = NULL; @@ -4330,7 +4330,7 @@ static struct fib6_info *rt6_add_route_info(struct net *net, .fc_nlinfo.nl_net = net, }; - cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; + cfg.fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO); cfg.fc_dst = *prefix; cfg.fc_gateway = *gwaddr; @@ -4348,7 +4348,7 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, const struct in6_addr *addr, struct net_device *dev) { - u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; + u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_DFLT); struct fib6_info *rt; struct fib6_table *table; @@ -4383,7 +4383,7 @@ struct fib6_info *rt6_add_dflt_router(struct net *net, u32 defrtr_usr_metric) { struct fib6_config cfg = { - .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, + .fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_DFLT), .fc_metric = defrtr_usr_metric, .fc_ifindex = dev->ifindex, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | @@ -4408,47 +4408,24 @@ struct fib6_info *rt6_add_dflt_router(struct net *net, return rt6_get_dflt_router(net, gwaddr, dev); } -static void __rt6_purge_dflt_routers(struct net *net, - struct fib6_table *table) +static int rt6_addrconf_purge(struct fib6_info *rt, void *arg) { - struct fib6_info *rt; + struct net_device *dev = fib6_info_nh_dev(rt); + struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; -restart: - rcu_read_lock(); - for_each_fib6_node_rt_rcu(&table->tb6_root) { - struct net_device *dev = fib6_info_nh_dev(rt); - struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; - - if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && - (!idev || idev->cnf.accept_ra != 2) && - fib6_info_hold_safe(rt)) { - rcu_read_unlock(); - ip6_del_rt(net, rt, false); - goto restart; - } + if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && + (!idev || idev->cnf.accept_ra != 2)) { + /* Delete this route. See fib6_clean_tree() */ + return -1; } - rcu_read_unlock(); - table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; + /* Continue walking */ + return 0; } void rt6_purge_dflt_routers(struct net *net) { - struct fib6_table *table; - struct hlist_head *head; - unsigned int h; - - rcu_read_lock(); - - for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { - head = &net->ipv6.fib_table_hash[h]; - hlist_for_each_entry_rcu(table, head, tb6_hlist) { - if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) - __rt6_purge_dflt_routers(net, table); - } - } - - rcu_read_unlock(); + fib6_clean_all(net, rt6_addrconf_purge, NULL); } static void rtmsg_to_fib6_config(struct net *net, diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 0d2bab9d351c..053b338d8479 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -260,7 +260,7 @@ static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid, mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout); mgmt->u.action.u.addba_resp.status = cpu_to_le16(status); - if (sta->sta.he_cap.has_he && addbaext) + if (sta->sta.deflink.he_cap.has_he && addbaext) ieee80211_add_addbaext(sdata, skb, addbaext); ieee80211_tx_skb(sdata, skb); @@ -293,7 +293,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, goto end; } - if (!sta->sta.ht_cap.ht_supported && + if (!sta->sta.deflink.ht_cap.ht_supported && sta->sdata->vif.bss_conf.chandef.chan->band != NL80211_BAND_6GHZ) { ht_dbg(sta->sdata, "STA %pM erroneously requests BA session on tid %d w/o QoS\n", @@ -309,7 +309,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, goto end; } - if (sta->sta.he_cap.has_he) + if (sta->sta.deflink.eht_cap.has_eht) max_buf_size = IEEE80211_MAX_AMPDU_BUF_HE; else max_buf_size = IEEE80211_MAX_AMPDU_BUF_HT; @@ -319,7 +319,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, * and if buffer size does not exceeds max value */ /* XXX: check own ht delayed BA capability?? */ if (((ba_policy != 1) && - (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) || + (!(sta->sta.deflink.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) || (buf_size > max_buf_size)) { status = WLAN_STATUS_INVALID_QOS_PARAM; ht_dbg_ratelimited(sta->sdata, diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index a4d3fa14f76b..b17745e92be8 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -467,7 +467,7 @@ static void ieee80211_send_addba_with_timeout(struct sta_info *sta, sta->ampdu_mlme.addba_req_num[tid]++; spin_unlock_bh(&sta->lock); - if (sta->sta.he_cap.has_he) { + if (sta->sta.deflink.he_cap.has_he) { buf_size = local->hw.max_tx_aggregation_subframes; } else { /* @@ -598,7 +598,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, "Requested to start BA session on reserved tid=%d", tid)) return -EINVAL; - if (!pubsta->ht_cap.ht_supported && + if (!pubsta->deflink.ht_cap.ht_supported && sta->sdata->vif.bss_conf.chandef.chan->band != NL80211_BAND_6GHZ) return -EINVAL; @@ -651,7 +651,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, * is set when we receive a bss info from a probe response or a beacon. */ if (sta->sdata->vif.type == NL80211_IFTYPE_ADHOC && - !sta->sta.ht_cap.ht_supported) { + !sta->sta.deflink.ht_cap.ht_supported) { ht_dbg(sdata, "BA request denied - IBSS STA %pM does not advertise HT support\n", pubsta->addr); diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c index 758ef63669e7..619fc3854525 100644 --- a/net/mac80211/airtime.c +++ b/net/mac80211/airtime.c @@ -653,8 +653,8 @@ u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_rx_status stat; - struct ieee80211_tx_rate *rate = &sta->tx_stats.last_rate; - struct rate_info *ri = &sta->tx_stats.last_rate_info; + struct ieee80211_tx_rate *rate = &sta->deflink.tx_stats.last_rate; + struct rate_info *ri = &sta->deflink.tx_stats.last_rate_info; u32 duration, overhead; u8 agg_shift; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 4fa216a108ae..0a5f4637562c 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -402,8 +402,8 @@ static int ieee80211_set_tx(struct ieee80211_sub_if_data *sdata, } static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_idx, bool pairwise, const u8 *mac_addr, - struct key_params *params) + int link_id, u8 key_idx, bool pairwise, + const u8 *mac_addr, struct key_params *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; @@ -519,7 +519,8 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, } static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_idx, bool pairwise, const u8 *mac_addr) + int link_id, u8 key_idx, bool pairwise, + const u8 *mac_addr) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; @@ -540,7 +541,8 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, if (pairwise) key = key_mtx_dereference(local, sta->ptk[key_idx]); else - key = key_mtx_dereference(local, sta->gtk[key_idx]); + key = key_mtx_dereference(local, + sta->deflink.gtk[key_idx]); } else key = key_mtx_dereference(local, sdata->keys[key_idx]); @@ -560,8 +562,8 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, } static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_idx, bool pairwise, const u8 *mac_addr, - void *cookie, + int link_id, u8 key_idx, bool pairwise, + const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params *params)) { @@ -590,7 +592,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, else if (!pairwise && key_idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + NUM_DEFAULT_BEACON_KEYS) - key = rcu_dereference(sta->gtk[key_idx]); + key = rcu_dereference(sta->deflink.gtk[key_idx]); } else key = rcu_dereference(sdata->keys[key_idx]); @@ -680,7 +682,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, static int ieee80211_config_default_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_idx, bool uni, + int link_id, u8 key_idx, bool uni, bool multi) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -692,7 +694,7 @@ static int ieee80211_config_default_key(struct wiphy *wiphy, static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_idx) + int link_id, u8 key_idx) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -703,7 +705,7 @@ static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy, static int ieee80211_config_default_beacon_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_idx) + int link_id, u8 key_idx) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -959,11 +961,29 @@ static int ieee80211_set_ftm_responder_params( return 0; } +static int +ieee80211_copy_mbssid_beacon(u8 *pos, struct cfg80211_mbssid_elems *dst, + struct cfg80211_mbssid_elems *src) +{ + int i, offset = 0; + + for (i = 0; i < src->cnt; i++) { + memcpy(pos + offset, src->elem[i].data, src->elem[i].len); + dst->elem[i].len = src->elem[i].len; + dst->elem[i].data = pos + offset; + offset += dst->elem[i].len; + } + dst->cnt = src->cnt; + + return offset; +} + static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_beacon_data *params, const struct ieee80211_csa_settings *csa, const struct ieee80211_color_change_settings *cca) { + struct cfg80211_mbssid_elems *mbssid = NULL; struct beacon_data *new, *old; int new_head_len, new_tail_len; int size, err; @@ -991,6 +1011,17 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, size = sizeof(*new) + new_head_len + new_tail_len; + /* new or old multiple BSSID elements? */ + if (params->mbssid_ies) { + mbssid = params->mbssid_ies; + size += struct_size(new->mbssid_ies, elem, mbssid->cnt); + size += ieee80211_get_mbssid_beacon_len(mbssid); + } else if (old && old->mbssid_ies) { + mbssid = old->mbssid_ies; + size += struct_size(new->mbssid_ies, elem, mbssid->cnt); + size += ieee80211_get_mbssid_beacon_len(mbssid); + } + new = kzalloc(size, GFP_KERNEL); if (!new) return -ENOMEM; @@ -999,12 +1030,20 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, /* * pointers go into the block we allocated, - * memory is | beacon_data | head | tail | + * memory is | beacon_data | head | tail | mbssid_ies */ new->head = ((u8 *) new) + sizeof(*new); new->tail = new->head + new_head_len; new->head_len = new_head_len; new->tail_len = new_tail_len; + /* copy in optional mbssid_ies */ + if (mbssid) { + u8 *pos = new->tail + new->tail_len; + + new->mbssid_ies = (void *)pos; + pos += struct_size(new->mbssid_ies, elem, mbssid->cnt); + ieee80211_copy_mbssid_beacon(pos, new->mbssid_ies, mbssid); + } if (csa) { new->cntdwn_current_counter = csa->count; @@ -1103,7 +1142,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); changed |= BSS_CHANGED_HE_OBSS_PD; - if (params->he_bss_color.enabled) + if (params->beacon.he_bss_color.enabled) changed |= BSS_CHANGED_HE_BSS_COLOR; } @@ -1152,7 +1191,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p; sdata->vif.bss_conf.twt_responder = params->twt_responder; sdata->vif.bss_conf.he_obss_pd = params->he_obss_pd; - sdata->vif.bss_conf.he_bss_color = params->he_bss_color; + sdata->vif.bss_conf.he_bss_color = params->beacon.he_bss_color; sdata->vif.bss_conf.s1g = params->chandef.chan->band == NL80211_BAND_S1GHZ; @@ -1260,7 +1299,18 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, return 0; } -static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) +static void ieee80211_free_next_beacon(struct ieee80211_sub_if_data *sdata) +{ + if (!sdata->u.ap.next_beacon) + return; + + kfree(sdata->u.ap.next_beacon->mbssid_ies); + kfree(sdata->u.ap.next_beacon); + sdata->u.ap.next_beacon = NULL; +} + +static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, + unsigned int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_sub_if_data *vlan; @@ -1294,8 +1344,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) mutex_unlock(&local->mtx); - kfree(sdata->u.ap.next_beacon); - sdata->u.ap.next_beacon = NULL; + ieee80211_free_next_beacon(sdata); /* turn off carrier for this interface and dependent VLANs */ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) @@ -1646,43 +1695,35 @@ static int sta_apply_parameters(struct ieee80211_local *local, if (params->listen_interval >= 0) sta->listen_interval = params->listen_interval; - if (params->sta_modify_mask & STATION_PARAM_APPLY_STA_TXPOWER) { - sta->sta.txpwr.type = params->txpwr.type; - if (params->txpwr.type == NL80211_TX_POWER_LIMITED) - sta->sta.txpwr.power = params->txpwr.power; - ret = drv_sta_set_txpwr(local, sdata, sta); - if (ret) - return ret; - } - - if (params->supported_rates && params->supported_rates_len) { + if (params->link_sta_params.supported_rates && + params->link_sta_params.supported_rates_len) { ieee80211_parse_bitrates(&sdata->vif.bss_conf.chandef, - sband, params->supported_rates, - params->supported_rates_len, - &sta->sta.supp_rates[sband->band]); + sband, params->link_sta_params.supported_rates, + params->link_sta_params.supported_rates_len, + &sta->sta.deflink.supp_rates[sband->band]); } - if (params->ht_capa) + if (params->link_sta_params.ht_capa) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, - params->ht_capa, sta); + params->link_sta_params.ht_capa, sta); /* VHT can override some HT caps such as the A-MSDU max length */ - if (params->vht_capa) + if (params->link_sta_params.vht_capa) ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, - params->vht_capa, sta); + params->link_sta_params.vht_capa, sta); - if (params->he_capa) + if (params->link_sta_params.he_capa) ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, - (void *)params->he_capa, - params->he_capa_len, - (void *)params->he_6ghz_capa, + (void *)params->link_sta_params.he_capa, + params->link_sta_params.he_capa_len, + (void *)params->link_sta_params.he_6ghz_capa, sta); - if (params->opmode_notif_used) { + if (params->link_sta_params.opmode_notif_used) { /* returned value is only needed for rc update, but the * rc isn't initialized here yet, so ignore it */ - __ieee80211_vht_handle_opmode(sdata, sta, params->opmode_notif, + __ieee80211_vht_handle_opmode(sdata, sta, params->link_sta_params.opmode_notif, sband->band); } @@ -2960,6 +3001,7 @@ static int ieee80211_set_cqm_rssi_range_config(struct wiphy *wiphy, static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, struct net_device *dev, + unsigned int link_id, const u8 *addr, const struct cfg80211_bitrate_mask *mask) { @@ -3089,12 +3131,24 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) len = beacon->head_len + beacon->tail_len + beacon->beacon_ies_len + beacon->proberesp_ies_len + beacon->assocresp_ies_len + - beacon->probe_resp_len + beacon->lci_len + beacon->civicloc_len; + beacon->probe_resp_len + beacon->lci_len + beacon->civicloc_len + + ieee80211_get_mbssid_beacon_len(beacon->mbssid_ies); new_beacon = kzalloc(sizeof(*new_beacon) + len, GFP_KERNEL); if (!new_beacon) return NULL; + if (beacon->mbssid_ies && beacon->mbssid_ies->cnt) { + new_beacon->mbssid_ies = + kzalloc(struct_size(new_beacon->mbssid_ies, + elem, beacon->mbssid_ies->cnt), + GFP_KERNEL); + if (!new_beacon->mbssid_ies) { + kfree(new_beacon); + return NULL; + } + } + pos = (u8 *)(new_beacon + 1); if (beacon->head_len) { new_beacon->head_len = beacon->head_len; @@ -3132,6 +3186,10 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) memcpy(pos, beacon->probe_resp, beacon->probe_resp_len); pos += beacon->probe_resp_len; } + if (beacon->mbssid_ies && beacon->mbssid_ies->cnt) + pos += ieee80211_copy_mbssid_beacon(pos, + new_beacon->mbssid_ies, + beacon->mbssid_ies); /* might copy -1, meaning no changes requested */ new_beacon->ftm_responder = beacon->ftm_responder; @@ -3167,10 +3225,12 @@ static int ieee80211_set_after_csa_beacon(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.type) { case NL80211_IFTYPE_AP: + if (!sdata->u.ap.next_beacon) + return -EINVAL; + err = ieee80211_assign_beacon(sdata, sdata->u.ap.next_beacon, NULL, NULL); - kfree(sdata->u.ap.next_beacon); - sdata->u.ap.next_beacon = NULL; + ieee80211_free_next_beacon(sdata); if (err < 0) return err; @@ -3249,7 +3309,7 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata) if (err) return err; - cfg80211_ch_switch_notify(sdata->dev, &sdata->csa_chandef); + cfg80211_ch_switch_notify(sdata->dev, &sdata->csa_chandef, 0); return 0; } @@ -3325,8 +3385,10 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, if ((params->n_counter_offsets_beacon > IEEE80211_MAX_CNTDWN_COUNTERS_NUM) || (params->n_counter_offsets_presp > - IEEE80211_MAX_CNTDWN_COUNTERS_NUM)) + IEEE80211_MAX_CNTDWN_COUNTERS_NUM)) { + ieee80211_free_next_beacon(sdata); return -EINVAL; + } csa.counter_offsets_beacon = params->counter_offsets_beacon; csa.counter_offsets_presp = params->counter_offsets_presp; @@ -3336,7 +3398,7 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, err = ieee80211_assign_beacon(sdata, ¶ms->beacon_csa, &csa, NULL); if (err < 0) { - kfree(sdata->u.ap.next_beacon); + ieee80211_free_next_beacon(sdata); return err; } *changed |= err; @@ -3423,8 +3485,8 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, static void ieee80211_color_change_abort(struct ieee80211_sub_if_data *sdata) { sdata->vif.color_change_active = false; - kfree(sdata->u.ap.next_beacon); - sdata->u.ap.next_beacon = NULL; + + ieee80211_free_next_beacon(sdata); cfg80211_color_change_aborted_notify(sdata->dev); } @@ -3515,7 +3577,8 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, ieee80211_stop_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); - cfg80211_ch_switch_started_notify(sdata->dev, &sdata->csa_chandef, + cfg80211_ch_switch_started_notify(sdata->dev, + &sdata->csa_chandef, 0, params->count, params->block_tx); if (changed) { @@ -3752,6 +3815,7 @@ unlock: static int ieee80211_cfg_get_channel(struct wiphy *wiphy, struct wireless_dev *wdev, + unsigned int link_id, struct cfg80211_chan_def *chandef) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); @@ -3812,6 +3876,7 @@ static int ieee80211_set_qos_map(struct wiphy *wiphy, static int ieee80211_set_ap_chanwidth(struct wiphy *wiphy, struct net_device *dev, + unsigned int link_id, struct cfg80211_chan_def *chandef) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -4160,10 +4225,12 @@ ieee80211_set_after_color_change_beacon(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_AP: { int ret; + if (!sdata->u.ap.next_beacon) + return -EINVAL; + ret = ieee80211_assign_beacon(sdata, sdata->u.ap.next_beacon, NULL, NULL); - kfree(sdata->u.ap.next_beacon); - sdata->u.ap.next_beacon = NULL; + ieee80211_free_next_beacon(sdata); if (ret < 0) return ret; @@ -4206,7 +4273,7 @@ ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata, err = ieee80211_assign_beacon(sdata, ¶ms->beacon_color_change, NULL, &color_change); if (err < 0) { - kfree(sdata->u.ap.next_beacon); + ieee80211_free_next_beacon(sdata); return err; } *changed |= err; @@ -4342,6 +4409,18 @@ out: return err; } +static int +ieee80211_set_radar_background(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + + if (!local->ops->set_radar_background) + return -EOPNOTSUPP; + + return local->ops->set_radar_background(&local->hw, chandef); +} + const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, @@ -4446,4 +4525,5 @@ const struct cfg80211_ops mac80211_config_ops = { .reset_tid_config = ieee80211_reset_tid_config, .set_sar_specs = ieee80211_set_sar_specs, .color_change = ieee80211_color_change, + .set_radar_background = ieee80211_set_radar_background, }; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 63e15f583e0a..afc9f481e3af 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -199,7 +199,7 @@ static enum nl80211_chan_width ieee80211_get_sta_bw(struct sta_info *sta) switch (width) { case IEEE80211_STA_RX_BW_20: - if (sta->sta.ht_cap.ht_supported) + if (sta->sta.deflink.ht_cap.ht_supported) return NL80211_CHAN_WIDTH_20; else return NL80211_CHAN_WIDTH_20_NOHT; @@ -373,15 +373,15 @@ static void ieee80211_chan_bw_change(struct ieee80211_local *local, new_sta_bw = ieee80211_sta_cur_vht_bw(sta); /* nothing change */ - if (new_sta_bw == sta->sta.bandwidth) + if (new_sta_bw == sta->sta.deflink.bandwidth) continue; /* vif changed to narrow BW and narrow BW for station wasn't * requested or vise versa */ - if ((new_sta_bw < sta->sta.bandwidth) == !narrowed) + if ((new_sta_bw < sta->sta.deflink.bandwidth) == !narrowed) continue; - sta->sta.bandwidth = new_sta_bw; + sta->sta.deflink.bandwidth = new_sta_bw; rate_control_rate_update(local, sband, sta, IEEE80211_RC_BW_CHANGED); } diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 8be28cfd6f64..d631880fae12 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -437,7 +437,7 @@ static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf, char buf[512], *p = buf; int i; struct sta_info *sta = file->private_data; - struct ieee80211_sta_ht_cap *htc = &sta->sta.ht_cap; + struct ieee80211_sta_ht_cap *htc = &sta->sta.deflink.ht_cap; p += scnprintf(p, sizeof(buf) + buf - p, "ht %ssupported\n", htc->ht_supported ? "" : "not "); @@ -513,7 +513,7 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf, { char buf[512], *p = buf; struct sta_info *sta = file->private_data; - struct ieee80211_sta_vht_cap *vhtc = &sta->sta.vht_cap; + struct ieee80211_sta_vht_cap *vhtc = &sta->sta.deflink.vht_cap; p += scnprintf(p, sizeof(buf) + buf - p, "VHT %ssupported\n", vhtc->vht_supported ? "" : "not "); @@ -619,7 +619,7 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf, char *buf, *p; size_t buf_sz = PAGE_SIZE; struct sta_info *sta = file->private_data; - struct ieee80211_sta_he_cap *hec = &sta->sta.he_cap; + struct ieee80211_sta_he_cap *hec = &sta->sta.deflink.he_cap; struct ieee80211_he_mcs_nss_supp *nss = &hec->he_mcs_nss_supp; u8 ppe_size; u8 *cap; @@ -1024,9 +1024,9 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) DEBUGFS_ADD(vht_capa); DEBUGFS_ADD(he_capa); - DEBUGFS_ADD_COUNTER(rx_duplicates, rx_stats.num_duplicates); - DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments); - DEBUGFS_ADD_COUNTER(tx_filtered, status_stats.filtered); + DEBUGFS_ADD_COUNTER(rx_duplicates, deflink.rx_stats.num_duplicates); + DEBUGFS_ADD_COUNTER(rx_fragments, deflink.rx_stats.fragments); + DEBUGFS_ADD_COUNTER(tx_filtered, deflink.status_stats.filtered); if (local->ops->wake_tx_queue) { DEBUGFS_ADD(aqm); diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c index 99a2e30b3833..cba0e11917b6 100644 --- a/net/mac80211/ethtool.c +++ b/net/mac80211/ethtool.c @@ -110,7 +110,7 @@ static void ieee80211_get_stats(struct net_device *dev, sta_set_sinfo(sta, &sinfo, false); i = 0; - ADD_STA_STATS(sta); + ADD_STA_STATS(sta->link[0]); data[i++] = sta->sta_state; @@ -136,7 +136,7 @@ static void ieee80211_get_stats(struct net_device *dev, memset(&sinfo, 0, sizeof(sinfo)); sta_set_sinfo(sta, &sinfo, false); i = 0; - ADD_STA_STATS(sta); + ADD_STA_STATS(sta->link[0]); } } diff --git a/net/mac80211/he.c b/net/mac80211/he.c index c05af7018f79..1a61f7552edd 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -49,7 +49,7 @@ ieee80211_update_from_he_6ghz_capa(const struct ieee80211_he_6ghz_capa *he_6ghz_ break; } - sta->sta.he_6ghz_capa = *he_6ghz_capa; + sta->sta.deflink.he_6ghz_capa = *he_6ghz_capa; } static void ieee80211_he_mcs_disable(__le16 *he_mcs) @@ -110,7 +110,7 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, const struct ieee80211_he_6ghz_capa *he_6ghz_capa, struct sta_info *sta) { - struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap; + struct ieee80211_sta_he_cap *he_cap = &sta->sta.deflink.he_cap; struct ieee80211_sta_he_cap own_he_cap; struct ieee80211_he_cap_elem *he_cap_ie_elem = (void *)he_cap_ie; u8 he_ppe_size; @@ -153,8 +153,8 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, he_cap->has_he = true; - sta->cur_max_bandwidth = ieee80211_sta_cap_rx_bw(sta); - sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta); + sta->deflink.cur_max_bandwidth = ieee80211_sta_cap_rx_bw(sta); + sta->sta.deflink.bandwidth = ieee80211_sta_cur_vht_bw(sta); if (sband->band == NL80211_BAND_6GHZ && he_6ghz_capa) ieee80211_update_from_he_6ghz_capa(he_6ghz_capa, sta); diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 2eb7641f5556..171bd16b13f3 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -243,9 +243,9 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_3839; apply: - changed = memcmp(&sta->sta.ht_cap, &ht_cap, sizeof(ht_cap)); + changed = memcmp(&sta->sta.deflink.ht_cap, &ht_cap, sizeof(ht_cap)); - memcpy(&sta->sta.ht_cap, &ht_cap, sizeof(ht_cap)); + memcpy(&sta->sta.deflink.ht_cap, &ht_cap, sizeof(ht_cap)); switch (sdata->vif.bss_conf.chandef.width) { default: @@ -264,9 +264,9 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, break; } - sta->sta.bandwidth = bw; + sta->sta.deflink.bandwidth = bw; - sta->cur_max_bandwidth = + sta->deflink.cur_max_bandwidth = ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20; diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 48e0260f3424..175253769ee4 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -641,7 +641,7 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, /* make sure mandatory rates are always added */ sband = local->hw.wiphy->bands[band]; - sta->sta.supp_rates[band] = supp_rates | + sta->sta.deflink.supp_rates[band] = supp_rates | ieee80211_mandatory_rates(sband, scan_width); return ieee80211_ibss_finish_sta(sta); @@ -1009,7 +1009,7 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, if (sta) { u32 prev_rates; - prev_rates = sta->sta.supp_rates[band]; + prev_rates = sta->sta.deflink.supp_rates[band]; /* make sure mandatory rates are always added */ scan_width = NL80211_BSS_CHAN_WIDTH_20; if (rx_status->bw == RATE_INFO_BW_5) @@ -1017,13 +1017,13 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, else if (rx_status->bw == RATE_INFO_BW_10) scan_width = NL80211_BSS_CHAN_WIDTH_10; - sta->sta.supp_rates[band] = supp_rates | + sta->sta.deflink.supp_rates[band] = supp_rates | ieee80211_mandatory_rates(sband, scan_width); - if (sta->sta.supp_rates[band] != prev_rates) { + if (sta->sta.deflink.supp_rates[band] != prev_rates) { ibss_dbg(sdata, "updated supp_rates set for %pM based on beacon/probe_resp (0x%x -> 0x%x)\n", sta->sta.addr, prev_rates, - sta->sta.supp_rates[band]); + sta->sta.deflink.supp_rates[band]); rates_updated = true; } } else { @@ -1047,7 +1047,7 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, /* we both use HT */ struct ieee80211_ht_cap htcap_ie; struct cfg80211_chan_def chandef; - enum ieee80211_sta_rx_bandwidth bw = sta->sta.bandwidth; + enum ieee80211_sta_rx_bandwidth bw = sta->sta.deflink.bandwidth; cfg80211_chandef_create(&chandef, channel, NL80211_CHAN_NO_HT); ieee80211_chandef_ht_oper(elems->ht_operation, &chandef); @@ -1062,7 +1062,7 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_40) { /* we both use VHT */ struct ieee80211_vht_cap cap_ie; - struct ieee80211_sta_vht_cap cap = sta->sta.vht_cap; + struct ieee80211_sta_vht_cap cap = sta->sta.deflink.vht_cap; u32 vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); @@ -1073,11 +1073,11 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie)); ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, &cap_ie, sta); - if (memcmp(&cap, &sta->sta.vht_cap, sizeof(cap))) + if (memcmp(&cap, &sta->sta.deflink.vht_cap, sizeof(cap))) rates_updated |= true; } - if (bw != sta->sta.bandwidth) + if (bw != sta->sta.deflink.bandwidth) rates_updated |= true; if (!cfg80211_chandef_compatible(&sdata->u.ibss.chandef, @@ -1087,12 +1087,12 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, if (sta && rates_updated) { u32 changed = IEEE80211_RC_SUPP_RATES_CHANGED; - u8 rx_nss = sta->sta.rx_nss; + u8 rx_nss = sta->sta.deflink.rx_nss; /* Force rx_nss recalculation */ - sta->sta.rx_nss = 0; + sta->sta.deflink.rx_nss = 0; rate_control_rate_init(sta); - if (sta->sta.rx_nss != rx_nss) + if (sta->sta.deflink.rx_nss != rx_nss) changed |= IEEE80211_RC_NSS_CHANGED; drv_sta_rc_update(local, sdata, &sta->sta, changed); @@ -1239,7 +1239,7 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata, /* make sure mandatory rates are always added */ sband = local->hw.wiphy->bands[band]; - sta->sta.supp_rates[band] = supp_rates | + sta->sta.deflink.supp_rates[band] = supp_rates | ieee80211_mandatory_rates(sband, scan_width); spin_lock(&ifibss->incomplete_lock); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 21549a440b38..e4cd153d9a08 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -257,6 +257,7 @@ struct beacon_data { struct ieee80211_meshconf_ie *meshconf; u16 cntdwn_counter_offsets[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; u8 cntdwn_current_counter; + struct cfg80211_mbssid_elems *mbssid_ies; struct rcu_head rcu_head; }; @@ -1078,6 +1079,20 @@ ieee80211_vif_get_shift(struct ieee80211_vif *vif) return shift; } +static inline int +ieee80211_get_mbssid_beacon_len(struct cfg80211_mbssid_elems *elems) +{ + int i, len = 0; + + if (!elems) + return 0; + + for (i = 0; i < elems->cnt; i++) + len += elems->elem[i].len; + + return len; +} + enum { IEEE80211_RX_MSG = 1, IEEE80211_TX_STATUS_MSG = 2, @@ -2002,7 +2017,7 @@ void ieee80211_clear_fast_xmit(struct sta_info *sta); int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, const u8 *dest, __be16 proto, bool unencrypted, - u64 *cookie); + int link_id, u64 *cookie); int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len); void ieee80211_resort_txq(struct ieee80211_hw *hw, diff --git a/net/mac80211/key.c b/net/mac80211/key.c index f695fc80088b..0fcf8aebedc4 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -476,7 +476,7 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, !(new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX)) _ieee80211_set_tx_key(new, true); } else { - rcu_assign_pointer(sta->gtk[idx], new); + rcu_assign_pointer(sta->deflink.gtk[idx], new); } /* Only needed for transition from no key -> key. * Still triggers unnecessary when using Extended Key ID @@ -826,7 +826,8 @@ int ieee80211_key_link(struct ieee80211_key *key, (old_key && old_key->conf.cipher != key->conf.cipher)) goto out; } else if (sta) { - old_key = key_mtx_dereference(sdata->local, sta->gtk[idx]); + old_key = key_mtx_dereference(sdata->local, + sta->deflink.gtk[idx]); } else { old_key = key_mtx_dereference(sdata->local, sdata->keys[idx]); } @@ -1076,8 +1077,8 @@ void ieee80211_free_sta_keys(struct ieee80211_local *local, int i; mutex_lock(&local->key_mtx); - for (i = 0; i < ARRAY_SIZE(sta->gtk); i++) { - key = key_mtx_dereference(local, sta->gtk[i]); + for (i = 0; i < ARRAY_SIZE(sta->deflink.gtk); i++) { + key = key_mtx_dereference(local, sta->deflink.gtk[i]); if (!key) continue; ieee80211_key_replace(key->sdata, key->sta, diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 44a6fdb6efbd..58ebdcd69d05 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -310,7 +310,7 @@ void ieee80211s_update_metric(struct ieee80211_local *local, LINK_FAIL_THRESH) mesh_plink_broken(sta); - sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, &rinfo); + sta_set_rate_info_tx(sta, &sta->deflink.tx_stats.last_rate, &rinfo); ewma_mesh_tx_rate_avg_add(&sta->mesh->tx_rate_avg, cfg80211_calculate_bitrate(&rinfo)); } diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index a829470dd59e..42ba7424589e 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -61,8 +61,8 @@ static bool rssi_threshold_check(struct ieee80211_sub_if_data *sdata, s32 rssi_threshold = sdata->u.mesh.mshcfg.rssi_threshold; return rssi_threshold == 0 || (sta && - (s8)-ewma_signal_read(&sta->rx_stats_avg.signal) > - rssi_threshold); + (s8)-ewma_signal_read(&sta->deflink.rx_stats_avg.signal) > + rssi_threshold); } /** @@ -125,7 +125,7 @@ static u32 mesh_set_short_slot_time(struct ieee80211_sub_if_data *sdata) continue; short_slot = false; - if (erp_rates & sta->sta.supp_rates[sband->band]) + if (erp_rates & sta->sta.deflink.supp_rates[sband->band]) short_slot = true; else break; @@ -175,10 +175,10 @@ static u32 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata) sta->mesh->plink_state != NL80211_PLINK_ESTAB) continue; - if (sta->sta.bandwidth > IEEE80211_STA_RX_BW_20) + if (sta->sta.deflink.bandwidth > IEEE80211_STA_RX_BW_20) continue; - if (!sta->sta.ht_cap.ht_supported) { + if (!sta->sta.deflink.ht_cap.ht_supported) { mpl_dbg(sdata, "nonHT sta (%pM) is present\n", sta->sta.addr); non_ht_sta = true; @@ -415,7 +415,7 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; u32 rates, basic_rates = 0, changed = 0; - enum ieee80211_sta_rx_bandwidth bw = sta->sta.bandwidth; + enum ieee80211_sta_rx_bandwidth bw = sta->sta.deflink.bandwidth; sband = ieee80211_get_sband(sdata); if (!sband) @@ -425,7 +425,7 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, &basic_rates); spin_lock_bh(&sta->mesh->plink_lock); - sta->rx_stats.last_rx = jiffies; + sta->deflink.rx_stats.last_rx = jiffies; /* rates and capabilities don't change during peering */ if (sta->mesh->plink_state == NL80211_PLINK_ESTAB && @@ -433,9 +433,9 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, goto out; sta->mesh->processed_beacon = true; - if (sta->sta.supp_rates[sband->band] != rates) + if (sta->sta.deflink.supp_rates[sband->band] != rates) changed |= IEEE80211_RC_SUPP_RATES_CHANGED; - sta->sta.supp_rates[sband->band] = rates; + sta->sta.deflink.supp_rates[sband->band] = rates; if (ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, elems->ht_cap_elem, sta)) @@ -449,16 +449,16 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, elems->he_6ghz_capa, sta); - if (bw != sta->sta.bandwidth) + if (bw != sta->sta.deflink.bandwidth) changed |= IEEE80211_RC_BW_CHANGED; /* HT peer is operating 20MHz-only */ if (elems->ht_operation && !(elems->ht_operation->ht_param & IEEE80211_HT_PARAM_CHAN_WIDTH_ANY)) { - if (sta->sta.bandwidth != IEEE80211_STA_RX_BW_20) + if (sta->sta.deflink.bandwidth != IEEE80211_STA_RX_BW_20) changed |= IEEE80211_RC_BW_CHANGED; - sta->sta.bandwidth = IEEE80211_STA_RX_BW_20; + sta->sta.deflink.bandwidth = IEEE80211_STA_RX_BW_20; } if (!test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index cc6d38a2e6d5..b8620d57f191 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1240,7 +1240,7 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_sub_if_data *sdata) return; } - cfg80211_ch_switch_notify(sdata->dev, &sdata->reserved_chandef); + cfg80211_ch_switch_notify(sdata->dev, &sdata->reserved_chandef, 0); } void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success) @@ -1441,7 +1441,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, IEEE80211_QUEUE_STOP_REASON_CSA); mutex_unlock(&local->mtx); - cfg80211_ch_switch_started_notify(sdata->dev, &csa_ie.chandef, + cfg80211_ch_switch_started_notify(sdata->dev, &csa_ie.chandef, 0, csa_ie.count, csa_ie.mode); if (local->ops->channel_switch) { @@ -1502,6 +1502,7 @@ ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, fallthrough; case NL80211_BAND_2GHZ: case NL80211_BAND_60GHZ: + case NL80211_BAND_LC: chan_increment = 1; break; case NL80211_BAND_5GHZ: @@ -2876,8 +2877,13 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, ieee80211_vif_release_channel(sdata); mutex_unlock(&sdata->local->mtx); - if (abandon) - cfg80211_abandon_assoc(sdata->dev, assoc_data->bss); + if (abandon) { + struct cfg80211_assoc_failure data = { + .bss[0] = assoc_data->bss, + }; + + cfg80211_assoc_failure(sdata->dev, &data); + } } kfree(assoc_data); @@ -3254,7 +3260,7 @@ static bool ieee80211_twt_req_supported(const struct sta_info *sta, if (!(elems->ext_capab[9] & WLAN_EXT_CAPA10_TWT_RESPONDER_SUPPORT)) return false; - return sta->sta.he_cap.he_cap_elem.mac_cap_info[0] & + return sta->sta.deflink.he_cap.he_cap_elem.mac_cap_info[0] & IEEE80211_HE_MAC_CAP0_TWT_RES; } @@ -3281,7 +3287,7 @@ static bool ieee80211_twt_bcast_support(struct ieee80211_sub_if_data *sdata, ieee80211_vif_type_p2p(&sdata->vif)); return bss_conf->he_support && - (sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & + (sta->sta.deflink.he_cap.he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_BCAST_TWT) && own_he_cap && (own_he_cap->he_cap_elem.mac_cap_info[2] & @@ -3499,7 +3505,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, elems->he_6ghz_capa, sta); - bss_conf->he_support = sta->sta.he_cap.has_he; + bss_conf->he_support = sta->sta.deflink.he_cap.has_he; if (elems->rsnx && elems->rsnx_len && (elems->rsnx[0] & WLAN_RSNX_CAPA_PROTECTED_TWT) && wiphy_ext_feature_isset(local->hw.wiphy, @@ -3581,7 +3587,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, nss = *elems->opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT; nss += 1; - sta->sta.rx_nss = nss; + sta->sta.deflink.rx_nss = nss; } rate_control_rate_init(sta); @@ -3687,7 +3693,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; u16 capab_info, status_code, aid; struct ieee802_11_elems *elems; - int ac, uapsd_queues = -1; + int ac; u8 *pos; bool reassoc; struct cfg80211_bss *cbss; @@ -3696,6 +3702,9 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, .u.mlme.data = ASSOC_EVENT, }; struct ieee80211_prep_tx_info info = {}; + struct cfg80211_rx_assoc_resp resp = { + .uapsd_queues = -1, + }; sdata_assert_lock(sdata); @@ -3774,8 +3783,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, } else { if (!ieee80211_assoc_success(sdata, cbss, mgmt, len, elems)) { /* oops -- internal error -- send timeout for now */ + struct cfg80211_assoc_failure data = { + .timeout = true, + .bss[0] = cbss, + }; ieee80211_destroy_assoc_data(sdata, false, false); - cfg80211_assoc_timeout(sdata->dev, cbss); + cfg80211_assoc_failure(sdata->dev, &data); goto notify_driver; } event.u.mlme.status = MLME_SUCCESS; @@ -3790,16 +3803,20 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, ieee80211_destroy_assoc_data(sdata, true, false); /* get uapsd queues configuration */ - uapsd_queues = 0; + resp.uapsd_queues = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) if (sdata->tx_conf[ac].uapsd) - uapsd_queues |= ieee80211_ac_to_qos_mask[ac]; + resp.uapsd_queues |= ieee80211_ac_to_qos_mask[ac]; info.success = 1; } - cfg80211_rx_assoc_resp(sdata->dev, cbss, (u8 *)mgmt, len, uapsd_queues, - ifmgd->assoc_req_ies, ifmgd->assoc_req_ies_len); + resp.links[0].bss = cbss; + resp.buf = (u8 *)mgmt; + resp.len = len; + resp.req_ies = ifmgd->assoc_req_ies; + resp.req_ies_len = ifmgd->assoc_req_ies_len; + cfg80211_rx_assoc_resp(sdata->dev, &resp); notify_driver: drv_mgd_complete_tx(sdata->local, sdata, &info); kfree(elems); @@ -4632,9 +4649,13 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) .u.mlme.data = ASSOC_EVENT, .u.mlme.status = MLME_TIMEOUT, }; + struct cfg80211_assoc_failure data = { + .bss[0] = bss, + .timeout = true, + }; ieee80211_destroy_assoc_data(sdata, false, false); - cfg80211_assoc_timeout(sdata->dev, bss); + cfg80211_assoc_failure(sdata->dev, &data); drv_event_callback(sdata->local, sdata, &event); } } else if (ifmgd->assoc_data && ifmgd->assoc_data->timeout_started) @@ -4734,9 +4755,9 @@ static void ieee80211_sta_conn_mon_timer(struct timer_list *t) if (!sta) return; - timeout = sta->status_stats.last_ack; - if (time_before(sta->status_stats.last_ack, sta->rx_stats.last_rx)) - timeout = sta->rx_stats.last_rx; + timeout = sta->deflink.status_stats.last_ack; + if (time_before(sta->deflink.status_stats.last_ack, sta->deflink.rx_stats.last_rx)) + timeout = sta->deflink.rx_stats.last_rx; timeout += IEEE80211_CONNECTION_IDLE_TIME; /* If timeout is after now, then update timer to fire at @@ -5311,7 +5332,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, } if (rates) - new_sta->sta.supp_rates[cbss->channel->band] = rates; + new_sta->sta.deflink.supp_rates[cbss->channel->band] = rates; else sdata_info(sdata, "No rates found, keeping mandatory only\n"); @@ -5952,20 +5973,16 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, u8 bssid[ETH_ALEN]; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; - /* - * cfg80211 should catch this ... but it's racy since - * we can receive a disassoc frame, process it, hand it - * to cfg80211 while that's in a locked section already - * trying to tell us that the user wants to disconnect. - */ - if (ifmgd->associated != req->bss) - return -ENOLINK; + if (!ifmgd->associated || + memcmp(ifmgd->associated->bssid, req->ap_addr, ETH_ALEN)) + return -ENOTCONN; sdata_info(sdata, "disassociating from %pM by local choice (Reason: %u=%s)\n", - req->bss->bssid, req->reason_code, ieee80211_get_reason_code_string(req->reason_code)); + req->ap_addr, req->reason_code, + ieee80211_get_reason_code_string(req->reason_code)); - memcpy(bssid, req->bss->bssid, ETH_ALEN); + memcpy(bssid, req->ap_addr, ETH_ALEN); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DISASSOC, req->reason_code, !req->local_state_change, frame_buf); @@ -5995,8 +6012,13 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata) sdata_lock(sdata); if (ifmgd->assoc_data) { struct cfg80211_bss *bss = ifmgd->assoc_data->bss; + struct cfg80211_assoc_failure data = { + .bss[0] = bss, + .timeout = true, + }; + ieee80211_destroy_assoc_data(sdata, false, false); - cfg80211_assoc_timeout(sdata->dev, bss); + cfg80211_assoc_failure(sdata->dev, &data); } if (ifmgd->auth_data) ieee80211_destroy_auth_data(sdata, false); diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c index 7c1a735b9eee..f97cb4c453d3 100644 --- a/net/mac80211/ocb.c +++ b/net/mac80211/ocb.c @@ -74,7 +74,7 @@ void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata, /* Add only mandatory rates for now */ sband = local->hw.wiphy->bands[band]; - sta->sta.supp_rates[band] = + sta->sta.deflink.supp_rates[band] = ieee80211_mandatory_rates(sband, scan_width); spin_lock(&ifocb->incomplete_lock); diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 8c6416129d5b..ae9700e0a1a5 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -371,7 +371,7 @@ static void __rate_control_send_low(struct ieee80211_hw *hw, WARN_ONCE(i == sband->n_bitrates, "no supported rates for sta %pM (0x%x, band %d) in rate_mask 0x%x with flags 0x%x\n", sta ? sta->addr : NULL, - sta ? sta->supp_rates[sband->band] : -1, + sta ? sta->deflink.supp_rates[sband->band] : -1, sband->band, rate_mask, rate_flags); @@ -781,11 +781,11 @@ static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata, u16 sta_vht_mask[NL80211_VHT_NSS_MAX]; /* Filter out rates that the STA does not support */ - *mask &= sta->supp_rates[sband->band]; + *mask &= sta->deflink.supp_rates[sband->band]; for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) - mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i]; + mcs_mask[i] &= sta->deflink.ht_cap.mcs.rx_mask[i]; - sta_vht_cap = sta->vht_cap.vht_mcs.rx_mcs_map; + sta_vht_cap = sta->deflink.vht_cap.vht_mcs.rx_mcs_map; ieee80211_get_vht_mask_from_cap(sta_vht_cap, sta_vht_mask); for (i = 0; i < NL80211_VHT_NSS_MAX; i++) vht_mask[i] &= sta_vht_mask[i]; diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 90238170dec3..435872a9607d 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -608,7 +608,7 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) int tmp_max_streams, group, tmp_idx, tmp_prob; int tmp_tp = 0; - if (!mi->sta->ht_cap.ht_supported) + if (!mi->sta->deflink.ht_cap.ht_supported) return; group = MI_RATE_GROUP(mi->max_tp_rate[0]); @@ -998,7 +998,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) u16 tmp_mcs_tp_rate[MAX_THR_RATES], tmp_group_tp_rate[MAX_THR_RATES]; u16 tmp_legacy_tp_rate[MAX_THR_RATES], tmp_max_prob_rate; u16 index; - bool ht_supported = mi->sta->ht_cap.ht_supported; + bool ht_supported = mi->sta->deflink.ht_cap.ht_supported; if (mi->ampdu_packets > 0) { if (!ieee80211_hw_check(mp->hw, TX_STATUS_NO_AMPDU_LEN)) @@ -1421,7 +1421,7 @@ minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi) * the limit here to avoid the complexity of having to de-aggregate * packets in the queue. */ - if (!mi->sta->vht_cap.vht_supported) + if (!mi->sta->deflink.vht_cap.vht_supported) return IEEE80211_MAX_MPDU_LEN_HT_BA; /* unlimited */ @@ -1538,7 +1538,7 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, if (sband->band != NL80211_BAND_2GHZ) return; - if (sta->ht_cap.ht_supported && + if (sta->deflink.ht_cap.ht_supported && !ieee80211_hw_check(mp->hw, SUPPORTS_HT_CCK_RATES)) return; @@ -1561,7 +1561,7 @@ minstrel_ht_update_ofdm(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, const u8 *rates; int i; - if (sta->ht_cap.ht_supported) + if (sta->deflink.ht_cap.ht_supported) return; rates = mp->ofdm_rates[sband->band]; @@ -1581,9 +1581,9 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, { struct minstrel_priv *mp = priv; struct minstrel_ht_sta *mi = priv_sta; - struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs; - u16 ht_cap = sta->ht_cap.cap; - struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; + struct ieee80211_mcs_info *mcs = &sta->deflink.ht_cap.mcs; + u16 ht_cap = sta->deflink.ht_cap.cap; + struct ieee80211_sta_vht_cap *vht_cap = &sta->deflink.vht_cap; const struct ieee80211_rate *ctl_rate; bool ldpc, erp; int use_vht; @@ -1655,7 +1655,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, } if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH && - sta->bandwidth < IEEE80211_STA_RX_BW_40) + sta->deflink.bandwidth < IEEE80211_STA_RX_BW_40) continue; nss = minstrel_mcs_groups[i].streams; @@ -1682,7 +1682,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, continue; if (gflags & IEEE80211_TX_RC_80_MHZ_WIDTH) { - if (sta->bandwidth < IEEE80211_STA_RX_BW_80 || + if (sta->deflink.bandwidth < IEEE80211_STA_RX_BW_80 || ((gflags & IEEE80211_TX_RC_SHORT_GI) && !(vht_cap->cap & IEEE80211_VHT_CAP_SHORT_GI_80))) { continue; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 175ead6b19cb..177ae2bb5c36 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -221,7 +221,7 @@ static void __ieee80211_queue_skb_to_iface(struct ieee80211_sub_if_data *sdata, skb_queue_tail(&sdata->skb_queue, skb); ieee80211_queue_work(&sdata->local->hw, &sdata->work); if (sta) - sta->rx_stats.packets++; + sta->deflink.rx_stats.packets++; } static void ieee80211_queue_skb_to_iface(struct ieee80211_sub_if_data *sdata, @@ -1459,7 +1459,7 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx) if (unlikely(ieee80211_has_retry(hdr->frame_control) && rx->sta->last_seq_ctrl[rx->seqno_idx] == hdr->seq_ctrl)) { I802_DEBUG_INC(rx->local->dot11FrameDuplicateCount); - rx->sta->rx_stats.num_duplicates++; + rx->sta->deflink.rx_stats.num_duplicates++; return RX_DROP_UNUSABLE; } else if (!(status->flag & RX_FLAG_AMSDU_MORE)) { rx->sta->last_seq_ctrl[rx->seqno_idx] = hdr->seq_ctrl; @@ -1755,46 +1755,47 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) NL80211_IFTYPE_ADHOC); if (ether_addr_equal(bssid, rx->sdata->u.ibss.bssid) && test_sta_flag(sta, WLAN_STA_AUTHORIZED)) { - sta->rx_stats.last_rx = jiffies; + sta->deflink.rx_stats.last_rx = jiffies; if (ieee80211_is_data(hdr->frame_control) && !is_multicast_ether_addr(hdr->addr1)) - sta->rx_stats.last_rate = + sta->deflink.rx_stats.last_rate = sta_stats_encode_rate(status); } } else if (rx->sdata->vif.type == NL80211_IFTYPE_OCB) { - sta->rx_stats.last_rx = jiffies; + sta->deflink.rx_stats.last_rx = jiffies; } else if (!ieee80211_is_s1g_beacon(hdr->frame_control) && !is_multicast_ether_addr(hdr->addr1)) { /* * Mesh beacons will update last_rx when if they are found to * match the current local configuration when processed. */ - sta->rx_stats.last_rx = jiffies; + sta->deflink.rx_stats.last_rx = jiffies; if (ieee80211_is_data(hdr->frame_control)) - sta->rx_stats.last_rate = sta_stats_encode_rate(status); + sta->deflink.rx_stats.last_rate = sta_stats_encode_rate(status); } - sta->rx_stats.fragments++; + sta->deflink.rx_stats.fragments++; - u64_stats_update_begin(&rx->sta->rx_stats.syncp); - sta->rx_stats.bytes += rx->skb->len; - u64_stats_update_end(&rx->sta->rx_stats.syncp); + u64_stats_update_begin(&rx->sta->deflink.rx_stats.syncp); + sta->deflink.rx_stats.bytes += rx->skb->len; + u64_stats_update_end(&rx->sta->deflink.rx_stats.syncp); if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { - sta->rx_stats.last_signal = status->signal; - ewma_signal_add(&sta->rx_stats_avg.signal, -status->signal); + sta->deflink.rx_stats.last_signal = status->signal; + ewma_signal_add(&sta->deflink.rx_stats_avg.signal, + -status->signal); } if (status->chains) { - sta->rx_stats.chains = status->chains; + sta->deflink.rx_stats.chains = status->chains; for (i = 0; i < ARRAY_SIZE(status->chain_signal); i++) { int signal = status->chain_signal[i]; if (!(status->chains & BIT(i))) continue; - sta->rx_stats.chain_signal_last[i] = signal; - ewma_signal_add(&sta->rx_stats_avg.chain_signal[i], + sta->deflink.rx_stats.chain_signal_last[i] = signal; + ewma_signal_add(&sta->deflink.rx_stats_avg.chain_signal[i], -signal); } } @@ -1855,7 +1856,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) * Update counter and free packet here to avoid * counting this as a dropped packed. */ - sta->rx_stats.packets++; + sta->deflink.rx_stats.packets++; dev_kfree_skb(rx->skb); return RX_QUEUED; } @@ -1887,11 +1888,11 @@ ieee80211_rx_get_bigtk(struct ieee80211_rx_data *rx, int idx) } if (rx->sta) - key = rcu_dereference(rx->sta->gtk[idx]); + key = rcu_dereference(rx->sta->deflink.gtk[idx]); if (!key) key = rcu_dereference(sdata->keys[idx]); if (!key && rx->sta) - key = rcu_dereference(rx->sta->gtk[idx2]); + key = rcu_dereference(rx->sta->deflink.gtk[idx2]); if (!key) key = rcu_dereference(sdata->keys[idx2]); @@ -2007,7 +2008,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) test_sta_flag(rx->sta, WLAN_STA_MFP)) return RX_DROP_MONITOR; - rx->key = rcu_dereference(rx->sta->gtk[mmie_keyidx]); + rx->key = rcu_dereference(rx->sta->deflink.gtk[mmie_keyidx]); } if (!rx->key) rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]); @@ -2030,7 +2031,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) } else { if (rx->sta) { for (i = 0; i < NUM_DEFAULT_KEYS; i++) { - key = rcu_dereference(rx->sta->gtk[i]); + key = rcu_dereference(rx->sta->deflink.gtk[i]); if (key) break; } @@ -2067,7 +2068,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) /* check per-station GTK first, if multicast packet */ if (is_multicast_ether_addr(hdr->addr1) && rx->sta) - rx->key = rcu_dereference(rx->sta->gtk[keyidx]); + rx->key = rcu_dereference(rx->sta->deflink.gtk[keyidx]); /* if not found, try default key */ if (!rx->key) { @@ -2394,7 +2395,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) out: ieee80211_led_rx(rx->local); if (rx->sta) - rx->sta->rx_stats.packets++; + rx->sta->deflink.rx_stats.packets++; return RX_CONTINUE; } @@ -2641,9 +2642,9 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) * for non-QoS-data frames. Here we know it's a data * frame, so count MSDUs. */ - u64_stats_update_begin(&rx->sta->rx_stats.syncp); - rx->sta->rx_stats.msdu[rx->seqno_idx]++; - u64_stats_update_end(&rx->sta->rx_stats.syncp); + u64_stats_update_begin(&rx->sta->deflink.rx_stats.syncp); + rx->sta->deflink.rx_stats.msdu[rx->seqno_idx]++; + u64_stats_update_end(&rx->sta->deflink.rx_stats.syncp); } if ((sdata->vif.type == NL80211_IFTYPE_AP || @@ -3301,7 +3302,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) switch (mgmt->u.action.category) { case WLAN_CATEGORY_HT: /* reject HT action frames from stations not supporting HT */ - if (!rx->sta->sta.ht_cap.ht_supported) + if (!rx->sta->sta.deflink.ht_cap.ht_supported) goto invalid; if (sdata->vif.type != NL80211_IFTYPE_STATION && @@ -3365,7 +3366,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) struct sta_opmode_info sta_opmode = {}; /* If it doesn't support 40 MHz it can't change ... */ - if (!(rx->sta->sta.ht_cap.cap & + if (!(rx->sta->sta.deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40)) goto handled; @@ -3375,13 +3376,13 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) max_bw = ieee80211_sta_cap_rx_bw(rx->sta); /* set cur_max_bandwidth and recalc sta bw */ - rx->sta->cur_max_bandwidth = max_bw; + rx->sta->deflink.cur_max_bandwidth = max_bw; new_bw = ieee80211_sta_cur_vht_bw(rx->sta); - if (rx->sta->sta.bandwidth == new_bw) + if (rx->sta->sta.deflink.bandwidth == new_bw) goto handled; - rx->sta->sta.bandwidth = new_bw; + rx->sta->sta.deflink.bandwidth = new_bw; sband = rx->local->hw.wiphy->bands[status->band]; sta_opmode.bw = ieee80211_sta_rx_bw_to_chan_width(rx->sta); @@ -3578,7 +3579,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) handled: if (rx->sta) - rx->sta->rx_stats.packets++; + rx->sta->deflink.rx_stats.packets++; dev_kfree_skb(rx->skb); return RX_QUEUED; @@ -3612,7 +3613,7 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) ieee80211_rx_status_to_khz(status), sig, rx->skb->data, rx->skb->len, 0)) { if (rx->sta) - rx->sta->rx_stats.packets++; + rx->sta->deflink.rx_stats.packets++; dev_kfree_skb(rx->skb); return RX_QUEUED; } @@ -3650,7 +3651,7 @@ ieee80211_rx_h_action_post_userspace(struct ieee80211_rx_data *rx) handled: if (rx->sta) - rx->sta->rx_stats.packets++; + rx->sta->deflink.rx_stats.packets++; dev_kfree_skb(rx->skb); return RX_QUEUED; } @@ -3870,7 +3871,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, case RX_DROP_MONITOR: I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop); if (rx->sta) - rx->sta->rx_stats.dropped++; + rx->sta->deflink.rx_stats.dropped++; fallthrough; case RX_CONTINUE: { struct ieee80211_rate *rate = NULL; @@ -3889,7 +3890,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, case RX_DROP_UNUSABLE: I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop); if (rx->sta) - rx->sta->rx_stats.dropped++; + rx->sta->deflink.rx_stats.dropped++; dev_kfree_skb(rx->skb); break; case RX_QUEUED: @@ -4441,15 +4442,15 @@ static void ieee80211_rx_8023(struct ieee80211_rx_data *rx, void *sa = skb->data + ETH_ALEN; void *da = skb->data; - stats = &sta->rx_stats; + stats = &sta->deflink.rx_stats; if (fast_rx->uses_rss) - stats = this_cpu_ptr(sta->pcpu_rx_stats); + stats = this_cpu_ptr(sta->deflink.pcpu_rx_stats); /* statistics part of ieee80211_rx_h_sta_process() */ if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { stats->last_signal = status->signal; if (!fast_rx->uses_rss) - ewma_signal_add(&sta->rx_stats_avg.signal, + ewma_signal_add(&sta->deflink.rx_stats_avg.signal, -status->signal); } @@ -4465,7 +4466,7 @@ static void ieee80211_rx_8023(struct ieee80211_rx_data *rx, stats->chain_signal_last[i] = signal; if (!fast_rx->uses_rss) - ewma_signal_add(&sta->rx_stats_avg.chain_signal[i], + ewma_signal_add(&sta->deflink.rx_stats_avg.chain_signal[i], -signal); } } @@ -4541,7 +4542,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, u8 da[ETH_ALEN]; u8 sa[ETH_ALEN]; } addrs __aligned(2); - struct ieee80211_sta_rx_stats *stats = &sta->rx_stats; + struct ieee80211_sta_rx_stats *stats = &sta->deflink.rx_stats; /* for parallel-rx, we need to have DUP_VALIDATED, otherwise we write * to a common data structure; drivers can implement that per queue @@ -4630,6 +4631,8 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, /* do the header conversion - first grab the addresses */ ether_addr_copy(addrs.da, skb->data + fast_rx->da_offs); ether_addr_copy(addrs.sa, skb->data + fast_rx->sa_offs); + skb_postpull_rcsum(skb, skb->data + snap_offs, + sizeof(rfc1042_header) + 2); /* remove the SNAP but leave the ethertype */ skb_pull(skb, snap_offs + sizeof(rfc1042_header)); /* push the addresses in front */ @@ -4641,7 +4644,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, drop: dev_kfree_skb(skb); if (fast_rx->uses_rss) - stats = this_cpu_ptr(sta->pcpu_rx_stats); + stats = this_cpu_ptr(sta->deflink.pcpu_rx_stats); stats->dropped++; return true; diff --git a/net/mac80211/s1g.c b/net/mac80211/s1g.c index 10b34bc4b67d..c1f964e9991c 100644 --- a/net/mac80211/s1g.c +++ b/net/mac80211/s1g.c @@ -11,8 +11,8 @@ void ieee80211_s1g_sta_rate_init(struct sta_info *sta) { /* avoid indicating legacy bitrates for S1G STAs */ - sta->tx_stats.last_rate.flags |= IEEE80211_TX_RC_S1G_MCS; - sta->rx_stats.last_rate = + sta->deflink.tx_stats.last_rate.flags |= IEEE80211_TX_RC_S1G_MCS; + sta->deflink.rx_stats.last_rate = STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_S1G); } diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index f1e263b2c295..77806b2fd014 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -287,7 +287,7 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif - free_percpu(sta->pcpu_rx_stats); + free_percpu(sta->deflink.pcpu_rx_stats); kfree(sta); } @@ -346,9 +346,9 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, return NULL; if (ieee80211_hw_check(hw, USES_RSS)) { - sta->pcpu_rx_stats = + sta->deflink.pcpu_rx_stats = alloc_percpu_gfp(struct ieee80211_sta_rx_stats, gfp); - if (!sta->pcpu_rx_stats) + if (!sta->deflink.pcpu_rx_stats) goto free; } @@ -377,6 +377,14 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->sta.max_rx_aggregation_subframes = local->hw.max_rx_aggregation_subframes; + /* TODO link specific alloc and assignments for MLO Link STA */ + + /* For non MLO STA, link info can be accessed either via deflink + * or link[0] + */ + sta->link[0] = &sta->deflink; + sta->sta.link[0] = &sta->sta.deflink; + /* Extended Key ID needs to install keys for keyid 0 and 1 Rx-only. * The Tx path starts to use a key as soon as the key slot ptk_idx * references to is not NULL. To not use the initial Rx-only key @@ -388,9 +396,9 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->local = local; sta->sdata = sdata; - sta->rx_stats.last_rx = jiffies; + sta->deflink.rx_stats.last_rx = jiffies; - u64_stats_init(&sta->rx_stats.syncp); + u64_stats_init(&sta->deflink.rx_stats.syncp); ieee80211_init_frag_cache(&sta->frags); @@ -400,10 +408,10 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->reserved_tid = IEEE80211_TID_UNRESERVED; sta->last_connected = ktime_get_seconds(); - ewma_signal_init(&sta->rx_stats_avg.signal); - ewma_avg_signal_init(&sta->status_stats.avg_ack_signal); - for (i = 0; i < ARRAY_SIZE(sta->rx_stats_avg.chain_signal); i++) - ewma_signal_init(&sta->rx_stats_avg.chain_signal[i]); + ewma_signal_init(&sta->deflink.rx_stats_avg.signal); + ewma_avg_signal_init(&sta->deflink.status_stats.avg_ack_signal); + for (i = 0; i < ARRAY_SIZE(sta->deflink.rx_stats_avg.chain_signal); i++) + ewma_signal_init(&sta->deflink.rx_stats_avg.chain_signal[i]); if (local->ops->wake_tx_queue) { void *txq_data; @@ -444,6 +452,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, switch (i) { case NL80211_BAND_2GHZ: + case NL80211_BAND_LC: /* * We use both here, even if we cannot really know for * sure the station will support both, but the only use @@ -472,7 +481,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, if (!(rate->flags & mandatory)) continue; - sta->sta.supp_rates[i] |= BIT(r); + sta->sta.deflink.supp_rates[i] |= BIT(r); } } @@ -522,7 +531,7 @@ free_txq: if (sta->sta.txq[0]) kfree(to_txq_info(sta->sta.txq[0])); free: - free_percpu(sta->pcpu_rx_stats); + free_percpu(sta->deflink.pcpu_rx_stats); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif @@ -2082,16 +2091,16 @@ int sta_info_move_state(struct sta_info *sta, u8 sta_info_tx_streams(struct sta_info *sta) { - struct ieee80211_sta_ht_cap *ht_cap = &sta->sta.ht_cap; + struct ieee80211_sta_ht_cap *ht_cap = &sta->sta.deflink.ht_cap; u8 rx_streams; - if (!sta->sta.ht_cap.ht_supported) + if (!sta->sta.deflink.ht_cap.ht_supported) return 1; - if (sta->sta.vht_cap.vht_supported) { + if (sta->sta.deflink.vht_cap.vht_supported) { int i; u16 tx_mcs_map = - le16_to_cpu(sta->sta.vht_cap.vht_mcs.tx_mcs_map); + le16_to_cpu(sta->sta.deflink.vht_cap.vht_mcs.tx_mcs_map); for (i = 7; i >= 0; i--) if ((tx_mcs_map & (0x3 << (i * 2))) != @@ -2118,16 +2127,16 @@ u8 sta_info_tx_streams(struct sta_info *sta) static struct ieee80211_sta_rx_stats * sta_get_last_rx_stats(struct sta_info *sta) { - struct ieee80211_sta_rx_stats *stats = &sta->rx_stats; + struct ieee80211_sta_rx_stats *stats = &sta->deflink.rx_stats; int cpu; - if (!sta->pcpu_rx_stats) + if (!sta->deflink.pcpu_rx_stats) return stats; for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpustats; - cpustats = per_cpu_ptr(sta->pcpu_rx_stats, cpu); + cpustats = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); if (time_after(cpustats->last_rx, stats->last_rx)) stats = cpustats; @@ -2221,13 +2230,15 @@ static void sta_set_tidstats(struct sta_info *sta, int cpu; if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) { - tidstats->rx_msdu += sta_get_tidstats_msdu(&sta->rx_stats, tid); + tidstats->rx_msdu += sta_get_tidstats_msdu(&sta->deflink.rx_stats, + tid); - if (sta->pcpu_rx_stats) { + if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; - cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu); + cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, + cpu); tidstats->rx_msdu += sta_get_tidstats_msdu(cpurxs, tid); } @@ -2238,19 +2249,19 @@ static void sta_set_tidstats(struct sta_info *sta, if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU); - tidstats->tx_msdu = sta->tx_stats.msdu[tid]; + tidstats->tx_msdu = sta->deflink.tx_stats.msdu[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_RETRIES); - tidstats->tx_msdu_retries = sta->status_stats.msdu_retries[tid]; + tidstats->tx_msdu_retries = sta->deflink.status_stats.msdu_retries[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED); - tidstats->tx_msdu_failed = sta->status_stats.msdu_failed[tid]; + tidstats->tx_msdu_failed = sta->deflink.status_stats.msdu_failed[tid]; } if (local->ops->wake_tx_queue && tid < IEEE80211_NUM_TIDS) { @@ -2321,26 +2332,27 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { sinfo->tx_bytes = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) - sinfo->tx_bytes += sta->tx_stats.bytes[ac]; + sinfo->tx_bytes += sta->deflink.tx_stats.bytes[ac]; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) { sinfo->tx_packets = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) - sinfo->tx_packets += sta->tx_stats.packets[ac]; + sinfo->tx_packets += sta->deflink.tx_stats.packets[ac]; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS); } if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) | BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) { - sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats); + sinfo->rx_bytes += sta_get_stats_bytes(&sta->deflink.rx_stats); - if (sta->pcpu_rx_stats) { + if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; - cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu); + cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, + cpu); sinfo->rx_bytes += sta_get_stats_bytes(cpurxs); } } @@ -2349,12 +2361,13 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) { - sinfo->rx_packets = sta->rx_stats.packets; - if (sta->pcpu_rx_stats) { + sinfo->rx_packets = sta->deflink.rx_stats.packets; + if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; - cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu); + cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, + cpu); sinfo->rx_packets += cpurxs->packets; } } @@ -2362,12 +2375,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) { - sinfo->tx_retries = sta->status_stats.retry_count; + sinfo->tx_retries = sta->deflink.status_stats.retry_count; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) { - sinfo->tx_failed = sta->status_stats.retry_failed; + sinfo->tx_failed = sta->deflink.status_stats.retry_failed; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); } @@ -2388,12 +2401,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT); } - sinfo->rx_dropped_misc = sta->rx_stats.dropped; - if (sta->pcpu_rx_stats) { + sinfo->rx_dropped_misc = sta->deflink.rx_stats.dropped; + if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; - cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu); + cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_dropped_misc += cpurxs->dropped; } } @@ -2412,10 +2425,10 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL); } - if (!sta->pcpu_rx_stats && + if (!sta->deflink.pcpu_rx_stats && !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) { sinfo->signal_avg = - -ewma_signal_read(&sta->rx_stats_avg.signal); + -ewma_signal_read(&sta->deflink.rx_stats_avg.signal); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG); } } @@ -2428,7 +2441,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, !(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) | BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL); - if (!sta->pcpu_rx_stats) + if (!sta->deflink.pcpu_rx_stats) sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); sinfo->chains = last_rxstats->chains; @@ -2437,12 +2450,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->chain_signal[i] = last_rxstats->chain_signal_last[i]; sinfo->chain_signal_avg[i] = - -ewma_signal_read(&sta->rx_stats_avg.chain_signal[i]); + -ewma_signal_read(&sta->deflink.rx_stats_avg.chain_signal[i]); } } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) { - sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, + sta_set_rate_info_tx(sta, &sta->deflink.tx_stats.last_rate, &sinfo->txrate); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); } @@ -2524,16 +2537,16 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL)) && - sta->status_stats.ack_signal_filled) { - sinfo->ack_signal = sta->status_stats.last_ack_signal; + sta->deflink.status_stats.ack_signal_filled) { + sinfo->ack_signal = sta->deflink.status_stats.last_ack_signal; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) && - sta->status_stats.ack_signal_filled) { + sta->deflink.status_stats.ack_signal_filled) { sinfo->avg_ack_signal = -(s8)ewma_avg_signal_read( - &sta->status_stats.avg_ack_signal); + &sta->deflink.status_stats.avg_ack_signal); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); } @@ -2568,10 +2581,10 @@ unsigned long ieee80211_sta_last_active(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta); - if (!sta->status_stats.last_ack || - time_after(stats->last_rx, sta->status_stats.last_ack)) + if (!sta->deflink.status_stats.last_ack || + time_after(stats->last_rx, sta->deflink.status_stats.last_ack)) return stats->last_rx; - return sta->status_stats.last_ack; + return sta->deflink.status_stats.last_ack; } static void sta_update_codel_params(struct sta_info *sta, u32 thr) diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index e7443fc4669c..7cca133786af 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -482,6 +482,86 @@ struct ieee80211_fragment_cache { */ #define STA_SLOW_THRESHOLD 6000 /* 6 Mbps */ +/** + * struct link_sta_info - Link STA information + * All link specific sta info are stored here for reference. This can be + * a single entry for non-MLD STA or multiple entries for MLD STA + * @addr: Link MAC address - Can be same as MLD STA mac address and is always + * same for non-MLD STA. This is used as key for searching link STA + * @link_id: Link ID uniquely identifying the link STA. This is 0 for non-MLD + * and set to the corresponding vif LinkId for MLD STA + * @sta: Points to the STA info + * @gtk: group keys negotiated with this station, if any + * @tx_stats: TX statistics + * @tx_stats.packets: # of packets transmitted + * @tx_stats.bytes: # of bytes in all packets transmitted + * @tx_stats.last_rate: last TX rate + * @tx_stats.msdu: # of transmitted MSDUs per TID + * @rx_stats: RX statistics + * @rx_stats_avg: averaged RX statistics + * @rx_stats_avg.signal: averaged signal + * @rx_stats_avg.chain_signal: averaged per-chain signal + * @pcpu_rx_stats: per-CPU RX statistics, assigned only if the driver needs + * this (by advertising the USES_RSS hw flag) + * @status_stats: TX status statistics + * @status_stats.filtered: # of filtered frames + * @status_stats.retry_failed: # of frames that failed after retry + * @status_stats.retry_count: # of retries attempted + * @status_stats.lost_packets: # of lost packets + * @status_stats.last_pkt_time: timestamp of last ACKed packet + * @status_stats.msdu_retries: # of MSDU retries + * @status_stats.msdu_failed: # of failed MSDUs + * @status_stats.last_ack: last ack timestamp (jiffies) + * @status_stats.last_ack_signal: last ACK signal + * @status_stats.ack_signal_filled: last ACK signal validity + * @status_stats.avg_ack_signal: average ACK signal + * TODO Move other link params from sta_info as required for MLD operation + */ +struct link_sta_info { + u8 addr[ETH_ALEN]; + u8 link_id; + + /* TODO rhash head/node for finding link_sta based on addr */ + + struct sta_info *sta; + struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS + + NUM_DEFAULT_MGMT_KEYS + + NUM_DEFAULT_BEACON_KEYS]; + struct ieee80211_sta_rx_stats __percpu *pcpu_rx_stats; + + /* Updated from RX path only, no locking requirements */ + struct ieee80211_sta_rx_stats rx_stats; + struct { + struct ewma_signal signal; + struct ewma_signal chain_signal[IEEE80211_MAX_CHAINS]; + } rx_stats_avg; + + /* Updated from TX status path only, no locking requirements */ + struct { + unsigned long filtered; + unsigned long retry_failed, retry_count; + unsigned int lost_packets; + unsigned long last_pkt_time; + u64 msdu_retries[IEEE80211_NUM_TIDS + 1]; + u64 msdu_failed[IEEE80211_NUM_TIDS + 1]; + unsigned long last_ack; + s8 last_ack_signal; + bool ack_signal_filled; + struct ewma_avg_signal avg_ack_signal; + } status_stats; + + /* Updated from TX path only, no locking requirements */ + struct { + u64 packets[IEEE80211_NUM_ACS]; + u64 bytes[IEEE80211_NUM_ACS]; + struct ieee80211_tx_rate last_rate; + struct rate_info last_rate_info; + u64 msdu[IEEE80211_NUM_TIDS + 1]; + } tx_stats; + + enum ieee80211_sta_rx_bandwidth cur_max_bandwidth; +}; + /** * struct sta_info - STA information * @@ -497,7 +577,6 @@ struct ieee80211_fragment_cache { * @sdata: virtual interface this station belongs to * @ptk: peer keys negotiated with this station, if any * @ptk_idx: last installed peer key index - * @gtk: group keys negotiated with this station, if any * @rate_ctrl: rate control algorithm reference * @rate_ctrl_lock: spinlock used to protect rate control data * (data inside the algorithm, so serializes calls there) @@ -543,30 +622,19 @@ struct ieee80211_fragment_cache { * @fast_rx: RX fastpath information * @tdls_chandef: a TDLS peer can have a wider chandef that is compatible to * the BSS one. - * @tx_stats: TX statistics - * @tx_stats.packets: # of packets transmitted - * @tx_stats.bytes: # of bytes in all packets transmitted - * @tx_stats.last_rate: last TX rate - * @tx_stats.msdu: # of transmitted MSDUs per TID - * @rx_stats: RX statistics - * @rx_stats_avg: averaged RX statistics - * @rx_stats_avg.signal: averaged signal - * @rx_stats_avg.chain_signal: averaged per-chain signal - * @pcpu_rx_stats: per-CPU RX statistics, assigned only if the driver needs - * this (by advertising the USES_RSS hw flag) - * @status_stats: TX status statistics - * @status_stats.filtered: # of filtered frames - * @status_stats.retry_failed: # of frames that failed after retry - * @status_stats.retry_count: # of retries attempted - * @status_stats.lost_packets: # of lost packets - * @status_stats.last_pkt_time: timestamp of last ACKed packet - * @status_stats.msdu_retries: # of MSDU retries - * @status_stats.msdu_failed: # of failed MSDUs - * @status_stats.last_ack: last ack timestamp (jiffies) - * @status_stats.last_ack_signal: last ACK signal - * @status_stats.ack_signal_filled: last ACK signal validity - * @status_stats.avg_ack_signal: average ACK signal * @frags: fragment cache + * @multi_link_sta: Identifies if this sta is a MLD STA or regular STA + * @deflink: This is the default link STA information, for non MLO STA all link + * specific STA information is accessed through @deflink or through + * link[0] which points to address of @deflink. For MLO Link STA + * the first added link STA will point to deflink. + * @link: reference to Link Sta entries. For Non MLO STA, except 1st link, + * i.e link[0] all links would be assigned to NULL by default and + * would access link information via @deflink or link[0]. For MLO + * STA, first link STA being added will point its link pointer to + * @deflink address and remaining would be allocated and the address + * would be assigned to link[link_id] where link_id is the id assigned + * by the AP. */ struct sta_info { /* General information, mostly static */ @@ -576,9 +644,6 @@ struct sta_info { u8 addr[ETH_ALEN]; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; - struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS + - NUM_DEFAULT_MGMT_KEYS + - NUM_DEFAULT_BEACON_KEYS]; struct ieee80211_key __rcu *ptk[NUM_DEFAULT_KEYS]; u8 ptk_idx; struct rate_control_ref *rate_ctrl; @@ -588,7 +653,6 @@ struct sta_info { struct ieee80211_fast_tx __rcu *fast_tx; struct ieee80211_fast_rx __rcu *fast_rx; - struct ieee80211_sta_rx_stats __percpu *pcpu_rx_stats; #ifdef CONFIG_MAC80211_MESH struct mesh_sta *mesh; @@ -618,38 +682,9 @@ struct sta_info { u64 assoc_at; long last_connected; - /* Updated from RX path only, no locking requirements */ - struct ieee80211_sta_rx_stats rx_stats; - struct { - struct ewma_signal signal; - struct ewma_signal chain_signal[IEEE80211_MAX_CHAINS]; - } rx_stats_avg; - /* Plus 1 for non-QoS frames */ __le16 last_seq_ctrl[IEEE80211_NUM_TIDS + 1]; - /* Updated from TX status path only, no locking requirements */ - struct { - unsigned long filtered; - unsigned long retry_failed, retry_count; - unsigned int lost_packets; - unsigned long last_pkt_time; - u64 msdu_retries[IEEE80211_NUM_TIDS + 1]; - u64 msdu_failed[IEEE80211_NUM_TIDS + 1]; - unsigned long last_ack; - s8 last_ack_signal; - bool ack_signal_filled; - struct ewma_avg_signal avg_ack_signal; - } status_stats; - - /* Updated from TX path only, no locking requirements */ - struct { - u64 packets[IEEE80211_NUM_ACS]; - u64 bytes[IEEE80211_NUM_ACS]; - struct ieee80211_tx_rate last_rate; - struct rate_info last_rate_info; - u64 msdu[IEEE80211_NUM_TIDS + 1]; - } tx_stats; u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1]; struct airtime_info airtime[IEEE80211_NUM_ACS]; @@ -663,8 +698,6 @@ struct sta_info { struct dentry *debugfs_dir; #endif - enum ieee80211_sta_rx_bandwidth cur_max_bandwidth; - enum ieee80211_smps_mode known_smps_mode; const struct ieee80211_cipher_scheme *cipher_scheme; @@ -676,6 +709,10 @@ struct sta_info { struct ieee80211_fragment_cache frags; + bool multi_link_sta; + struct link_sta_info deflink; + struct link_sta_info *link[MAX_STA_LINKS]; + /* keep last! */ struct ieee80211_sta sta; }; diff --git a/net/mac80211/status.c b/net/mac80211/status.c index f6f63a0b1b72..e863305a36e7 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -71,7 +71,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, info->flags |= IEEE80211_TX_INTFL_RETRANSMISSION; info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; - sta->status_stats.filtered++; + sta->deflink.status_stats.filtered++; /* * Clear more-data bit on filtered frames, it might be set @@ -774,7 +774,7 @@ static void ieee80211_lost_packet(struct sta_info *sta, !(info->flags & IEEE80211_TX_STAT_AMPDU)) return; - sta->status_stats.lost_packets++; + sta->deflink.status_stats.lost_packets++; if (sta->sta.tdls) { pkt_time = STA_LOST_TDLS_PKT_TIME; pkt_thr = STA_LOST_PKT_THRESHOLD; @@ -787,13 +787,14 @@ static void ieee80211_lost_packet(struct sta_info *sta, * mechanism. * For non-TDLS, use STA_LOST_PKT_THRESHOLD and STA_LOST_PKT_TIME */ - if (sta->status_stats.lost_packets < pkt_thr || - !time_after(jiffies, sta->status_stats.last_pkt_time + pkt_time)) + if (sta->deflink.status_stats.lost_packets < pkt_thr || + !time_after(jiffies, sta->deflink.status_stats.last_pkt_time + pkt_time)) return; cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, - sta->status_stats.lost_packets, GFP_ATOMIC); - sta->status_stats.lost_packets = 0; + sta->deflink.status_stats.lost_packets, + GFP_ATOMIC); + sta->deflink.status_stats.lost_packets = 0; } static int ieee80211_tx_get_rates(struct ieee80211_hw *hw, @@ -928,7 +929,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL) && (ieee80211_is_data(hdr->frame_control)) && (rates_idx != -1)) - sta->tx_stats.last_rate = + sta->deflink.tx_stats.last_rate = info->status.rates[rates_idx]; if ((info->flags & IEEE80211_TX_STAT_AMPDU_NO_BACK) && @@ -974,9 +975,9 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, return; } else if (ieee80211_is_data_present(fc)) { if (!acked && !noack_success) - sta->status_stats.msdu_failed[tid]++; + sta->deflink.status_stats.msdu_failed[tid]++; - sta->status_stats.msdu_retries[tid] += + sta->deflink.status_stats.msdu_retries[tid] += retry_count; } @@ -1109,7 +1110,7 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, sta = container_of(pubsta, struct sta_info, sta); if (status->rate) - sta->tx_stats.last_rate_info = *status->rate; + sta->deflink.tx_stats.last_rate_info = *status->rate; } if (skb && (tx_time_est = @@ -1138,8 +1139,8 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, struct ieee80211_sub_if_data *sdata = sta->sdata; if (!acked && !noack_success) - sta->status_stats.retry_failed++; - sta->status_stats.retry_count += retry_count; + sta->deflink.status_stats.retry_failed++; + sta->deflink.status_stats.retry_count += retry_count; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { if (sdata->vif.type == NL80211_IFTYPE_STATION && @@ -1148,13 +1149,13 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, acked, info->status.tx_time); if (acked) { - sta->status_stats.last_ack = jiffies; + sta->deflink.status_stats.last_ack = jiffies; - if (sta->status_stats.lost_packets) - sta->status_stats.lost_packets = 0; + if (sta->deflink.status_stats.lost_packets) + sta->deflink.status_stats.lost_packets = 0; /* Track when last packet was ACKed */ - sta->status_stats.last_pkt_time = jiffies; + sta->deflink.status_stats.last_pkt_time = jiffies; /* Reset connection monitor */ if (sdata->vif.type == NL80211_IFTYPE_STATION && @@ -1162,10 +1163,10 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, sdata->u.mgd.probe_send_count = 0; if (info->status.is_valid_ack_signal) { - sta->status_stats.last_ack_signal = + sta->deflink.status_stats.last_ack_signal = (s8)info->status.ack_signal; - sta->status_stats.ack_signal_filled = true; - ewma_avg_signal_add(&sta->status_stats.avg_ack_signal, + sta->deflink.status_stats.ack_signal_filled = true; + ewma_avg_signal_add(&sta->deflink.status_stats.avg_ack_signal, -info->status.ack_signal); } } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) { @@ -1231,7 +1232,7 @@ void ieee80211_tx_rate_update(struct ieee80211_hw *hw, rate_control_tx_status(local, sband, &status); if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) - sta->tx_stats.last_rate = info->status.rates[0]; + sta->deflink.tx_stats.last_rate = info->status.rates[0]; } EXPORT_SYMBOL(ieee80211_tx_rate_update); diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 137be9ec94af..4e2d22e47429 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -459,9 +459,9 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, pos = skb_put(skb, sizeof(struct ieee80211_ht_cap) + 2); ieee80211_ie_build_ht_cap(pos, &ht_cap, ht_cap.cap); } else if (action_code == WLAN_TDLS_SETUP_RESPONSE && - ht_cap.ht_supported && sta->sta.ht_cap.ht_supported) { + ht_cap.ht_supported && sta->sta.deflink.ht_cap.ht_supported) { /* the peer caps are already intersected with our own */ - memcpy(&ht_cap, &sta->sta.ht_cap, sizeof(ht_cap)); + memcpy(&ht_cap, &sta->sta.deflink.ht_cap, sizeof(ht_cap)); pos = skb_put(skb, sizeof(struct ieee80211_ht_cap) + 2); ieee80211_ie_build_ht_cap(pos, &ht_cap, ht_cap.cap); @@ -510,9 +510,9 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, pos = skb_put(skb, sizeof(struct ieee80211_vht_cap) + 2); ieee80211_ie_build_vht_cap(pos, &vht_cap, vht_cap.cap); } else if (action_code == WLAN_TDLS_SETUP_RESPONSE && - vht_cap.vht_supported && sta->sta.vht_cap.vht_supported) { + vht_cap.vht_supported && sta->sta.deflink.vht_cap.vht_supported) { /* the peer caps are already intersected with our own */ - memcpy(&vht_cap, &sta->sta.vht_cap, sizeof(vht_cap)); + memcpy(&vht_cap, &sta->sta.deflink.vht_cap, sizeof(vht_cap)); /* the AID is present only when VHT is implemented */ ieee80211_tdls_add_aid(sdata, skb); @@ -603,13 +603,13 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, * if HT support is only added in TDLS, we need an HT-operation IE. * add the IE as required by IEEE802.11-2012 9.23.3.2. */ - if (!ap_sta->sta.ht_cap.ht_supported && sta->sta.ht_cap.ht_supported) { + if (!ap_sta->sta.deflink.ht_cap.ht_supported && sta->sta.deflink.ht_cap.ht_supported) { u16 prot = IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED | IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT | IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT; pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_operation)); - ieee80211_ie_build_ht_oper(pos, &sta->sta.ht_cap, + ieee80211_ie_build_ht_oper(pos, &sta->sta.deflink.ht_cap, &sdata->vif.bss_conf.chandef, prot, true); } @@ -618,7 +618,7 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, /* only include VHT-operation if not on the 2.4GHz band */ if (sband->band != NL80211_BAND_2GHZ && - sta->sta.vht_cap.vht_supported) { + sta->sta.deflink.vht_cap.vht_supported) { /* * if both peers support WIDER_BW, we can expand the chandef to * a wider compatible one, up to 80MHz @@ -627,7 +627,7 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, ieee80211_tdls_chandef_vht_upgrade(sdata, sta); pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_operation)); - ieee80211_ie_build_vht_oper(pos, &sta->sta.vht_cap, + ieee80211_ie_build_vht_oper(pos, &sta->sta.deflink.vht_cap, &sta->tdls_chandef); } @@ -1269,8 +1269,8 @@ static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata, bw = ieee80211_chan_width_to_rx_bw(conf->def.width); bw = min(bw, ieee80211_sta_cap_rx_bw(sta)); - if (bw != sta->sta.bandwidth) { - sta->sta.bandwidth = bw; + if (bw != sta->sta.deflink.bandwidth) { + sta->sta.deflink.bandwidth = bw; rate_control_rate_update(local, sband, sta, IEEE80211_RC_BW_CHANGED); /* @@ -1296,7 +1296,7 @@ static int iee80211_tdls_have_ht_peers(struct ieee80211_sub_if_data *sdata) if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded || !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || !test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH) || - !sta->sta.ht_cap.ht_supported) + !sta->sta.deflink.ht_cap.ht_supported) continue; result = true; break; @@ -1321,7 +1321,7 @@ iee80211_tdls_recalc_ht_protection(struct ieee80211_sub_if_data *sdata, if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) return; - tdls_ht = (sta && sta->sta.ht_cap.ht_supported) || + tdls_ht = (sta && sta->sta.deflink.ht_cap.ht_supported) || iee80211_tdls_have_ht_peers(sdata); opmode = sdata->vif.bss_conf.ht_operation_mode; @@ -1900,7 +1900,7 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, } /* peer should have known better */ - if (!sta->sta.ht_cap.ht_supported && elems->sec_chan_offs && + if (!sta->sta.deflink.ht_cap.ht_supported && elems->sec_chan_offs && elems->sec_chan_offs->sec_chan_offs) { tdls_dbg(sdata, "TDLS chan switch - wide chan unsupported\n"); ret = -ENOTSUPP; diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 9e8381bef7ed..f180b5e30583 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -860,8 +860,8 @@ TRACE_EVENT(drv_sta_set_txpwr, LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; - __entry->txpwr = sta->txpwr.power; - __entry->type = sta->txpwr.type; + __entry->txpwr = sta->deflink.txpwr.power; + __entry->type = sta->deflink.txpwr.type; ), TP_printk( diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 8f8dc2625d53..e717ce39743a 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -146,7 +146,8 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, rate = DIV_ROUND_UP(r->bitrate, 1 << shift); switch (sband->band) { - case NL80211_BAND_2GHZ: { + case NL80211_BAND_2GHZ: + case NL80211_BAND_LC: { u32 flag; if (tx->sdata->flags & IEEE80211_SDATA_OPERATING_GMODE) flag = IEEE80211_RATE_MANDATORY_G; @@ -767,9 +768,9 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) if (txrc.reported_rate.idx < 0) { txrc.reported_rate = tx->rate; if (tx->sta && ieee80211_is_tx_data(tx->skb)) - tx->sta->tx_stats.last_rate = txrc.reported_rate; + tx->sta->deflink.tx_stats.last_rate = txrc.reported_rate; } else if (tx->sta) - tx->sta->tx_stats.last_rate = txrc.reported_rate; + tx->sta->deflink.tx_stats.last_rate = txrc.reported_rate; if (ratetbl) return TX_CONTINUE; @@ -836,7 +837,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) hdr->seq_ctrl = cpu_to_le16(tx->sdata->sequence_number); tx->sdata->sequence_number += 0x10; if (tx->sta) - tx->sta->tx_stats.msdu[IEEE80211_NUM_TIDS]++; + tx->sta->deflink.tx_stats.msdu[IEEE80211_NUM_TIDS]++; return TX_CONTINUE; } @@ -850,7 +851,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) /* include per-STA, per-TID sequence counter */ tid = ieee80211_get_tid(hdr); - tx->sta->tx_stats.msdu[tid]++; + tx->sta->deflink.tx_stats.msdu[tid]++; hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid); @@ -1003,10 +1004,10 @@ ieee80211_tx_h_stats(struct ieee80211_tx_data *tx) skb_queue_walk(&tx->skbs, skb) { ac = skb_get_queue_mapping(skb); - tx->sta->tx_stats.bytes[ac] += skb->len; + tx->sta->deflink.tx_stats.bytes[ac] += skb->len; } if (ac >= 0) - tx->sta->tx_stats.packets[ac]++; + tx->sta->deflink.tx_stats.packets[ac]++; return TX_CONTINUE; } @@ -1158,7 +1159,7 @@ ieee80211_aggr_check(struct ieee80211_sub_if_data *sdata, if (!ref || !(ref->ops->capa & RATE_CTRL_CAPA_AMPDU_TRIGGER)) return; - if (!sta || !sta->sta.ht_cap.ht_supported || + if (!sta || !sta->sta.deflink.ht_cap.ht_supported || !sta->sta.wme || skb_get_queue_mapping(skb) == IEEE80211_AC_VO || skb->protocol == sdata->control_port_protocol) return; @@ -3461,18 +3462,18 @@ ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata, } if (skb_shinfo(skb)->gso_size) - sta->tx_stats.msdu[tid] += + sta->deflink.tx_stats.msdu[tid] += DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size); else - sta->tx_stats.msdu[tid]++; + sta->deflink.tx_stats.msdu[tid]++; info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; /* statistics normally done by ieee80211_tx_h_stats (but that * has to consider fragmentation, so is more complex) */ - sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len; - sta->tx_stats.packets[skb_get_queue_mapping(skb)]++; + sta->deflink.tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len; + sta->deflink.tx_stats.packets[skb_get_queue_mapping(skb)]++; if (pn_offs) { u64 pn; @@ -4480,8 +4481,8 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, dev_sw_netstats_tx_add(dev, 1, skb->len); - sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len; - sta->tx_stats.packets[skb_get_queue_mapping(skb)]++; + sta->deflink.tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len; + sta->deflink.tx_stats.packets[skb_get_queue_mapping(skb)]++; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, @@ -4987,6 +4988,135 @@ static int ieee80211_beacon_protect(struct sk_buff *skb, return 0; } +static void +ieee80211_beacon_get_finish(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_mutable_offsets *offs, + struct beacon_data *beacon, + struct sk_buff *skb, + struct ieee80211_chanctx_conf *chanctx_conf, + u16 csa_off_base) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_tx_info *info; + enum nl80211_band band; + struct ieee80211_tx_rate_control txrc; + + /* CSA offsets */ + if (offs && beacon) { + u16 i; + + for (i = 0; i < IEEE80211_MAX_CNTDWN_COUNTERS_NUM; i++) { + u16 csa_off = beacon->cntdwn_counter_offsets[i]; + + if (!csa_off) + continue; + + offs->cntdwn_counter_offs[i] = csa_off_base + csa_off; + } + } + + band = chanctx_conf->def.chan->band; + info = IEEE80211_SKB_CB(skb); + info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + info->flags |= IEEE80211_TX_CTL_NO_ACK; + info->band = band; + + memset(&txrc, 0, sizeof(txrc)); + txrc.hw = hw; + txrc.sband = local->hw.wiphy->bands[band]; + txrc.bss_conf = &sdata->vif.bss_conf; + txrc.skb = skb; + txrc.reported_rate.idx = -1; + if (sdata->beacon_rate_set && sdata->beacon_rateidx_mask[band]) + txrc.rate_idx_mask = sdata->beacon_rateidx_mask[band]; + else + txrc.rate_idx_mask = sdata->rc_rateidx_mask[band]; + txrc.bss = true; + rate_control_get_rate(sdata, NULL, &txrc); + + info->control.vif = vif; + info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT | + IEEE80211_TX_CTL_ASSIGN_SEQ | + IEEE80211_TX_CTL_FIRST_FRAGMENT; +} + +static void +ieee80211_beacon_add_mbssid(struct sk_buff *skb, struct beacon_data *beacon) +{ + int i; + + if (!beacon->mbssid_ies) + return; + + for (i = 0; i < beacon->mbssid_ies->cnt; i++) + skb_put_data(skb, beacon->mbssid_ies->elem[i].data, + beacon->mbssid_ies->elem[i].len); +} + +static struct sk_buff * +ieee80211_beacon_get_ap(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_mutable_offsets *offs, + bool is_template, + struct beacon_data *beacon, + struct ieee80211_chanctx_conf *chanctx_conf) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_if_ap *ap = &sdata->u.ap; + struct sk_buff *skb = NULL; + u16 csa_off_base = 0; + int mbssid_len; + + if (beacon->cntdwn_counter_offsets[0]) { + if (!is_template) + ieee80211_beacon_update_cntdwn(vif); + + ieee80211_set_beacon_cntdwn(sdata, beacon); + } + + /* headroom, head length, + * tail length, maximum TIM length and multiple BSSID length + */ + mbssid_len = ieee80211_get_mbssid_beacon_len(beacon->mbssid_ies); + skb = dev_alloc_skb(local->tx_headroom + beacon->head_len + + beacon->tail_len + 256 + + local->hw.extra_beacon_tailroom + mbssid_len); + if (!skb) + return NULL; + + skb_reserve(skb, local->tx_headroom); + skb_put_data(skb, beacon->head, beacon->head_len); + + ieee80211_beacon_add_tim(sdata, &ap->ps, skb, is_template); + + if (offs) { + offs->tim_offset = beacon->head_len; + offs->tim_length = skb->len - beacon->head_len; + offs->cntdwn_counter_offs[0] = beacon->cntdwn_counter_offsets[0]; + + if (mbssid_len) { + ieee80211_beacon_add_mbssid(skb, beacon); + offs->mbssid_off = skb->len - mbssid_len; + } + + /* for AP the csa offsets are from tail */ + csa_off_base = skb->len; + } + + if (beacon->tail) + skb_put_data(skb, beacon->tail, beacon->tail_len); + + if (ieee80211_beacon_protect(skb, local, sdata) < 0) + return NULL; + + ieee80211_beacon_get_finish(hw, vif, offs, beacon, skb, chanctx_conf, + csa_off_base); + return skb; +} + static struct sk_buff * __ieee80211_beacon_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, @@ -4996,12 +5126,8 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, struct ieee80211_local *local = hw_to_local(hw); struct beacon_data *beacon = NULL; struct sk_buff *skb = NULL; - struct ieee80211_tx_info *info; struct ieee80211_sub_if_data *sdata = NULL; - enum nl80211_band band; - struct ieee80211_tx_rate_control txrc; struct ieee80211_chanctx_conf *chanctx_conf; - int csa_off_base = 0; rcu_read_lock(); @@ -5018,48 +5144,11 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, struct ieee80211_if_ap *ap = &sdata->u.ap; beacon = rcu_dereference(ap->beacon); - if (beacon) { - if (beacon->cntdwn_counter_offsets[0]) { - if (!is_template) - ieee80211_beacon_update_cntdwn(vif); - - ieee80211_set_beacon_cntdwn(sdata, beacon); - } - - /* - * headroom, head length, - * tail length and maximum TIM length - */ - skb = dev_alloc_skb(local->tx_headroom + - beacon->head_len + - beacon->tail_len + 256 + - local->hw.extra_beacon_tailroom); - if (!skb) - goto out; - - skb_reserve(skb, local->tx_headroom); - skb_put_data(skb, beacon->head, beacon->head_len); - - ieee80211_beacon_add_tim(sdata, &ap->ps, skb, - is_template); - - if (offs) { - offs->tim_offset = beacon->head_len; - offs->tim_length = skb->len - beacon->head_len; - offs->cntdwn_counter_offs[0] = beacon->cntdwn_counter_offsets[0]; - - /* for AP the csa offsets are from tail */ - csa_off_base = skb->len; - } - - if (beacon->tail) - skb_put_data(skb, beacon->tail, - beacon->tail_len); - - if (ieee80211_beacon_protect(skb, local, sdata) < 0) - goto out; - } else + if (!beacon) goto out; + + skb = ieee80211_beacon_get_ap(hw, vif, offs, is_template, + beacon, chanctx_conf); } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_hdr *hdr; @@ -5085,6 +5174,9 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, hdr = (struct ieee80211_hdr *) skb->data; hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON); + + ieee80211_beacon_get_finish(hw, vif, offs, beacon, skb, + chanctx_conf, 0); } else if (ieee80211_vif_is_mesh(&sdata->vif)) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; @@ -5124,51 +5216,13 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, } skb_put_data(skb, beacon->tail, beacon->tail_len); + ieee80211_beacon_get_finish(hw, vif, offs, beacon, skb, + chanctx_conf, 0); } else { WARN_ON(1); goto out; } - /* CSA offsets */ - if (offs && beacon) { - int i; - - for (i = 0; i < IEEE80211_MAX_CNTDWN_COUNTERS_NUM; i++) { - u16 csa_off = beacon->cntdwn_counter_offsets[i]; - - if (!csa_off) - continue; - - offs->cntdwn_counter_offs[i] = csa_off_base + csa_off; - } - } - - band = chanctx_conf->def.chan->band; - - info = IEEE80211_SKB_CB(skb); - - info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; - info->flags |= IEEE80211_TX_CTL_NO_ACK; - info->band = band; - - memset(&txrc, 0, sizeof(txrc)); - txrc.hw = hw; - txrc.sband = local->hw.wiphy->bands[band]; - txrc.bss_conf = &sdata->vif.bss_conf; - txrc.skb = skb; - txrc.reported_rate.idx = -1; - if (sdata->beacon_rate_set && sdata->beacon_rateidx_mask[band]) - txrc.rate_idx_mask = sdata->beacon_rateidx_mask[band]; - else - txrc.rate_idx_mask = sdata->rc_rateidx_mask[band]; - txrc.bss = true; - rate_control_get_rate(sdata, NULL, &txrc); - - info->control.vif = vif; - - info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT | - IEEE80211_TX_CTL_ASSIGN_SEQ | - IEEE80211_TX_CTL_FIRST_FRAGMENT; out: rcu_read_unlock(); return skb; @@ -5671,7 +5725,7 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, const u8 *dest, __be16 proto, bool unencrypted, - u64 *cookie) + int link_id, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index e856f9092137..504410ddddc5 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -118,14 +118,14 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, const struct ieee80211_vht_cap *vht_cap_ie, struct sta_info *sta) { - struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap; + struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.deflink.vht_cap; struct ieee80211_sta_vht_cap own_cap; u32 cap_info, i; bool have_80mhz; memset(vht_cap, 0, sizeof(*vht_cap)); - if (!sta->sta.ht_cap.ht_supported) + if (!sta->sta.deflink.ht_cap.ht_supported) return; if (!vht_cap_ie || !sband->vht_cap.vht_supported) @@ -295,10 +295,10 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, switch (vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ: - sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; + sta->deflink.cur_max_bandwidth = IEEE80211_STA_RX_BW_160; break; default: - sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; + sta->deflink.cur_max_bandwidth = IEEE80211_STA_RX_BW_80; if (!(vht_cap->vht_mcs.tx_highest & cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE))) @@ -310,10 +310,10 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, * above) between 160 and 80+80 yet. */ if (cap_info & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) - sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; + sta->deflink.cur_max_bandwidth = IEEE80211_STA_RX_BW_160; } - sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta); + sta->sta.deflink.bandwidth = ieee80211_sta_cur_vht_bw(sta); switch (vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK) { case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: @@ -332,8 +332,8 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, /* FIXME: move this to some better location - parses HE now */ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta) { - struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap; - struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap; + struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.deflink.vht_cap; + struct ieee80211_sta_he_cap *he_cap = &sta->sta.deflink.he_cap; u32 cap_width; if (he_cap->has_he) { @@ -357,7 +357,7 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta) } if (!vht_cap->vht_supported) - return sta->sta.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? + return sta->sta.deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20; @@ -380,14 +380,14 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta) enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta) { - struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap; + struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.deflink.vht_cap; u32 cap_width; if (!vht_cap->vht_supported) { - if (!sta->sta.ht_cap.ht_supported) + if (!sta->sta.deflink.ht_cap.ht_supported) return NL80211_CHAN_WIDTH_20_NOHT; - return sta->sta.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? + return sta->sta.deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? NL80211_CHAN_WIDTH_40 : NL80211_CHAN_WIDTH_20; } @@ -404,13 +404,13 @@ enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta) enum nl80211_chan_width ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta) { - enum ieee80211_sta_rx_bandwidth cur_bw = sta->sta.bandwidth; - struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap; + enum ieee80211_sta_rx_bandwidth cur_bw = sta->sta.deflink.bandwidth; + struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.deflink.vht_cap; u32 cap_width; switch (cur_bw) { case IEEE80211_STA_RX_BW_20: - if (!sta->sta.ht_cap.ht_supported) + if (!sta->sta.deflink.ht_cap.ht_supported) return NL80211_CHAN_WIDTH_20_NOHT; else return NL80211_CHAN_WIDTH_20; @@ -459,7 +459,7 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta) enum nl80211_chan_width bss_width = sdata->vif.bss_conf.chandef.width; bw = ieee80211_sta_cap_rx_bw(sta); - bw = min(bw, sta->cur_max_bandwidth); + bw = min(bw, sta->deflink.cur_max_bandwidth); /* Don't consider AP's bandwidth for TDLS peers, section 11.23.1 of * IEEE80211-2016 specification makes higher bandwidth operation @@ -487,13 +487,13 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta) bool support_160; /* if we received a notification already don't overwrite it */ - if (sta->sta.rx_nss) + if (sta->sta.deflink.rx_nss) return; - if (sta->sta.he_cap.has_he) { + if (sta->sta.deflink.he_cap.has_he) { int i; u8 rx_mcs_80 = 0, rx_mcs_160 = 0; - const struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap; + const struct ieee80211_sta_he_cap *he_cap = &sta->sta.deflink.he_cap; u16 mcs_160_map = le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_160); u16 mcs_80_map = le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_80); @@ -524,23 +524,23 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta) he_rx_nss = rx_mcs_80; } - if (sta->sta.ht_cap.ht_supported) { - if (sta->sta.ht_cap.mcs.rx_mask[0]) + if (sta->sta.deflink.ht_cap.ht_supported) { + if (sta->sta.deflink.ht_cap.mcs.rx_mask[0]) ht_rx_nss++; - if (sta->sta.ht_cap.mcs.rx_mask[1]) + if (sta->sta.deflink.ht_cap.mcs.rx_mask[1]) ht_rx_nss++; - if (sta->sta.ht_cap.mcs.rx_mask[2]) + if (sta->sta.deflink.ht_cap.mcs.rx_mask[2]) ht_rx_nss++; - if (sta->sta.ht_cap.mcs.rx_mask[3]) + if (sta->sta.deflink.ht_cap.mcs.rx_mask[3]) ht_rx_nss++; /* FIXME: consider rx_highest? */ } - if (sta->sta.vht_cap.vht_supported) { + if (sta->sta.deflink.vht_cap.vht_supported) { int i; u16 rx_mcs_map; - rx_mcs_map = le16_to_cpu(sta->sta.vht_cap.vht_mcs.rx_mcs_map); + rx_mcs_map = le16_to_cpu(sta->sta.deflink.vht_cap.vht_mcs.rx_mcs_map); for (i = 7; i >= 0; i--) { u8 mcs = (rx_mcs_map >> (2 * i)) & 3; @@ -555,7 +555,7 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta) rx_nss = max(vht_rx_nss, ht_rx_nss); rx_nss = max(he_rx_nss, rx_nss); - sta->sta.rx_nss = max_t(u8, 1, rx_nss); + sta->sta.deflink.rx_nss = max_t(u8, 1, rx_nss); } u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, @@ -575,8 +575,8 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT; nss += 1; - if (sta->sta.rx_nss != nss) { - sta->sta.rx_nss = nss; + if (sta->sta.deflink.rx_nss != nss) { + sta->sta.deflink.rx_nss = nss; sta_opmode.rx_nss = nss; changed |= IEEE80211_RC_NSS_CHANGED; sta_opmode.changed |= STA_OPMODE_N_SS_CHANGED; @@ -585,27 +585,27 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) { case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ - sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20; + sta->deflink.cur_max_bandwidth = IEEE80211_STA_RX_BW_20; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ - sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_40; + sta->deflink.cur_max_bandwidth = IEEE80211_STA_RX_BW_40; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: if (opmode & IEEE80211_OPMODE_NOTIF_BW_160_80P80) - sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; + sta->deflink.cur_max_bandwidth = IEEE80211_STA_RX_BW_160; else - sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; + sta->deflink.cur_max_bandwidth = IEEE80211_STA_RX_BW_80; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: /* legacy only, no longer used by newer spec */ - sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; + sta->deflink.cur_max_bandwidth = IEEE80211_STA_RX_BW_160; break; } new_bw = ieee80211_sta_cur_vht_bw(sta); - if (new_bw != sta->sta.bandwidth) { - sta->sta.bandwidth = new_bw; + if (new_bw != sta->sta.deflink.bandwidth) { + sta->sta.deflink.bandwidth = new_bw; sta_opmode.bw = ieee80211_sta_rx_bw_to_chan_width(sta); changed |= IEEE80211_RC_BW_CHANGED; sta_opmode.changed |= STA_OPMODE_MAX_BW_CHANGED; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 4f645d51c257..f0fe740acd99 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1510,6 +1510,29 @@ config NETFILTER_XT_MATCH_QUOTA If you want to compile it as a module, say M here and read . If unsure, say `N'. +config NETFILTER_XT_MATCH_QUOTA2 + tristate '"quota2" match support' + depends on NETFILTER_ADVANCED + help + This option adds a `quota2' match, which allows to match on a + byte counter correctly and not per CPU. + It allows naming the quotas. + This is based on http://xtables-addons.git.sourceforge.net + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config NETFILTER_XT_MATCH_QUOTA2_LOG + bool '"quota2" Netfilter LOG support' + depends on NETFILTER_XT_MATCH_QUOTA2 + default n + help + This option allows `quota2' to log ONCE when a quota limit + is passed. It logs via NETLINK using the NETLINK_NFLOG family. + It logs similarly to how ipt_ULOG would without data. + + If unsure, say `N'. + config NETFILTER_XT_MATCH_RATEEST tristate '"rateest" match support' depends on NETFILTER_ADVANCED diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index aab20e575ecd..3f2d184a57cf 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -195,6 +195,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o +obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA2) += xt_quota2.o obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) += xt_recent.o diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 43ea8cfd374b..a27dac05607e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -1627,6 +1628,8 @@ __nf_conntrack_alloc(struct net *net, nf_ct_zone_add(ct, zone); + trace_android_rvh_nf_conn_alloc(ct); + /* Because we use RCU lookups, we set ct_general.use to zero before * this is inserted in any list. */ @@ -1662,6 +1665,7 @@ void nf_conntrack_free(struct nf_conn *ct) cnet = nf_ct_pernet(net); smp_mb__before_atomic(); + trace_android_rvh_nf_conn_free(ct); atomic_dec(&cnet->count); } EXPORT_SYMBOL_GPL(nf_conntrack_free); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 2f7cf5ecebf4..90579d0ee9f7 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -28,6 +28,11 @@ #include #include #include +#include +#include +#include + +#define NLMSG_MAX_SIZE 64 struct idletimer_tg { struct list_head entry; @@ -38,15 +43,112 @@ struct idletimer_tg { struct kobject *kobj; struct device_attribute attr; + struct timespec64 delayed_timer_trigger; + struct timespec64 last_modified_timer; + struct timespec64 last_suspend_time; + struct notifier_block pm_nb; + + int timeout; unsigned int refcnt; u8 timer_type; + + bool work_pending; + bool send_nl_msg; + bool active; + uid_t uid; + bool suspend_time_valid; }; static LIST_HEAD(idletimer_tg_list); static DEFINE_MUTEX(list_mutex); +static DEFINE_SPINLOCK(timestamp_lock); static struct kobject *idletimer_tg_kobj; +static bool check_for_delayed_trigger(struct idletimer_tg *timer, + struct timespec64 *ts) +{ + bool state; + struct timespec64 temp; + spin_lock_bh(×tamp_lock); + timer->work_pending = false; + if ((ts->tv_sec - timer->last_modified_timer.tv_sec) > timer->timeout || + timer->delayed_timer_trigger.tv_sec != 0) { + state = false; + temp.tv_sec = timer->timeout; + temp.tv_nsec = 0; + if (timer->delayed_timer_trigger.tv_sec != 0) { + temp = timespec64_add(timer->delayed_timer_trigger, + temp); + ts->tv_sec = temp.tv_sec; + ts->tv_nsec = temp.tv_nsec; + timer->delayed_timer_trigger.tv_sec = 0; + timer->work_pending = true; + schedule_work(&timer->work); + } else { + temp = timespec64_add(timer->last_modified_timer, temp); + ts->tv_sec = temp.tv_sec; + ts->tv_nsec = temp.tv_nsec; + } + } else { + state = timer->active; + } + spin_unlock_bh(×tamp_lock); + return state; +} + +static void notify_netlink_uevent(const char *iface, struct idletimer_tg *timer) +{ + char iface_msg[NLMSG_MAX_SIZE]; + char state_msg[NLMSG_MAX_SIZE]; + char timestamp_msg[NLMSG_MAX_SIZE]; + char uid_msg[NLMSG_MAX_SIZE]; + char *envp[] = { iface_msg, state_msg, timestamp_msg, uid_msg, NULL }; + int res; + struct timespec64 ts; + u64 time_ns; + bool state; + + res = snprintf(iface_msg, NLMSG_MAX_SIZE, "INTERFACE=%s", + iface); + if (NLMSG_MAX_SIZE <= res) { + pr_err("message too long (%d)\n", res); + return; + } + + ts = ktime_to_timespec64(ktime_get_boottime()); + state = check_for_delayed_trigger(timer, &ts); + res = snprintf(state_msg, NLMSG_MAX_SIZE, "STATE=%s", + state ? "active" : "inactive"); + + if (NLMSG_MAX_SIZE <= res) { + pr_err("message too long (%d)\n", res); + return; + } + + if (state) { + res = snprintf(uid_msg, NLMSG_MAX_SIZE, "UID=%u", timer->uid); + if (NLMSG_MAX_SIZE <= res) + pr_err("message too long (%d)\n", res); + } else { + res = snprintf(uid_msg, NLMSG_MAX_SIZE, "UID="); + if (NLMSG_MAX_SIZE <= res) + pr_err("message too long (%d)\n", res); + } + + time_ns = timespec64_to_ns(&ts); + res = snprintf(timestamp_msg, NLMSG_MAX_SIZE, "TIME_NS=%llu", time_ns); + if (NLMSG_MAX_SIZE <= res) { + timestamp_msg[0] = '\0'; + pr_err("message too long (%d)\n", res); + } + + pr_debug("putting nlmsg: <%s> <%s> <%s> <%s>\n", iface_msg, state_msg, + timestamp_msg, uid_msg); + kobject_uevent_env(idletimer_tg_kobj, KOBJ_CHANGE, envp); + return; +} + static struct idletimer_tg *__idletimer_tg_find_by_label(const char *label) { @@ -67,6 +169,7 @@ static ssize_t idletimer_tg_show(struct device *dev, unsigned long expires = 0; struct timespec64 ktimespec = {}; long time_diff = 0; + unsigned long now = jiffies; mutex_lock(&list_mutex); @@ -78,16 +181,20 @@ static ssize_t idletimer_tg_show(struct device *dev, time_diff = ktimespec.tv_sec; } else { expires = timer->timer.expires; - time_diff = jiffies_to_msecs(expires - jiffies) / 1000; + time_diff = jiffies_to_msecs(expires - now) / 1000; } } mutex_unlock(&list_mutex); - if (time_after(expires, jiffies) || ktimespec.tv_sec > 0) - return snprintf(buf, PAGE_SIZE, "%ld\n", time_diff); + if (time_after(expires, now) || ktimespec.tv_sec > 0) + return scnprintf(buf, PAGE_SIZE, "%ld\n", time_diff); - return snprintf(buf, PAGE_SIZE, "0\n"); + if (timer->send_nl_msg) + return scnprintf(buf, PAGE_SIZE, "0 %d\n", + jiffies_to_msecs(now - expires) / 1000); + + return scnprintf(buf, PAGE_SIZE, "0\n"); } static void idletimer_tg_work(struct work_struct *work) @@ -96,6 +203,9 @@ static void idletimer_tg_work(struct work_struct *work) work); sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name); + + if (timer->send_nl_msg) + notify_netlink_uevent(timer->attr.attr.name, timer); } static void idletimer_tg_expired(struct timer_list *t) @@ -104,7 +214,62 @@ static void idletimer_tg_expired(struct timer_list *t) pr_debug("timer %s expired\n", timer->attr.attr.name); + spin_lock_bh(×tamp_lock); + timer->active = false; + timer->work_pending = true; schedule_work(&timer->work); + spin_unlock_bh(×tamp_lock); +} + +static int idletimer_resume(struct notifier_block *notifier, + unsigned long pm_event, void *unused) +{ + struct timespec64 ts; + unsigned long time_diff, now = jiffies; + struct idletimer_tg *timer = container_of(notifier, + struct idletimer_tg, pm_nb); + if (!timer) + return NOTIFY_DONE; + + switch (pm_event) { + case PM_SUSPEND_PREPARE: + timer->last_suspend_time = + ktime_to_timespec64(ktime_get_boottime()); + timer->suspend_time_valid = true; + break; + case PM_POST_SUSPEND: + if (!timer->suspend_time_valid) + break; + timer->suspend_time_valid = false; + + spin_lock_bh(×tamp_lock); + if (!timer->active) { + spin_unlock_bh(×tamp_lock); + break; + } + /* since jiffies are not updated when suspended now represents + * the time it would have suspended */ + if (time_after(timer->timer.expires, now)) { + ts = ktime_to_timespec64(ktime_get_boottime()); + ts = timespec64_sub(ts, timer->last_suspend_time); + time_diff = timespec64_to_jiffies(&ts); + if (timer->timer.expires > (time_diff + now)) { + mod_timer_pending(&timer->timer, + (timer->timer.expires - time_diff)); + } else { + del_timer(&timer->timer); + timer->timer.expires = 0; + timer->active = false; + timer->work_pending = true; + schedule_work(&timer->work); + } + } + spin_unlock_bh(×tamp_lock); + break; + default: + break; + } + return NOTIFY_DONE; } static enum alarmtimer_restart idletimer_tg_alarmproc(struct alarm *alarm, @@ -158,17 +323,34 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); if (ret < 0) { - pr_debug("couldn't add file to sysfs"); + pr_debug("couldn't add file to sysfs\n"); goto out_free_attr; } list_add(&info->timer->entry, &idletimer_tg_list); - - timer_setup(&info->timer->timer, idletimer_tg_expired, 0); + pr_debug("timer type value is 0.\n"); + info->timer->timer_type = 0; info->timer->refcnt = 1; + info->timer->send_nl_msg = false; + info->timer->active = true; + info->timer->timeout = info->timeout; + + info->timer->delayed_timer_trigger.tv_sec = 0; + info->timer->delayed_timer_trigger.tv_nsec = 0; + info->timer->work_pending = false; + info->timer->uid = 0; + info->timer->last_modified_timer = + ktime_to_timespec64(ktime_get_boottime()); + + info->timer->pm_nb.notifier_call = idletimer_resume; + ret = register_pm_notifier(&info->timer->pm_nb); + if (ret) + printk(KERN_WARNING "[%s] Failed to register pm notifier %d\n", + __func__, ret); INIT_WORK(&info->timer->work, idletimer_tg_work); + timer_setup(&info->timer->timer, idletimer_tg_expired, 0); mod_timer(&info->timer->timer, msecs_to_jiffies(info->timeout * 1000) + jiffies); @@ -186,7 +368,7 @@ static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) { int ret; - info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL); + info->timer = kzalloc(sizeof(*info->timer), GFP_KERNEL); if (!info->timer) { ret = -ENOMEM; goto out; @@ -207,7 +389,7 @@ static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); if (ret < 0) { - pr_debug("couldn't add file to sysfs"); + pr_debug("couldn't add file to sysfs\n"); goto out_free_attr; } @@ -215,9 +397,25 @@ static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) kobject_uevent(idletimer_tg_kobj,KOBJ_ADD); list_add(&info->timer->entry, &idletimer_tg_list); - pr_debug("timer type value is %u", info->timer_type); + pr_debug("timer type value is %u\n", info->timer_type); info->timer->timer_type = info->timer_type; info->timer->refcnt = 1; + info->timer->send_nl_msg = (info->send_nl_msg != 0); + info->timer->active = true; + info->timer->timeout = info->timeout; + + info->timer->delayed_timer_trigger.tv_sec = 0; + info->timer->delayed_timer_trigger.tv_nsec = 0; + info->timer->work_pending = false; + info->timer->uid = 0; + info->timer->last_modified_timer = + ktime_to_timespec64(ktime_get_boottime()); + + info->timer->pm_nb.notifier_call = idletimer_resume; + ret = register_pm_notifier(&info->timer->pm_nb); + if (ret) + printk(KERN_WARNING "[%s] Failed to register pm notifier %d\n", + __func__, ret); INIT_WORK(&info->timer->work, idletimer_tg_work); @@ -231,7 +429,7 @@ static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) } else { timer_setup(&info->timer->timer, idletimer_tg_expired, 0); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + msecs_to_jiffies(info->timeout * 1000) + jiffies); } return 0; @@ -244,6 +442,41 @@ out: return ret; } +static void reset_timer(struct idletimer_tg * const info_timer, + const __u32 info_timeout, + struct sk_buff *skb) +{ + unsigned long now = jiffies; + bool timer_prev; + + spin_lock_bh(×tamp_lock); + timer_prev = info_timer->active; + info_timer->active = true; + /* timer_prev is used to guard overflow problem in time_before*/ + if (!timer_prev || time_before(info_timer->timer.expires, now)) { + pr_debug("Starting Checkentry timer (Expired, Jiffies): %lu, %lu\n", + info_timer->timer.expires, now); + + /* Stores the uid resposible for waking up the radio */ + if (skb && (skb->sk)) { + info_timer->uid = from_kuid_munged(current_user_ns(), + sock_i_uid(skb_to_full_sk(skb))); + } + + /* checks if there is a pending inactive notification*/ + if (info_timer->work_pending) + info_timer->delayed_timer_trigger = info_timer->last_modified_timer; + else { + info_timer->work_pending = true; + schedule_work(&info_timer->work); + } + } + + info_timer->last_modified_timer = ktime_to_timespec64(ktime_get_boottime()); + mod_timer(&info_timer->timer, msecs_to_jiffies(info_timeout * 1000) + now); + spin_unlock_bh(×tamp_lock); +} + /* * The actual xt_tables plugin. */ @@ -251,12 +484,21 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb, const struct xt_action_param *par) { const struct idletimer_tg_info *info = par->targinfo; + unsigned long now = jiffies; pr_debug("resetting timer %s, timeout period %u\n", info->label, info->timeout); - mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + info->timer->active = true; + + if (time_before(info->timer->timer.expires, now)) { + schedule_work(&info->timer->work); + pr_debug("Starting timer %s (Expired, Jiffies): %lu, %lu\n", + info->label, info->timer->timer.expires, now); + } + + /* TODO: Avoid modifying timers on each packet */ + reset_timer(info->timer, info->timeout, skb); return XT_CONTINUE; } @@ -268,6 +510,7 @@ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct idletimer_tg_info_v1 *info = par->targinfo; + unsigned long now = jiffies; pr_debug("resetting timer %s, timeout period %u\n", info->label, info->timeout); @@ -276,8 +519,16 @@ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb, ktime_t tout = ktime_set(info->timeout, 0); alarm_start_relative(&info->timer->alarm, tout); } else { - mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + info->timer->active = true; + + if (time_before(info->timer->timer.expires, now)) { + schedule_work(&info->timer->work); + pr_debug("Starting timer %s (Expired, Jiffies): %lu, %lu\n", + info->label, info->timer->timer.expires, now); + } + + /* TODO: Avoid modifying timers on each packet */ + reset_timer(info->timer, info->timeout, skb); } return XT_CONTINUE; @@ -321,9 +572,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) info->timer = __idletimer_tg_find_by_label(info->label); if (info->timer) { info->timer->refcnt++; - mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); - + reset_timer(info->timer, info->timeout, NULL); pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); } else { @@ -346,9 +595,6 @@ static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) pr_debug("checkentry targinfo%s\n", info->label); - if (info->send_nl_msg) - return -EOPNOTSUPP; - ret = idletimer_tg_helper((struct idletimer_tg_info *)info); if(ret < 0) { @@ -361,6 +607,11 @@ static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) return -EINVAL; } + if (info->send_nl_msg > 1) { + pr_debug("invalid value for send_nl_msg\n"); + return -EINVAL; + } + mutex_lock(&list_mutex); info->timer = __idletimer_tg_find_by_label(info->label); @@ -383,8 +634,7 @@ static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) alarm_start_relative(&info->timer->alarm, tout); } } else { - mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + reset_timer(info->timer, info->timeout, NULL); } pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); @@ -414,8 +664,9 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) list_del(&info->timer->entry); del_timer_sync(&info->timer->timer); - cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); + unregister_pm_notifier(&info->timer->pm_nb); + cancel_work_sync(&info->timer->work); kfree(info->timer->attr.attr.name); kfree(info->timer); } else { @@ -443,8 +694,9 @@ static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par) } else { del_timer_sync(&info->timer->timer); } - cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); + unregister_pm_notifier(&info->timer->pm_nb); + cancel_work_sync(&info->timer->work); kfree(info->timer->attr.attr.name); kfree(info->timer); } else { @@ -540,3 +792,4 @@ MODULE_DESCRIPTION("Xtables: idle time monitor"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("ipt_IDLETIMER"); MODULE_ALIAS("ip6t_IDLETIMER"); +MODULE_ALIAS("arpt_IDLETIMER"); diff --git a/net/netfilter/xt_quota2.c b/net/netfilter/xt_quota2.c new file mode 100644 index 000000000000..641e36e89ce5 --- /dev/null +++ b/net/netfilter/xt_quota2.c @@ -0,0 +1,397 @@ +/* + * xt_quota2 - enhanced xt_quota that can count upwards and in packets + * as a minimal accounting match. + * by Jan Engelhardt , 2008 + * + * Originally based on xt_quota.c: + * netfilter module to enforce network quotas + * Sam Johnston + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License; either + * version 2 of the License, as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG +/* For compatibility, these definitions are copied from the + * deprecated header file */ +#define ULOG_MAC_LEN 80 +#define ULOG_PREFIX_LEN 32 + +/* Format of the ULOG packets passed through netlink */ +typedef struct ulog_packet_msg { + unsigned long mark; + long timestamp_sec; + long timestamp_usec; + unsigned int hook; + char indev_name[IFNAMSIZ]; + char outdev_name[IFNAMSIZ]; + size_t data_len; + char prefix[ULOG_PREFIX_LEN]; + unsigned char mac_len; + unsigned char mac[ULOG_MAC_LEN]; + unsigned char payload[0]; +} ulog_packet_msg_t; +#endif + +/** + * @lock: lock to protect quota writers from each other + */ +struct xt_quota_counter { + u_int64_t quota; + spinlock_t lock; + struct list_head list; + atomic_t ref; + char name[sizeof(((struct xt_quota_mtinfo2 *)NULL)->name)]; + struct proc_dir_entry *procfs_entry; +}; + +#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG +/* Harald's favorite number +1 :D From ipt_ULOG.C */ +static int qlog_nl_event = 112; +module_param_named(event_num, qlog_nl_event, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(event_num, + "Event number for NETLINK_NFLOG message. 0 disables log." + "111 is what ipt_ULOG uses."); +static struct sock *nflognl; +#endif + +static LIST_HEAD(counter_list); +static DEFINE_SPINLOCK(counter_list_lock); + +static struct proc_dir_entry *proc_xt_quota; +static unsigned int quota_list_perms = S_IRUGO | S_IWUSR; +static kuid_t quota_list_uid = KUIDT_INIT(0); +static kgid_t quota_list_gid = KGIDT_INIT(0); +module_param_named(perms, quota_list_perms, uint, S_IRUGO | S_IWUSR); + +#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG +static void quota2_log(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const char *prefix) +{ + ulog_packet_msg_t *pm; + struct sk_buff *log_skb; + size_t size; + struct nlmsghdr *nlh; + + if (!qlog_nl_event) + return; + + size = NLMSG_SPACE(sizeof(*pm)); + size = max(size, (size_t)NLMSG_GOODSIZE); + log_skb = alloc_skb(size, GFP_ATOMIC); + if (!log_skb) { + pr_err("xt_quota2: cannot alloc skb for logging\n"); + return; + } + + nlh = nlmsg_put(log_skb, /*pid*/0, /*seq*/0, qlog_nl_event, + sizeof(*pm), 0); + if (!nlh) { + pr_err("xt_quota2: nlmsg_put failed\n"); + kfree_skb(log_skb); + return; + } + pm = nlmsg_data(nlh); + memset(pm, 0, sizeof(*pm)); + if (skb->tstamp == 0) + __net_timestamp((struct sk_buff *)skb); + pm->hook = hooknum; + if (prefix != NULL) + strlcpy(pm->prefix, prefix, sizeof(pm->prefix)); + if (in) + strlcpy(pm->indev_name, in->name, sizeof(pm->indev_name)); + if (out) + strlcpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); + + NETLINK_CB(log_skb).dst_group = 1; + pr_debug("throwing 1 packets to netlink group 1\n"); + netlink_broadcast(nflognl, log_skb, 0, 1, GFP_ATOMIC); +} +#else +static void quota2_log(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const char *prefix) +{ +} +#endif /* if+else CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG */ + +static ssize_t quota_proc_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct xt_quota_counter *e = PDE_DATA(file_inode(file)); + char tmp[24]; + size_t tmp_size; + + spin_lock_bh(&e->lock); + tmp_size = scnprintf(tmp, sizeof(tmp), "%llu\n", e->quota); + spin_unlock_bh(&e->lock); + return simple_read_from_buffer(buf, size, ppos, tmp, tmp_size); +} + +static ssize_t quota_proc_write(struct file *file, const char __user *input, + size_t size, loff_t *ppos) +{ + struct xt_quota_counter *e = PDE_DATA(file_inode(file)); + char buf[sizeof("18446744073709551616")]; + + if (size > sizeof(buf)) + size = sizeof(buf); + if (copy_from_user(buf, input, size) != 0) + return -EFAULT; + buf[sizeof(buf)-1] = '\0'; + if (size < sizeof(buf)) + buf[size] = '\0'; + + spin_lock_bh(&e->lock); + e->quota = simple_strtoull(buf, NULL, 0); + spin_unlock_bh(&e->lock); + return size; +} + +static const struct proc_ops q2_counter_fops = { + .proc_read = quota_proc_read, + .proc_write = quota_proc_write, + .proc_lseek = default_llseek, +}; + +static struct xt_quota_counter * +q2_new_counter(const struct xt_quota_mtinfo2 *q, bool anon) +{ + struct xt_quota_counter *e; + unsigned int size; + + /* Do not need all the procfs things for anonymous counters. */ + size = anon ? offsetof(typeof(*e), list) : sizeof(*e); + e = kmalloc(size, GFP_KERNEL); + if (e == NULL) + return NULL; + + e->quota = q->quota; + spin_lock_init(&e->lock); + if (!anon) { + INIT_LIST_HEAD(&e->list); + atomic_set(&e->ref, 1); + strlcpy(e->name, q->name, sizeof(e->name)); + } + return e; +} + +/** + * q2_get_counter - get ref to counter or create new + * @name: name of counter + */ +static struct xt_quota_counter * +q2_get_counter(const struct xt_quota_mtinfo2 *q) +{ + struct proc_dir_entry *p; + struct xt_quota_counter *e = NULL; + struct xt_quota_counter *new_e; + + if (*q->name == '\0') + return q2_new_counter(q, true); + + /* No need to hold a lock while getting a new counter */ + new_e = q2_new_counter(q, false); + if (new_e == NULL) + goto out; + + spin_lock_bh(&counter_list_lock); + list_for_each_entry(e, &counter_list, list) + if (strcmp(e->name, q->name) == 0) { + atomic_inc(&e->ref); + spin_unlock_bh(&counter_list_lock); + kfree(new_e); + pr_debug("xt_quota2: old counter name=%s", e->name); + return e; + } + e = new_e; + pr_debug("xt_quota2: new_counter name=%s", e->name); + list_add_tail(&e->list, &counter_list); + /* The entry having a refcount of 1 is not directly destructible. + * This func has not yet returned the new entry, thus iptables + * has not references for destroying this entry. + * For another rule to try to destroy it, it would 1st need for this + * func* to be re-invoked, acquire a new ref for the same named quota. + * Nobody will access the e->procfs_entry either. + * So release the lock. */ + spin_unlock_bh(&counter_list_lock); + + /* create_proc_entry() is not spin_lock happy */ + p = e->procfs_entry = proc_create_data(e->name, quota_list_perms, + proc_xt_quota, &q2_counter_fops, e); + + if (IS_ERR_OR_NULL(p)) { + spin_lock_bh(&counter_list_lock); + list_del(&e->list); + spin_unlock_bh(&counter_list_lock); + goto out; + } + proc_set_user(p, quota_list_uid, quota_list_gid); + return e; + + out: + kfree(e); + return NULL; +} + +static int quota_mt2_check(const struct xt_mtchk_param *par) +{ + struct xt_quota_mtinfo2 *q = par->matchinfo; + + pr_debug("xt_quota2: check() flags=0x%04x", q->flags); + + if (q->flags & ~XT_QUOTA_MASK) + return -EINVAL; + + q->name[sizeof(q->name)-1] = '\0'; + if (*q->name == '.' || strchr(q->name, '/') != NULL) { + printk(KERN_ERR "xt_quota.3: illegal name\n"); + return -EINVAL; + } + + q->master = q2_get_counter(q); + if (q->master == NULL) { + printk(KERN_ERR "xt_quota.3: memory alloc failure\n"); + return -ENOMEM; + } + + return 0; +} + +static void quota_mt2_destroy(const struct xt_mtdtor_param *par) +{ + struct xt_quota_mtinfo2 *q = par->matchinfo; + struct xt_quota_counter *e = q->master; + + if (*q->name == '\0') { + kfree(e); + return; + } + + spin_lock_bh(&counter_list_lock); + if (!atomic_dec_and_test(&e->ref)) { + spin_unlock_bh(&counter_list_lock); + return; + } + + list_del(&e->list); + spin_unlock_bh(&counter_list_lock); + remove_proc_entry(e->name, proc_xt_quota); + kfree(e); +} + +static bool +quota_mt2(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct xt_quota_mtinfo2 *q = (void *)par->matchinfo; + struct xt_quota_counter *e = q->master; + int charge = (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len; + bool no_change = q->flags & XT_QUOTA_NO_CHANGE; + bool ret = q->flags & XT_QUOTA_INVERT; + + spin_lock_bh(&e->lock); + if (q->flags & XT_QUOTA_GROW) { + /* + * While no_change is pointless in "grow" mode, we will + * implement it here simply to have a consistent behavior. + */ + if (!no_change) + e->quota += charge; + ret = true; /* note: does not respect inversion (bug??) */ + } else { + if (e->quota > charge) { + if (!no_change) + e->quota -= charge; + ret = !ret; + } else if (e->quota) { + /* We are transitioning, log that fact. */ + quota2_log(xt_hooknum(par), + skb, + xt_in(par), + xt_out(par), + q->name); + /* we do not allow even small packets from now on */ + e->quota = 0; + } + } + spin_unlock_bh(&e->lock); + return ret; +} + +static struct xt_match quota_mt2_reg[] __read_mostly = { + { + .name = "quota2", + .revision = 3, + .family = NFPROTO_IPV4, + .checkentry = quota_mt2_check, + .match = quota_mt2, + .destroy = quota_mt2_destroy, + .matchsize = sizeof(struct xt_quota_mtinfo2), + .usersize = offsetof(struct xt_quota_mtinfo2, master), + .me = THIS_MODULE, + }, + { + .name = "quota2", + .revision = 3, + .family = NFPROTO_IPV6, + .checkentry = quota_mt2_check, + .match = quota_mt2, + .destroy = quota_mt2_destroy, + .matchsize = sizeof(struct xt_quota_mtinfo2), + .usersize = offsetof(struct xt_quota_mtinfo2, master), + .me = THIS_MODULE, + }, +}; + +static int __init quota_mt2_init(void) +{ + int ret; + pr_debug("xt_quota2: init()"); + +#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG + nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, NULL); + if (!nflognl) + return -ENOMEM; +#endif + + proc_xt_quota = proc_mkdir("xt_quota", init_net.proc_net); + if (proc_xt_quota == NULL) + return -EACCES; + + ret = xt_register_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg)); + if (ret < 0) + remove_proc_entry("xt_quota", init_net.proc_net); + pr_debug("xt_quota2: init() %d", ret); + return ret; +} + +static void __exit quota_mt2_exit(void) +{ + xt_unregister_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg)); + remove_proc_entry("xt_quota", init_net.proc_net); +} + +module_init(quota_mt2_init); +module_exit(quota_mt2_exit); +MODULE_DESCRIPTION("Xtables: countdown quota match; up counter"); +MODULE_AUTHOR("Sam Johnston "); +MODULE_AUTHOR("Jan Engelhardt "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_quota2"); +MODULE_ALIAS("ip6t_quota2"); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 5f6e6a6e91b3..17da10d39ae0 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -411,6 +411,9 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) } } + if (snum && inet_is_local_unbindable_port(net, snum)) + return -EPERM; + if (snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; @@ -1079,6 +1082,8 @@ static int sctp_connect_new_asoc(struct sctp_endpoint *ep, if (sctp_autobind(sk)) return -EAGAIN; } else { + if (inet_is_local_unbindable_port(net, ep->base.bind_addr.port)) + return -EPERM; if (inet_port_requires_bind_service(net, ep->base.bind_addr.port) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 691c0000e9ea..6321bb24f88d 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -149,5 +149,6 @@ cleanup_sunrpc(void) rcu_barrier(); /* Wait for completion of call_rcu()'s */ } MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver); fs_initcall(init_sunrpc); /* Ensure we're initialised before nfs */ module_exit(cleanup_sunrpc); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0a59a00cb581..ec5d6af3b72d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3440,4 +3440,5 @@ fs_initcall(af_unix_init); module_exit(af_unix_exit); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver); MODULE_ALIAS_NETPROTO(PF_UNIX); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index dc36a46ce0e7..44984cdcb7c6 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -333,8 +333,7 @@ void vsock_remove_sock(struct vsock_sock *vsk) } EXPORT_SYMBOL_GPL(vsock_remove_sock); -void vsock_for_each_connected_socket(struct vsock_transport *transport, - void (*fn)(struct sock *sk)) +void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)) { int i; @@ -343,12 +342,8 @@ void vsock_for_each_connected_socket(struct vsock_transport *transport, for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) { struct vsock_sock *vsk; list_for_each_entry(vsk, &vsock_connected_table[i], - connected_table) { - if (vsk->transport != transport) - continue; - + connected_table) fn(sk_vsock(vsk)); - } } spin_unlock_bh(&vsock_table_lock); diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index c5f936fbf876..18a0f075eebc 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -358,11 +358,17 @@ static void virtio_vsock_event_fill(struct virtio_vsock *vsock) static void virtio_vsock_reset_sock(struct sock *sk) { + struct vsock_sock *vsk = vsock_sk(sk); + /* vmci_transport.c doesn't take sk_lock here either. At least we're * under vsock_table_lock so the sock cannot disappear while we're * executing. */ + /* Only handle our own sockets */ + if (vsk->transport != &virtio_transport.transport) + return; + sk->sk_state = TCP_CLOSE; sk->sk_err = ECONNRESET; sk_error_report(sk); @@ -385,8 +391,7 @@ static void virtio_vsock_event_handle(struct virtio_vsock *vsock, switch (le32_to_cpu(event->id)) { case VIRTIO_VSOCK_EVENT_TRANSPORT_RESET: virtio_vsock_update_guest_cid(vsock); - vsock_for_each_connected_socket(&virtio_transport.transport, - virtio_vsock_reset_sock); + vsock_for_each_connected_socket(virtio_vsock_reset_sock); break; } } @@ -667,8 +672,7 @@ static void virtio_vsock_remove(struct virtio_device *vdev) synchronize_rcu(); /* Reset all connected sockets when the device disappear */ - vsock_for_each_connected_socket(&virtio_transport.transport, - virtio_vsock_reset_sock); + vsock_for_each_connected_socket(virtio_vsock_reset_sock); /* Stop all work handlers to make sure no one is accessing the device, * so we can safely call vdev->config->reset(). diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 3a12aee33e92..b94287d2644b 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -26,6 +26,10 @@ /* Threshold for detecting small packets to copy */ #define GOOD_COPY_LEN 128 +uint virtio_transport_max_vsock_pkt_buf_size = 64 * 1024; +module_param(virtio_transport_max_vsock_pkt_buf_size, uint, 0444); +EXPORT_SYMBOL_GPL(virtio_transport_max_vsock_pkt_buf_size); + static const struct virtio_transport * virtio_transport_get_ops(struct vsock_sock *vsk) { diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 94c1112f1c8c..c32d4aa90b45 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -805,6 +805,11 @@ static void vmci_transport_handle_detach(struct sock *sk) struct vsock_sock *vsk; vsk = vsock_sk(sk); + + /* Only handle our own sockets */ + if (vsk->transport != &vmci_transport) + return; + if (!vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle)) { sock_set_flag(sk, SOCK_DONE); @@ -884,8 +889,7 @@ static void vmci_transport_qp_resumed_cb(u32 sub_id, const struct vmci_event_data *e_data, void *client_data) { - vsock_for_each_connected_socket(&vmci_transport, - vmci_transport_handle_detach); + vsock_for_each_connected_socket(vmci_transport_handle_detach); } static void vmci_transport_recv_pkt_work(struct work_struct *work) diff --git a/net/wireless/ap.c b/net/wireless/ap.c index 550ac9d827fe..e68923200018 100644 --- a/net/wireless/ap.c +++ b/net/wireless/ap.c @@ -1,4 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 +/* + * Parts of this file are + * Copyright (C) 2022 Intel Corporation + */ #include #include #include @@ -7,8 +11,9 @@ #include "rdev-ops.h" -int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev, - struct net_device *dev, bool notify) +static int ___cfg80211_stop_ap(struct cfg80211_registered_device *rdev, + struct net_device *dev, unsigned int link_id, + bool notify) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; @@ -22,15 +27,16 @@ int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev, dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; - if (!wdev->beacon_interval) + if (!wdev->links[link_id].ap.beacon_interval) return -ENOENT; - err = rdev_stop_ap(rdev, dev); + err = rdev_stop_ap(rdev, dev, link_id); if (!err) { wdev->conn_owner_nlportid = 0; - wdev->beacon_interval = 0; - memset(&wdev->chandef, 0, sizeof(wdev->chandef)); - wdev->ssid_len = 0; + wdev->links[link_id].ap.beacon_interval = 0; + memset(&wdev->links[link_id].ap.chandef, 0, + sizeof(wdev->links[link_id].ap.chandef)); + wdev->u.ap.ssid_len = 0; rdev_set_qos_map(rdev, dev, NULL); if (notify) nl80211_send_ap_stopped(wdev); @@ -46,14 +52,36 @@ int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev, return err; } +int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev, + struct net_device *dev, int link_id, + bool notify) +{ + unsigned int link; + int ret = 0; + + if (link_id >= 0) + return ___cfg80211_stop_ap(rdev, dev, link_id, notify); + + for_each_valid_link(dev->ieee80211_ptr, link) { + int ret1 = ___cfg80211_stop_ap(rdev, dev, link, notify); + + if (ret1) + ret = ret1; + /* try the next one also if one errored */ + } + + return ret; +} + int cfg80211_stop_ap(struct cfg80211_registered_device *rdev, - struct net_device *dev, bool notify) + struct net_device *dev, int link_id, + bool notify) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; wdev_lock(wdev); - err = __cfg80211_stop_ap(rdev, dev, notify); + err = __cfg80211_stop_ap(rdev, dev, link_id, notify); wdev_unlock(wdev); return err; diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 869c43d4414c..0e5835cd8c61 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -6,7 +6,7 @@ * * Copyright 2009 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2018-2021 Intel Corporation + * Copyright 2018-2022 Intel Corporation */ #include @@ -181,6 +181,9 @@ static int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width) case NL80211_CHAN_WIDTH_160: mhz = 160; break; + case NL80211_CHAN_WIDTH_320: + mhz = 320; + break; default: WARN_ON_ONCE(1); return -1; @@ -245,19 +248,7 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) oper_freq - MHZ_TO_KHZ(oper_width) / 2) return false; break; - case NL80211_CHAN_WIDTH_40: - if (chandef->center_freq1 != control_freq + 10 && - chandef->center_freq1 != control_freq - 10) - return false; - if (chandef->center_freq2) - return false; - break; case NL80211_CHAN_WIDTH_80P80: - if (chandef->center_freq1 != control_freq + 30 && - chandef->center_freq1 != control_freq + 10 && - chandef->center_freq1 != control_freq - 10 && - chandef->center_freq1 != control_freq - 30) - return false; if (!chandef->center_freq2) return false; /* adjacent is not allowed -- that's a 160 MHz channel */ @@ -265,28 +256,53 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) chandef->center_freq2 - chandef->center_freq1 == 80) return false; break; - case NL80211_CHAN_WIDTH_80: - if (chandef->center_freq1 != control_freq + 30 && - chandef->center_freq1 != control_freq + 10 && - chandef->center_freq1 != control_freq - 10 && - chandef->center_freq1 != control_freq - 30) - return false; + default: if (chandef->center_freq2) return false; break; + } + + switch (chandef->width) { + case NL80211_CHAN_WIDTH_5: + case NL80211_CHAN_WIDTH_10: + case NL80211_CHAN_WIDTH_20: + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_1: + case NL80211_CHAN_WIDTH_2: + case NL80211_CHAN_WIDTH_4: + case NL80211_CHAN_WIDTH_8: + case NL80211_CHAN_WIDTH_16: + /* all checked above */ + break; + case NL80211_CHAN_WIDTH_320: + if (chandef->center_freq1 == control_freq + 150 || + chandef->center_freq1 == control_freq + 130 || + chandef->center_freq1 == control_freq + 110 || + chandef->center_freq1 == control_freq + 90 || + chandef->center_freq1 == control_freq - 90 || + chandef->center_freq1 == control_freq - 110 || + chandef->center_freq1 == control_freq - 130 || + chandef->center_freq1 == control_freq - 150) + break; + fallthrough; case NL80211_CHAN_WIDTH_160: - if (chandef->center_freq1 != control_freq + 70 && - chandef->center_freq1 != control_freq + 50 && - chandef->center_freq1 != control_freq + 30 && - chandef->center_freq1 != control_freq + 10 && - chandef->center_freq1 != control_freq - 10 && - chandef->center_freq1 != control_freq - 30 && - chandef->center_freq1 != control_freq - 50 && - chandef->center_freq1 != control_freq - 70) - return false; - if (chandef->center_freq2) - return false; - break; + if (chandef->center_freq1 == control_freq + 70 || + chandef->center_freq1 == control_freq + 50 || + chandef->center_freq1 == control_freq - 50 || + chandef->center_freq1 == control_freq - 70) + break; + fallthrough; + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_80: + if (chandef->center_freq1 == control_freq + 30 || + chandef->center_freq1 == control_freq - 30) + break; + fallthrough; + case NL80211_CHAN_WIDTH_40: + if (chandef->center_freq1 == control_freq + 10 || + chandef->center_freq1 == control_freq - 10) + break; + fallthrough; default: return false; } @@ -305,7 +321,7 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) EXPORT_SYMBOL(cfg80211_chandef_valid); static void chandef_primary_freqs(const struct cfg80211_chan_def *c, - u32 *pri40, u32 *pri80) + u32 *pri40, u32 *pri80, u32 *pri160) { int tmp; @@ -313,9 +329,11 @@ static void chandef_primary_freqs(const struct cfg80211_chan_def *c, case NL80211_CHAN_WIDTH_40: *pri40 = c->center_freq1; *pri80 = 0; + *pri160 = 0; break; case NL80211_CHAN_WIDTH_80: case NL80211_CHAN_WIDTH_80P80: + *pri160 = 0; *pri80 = c->center_freq1; /* n_P20 */ tmp = (30 + c->chan->center_freq - c->center_freq1)/20; @@ -325,6 +343,7 @@ static void chandef_primary_freqs(const struct cfg80211_chan_def *c, *pri40 = c->center_freq1 - 20 + 40 * tmp; break; case NL80211_CHAN_WIDTH_160: + *pri160 = c->center_freq1; /* n_P20 */ tmp = (70 + c->chan->center_freq - c->center_freq1)/20; /* n_P40 */ @@ -335,6 +354,20 @@ static void chandef_primary_freqs(const struct cfg80211_chan_def *c, tmp /= 2; *pri80 = c->center_freq1 - 40 + 80 * tmp; break; + case NL80211_CHAN_WIDTH_320: + /* n_P20 */ + tmp = (150 + c->chan->center_freq - c->center_freq1) / 20; + /* n_P40 */ + tmp /= 2; + /* freq_P40 */ + *pri40 = c->center_freq1 - 140 + 40 * tmp; + /* n_P80 */ + tmp /= 2; + *pri80 = c->center_freq1 - 120 + 80 * tmp; + /* n_P160 */ + tmp /= 2; + *pri160 = c->center_freq1 - 80 + 160 * tmp; + break; default: WARN_ON_ONCE(1); } @@ -344,7 +377,7 @@ const struct cfg80211_chan_def * cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1, const struct cfg80211_chan_def *c2) { - u32 c1_pri40, c1_pri80, c2_pri40, c2_pri80; + u32 c1_pri40, c1_pri80, c2_pri40, c2_pri80, c1_pri160, c2_pri160; /* If they are identical, return */ if (cfg80211_chandef_identical(c1, c2)) @@ -379,14 +412,31 @@ cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1, c2->width == NL80211_CHAN_WIDTH_20) return c1; - chandef_primary_freqs(c1, &c1_pri40, &c1_pri80); - chandef_primary_freqs(c2, &c2_pri40, &c2_pri80); + chandef_primary_freqs(c1, &c1_pri40, &c1_pri80, &c1_pri160); + chandef_primary_freqs(c2, &c2_pri40, &c2_pri80, &c2_pri160); if (c1_pri40 != c2_pri40) return NULL; - WARN_ON(!c1_pri80 && !c2_pri80); - if (c1_pri80 && c2_pri80 && c1_pri80 != c2_pri80) + if (c1->width == NL80211_CHAN_WIDTH_40) + return c2; + + if (c2->width == NL80211_CHAN_WIDTH_40) + return c1; + + if (c1_pri80 != c2_pri80) + return NULL; + + if (c1->width == NL80211_CHAN_WIDTH_80 && + c2->width > NL80211_CHAN_WIDTH_80) + return c2; + + if (c2->width == NL80211_CHAN_WIDTH_80 && + c1->width > NL80211_CHAN_WIDTH_80) + return c1; + + WARN_ON(!c1_pri160 && !c2_pri160); + if (c1_pri160 && c2_pri160 && c1_pri160 != c2_pri160) return NULL; if (c1->width > c2->width) @@ -622,14 +672,21 @@ bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy, * range of chandef. */ bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef, - struct ieee80211_channel *chan) + struct ieee80211_channel *chan, + bool primary_only) { int width; u32 freq; + if (!chandef->chan) + return false; + if (chandef->chan->center_freq == chan->center_freq) return true; + if (primary_only) + return false; + width = cfg80211_chandef_get_width(chandef); if (width <= 20) return false; @@ -654,23 +711,25 @@ bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef, bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev) { - bool active = false; + unsigned int link; ASSERT_WDEV_LOCK(wdev); - if (!wdev->chandef.chan) - return false; - switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: - active = wdev->beacon_interval != 0; + for_each_valid_link(wdev, link) { + if (wdev->links[link].ap.beacon_interval) + return true; + } break; case NL80211_IFTYPE_ADHOC: - active = wdev->ssid_len != 0; + if (wdev->u.ibss.ssid_len) + return true; break; case NL80211_IFTYPE_MESH_POINT: - active = wdev->mesh_id_len != 0; + if (wdev->u.mesh.id_len) + return true; break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_OCB: @@ -687,7 +746,35 @@ bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev) WARN_ON(1); } - return active; + return false; +} + +bool cfg80211_wdev_on_sub_chan(struct wireless_dev *wdev, + struct ieee80211_channel *chan, + bool primary_only) +{ + unsigned int link; + + switch (wdev->iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + for_each_valid_link(wdev, link) { + if (cfg80211_is_sub_chan(&wdev->links[link].ap.chandef, + chan, primary_only)) + return true; + } + break; + case NL80211_IFTYPE_ADHOC: + return cfg80211_is_sub_chan(&wdev->u.ibss.chandef, chan, + primary_only); + case NL80211_IFTYPE_MESH_POINT: + return cfg80211_is_sub_chan(&wdev->u.mesh.chandef, chan, + primary_only); + default: + break; + } + + return false; } static bool cfg80211_is_wiphy_oper_chan(struct wiphy *wiphy, @@ -702,7 +789,7 @@ static bool cfg80211_is_wiphy_oper_chan(struct wiphy *wiphy, continue; } - if (cfg80211_is_sub_chan(&wdev->chandef, chan)) { + if (cfg80211_wdev_on_sub_chan(wdev, chan, false)) { wdev_unlock(wdev); return true; } @@ -712,6 +799,20 @@ static bool cfg80211_is_wiphy_oper_chan(struct wiphy *wiphy, return false; } +static bool +cfg80211_offchan_chain_is_active(struct cfg80211_registered_device *rdev, + struct ieee80211_channel *channel) +{ + if (!rdev->background_radar_wdev) + return false; + + if (!cfg80211_chandef_valid(&rdev->background_radar_chandef)) + return false; + + return cfg80211_is_sub_chan(&rdev->background_radar_chandef, channel, + false); +} + bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy, struct ieee80211_channel *chan) { @@ -728,6 +829,9 @@ bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy, if (cfg80211_is_wiphy_oper_chan(&rdev->wiphy, chan)) return true; + + if (cfg80211_offchan_chain_is_active(rdev, chan)) + return true; } return false; @@ -942,7 +1046,10 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, struct ieee80211_sta_vht_cap *vht_cap; struct ieee80211_edmg *edmg_cap; u32 width, control_freq, cap; - bool ext_nss_cap, support_80_80 = false; + bool ext_nss_cap, support_80_80 = false, support_320 = false; + const struct ieee80211_sband_iftype_data *iftd; + struct ieee80211_supported_band *sband; + int i; if (WARN_ON(!cfg80211_chandef_valid(chandef))) return false; @@ -1044,6 +1151,32 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, (vht_cap->cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK))) return false; break; + case NL80211_CHAN_WIDTH_320: + prohibited_flags |= IEEE80211_CHAN_NO_320MHZ; + width = 320; + + if (chandef->chan->band != NL80211_BAND_6GHZ) + return false; + + sband = wiphy->bands[NL80211_BAND_6GHZ]; + if (!sband) + return false; + + for (i = 0; i < sband->n_iftype_data; i++) { + iftd = &sband->iftype_data[i]; + if (!iftd->eht_cap.has_eht) + continue; + + if (iftd->eht_cap.eht_cap_elem.phy_cap_info[0] & + IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) { + support_320 = true; + break; + } + } + + if (!support_320) + return false; + break; default: WARN_ON_ONCE(1); return false; @@ -1081,6 +1214,68 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, } EXPORT_SYMBOL(cfg80211_chandef_usable); +static bool cfg80211_ir_permissive_check_wdev(enum nl80211_iftype iftype, + struct wireless_dev *wdev, + struct ieee80211_channel *chan) +{ + struct ieee80211_channel *other_chan = NULL; + unsigned int link_id; + int r1, r2; + + for_each_valid_link(wdev, link_id) { + if (wdev->iftype == NL80211_IFTYPE_STATION && + wdev->links[link_id].client.current_bss) + other_chan = wdev->links[link_id].client.current_bss->pub.channel; + + /* + * If a GO already operates on the same GO_CONCURRENT channel, + * this one (maybe the same one) can beacon as well. We allow + * the operation even if the station we relied on with + * GO_CONCURRENT is disconnected now. But then we must make sure + * we're not outdoor on an indoor-only channel. + */ + if (iftype == NL80211_IFTYPE_P2P_GO && + wdev->iftype == NL80211_IFTYPE_P2P_GO && + wdev->links[link_id].ap.beacon_interval && + !(chan->flags & IEEE80211_CHAN_INDOOR_ONLY)) + other_chan = wdev->links[link_id].ap.chandef.chan; + + if (!other_chan) + continue; + + if (chan == other_chan) + return true; + + if (chan->band != NL80211_BAND_5GHZ && + chan->band != NL80211_BAND_6GHZ) + continue; + + r1 = cfg80211_get_unii(chan->center_freq); + r2 = cfg80211_get_unii(other_chan->center_freq); + + if (r1 != -EINVAL && r1 == r2) { + /* + * At some locations channels 149-165 are considered a + * bundle, but at other locations, e.g., Indonesia, + * channels 149-161 are considered a bundle while + * channel 165 is left out and considered to be in a + * different bundle. Thus, in case that there is a + * station interface connected to an AP on channel 165, + * it is assumed that channels 149-161 are allowed for + * GO operations. However, having a station interface + * connected to an AP on channels 149-161, does not + * allow GO operation on channel 165. + */ + if (chan->center_freq == 5825 && + other_chan->center_freq != 5825) + continue; + return true; + } + } + + return false; +} + /* * Check if the channel can be used under permissive conditions mandated by * some regulatory bodies, i.e., the channel is marked with @@ -1124,59 +1319,14 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy, * the current registered device. */ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { - struct ieee80211_channel *other_chan = NULL; - int r1, r2; + bool ret; wdev_lock(wdev); - if (wdev->iftype == NL80211_IFTYPE_STATION && - wdev->current_bss) - other_chan = wdev->current_bss->pub.channel; - - /* - * If a GO already operates on the same GO_CONCURRENT channel, - * this one (maybe the same one) can beacon as well. We allow - * the operation even if the station we relied on with - * GO_CONCURRENT is disconnected now. But then we must make sure - * we're not outdoor on an indoor-only channel. - */ - if (iftype == NL80211_IFTYPE_P2P_GO && - wdev->iftype == NL80211_IFTYPE_P2P_GO && - wdev->beacon_interval && - !(chan->flags & IEEE80211_CHAN_INDOOR_ONLY)) - other_chan = wdev->chandef.chan; + ret = cfg80211_ir_permissive_check_wdev(iftype, wdev, chan); wdev_unlock(wdev); - if (!other_chan) - continue; - - if (chan == other_chan) - return true; - - if (chan->band != NL80211_BAND_5GHZ && - chan->band != NL80211_BAND_6GHZ) - continue; - - r1 = cfg80211_get_unii(chan->center_freq); - r2 = cfg80211_get_unii(other_chan->center_freq); - - if (r1 != -EINVAL && r1 == r2) { - /* - * At some locations channels 149-165 are considered a - * bundle, but at other locations, e.g., Indonesia, - * channels 149-161 are considered a bundle while - * channel 165 is left out and considered to be in a - * different bundle. Thus, in case that there is a - * station interface connected to an AP on channel 165, - * it is assumed that channels 149-161 are allowed for - * GO operations. However, having a station interface - * connected to an AP on channels 149-161, does not - * allow GO operation on channel 165. - */ - if (chan->center_freq == 5825 && - other_chan->center_freq != 5825) - continue; - return true; - } + if (ret) + return ret; } return false; @@ -1249,97 +1399,6 @@ int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev, return rdev_set_monitor_channel(rdev, chandef); } -void -cfg80211_get_chan_state(struct wireless_dev *wdev, - struct ieee80211_channel **chan, - enum cfg80211_chan_mode *chanmode, - u8 *radar_detect) -{ - int ret; - - *chan = NULL; - *chanmode = CHAN_MODE_UNDEFINED; - - ASSERT_WDEV_LOCK(wdev); - - if (wdev->netdev && !netif_running(wdev->netdev)) - return; - - switch (wdev->iftype) { - case NL80211_IFTYPE_ADHOC: - if (wdev->current_bss) { - *chan = wdev->current_bss->pub.channel; - *chanmode = (wdev->ibss_fixed && - !wdev->ibss_dfs_possible) - ? CHAN_MODE_SHARED - : CHAN_MODE_EXCLUSIVE; - - /* consider worst-case - IBSS can try to return to the - * original user-specified channel as creator */ - if (wdev->ibss_dfs_possible) - *radar_detect |= BIT(wdev->chandef.width); - return; - } - break; - case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_P2P_CLIENT: - if (wdev->current_bss) { - *chan = wdev->current_bss->pub.channel; - *chanmode = CHAN_MODE_SHARED; - return; - } - break; - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_P2P_GO: - if (wdev->cac_started) { - *chan = wdev->chandef.chan; - *chanmode = CHAN_MODE_SHARED; - *radar_detect |= BIT(wdev->chandef.width); - } else if (wdev->beacon_interval) { - *chan = wdev->chandef.chan; - *chanmode = CHAN_MODE_SHARED; - - ret = cfg80211_chandef_dfs_required(wdev->wiphy, - &wdev->chandef, - wdev->iftype); - WARN_ON(ret < 0); - if (ret > 0) - *radar_detect |= BIT(wdev->chandef.width); - } - return; - case NL80211_IFTYPE_MESH_POINT: - if (wdev->mesh_id_len) { - *chan = wdev->chandef.chan; - *chanmode = CHAN_MODE_SHARED; - - ret = cfg80211_chandef_dfs_required(wdev->wiphy, - &wdev->chandef, - wdev->iftype); - WARN_ON(ret < 0); - if (ret > 0) - *radar_detect |= BIT(wdev->chandef.width); - } - return; - case NL80211_IFTYPE_OCB: - if (wdev->chandef.chan) { - *chan = wdev->chandef.chan; - *chanmode = CHAN_MODE_SHARED; - return; - } - break; - case NL80211_IFTYPE_MONITOR: - case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_P2P_DEVICE: - case NL80211_IFTYPE_NAN: - /* these interface types don't really have a channel */ - return; - case NL80211_IFTYPE_UNSPECIFIED: - case NL80211_IFTYPE_WDS: - case NUM_NL80211_IFTYPES: - WARN_ON(1); - } -} - bool cfg80211_any_usable_channels(struct wiphy *wiphy, unsigned long sband_mask, u32 prohibited_flags) @@ -1370,3 +1429,34 @@ bool cfg80211_any_usable_channels(struct wiphy *wiphy, return false; } EXPORT_SYMBOL(cfg80211_any_usable_channels); + +struct cfg80211_chan_def *wdev_chandef(struct wireless_dev *wdev, + unsigned int link_id) +{ + /* + * We need to sort out the locking here - in some cases + * where we get here we really just don't care (yet) + * about the valid links, but in others we do. But we + * get here with various driver cases, so we cannot + * easily require the wdev mutex. + */ + if (link_id || wdev->valid_links & BIT(0)) { + ASSERT_WDEV_LOCK(wdev); + WARN_ON(!(wdev->valid_links & BIT(link_id))); + } + + switch (wdev->iftype) { + case NL80211_IFTYPE_MESH_POINT: + return &wdev->u.mesh.chandef; + case NL80211_IFTYPE_ADHOC: + return &wdev->u.ibss.chandef; + case NL80211_IFTYPE_OCB: + return &wdev->u.ocb.chandef; + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + return &wdev->links[link_id].ap.chandef; + default: + return NULL; + } +} +EXPORT_SYMBOL(wdev_chandef); diff --git a/net/wireless/core.c b/net/wireless/core.c index 441136646f89..5b0c4d5b80cf 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -342,7 +342,7 @@ void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev) wiphy_lock(&rdev->wiphy); cfg80211_leave(rdev, wdev); - rdev_del_virtual_intf(rdev, wdev); + cfg80211_remove_virtual_intf(rdev, wdev); wiphy_unlock(&rdev->wiphy); } } @@ -536,6 +536,10 @@ use_default_name: INIT_WORK(&rdev->rfkill_block, cfg80211_rfkill_block_work); INIT_WORK(&rdev->conn_work, cfg80211_conn_work); INIT_WORK(&rdev->event_work, cfg80211_event_work); + INIT_WORK(&rdev->background_cac_abort_wk, + cfg80211_background_cac_abort_wk); + INIT_DELAYED_WORK(&rdev->background_cac_done_wk, + cfg80211_background_cac_done_wk); init_waitqueue_head(&rdev->dev_wait); @@ -724,6 +728,7 @@ int wiphy_register(struct wiphy *wiphy) if (wiphy->interface_modes & ~(BIT(NL80211_IFTYPE_STATION) | BIT(NL80211_IFTYPE_P2P_CLIENT) | BIT(NL80211_IFTYPE_AP) | + BIT(NL80211_IFTYPE_MESH_POINT) | BIT(NL80211_IFTYPE_P2P_GO) | BIT(NL80211_IFTYPE_ADHOC) | BIT(NL80211_IFTYPE_P2P_DEVICE) | @@ -855,6 +860,9 @@ int wiphy_register(struct wiphy *wiphy) for (i = 0; i < sband->n_iftype_data; i++) { const struct ieee80211_sband_iftype_data *iftd; + bool has_ap, has_non_ap; + u32 ap_bits = BIT(NL80211_IFTYPE_AP) | + BIT(NL80211_IFTYPE_P2P_GO); iftd = &sband->iftype_data[i]; @@ -874,6 +882,19 @@ int wiphy_register(struct wiphy *wiphy) else have_he = have_he && iftd->he_cap.has_he; + + has_ap = iftd->types_mask & ap_bits; + has_non_ap = iftd->types_mask & ~ap_bits; + + /* + * For EHT 20 MHz STA, the capabilities format differs + * but to simplify, don't check 20 MHz but rather check + * only if AP and non-AP were mentioned at the same time, + * reject if so. + */ + if (WARN_ON(iftd->eht_cap.has_eht && + has_ap && has_non_ap)) + return -EINVAL; } if (WARN_ON(!have_he && band == NL80211_BAND_6GHZ)) @@ -908,6 +929,12 @@ int wiphy_register(struct wiphy *wiphy) return -EINVAL; #endif + if (!wiphy->max_num_akm_suites) + wiphy->max_num_akm_suites = NL80211_MAX_NR_AKM_SUITES; + else if (wiphy->max_num_akm_suites < NL80211_MAX_NR_AKM_SUITES || + wiphy->max_num_akm_suites > CFG80211_MAX_NUM_AKM_SUITES) + return -EINVAL; + /* check and set up bitrates */ ieee80211_set_bitrate_flags(wiphy); @@ -1045,11 +1072,13 @@ void wiphy_unregister(struct wiphy *wiphy) cancel_work_sync(&rdev->conn_work); flush_work(&rdev->event_work); cancel_delayed_work_sync(&rdev->dfs_update_channels_wk); + cancel_delayed_work_sync(&rdev->background_cac_done_wk); flush_work(&rdev->destroy_work); flush_work(&rdev->sched_scan_stop_wk); flush_work(&rdev->propagate_radar_detect_wk); flush_work(&rdev->propagate_cac_done_wk); flush_work(&rdev->mgmt_registrations_update_wk); + flush_work(&rdev->background_cac_abort_wk); #ifdef CONFIG_PM if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup) @@ -1111,6 +1140,7 @@ static void _cfg80211_unregister_wdev(struct wireless_dev *wdev, bool unregister_netdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + unsigned int link_id; ASSERT_RTNL(); lockdep_assert_held(&rdev->wiphy.mtx); @@ -1160,11 +1190,22 @@ static void _cfg80211_unregister_wdev(struct wireless_dev *wdev, */ cfg80211_process_wdev_events(wdev); - if (WARN_ON(wdev->current_bss)) { - cfg80211_unhold_bss(wdev->current_bss); - cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); - wdev->current_bss = NULL; + if (wdev->iftype == NL80211_IFTYPE_STATION || + wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) { + for (link_id = 0; link_id < ARRAY_SIZE(wdev->links); link_id++) { + struct cfg80211_internal_bss *curbss; + + curbss = wdev->links[link_id].client.current_bss; + + if (WARN_ON(curbss)) { + cfg80211_unhold_bss(curbss); + cfg80211_put_bss(wdev->wiphy, &curbss->pub); + wdev->links[link_id].client.current_bss = NULL; + } + } } + + wdev->connected = false; } void cfg80211_unregister_wdev(struct wireless_dev *wdev) @@ -1198,6 +1239,8 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev, cfg80211_pmsr_wdev_down(wdev); + cfg80211_stop_background_radar_detection(wdev); + switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: __cfg80211_leave_ibss(rdev, dev, true); @@ -1224,7 +1267,7 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev, break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: - __cfg80211_stop_ap(rdev, dev, true); + __cfg80211_stop_ap(rdev, dev, -1, true); break; case NL80211_IFTYPE_OCB: __cfg80211_leave_ocb(rdev, dev); @@ -1410,6 +1453,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, case NETDEV_GOING_DOWN: wiphy_lock(&rdev->wiphy); cfg80211_leave(rdev, wdev); + cfg80211_remove_links(wdev); wiphy_unlock(&rdev->wiphy); break; case NETDEV_DOWN: @@ -1454,9 +1498,9 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, memcpy(&setup, &default_mesh_setup, sizeof(setup)); /* back compat only needed for mesh_id */ - setup.mesh_id = wdev->ssid; - setup.mesh_id_len = wdev->mesh_id_up_len; - if (wdev->mesh_id_up_len) + setup.mesh_id = wdev->u.mesh.id; + setup.mesh_id_len = wdev->u.mesh.id_up_len; + if (wdev->u.mesh.id_up_len) __cfg80211_join_mesh(rdev, dev, &setup, &default_mesh_config); diff --git a/net/wireless/core.h b/net/wireless/core.h index 1720abf36f92..775e16cb99ed 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -3,7 +3,7 @@ * Wireless configuration interface internals. * * Copyright 2006-2010 Johannes Berg - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation */ #ifndef __NET_WIRELESS_CORE_H #define __NET_WIRELESS_CORE_H @@ -84,6 +84,11 @@ struct cfg80211_registered_device { struct delayed_work dfs_update_channels_wk; + struct wireless_dev *background_radar_wdev; + struct cfg80211_chan_def background_radar_chandef; + struct delayed_work background_cac_done_wk; + struct work_struct background_cac_abort_wk; + /* netlink port which started critical protocol (0 means not started) */ u32 crit_proto_nlportid; @@ -276,12 +281,6 @@ struct cfg80211_cached_keys { int def; }; -enum cfg80211_chan_mode { - CHAN_MODE_UNDEFINED, - CHAN_MODE_SHARED, - CHAN_MODE_EXCLUSIVE, -}; - struct cfg80211_beacon_registration { struct list_head list; u32 nlportid; @@ -308,6 +307,7 @@ void cfg80211_bss_expire(struct cfg80211_registered_device *rdev); void cfg80211_bss_age(struct cfg80211_registered_device *rdev, unsigned long age_secs); void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev, + unsigned int link, struct ieee80211_channel *channel); /* IBSS */ @@ -354,32 +354,25 @@ int cfg80211_leave_ocb(struct cfg80211_registered_device *rdev, /* AP */ int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev, - struct net_device *dev, bool notify); + struct net_device *dev, int link, + bool notify); int cfg80211_stop_ap(struct cfg80211_registered_device *rdev, - struct net_device *dev, bool notify); + struct net_device *dev, int link, + bool notify); /* MLME */ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, - struct ieee80211_channel *chan, - enum nl80211_auth_type auth_type, - const u8 *bssid, - const u8 *ssid, int ssid_len, - const u8 *ie, int ie_len, - const u8 *key, int key_len, int key_idx, - const u8 *auth_data, int auth_data_len); + struct cfg80211_auth_request *req); int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, - struct ieee80211_channel *chan, - const u8 *bssid, - const u8 *ssid, int ssid_len, struct cfg80211_assoc_request *req); int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *bssid, const u8 *ie, int ie_len, u16 reason, bool local_state_change); int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev, - struct net_device *dev, const u8 *bssid, + struct net_device *dev, const u8 *ap_addr, const u8 *ie, int ie_len, u16 reason, bool local_state_change); void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, @@ -491,13 +484,28 @@ cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy, void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev); +int +cfg80211_start_background_radar_detection(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + struct cfg80211_chan_def *chandef); + +void cfg80211_stop_background_radar_detection(struct wireless_dev *wdev); + +void cfg80211_background_cac_done_wk(struct work_struct *work); + +void cfg80211_background_cac_abort_wk(struct work_struct *work); + bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy, struct ieee80211_channel *chan); bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev); bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef, - struct ieee80211_channel *chan); + struct ieee80211_channel *chan, + bool primary_only); +bool cfg80211_wdev_on_sub_chan(struct wireless_dev *wdev, + struct ieee80211_channel *chan, + bool primary_only); static inline unsigned int elapsed_jiffies_msecs(unsigned long start) { @@ -509,12 +517,6 @@ static inline unsigned int elapsed_jiffies_msecs(unsigned long start) return jiffies_to_msecs(end + (ULONG_MAX - start) + 1); } -void -cfg80211_get_chan_state(struct wireless_dev *wdev, - struct ieee80211_channel **chan, - enum cfg80211_chan_mode *chanmode, - u8 *radar_detect); - int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev, struct cfg80211_chan_def *chandef); @@ -560,4 +562,9 @@ void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid); void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev); void cfg80211_pmsr_free_wk(struct work_struct *work); +void cfg80211_remove_link(struct wireless_dev *wdev, unsigned int link_id); +void cfg80211_remove_links(struct wireless_dev *wdev); +int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev); + #endif /* __NET_WIRELESS_CORE_H */ diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index 8f98e546becf..edd062f104f4 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -3,7 +3,7 @@ * Some IBSS support code for cfg80211. * * Copyright 2009 Johannes Berg - * Copyright (C) 2020-2021 Intel Corporation + * Copyright (C) 2020-2022 Intel Corporation */ #include @@ -28,7 +28,7 @@ void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return; - if (!wdev->ssid_len) + if (!wdev->u.ibss.ssid_len) return; bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, NULL, 0, @@ -37,13 +37,13 @@ void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, if (WARN_ON(!bss)) return; - if (wdev->current_bss) { - cfg80211_unhold_bss(wdev->current_bss); - cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); + if (wdev->u.ibss.current_bss) { + cfg80211_unhold_bss(wdev->u.ibss.current_bss); + cfg80211_put_bss(wdev->wiphy, &wdev->u.ibss.current_bss->pub); } cfg80211_hold_bss(bss_from_pub(bss)); - wdev->current_bss = bss_from_pub(bss); + wdev->u.ibss.current_bss = bss_from_pub(bss); if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP)) cfg80211_upload_connect_keys(wdev); @@ -96,7 +96,7 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, lockdep_assert_held(&rdev->wiphy.mtx); ASSERT_WDEV_LOCK(wdev); - if (wdev->ssid_len) + if (wdev->u.ibss.ssid_len) return -EALREADY; if (!params->basic_rates) { @@ -131,9 +131,7 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, kfree_sensitive(wdev->connect_keys); wdev->connect_keys = connkeys; - wdev->ibss_fixed = params->channel_fixed; - wdev->ibss_dfs_possible = params->userspace_handles_dfs; - wdev->chandef = params->chandef; + wdev->u.ibss.chandef = params->chandef; if (connkeys) { params->wep_keys = connkeys->params; params->wep_tx_key = connkeys->def; @@ -148,8 +146,8 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, return err; } - memcpy(wdev->ssid, params->ssid, params->ssid_len); - wdev->ssid_len = params->ssid_len; + memcpy(wdev->u.ibss.ssid, params->ssid, params->ssid_len); + wdev->u.ibss.ssid_len = params->ssid_len; return 0; } @@ -173,16 +171,16 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext) */ if (rdev->ops->del_key) for (i = 0; i < 6; i++) - rdev_del_key(rdev, dev, i, false, NULL); + rdev_del_key(rdev, dev, -1, i, false, NULL); - if (wdev->current_bss) { - cfg80211_unhold_bss(wdev->current_bss); - cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); + if (wdev->u.ibss.current_bss) { + cfg80211_unhold_bss(wdev->u.ibss.current_bss); + cfg80211_put_bss(wdev->wiphy, &wdev->u.ibss.current_bss->pub); } - wdev->current_bss = NULL; - wdev->ssid_len = 0; - memset(&wdev->chandef, 0, sizeof(wdev->chandef)); + wdev->u.ibss.current_bss = NULL; + wdev->u.ibss.ssid_len = 0; + memset(&wdev->u.ibss.chandef, 0, sizeof(wdev->u.ibss.chandef)); #ifdef CONFIG_CFG80211_WEXT if (!nowext) wdev->wext.ibss.ssid_len = 0; @@ -207,7 +205,7 @@ int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, ASSERT_WDEV_LOCK(wdev); - if (!wdev->ssid_len) + if (!wdev->u.ibss.ssid_len) return -ENOLINK; err = rdev_leave_ibss(rdev, dev); @@ -341,7 +339,7 @@ int cfg80211_ibss_wext_siwfreq(struct net_device *dev, wdev_lock(wdev); err = 0; - if (wdev->ssid_len) + if (wdev->u.ibss.ssid_len) err = __cfg80211_leave_ibss(rdev, dev, true); wdev_unlock(wdev); @@ -376,8 +374,8 @@ int cfg80211_ibss_wext_giwfreq(struct net_device *dev, return -EINVAL; wdev_lock(wdev); - if (wdev->current_bss) - chan = wdev->current_bss->pub.channel; + if (wdev->u.ibss.current_bss) + chan = wdev->u.ibss.current_bss->pub.channel; else if (wdev->wext.ibss.chandef.chan) chan = wdev->wext.ibss.chandef.chan; wdev_unlock(wdev); @@ -410,7 +408,7 @@ int cfg80211_ibss_wext_siwessid(struct net_device *dev, wdev_lock(wdev); err = 0; - if (wdev->ssid_len) + if (wdev->u.ibss.ssid_len) err = __cfg80211_leave_ibss(rdev, dev, true); wdev_unlock(wdev); @@ -421,8 +419,8 @@ int cfg80211_ibss_wext_siwessid(struct net_device *dev, if (len > 0 && ssid[len - 1] == '\0') len--; - memcpy(wdev->ssid, ssid, len); - wdev->wext.ibss.ssid = wdev->ssid; + memcpy(wdev->u.ibss.ssid, ssid, len); + wdev->wext.ibss.ssid = wdev->u.ibss.ssid; wdev->wext.ibss.ssid_len = len; wdev_lock(wdev); @@ -445,10 +443,10 @@ int cfg80211_ibss_wext_giwessid(struct net_device *dev, data->flags = 0; wdev_lock(wdev); - if (wdev->ssid_len) { + if (wdev->u.ibss.ssid_len) { data->flags = 1; - data->length = wdev->ssid_len; - memcpy(ssid, wdev->ssid, data->length); + data->length = wdev->u.ibss.ssid_len; + memcpy(ssid, wdev->u.ibss.ssid, data->length); } else if (wdev->wext.ibss.ssid && wdev->wext.ibss.ssid_len) { data->flags = 1; data->length = wdev->wext.ibss.ssid_len; @@ -496,7 +494,7 @@ int cfg80211_ibss_wext_siwap(struct net_device *dev, wdev_lock(wdev); err = 0; - if (wdev->ssid_len) + if (wdev->u.ibss.ssid_len) err = __cfg80211_leave_ibss(rdev, dev, true); wdev_unlock(wdev); @@ -529,8 +527,9 @@ int cfg80211_ibss_wext_giwap(struct net_device *dev, ap_addr->sa_family = ARPHRD_ETHER; wdev_lock(wdev); - if (wdev->current_bss) - memcpy(ap_addr->sa_data, wdev->current_bss->pub.bssid, ETH_ALEN); + if (wdev->u.ibss.current_bss) + memcpy(ap_addr->sa_data, wdev->u.ibss.current_bss->pub.bssid, + ETH_ALEN); else if (wdev->wext.ibss.bssid) memcpy(ap_addr->sa_data, wdev->wext.ibss.bssid, ETH_ALEN); else diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index e4e363138279..59a3c5c092b1 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -1,4 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 +/* + * Portions + * Copyright (C) 2022 Intel Corporation + */ #include #include #include @@ -114,7 +118,7 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, setup->is_secure) return -EOPNOTSUPP; - if (wdev->mesh_id_len) + if (wdev->u.mesh.id_len) return -EALREADY; if (!setup->mesh_id_len) @@ -125,7 +129,7 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, if (!setup->chandef.chan) { /* if no channel explicitly given, use preset channel */ - setup->chandef = wdev->preset_chandef; + setup->chandef = wdev->u.mesh.preset_chandef; } if (!setup->chandef.chan) { @@ -209,10 +213,10 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, err = rdev_join_mesh(rdev, dev, conf, setup); if (!err) { - memcpy(wdev->ssid, setup->mesh_id, setup->mesh_id_len); - wdev->mesh_id_len = setup->mesh_id_len; - wdev->chandef = setup->chandef; - wdev->beacon_interval = setup->beacon_interval; + memcpy(wdev->u.mesh.id, setup->mesh_id, setup->mesh_id_len); + wdev->u.mesh.id_len = setup->mesh_id_len; + wdev->u.mesh.chandef = setup->chandef; + wdev->u.mesh.beacon_interval = setup->beacon_interval; } return err; @@ -241,15 +245,15 @@ int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev, err = rdev_libertas_set_mesh_channel(rdev, wdev->netdev, chandef->chan); if (!err) - wdev->chandef = *chandef; + wdev->u.mesh.chandef = *chandef; return err; } - if (wdev->mesh_id_len) + if (wdev->u.mesh.id_len) return -EBUSY; - wdev->preset_chandef = *chandef; + wdev->u.mesh.preset_chandef = *chandef; return 0; } @@ -267,15 +271,16 @@ int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, if (!rdev->ops->leave_mesh) return -EOPNOTSUPP; - if (!wdev->mesh_id_len) + if (!wdev->u.mesh.id_len) return -ENOTCONN; err = rdev_leave_mesh(rdev, dev); if (!err) { wdev->conn_owner_nlportid = 0; - wdev->mesh_id_len = 0; - wdev->beacon_interval = 0; - memset(&wdev->chandef, 0, sizeof(wdev->chandef)); + wdev->u.mesh.id_len = 0; + wdev->u.mesh.beacon_interval = 0; + memset(&wdev->u.mesh.chandef, 0, + sizeof(wdev->u.mesh.chandef)); rdev_set_qos_map(rdev, dev, NULL); cfg80211_sched_dfs_chan_update(rdev); } diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 783acd2c4211..248f15bb99e6 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -4,7 +4,7 @@ * * Copyright (c) 2009, Jouni Malinen * Copyright (c) 2015 Intel Deutschland GmbH - * Copyright (C) 2019-2020 Intel Corporation + * Copyright (C) 2019-2020, 2022 Intel Corporation */ #include @@ -21,36 +21,48 @@ #include "rdev-ops.h" -void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss, - const u8 *buf, size_t len, int uapsd_queues, - const u8 *req_ies, size_t req_ies_len) +void cfg80211_rx_assoc_resp(struct net_device *dev, + struct cfg80211_rx_assoc_resp *data) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; - struct cfg80211_connect_resp_params cr; - const u8 *resp_ie = mgmt->u.assoc_resp.variable; - size_t resp_ie_len = len - offsetof(struct ieee80211_mgmt, - u.assoc_resp.variable); + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)data->buf; + struct cfg80211_connect_resp_params cr = { + .timeout_reason = NL80211_TIMEOUT_UNSPECIFIED, + .req_ie = data->req_ies, + .req_ie_len = data->req_ies_len, + .resp_ie = mgmt->u.assoc_resp.variable, + .resp_ie_len = data->len - + offsetof(struct ieee80211_mgmt, + u.assoc_resp.variable), + .status = le16_to_cpu(mgmt->u.assoc_resp.status_code), + .ap_mld_addr = data->ap_mld_addr, + }; + unsigned int link_id; - if (bss->channel->band == NL80211_BAND_S1GHZ) { - resp_ie = (u8 *)&mgmt->u.s1g_assoc_resp.variable; - resp_ie_len = len - offsetof(struct ieee80211_mgmt, - u.s1g_assoc_resp.variable); + for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) { + cr.links[link_id].bss = data->links[link_id].bss; + if (!cr.links[link_id].bss) + continue; + cr.links[link_id].bssid = data->links[link_id].bss->bssid; + cr.links[link_id].addr = data->links[link_id].addr; + /* need to have local link addresses for MLO connections */ + WARN_ON(cr.ap_mld_addr && !cr.links[link_id].addr); + + if (cr.links[link_id].bss->channel->band == NL80211_BAND_S1GHZ) { + WARN_ON(link_id); + cr.resp_ie = (u8 *)&mgmt->u.s1g_assoc_resp.variable; + cr.resp_ie_len = data->len - + offsetof(struct ieee80211_mgmt, + u.s1g_assoc_resp.variable); + } + + if (cr.ap_mld_addr) + cr.valid_links |= BIT(link_id); } - memset(&cr, 0, sizeof(cr)); - cr.status = (int)le16_to_cpu(mgmt->u.assoc_resp.status_code); - cr.bssid = mgmt->bssid; - cr.bss = bss; - cr.req_ie = req_ies; - cr.req_ie_len = req_ies_len; - cr.resp_ie = resp_ie; - cr.resp_ie_len = resp_ie_len; - cr.timeout_reason = NL80211_TIMEOUT_UNSPECIFIED; - - trace_cfg80211_send_rx_assoc(dev, bss); + trace_cfg80211_send_rx_assoc(dev, data); /* * This is a bit of a hack, we don't notify userspace of @@ -59,13 +71,19 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss, * frame instead of reassoc. */ if (cfg80211_sme_rx_assoc_resp(wdev, cr.status)) { - cfg80211_unhold_bss(bss_from_pub(bss)); - cfg80211_put_bss(wiphy, bss); + for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) { + struct cfg80211_bss *bss = data->links[link_id].bss; + + if (!bss) + continue; + + cfg80211_unhold_bss(bss_from_pub(bss)); + cfg80211_put_bss(wiphy, bss); + } return; } - nl80211_send_rx_assoc(rdev, dev, buf, len, GFP_KERNEL, uapsd_queues, - req_ies, req_ies_len); + nl80211_send_rx_assoc(rdev, dev, data); /* update current_bss etc., consumes the bss reference */ __cfg80211_connect_result(dev, &cr, cr.status == WLAN_STATUS_SUCCESS); } @@ -92,8 +110,7 @@ static void cfg80211_process_deauth(struct wireless_dev *wdev, nl80211_send_deauth(rdev, wdev->netdev, buf, len, reconnect, GFP_KERNEL); - if (!wdev->current_bss || - !ether_addr_equal(wdev->current_bss->pub.bssid, bssid)) + if (!wdev->connected || !ether_addr_equal(wdev->u.client.connected_addr, bssid)) return; __cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap); @@ -113,8 +130,8 @@ static void cfg80211_process_disassoc(struct wireless_dev *wdev, nl80211_send_disassoc(rdev, wdev->netdev, buf, len, reconnect, GFP_KERNEL); - if (WARN_ON(!wdev->current_bss || - !ether_addr_equal(wdev->current_bss->pub.bssid, bssid))) + if (WARN_ON(!wdev->connected || + !ether_addr_equal(wdev->u.client.connected_addr, bssid))) return; __cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap); @@ -155,33 +172,35 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr) } EXPORT_SYMBOL(cfg80211_auth_timeout); -void cfg80211_assoc_timeout(struct net_device *dev, struct cfg80211_bss *bss) +void cfg80211_assoc_failure(struct net_device *dev, + struct cfg80211_assoc_failure *data) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + const u8 *addr = data->ap_mld_addr ?: data->bss[0]->bssid; + int i; - trace_cfg80211_send_assoc_timeout(dev, bss->bssid); + trace_cfg80211_send_assoc_failure(dev, data); - nl80211_send_assoc_timeout(rdev, dev, bss->bssid, GFP_KERNEL); - cfg80211_sme_assoc_timeout(wdev); + if (data->timeout) { + nl80211_send_assoc_timeout(rdev, dev, addr, GFP_KERNEL); + cfg80211_sme_assoc_timeout(wdev); + } else { + cfg80211_sme_abandon_assoc(wdev); + } - cfg80211_unhold_bss(bss_from_pub(bss)); - cfg80211_put_bss(wiphy, bss); + for (i = 0; i < ARRAY_SIZE(data->bss); i++) { + struct cfg80211_bss *bss = data->bss[i]; + + if (!bss) + continue; + + cfg80211_unhold_bss(bss_from_pub(bss)); + cfg80211_put_bss(wiphy, bss); + } } -EXPORT_SYMBOL(cfg80211_assoc_timeout); - -void cfg80211_abandon_assoc(struct net_device *dev, struct cfg80211_bss *bss) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - struct wiphy *wiphy = wdev->wiphy; - - cfg80211_sme_abandon_assoc(wdev); - - cfg80211_unhold_bss(bss_from_pub(bss)); - cfg80211_put_bss(wiphy, bss); -} -EXPORT_SYMBOL(cfg80211_abandon_assoc); +EXPORT_SYMBOL(cfg80211_assoc_failure); void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len, bool reconnect) @@ -233,47 +252,30 @@ EXPORT_SYMBOL(cfg80211_michael_mic_failure); /* some MLME handling for userspace SME */ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, - struct ieee80211_channel *chan, - enum nl80211_auth_type auth_type, - const u8 *bssid, - const u8 *ssid, int ssid_len, - const u8 *ie, int ie_len, - const u8 *key, int key_len, int key_idx, - const u8 *auth_data, int auth_data_len) + struct cfg80211_auth_request *req) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_auth_request req = { - .ie = ie, - .ie_len = ie_len, - .auth_data = auth_data, - .auth_data_len = auth_data_len, - .auth_type = auth_type, - .key = key, - .key_len = key_len, - .key_idx = key_idx, - }; - int err; ASSERT_WDEV_LOCK(wdev); - if (auth_type == NL80211_AUTHTYPE_SHARED_KEY) - if (!key || !key_len || key_idx < 0 || key_idx > 3) - return -EINVAL; - - if (wdev->current_bss && - ether_addr_equal(bssid, wdev->current_bss->pub.bssid)) - return -EALREADY; - - req.bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len, - IEEE80211_BSS_TYPE_ESS, - IEEE80211_PRIVACY_ANY); - if (!req.bss) + if (!req->bss) return -ENOENT; - err = rdev_auth(rdev, dev, &req); + if (req->link_id >= 0 && + !(wdev->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO)) + return -EINVAL; - cfg80211_put_bss(&rdev->wiphy, req.bss); - return err; + if (req->auth_type == NL80211_AUTHTYPE_SHARED_KEY) { + if (!req->key || !req->key_len || + req->key_idx < 0 || req->key_idx > 3) + return -EINVAL; + } + + if (wdev->connected && + ether_addr_equal(req->bss->bssid, wdev->u.client.connected_addr)) + return -EALREADY; + + return rdev_auth(rdev, dev, req); } /* Do a logical ht_capa &= ht_capa_mask. */ @@ -310,21 +312,28 @@ void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa, p1[i] &= p2[i]; } +/* Note: caller must cfg80211_put_bss() regardless of result */ int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, - struct ieee80211_channel *chan, - const u8 *bssid, - const u8 *ssid, int ssid_len, struct cfg80211_assoc_request *req) { struct wireless_dev *wdev = dev->ieee80211_ptr; - int err; + int err, i, j; ASSERT_WDEV_LOCK(wdev); - if (wdev->current_bss && - (!req->prev_bssid || !ether_addr_equal(wdev->current_bss->pub.bssid, - req->prev_bssid))) + for (i = 1; i < ARRAY_SIZE(req->links); i++) { + if (!req->links[i].bss) + continue; + for (j = 0; j < i; j++) { + if (req->links[i].bss == req->links[j].bss) + return -EINVAL; + } + } + + if (wdev->connected && + (!req->prev_bssid || + !ether_addr_equal(wdev->u.client.connected_addr, req->prev_bssid))) return -EALREADY; cfg80211_oper_and_ht_capa(&req->ht_capa_mask, @@ -332,18 +341,22 @@ int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, cfg80211_oper_and_vht_capa(&req->vht_capa_mask, rdev->wiphy.vht_capa_mod_mask); - req->bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len, - IEEE80211_BSS_TYPE_ESS, - IEEE80211_PRIVACY_ANY); - if (!req->bss) - return -ENOENT; - err = rdev_assoc(rdev, dev, req); - if (!err) - cfg80211_hold_bss(bss_from_pub(req->bss)); - else - cfg80211_put_bss(&rdev->wiphy, req->bss); + if (!err) { + int link_id; + if (req->bss) { + cfg80211_ref_bss(&rdev->wiphy, req->bss); + cfg80211_hold_bss(bss_from_pub(req->bss)); + } + + for (link_id = 0; link_id < ARRAY_SIZE(req->links); link_id++) { + if (!req->links[link_id].bss) + continue; + cfg80211_ref_bss(&rdev->wiphy, req->links[link_id].bss); + cfg80211_hold_bss(bss_from_pub(req->links[link_id].bss)); + } + } return err; } @@ -364,20 +377,20 @@ int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev, ASSERT_WDEV_LOCK(wdev); if (local_state_change && - (!wdev->current_bss || - !ether_addr_equal(wdev->current_bss->pub.bssid, bssid))) + (!wdev->connected || + !ether_addr_equal(wdev->u.client.connected_addr, bssid))) return 0; if (ether_addr_equal(wdev->disconnect_bssid, bssid) || - (wdev->current_bss && - ether_addr_equal(wdev->current_bss->pub.bssid, bssid))) + (wdev->connected && + ether_addr_equal(wdev->u.client.connected_addr, bssid))) wdev->conn_owner_nlportid = 0; return rdev_deauth(rdev, dev, &req); } int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev, - struct net_device *dev, const u8 *bssid, + struct net_device *dev, const u8 *ap_addr, const u8 *ie, int ie_len, u16 reason, bool local_state_change) { @@ -387,17 +400,16 @@ int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev, .local_state_change = local_state_change, .ie = ie, .ie_len = ie_len, + .ap_addr = ap_addr, }; int err; ASSERT_WDEV_LOCK(wdev); - if (!wdev->current_bss) + if (!wdev->connected) return -ENOTCONN; - if (ether_addr_equal(wdev->current_bss->pub.bssid, bssid)) - req.bss = &wdev->current_bss->pub; - else + if (memcmp(wdev->u.client.connected_addr, ap_addr, ETH_ALEN)) return -ENOTCONN; err = rdev_disassoc(rdev, dev, &req); @@ -405,7 +417,7 @@ int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev, return err; /* driver should have reported the disassoc */ - WARN_ON(wdev->current_bss); + WARN_ON(wdev->connected); return 0; } @@ -420,10 +432,10 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, if (!rdev->ops->deauth) return; - if (!wdev->current_bss) + if (!wdev->connected) return; - memcpy(bssid, wdev->current_bss->pub.bssid, ETH_ALEN); + memcpy(bssid, wdev->u.client.connected_addr, ETH_ALEN); cfg80211_mlme_deauth(rdev, dev, bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); } @@ -643,6 +655,18 @@ void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) cfg80211_mgmt_registrations_update(wdev); } +static bool cfg80211_allowed_address(struct wireless_dev *wdev, const u8 *addr) +{ + int i; + + for_each_valid_link(wdev, i) { + if (ether_addr_equal(addr, wdev->links[i].addr)) + return true; + } + + return ether_addr_equal(addr, wdev_address(wdev)); +} + int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) @@ -676,28 +700,34 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: + /* + * check for IBSS DA must be done by driver as + * cfg80211 doesn't track the stations + */ + if (!wdev->u.ibss.current_bss || + !ether_addr_equal(wdev->u.ibss.current_bss->pub.bssid, + mgmt->bssid)) { + err = -ENOTCONN; + break; + } + break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: - if (!wdev->current_bss) { + if (!wdev->connected) { err = -ENOTCONN; break; } - if (!ether_addr_equal(wdev->current_bss->pub.bssid, + /* FIXME: MLD may address this differently */ + + if (!ether_addr_equal(wdev->u.client.connected_addr, mgmt->bssid)) { err = -ENOTCONN; break; } - /* - * check for IBSS DA must be done by driver as - * cfg80211 doesn't track the stations - */ - if (wdev->iftype == NL80211_IFTYPE_ADHOC) - break; - /* for station, check that DA is the AP */ - if (!ether_addr_equal(wdev->current_bss->pub.bssid, + if (!ether_addr_equal(wdev->u.client.connected_addr, mgmt->da)) { err = -ENOTCONN; break; @@ -735,7 +765,7 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, return err; } - if (!ether_addr_equal(mgmt->sa, wdev_address(wdev))) { + if (!cfg80211_allowed_address(wdev, mgmt->sa)) { /* Allow random TA to be used with Public Action frames if the * driver has indicated support for this. Otherwise, only allow * the local address to be used. @@ -743,31 +773,31 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, if (!ieee80211_is_action(mgmt->frame_control) || mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) return -EINVAL; - if (!wdev->current_bss && + if (!wdev->connected && !wiphy_ext_feature_isset( &rdev->wiphy, NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA)) return -EINVAL; - if (wdev->current_bss && + if (wdev->connected && !wiphy_ext_feature_isset( &rdev->wiphy, NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED)) return -EINVAL; } - /* Transmit the Action frame as requested by user space */ + /* Transmit the management frame as requested by user space */ return rdev_mgmt_tx(rdev, wdev, params, cookie); } -bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, - const u8 *buf, size_t len, u32 flags) +bool cfg80211_rx_mgmt_ext(struct wireless_dev *wdev, + struct cfg80211_rx_info *info) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg; const struct ieee80211_txrx_stypes *stypes = &wiphy->mgmt_stypes[wdev->iftype]; - struct ieee80211_mgmt *mgmt = (void *)buf; + struct ieee80211_mgmt *mgmt = (void *)info->buf; const u8 *data; int data_len; bool result = false; @@ -775,7 +805,7 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE); u16 stype; - trace_cfg80211_rx_mgmt(wdev, freq, sig_dbm); + trace_cfg80211_rx_mgmt(wdev, info); stype = (le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE) >> 4; if (!(stypes->rx & BIT(stype))) { @@ -783,8 +813,8 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, return false; } - data = buf + ieee80211_hdrlen(mgmt->frame_control); - data_len = len - ieee80211_hdrlen(mgmt->frame_control); + data = info->buf + ieee80211_hdrlen(mgmt->frame_control); + data_len = info->len - ieee80211_hdrlen(mgmt->frame_control); spin_lock_bh(&rdev->mgmt_registrations_lock); @@ -801,9 +831,8 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, /* found match! */ /* Indicate the received Action frame to user space */ - if (nl80211_send_mgmt(rdev, wdev, reg->nlportid, - freq, sig_dbm, - buf, len, flags, GFP_ATOMIC)) + if (nl80211_send_mgmt(rdev, wdev, reg->nlportid, info, + GFP_ATOMIC)) continue; result = true; @@ -815,7 +844,7 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, trace_cfg80211_return_bool(result); return result; } -EXPORT_SYMBOL(cfg80211_rx_mgmt_khz); +EXPORT_SYMBOL(cfg80211_rx_mgmt_ext); void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev) { @@ -905,13 +934,13 @@ void cfg80211_dfs_channels_update_work(struct work_struct *work) } -void cfg80211_radar_event(struct wiphy *wiphy, - struct cfg80211_chan_def *chandef, - gfp_t gfp) +void __cfg80211_radar_event(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + bool offchan, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - trace_cfg80211_radar_event(wiphy, chandef); + trace_cfg80211_radar_event(wiphy, chandef, offchan); /* only set the chandef supplied channel to unavailable, in * case the radar is detected on only one of multiple channels @@ -919,6 +948,9 @@ void cfg80211_radar_event(struct wiphy *wiphy, */ cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_UNAVAILABLE); + if (offchan) + queue_work(cfg80211_wq, &rdev->background_cac_abort_wk); + cfg80211_sched_dfs_chan_update(rdev); nl80211_radar_notify(rdev, chandef, NL80211_RADAR_DETECTED, NULL, gfp); @@ -926,7 +958,7 @@ void cfg80211_radar_event(struct wiphy *wiphy, memcpy(&rdev->radar_chandef, chandef, sizeof(struct cfg80211_chan_def)); queue_work(cfg80211_wq, &rdev->propagate_radar_detect_wk); } -EXPORT_SYMBOL(cfg80211_radar_event); +EXPORT_SYMBOL(__cfg80211_radar_event); void cfg80211_cac_event(struct net_device *netdev, const struct cfg80211_chan_def *chandef, @@ -937,14 +969,15 @@ void cfg80211_cac_event(struct net_device *netdev, struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); unsigned long timeout; + /* not yet supported */ + if (wdev->valid_links) + return; + trace_cfg80211_cac_event(netdev, event); if (WARN_ON(!wdev->cac_started && event != NL80211_RADAR_CAC_STARTED)) return; - if (WARN_ON(!wdev->chandef.chan)) - return; - switch (event) { case NL80211_RADAR_CAC_FINISHED: timeout = wdev->cac_start_time + @@ -970,3 +1003,143 @@ void cfg80211_cac_event(struct net_device *netdev, nl80211_radar_notify(rdev, chandef, event, netdev, gfp); } EXPORT_SYMBOL(cfg80211_cac_event); + +static void +__cfg80211_background_cac_event(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + const struct cfg80211_chan_def *chandef, + enum nl80211_radar_event event) +{ + struct wiphy *wiphy = &rdev->wiphy; + struct net_device *netdev; + + lockdep_assert_wiphy(&rdev->wiphy); + + if (!cfg80211_chandef_valid(chandef)) + return; + + if (!rdev->background_radar_wdev) + return; + + switch (event) { + case NL80211_RADAR_CAC_FINISHED: + cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_AVAILABLE); + memcpy(&rdev->cac_done_chandef, chandef, sizeof(*chandef)); + queue_work(cfg80211_wq, &rdev->propagate_cac_done_wk); + cfg80211_sched_dfs_chan_update(rdev); + wdev = rdev->background_radar_wdev; + break; + case NL80211_RADAR_CAC_ABORTED: + if (!cancel_delayed_work(&rdev->background_cac_done_wk)) + return; + wdev = rdev->background_radar_wdev; + break; + case NL80211_RADAR_CAC_STARTED: + break; + default: + return; + } + + netdev = wdev ? wdev->netdev : NULL; + nl80211_radar_notify(rdev, chandef, event, netdev, GFP_KERNEL); +} + +static void +cfg80211_background_cac_event(struct cfg80211_registered_device *rdev, + const struct cfg80211_chan_def *chandef, + enum nl80211_radar_event event) +{ + wiphy_lock(&rdev->wiphy); + __cfg80211_background_cac_event(rdev, rdev->background_radar_wdev, + chandef, event); + wiphy_unlock(&rdev->wiphy); +} + +void cfg80211_background_cac_done_wk(struct work_struct *work) +{ + struct delayed_work *delayed_work = to_delayed_work(work); + struct cfg80211_registered_device *rdev; + + rdev = container_of(delayed_work, struct cfg80211_registered_device, + background_cac_done_wk); + cfg80211_background_cac_event(rdev, &rdev->background_radar_chandef, + NL80211_RADAR_CAC_FINISHED); +} + +void cfg80211_background_cac_abort_wk(struct work_struct *work) +{ + struct cfg80211_registered_device *rdev; + + rdev = container_of(work, struct cfg80211_registered_device, + background_cac_abort_wk); + cfg80211_background_cac_event(rdev, &rdev->background_radar_chandef, + NL80211_RADAR_CAC_ABORTED); +} + +void cfg80211_background_cac_abort(struct wiphy *wiphy) +{ + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + + queue_work(cfg80211_wq, &rdev->background_cac_abort_wk); +} +EXPORT_SYMBOL(cfg80211_background_cac_abort); + +int +cfg80211_start_background_radar_detection(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + struct cfg80211_chan_def *chandef) +{ + unsigned int cac_time_ms; + int err; + + lockdep_assert_wiphy(&rdev->wiphy); + + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_RADAR_BACKGROUND)) + return -EOPNOTSUPP; + + /* Offchannel chain already locked by another wdev */ + if (rdev->background_radar_wdev && rdev->background_radar_wdev != wdev) + return -EBUSY; + + /* CAC already in progress on the offchannel chain */ + if (rdev->background_radar_wdev == wdev && + delayed_work_pending(&rdev->background_cac_done_wk)) + return -EBUSY; + + err = rdev_set_radar_background(rdev, chandef); + if (err) + return err; + + cac_time_ms = cfg80211_chandef_dfs_cac_time(&rdev->wiphy, chandef); + if (!cac_time_ms) + cac_time_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; + + rdev->background_radar_chandef = *chandef; + rdev->background_radar_wdev = wdev; /* Get offchain ownership */ + + __cfg80211_background_cac_event(rdev, wdev, chandef, + NL80211_RADAR_CAC_STARTED); + queue_delayed_work(cfg80211_wq, &rdev->background_cac_done_wk, + msecs_to_jiffies(cac_time_ms)); + + return 0; +} + +void cfg80211_stop_background_radar_detection(struct wireless_dev *wdev) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + + lockdep_assert_wiphy(wiphy); + + if (wdev != rdev->background_radar_wdev) + return; + + rdev_set_radar_background(rdev, NULL); + rdev->background_radar_wdev = NULL; /* Release offchain ownership */ + + __cfg80211_background_cac_event(rdev, wdev, + &rdev->background_radar_chandef, + NL80211_RADAR_CAC_ABORTED); +} diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index bb46a6a34614..cd6f80a8c44d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation */ #include @@ -285,6 +285,15 @@ static int validate_ie_attr(const struct nlattr *attr, return -EINVAL; } +static int validate_he_capa(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + if (!ieee80211_he_capa_size_ok(nla_data(attr), nla_len(attr))) + return -EINVAL; + + return 0; +} + /* policy for the attributes */ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; @@ -437,6 +446,22 @@ sar_policy[NL80211_SAR_ATTR_MAX + 1] = { [NL80211_SAR_ATTR_SPECS] = NLA_POLICY_NESTED_ARRAY(sar_specs_policy), }; +static const struct nla_policy +nl80211_mbssid_config_policy[NL80211_MBSSID_CONFIG_ATTR_MAX + 1] = { + [NL80211_MBSSID_CONFIG_ATTR_MAX_INTERFACES] = NLA_POLICY_MIN(NLA_U8, 2), + [NL80211_MBSSID_CONFIG_ATTR_MAX_EMA_PROFILE_PERIODICITY] = + NLA_POLICY_MIN(NLA_U8, 1), + [NL80211_MBSSID_CONFIG_ATTR_INDEX] = { .type = NLA_U8 }, + [NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX] = { .type = NLA_U32 }, + [NL80211_MBSSID_CONFIG_ATTR_EMA] = { .type = NLA_FLAG }, +}; + +static const struct nla_policy +nl80211_sta_wme_policy[NL80211_STA_WME_MAX + 1] = { + [NL80211_STA_WME_UAPSD_QUEUES] = { .type = NLA_U8 }, + [NL80211_STA_WME_MAX_SP] = { .type = NLA_U8 }, +}; + static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, @@ -541,9 +566,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_USE_MFP] = NLA_POLICY_RANGE(NLA_U32, NL80211_MFP_NO, NL80211_MFP_OPTIONAL), - [NL80211_ATTR_STA_FLAGS2] = { - .len = sizeof(struct nl80211_sta_flag_update), - }, + [NL80211_ATTR_STA_FLAGS2] = + NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_sta_flag_update)), [NL80211_ATTR_CONTROL_PORT] = { .type = NLA_FLAG }, [NL80211_ATTR_CONTROL_PORT_ETHERTYPE] = { .type = NLA_U16 }, [NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG }, @@ -596,6 +620,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, IEEE80211_MAX_DATA_LEN), [NL80211_ATTR_ROAM_SUPPORT] = { .type = NLA_FLAG }, + [NL80211_ATTR_STA_WME] = NLA_POLICY_NESTED(nl80211_sta_wme_policy), [NL80211_ATTR_SCHED_SCAN_MATCH] = { .type = NLA_NESTED }, [NL80211_ATTR_TX_NO_CCK_RATE] = { .type = NLA_FLAG }, [NL80211_ATTR_TDLS_ACTION] = { .type = NLA_U8 }, @@ -721,9 +746,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 }, [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 }, [NL80211_ATTR_HE_CAPABILITY] = - NLA_POLICY_RANGE(NLA_BINARY, - NL80211_HE_MIN_CAPABILITY_LEN, - NL80211_HE_MAX_CAPABILITY_LEN), + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_he_capa, + NL80211_HE_MAX_CAPABILITY_LEN), [NL80211_ATTR_FTM_RESPONDER] = NLA_POLICY_NESTED(nl80211_ftm_responder_policy), [NL80211_ATTR_TIMEOUT] = NLA_POLICY_MIN(NLA_U32, 1), @@ -764,6 +788,23 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_COLOR_CHANGE_COUNT] = { .type = NLA_U8 }, [NL80211_ATTR_COLOR_CHANGE_COLOR] = { .type = NLA_U8 }, [NL80211_ATTR_COLOR_CHANGE_ELEMS] = NLA_POLICY_NESTED(nl80211_policy), + [NL80211_ATTR_MBSSID_CONFIG] = + NLA_POLICY_NESTED(nl80211_mbssid_config_policy), + [NL80211_ATTR_MBSSID_ELEMS] = { .type = NLA_NESTED }, + [NL80211_ATTR_RADAR_BACKGROUND] = { .type = NLA_FLAG }, + [NL80211_ATTR_AP_SETTINGS_FLAGS] = { .type = NLA_U32 }, + [NL80211_ATTR_EHT_CAPABILITY] = + NLA_POLICY_RANGE(NLA_BINARY, + NL80211_EHT_MIN_CAPABILITY_LEN, + NL80211_EHT_MAX_CAPABILITY_LEN), + [NL80211_ATTR_DISABLE_EHT] = { .type = NLA_FLAG }, + [NL80211_ATTR_MLO_LINKS] = + NLA_POLICY_NESTED_ARRAY(nl80211_policy), + [NL80211_ATTR_MLO_LINK_ID] = + NLA_POLICY_RANGE(NLA_U8, 0, IEEE80211_MLD_MAX_NUM_LINKS), + [NL80211_ATTR_MLD_ADDR] = NLA_POLICY_EXACT_LEN(ETH_ALEN), + [NL80211_ATTR_MLO_SUPPORT] = { .type = NLA_FLAG }, + [NL80211_ATTR_MAX_NUM_AKM_SUITES] = { .type = NLA_REJECT }, }; /* policy for the key attributes */ @@ -854,6 +895,7 @@ nl80211_match_band_rssi_policy[NUM_NL80211_BANDS] = { [NL80211_BAND_5GHZ] = { .type = NLA_S32 }, [NL80211_BAND_6GHZ] = { .type = NLA_S32 }, [NL80211_BAND_60GHZ] = { .type = NLA_S32 }, + [NL80211_BAND_LC] = { .type = NLA_S32 }, }; static const struct nla_policy @@ -1133,6 +1175,12 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_16MHZ) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_16MHZ)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_320MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_320MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_EHT) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_EHT)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, @@ -1190,6 +1238,37 @@ static bool nl80211_put_txq_stats(struct sk_buff *msg, /* netlink command implementations */ +/** + * nl80211_link_id - return link ID + * @attrs: attributes to look at + * + * Returns: the link ID or 0 if not given + * + * Note this function doesn't do any validation of the link + * ID validity wrt. links that were actually added, so it must + * be called only from ops with %NL80211_FLAG_MLO_VALID_LINK_ID + * or if additional validation is done. + */ +static unsigned int nl80211_link_id(struct nlattr **attrs) +{ + struct nlattr *linkid = attrs[NL80211_ATTR_MLO_LINK_ID]; + + if (!linkid) + return 0; + + return nla_get_u8(linkid); +} + +static int nl80211_link_id_or_invalid(struct nlattr **attrs) +{ + struct nlattr *linkid = attrs[NL80211_ATTR_MLO_LINK_ID]; + + if (!linkid) + return -1; + + return nla_get_u8(linkid); +} + struct key_parse { struct key_params p; int idx; @@ -1461,11 +1540,14 @@ static int nl80211_key_allowed(struct wireless_dev *wdev) case NL80211_IFTYPE_MESH_POINT: break; case NL80211_IFTYPE_ADHOC: + if (wdev->u.ibss.current_bss) + return 0; + return -ENOLINK; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: - if (!wdev->current_bss) - return -ENOLINK; - break; + if (wdev->connected) + return 0; + return -ENOLINK; case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_MONITOR: @@ -1714,6 +1796,7 @@ nl80211_send_iftype_data(struct sk_buff *msg, const struct ieee80211_sband_iftype_data *iftdata) { const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap; + const struct ieee80211_sta_eht_cap *eht_cap = &iftdata->eht_cap; if (nl80211_put_iftypes(msg, NL80211_BAND_IFTYPE_ATTR_IFTYPES, iftdata->types_mask)) @@ -1734,6 +1817,37 @@ nl80211_send_iftype_data(struct sk_buff *msg, return -ENOBUFS; } + if (eht_cap->has_eht && he_cap->has_he) { + u8 mcs_nss_size, ppe_thresh_size; + u16 ppe_thres_hdr; + bool is_ap; + + is_ap = iftdata->types_mask & BIT(NL80211_IFTYPE_AP) || + iftdata->types_mask & BIT(NL80211_IFTYPE_P2P_GO); + + mcs_nss_size = + ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, + &eht_cap->eht_cap_elem, + is_ap); + + ppe_thres_hdr = get_unaligned_le16(&eht_cap->eht_ppe_thres[0]); + ppe_thresh_size = + ieee80211_eht_ppe_size(ppe_thres_hdr, + eht_cap->eht_cap_elem.phy_cap_info); + + if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MAC, + sizeof(eht_cap->eht_cap_elem.mac_cap_info), + eht_cap->eht_cap_elem.mac_cap_info) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PHY, + sizeof(eht_cap->eht_cap_elem.phy_cap_info), + eht_cap->eht_cap_elem.phy_cap_info) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MCS_SET, + mcs_nss_size, &eht_cap->eht_mcs_nss_supp) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PPE, + ppe_thresh_size, eht_cap->eht_ppe_thres)) + return -ENOBUFS; + } + if (sband->band == NL80211_BAND_6GHZ && nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA, sizeof(iftdata->he_6ghz_capa), @@ -2212,6 +2326,35 @@ fail: return -ENOBUFS; } +static int nl80211_put_mbssid_support(struct wiphy *wiphy, struct sk_buff *msg) +{ + struct nlattr *config; + + if (!wiphy->mbssid_max_interfaces) + return 0; + + config = nla_nest_start(msg, NL80211_ATTR_MBSSID_CONFIG); + if (!config) + return -ENOBUFS; + + if (nla_put_u8(msg, NL80211_MBSSID_CONFIG_ATTR_MAX_INTERFACES, + wiphy->mbssid_max_interfaces)) + goto fail; + + if (wiphy->ema_max_profile_periodicity && + nla_put_u8(msg, + NL80211_MBSSID_CONFIG_ATTR_MAX_EMA_PROFILE_PERIODICITY, + wiphy->ema_max_profile_periodicity)) + goto fail; + + nla_nest_end(msg, config); + return 0; + +fail: + nla_nest_cancel(msg, config); + return -ENOBUFS; +} + struct nl80211_dump_wiphy_state { s64 filter_wiphy; long start; @@ -2734,6 +2877,15 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, capab->extended_capabilities_mask)) goto nla_put_failure; + if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_MLO && + (nla_put_u16(msg, + NL80211_ATTR_EML_CAPABILITY, + capab->eml_capabilities) || + nla_put_u16(msg, + NL80211_ATTR_MLD_CAPA_AND_OPS, + capab->mld_capa_and_ops))) + goto nla_put_failure; + nla_nest_end(msg, nested_ext_capab); if (state->split) break; @@ -2797,6 +2949,16 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (nl80211_put_sar_specs(rdev, msg)) goto nla_put_failure; + if (nl80211_put_mbssid_support(&rdev->wiphy, msg)) + goto nla_put_failure; + + if (nla_put_u16(msg, NL80211_ATTR_MAX_NUM_AKM_SUITES, + rdev->wiphy.max_num_akm_suites)) + goto nla_put_failure; + + if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_MLO) + nla_put_flag(msg, NL80211_ATTR_MLO_SUPPORT); + /* done */ state->split_start = 0; break; @@ -3138,12 +3300,14 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, static int __nl80211_set_channel(struct cfg80211_registered_device *rdev, struct net_device *dev, - struct genl_info *info) + struct genl_info *info, + int _link_id) { struct cfg80211_chan_def chandef; int result; enum nl80211_iftype iftype = NL80211_IFTYPE_MONITOR; struct wireless_dev *wdev = NULL; + int link_id = _link_id; if (dev) wdev = dev->ieee80211_ptr; @@ -3152,6 +3316,12 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev, if (wdev) iftype = wdev->iftype; + if (link_id < 0) { + if (wdev && wdev->valid_links) + return -EINVAL; + link_id = 0; + } + result = nl80211_parse_chandef(rdev, info, &chandef); if (result) return result; @@ -3160,49 +3330,53 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev, case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &chandef, - iftype)) { - result = -EINVAL; - break; - } - if (wdev->beacon_interval) { + iftype)) + return -EINVAL; + if (wdev->links[link_id].ap.beacon_interval) { + struct ieee80211_channel *cur_chan; + if (!dev || !rdev->ops->set_ap_chanwidth || !(rdev->wiphy.features & - NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE)) { - result = -EBUSY; - break; - } + NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE)) + return -EBUSY; /* Only allow dynamic channel width changes */ - if (chandef.chan != wdev->preset_chandef.chan) { - result = -EBUSY; - break; - } - result = rdev_set_ap_chanwidth(rdev, dev, &chandef); + cur_chan = wdev->links[link_id].ap.chandef.chan; + if (chandef.chan != cur_chan) + return -EBUSY; + + result = rdev_set_ap_chanwidth(rdev, dev, link_id, + &chandef); if (result) - break; + return result; + wdev->links[link_id].ap.chandef = chandef; + } else { + wdev->u.ap.preset_chandef = chandef; } - wdev->preset_chandef = chandef; - result = 0; - break; + return 0; case NL80211_IFTYPE_MESH_POINT: - result = cfg80211_set_mesh_channel(rdev, wdev, &chandef); - break; + return cfg80211_set_mesh_channel(rdev, wdev, &chandef); case NL80211_IFTYPE_MONITOR: - result = cfg80211_set_monitor_channel(rdev, &chandef); - break; + return cfg80211_set_monitor_channel(rdev, &chandef); default: - result = -EINVAL; + break; } - return result; + return -EINVAL; } static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; + int link_id = nl80211_link_id_or_invalid(info->attrs); struct net_device *netdev = info->user_ptr[1]; + int ret; - return __nl80211_set_channel(rdev, netdev, info); + wdev_lock(netdev->ieee80211_ptr); + ret = __nl80211_set_channel(rdev, netdev, info, link_id); + wdev_unlock(netdev->ieee80211_ptr); + + return ret; } static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) @@ -3306,18 +3480,40 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (result) goto out; - result = rdev_set_txq_params(rdev, netdev, - &txq_params); + txq_params.link_id = + nl80211_link_id_or_invalid(info->attrs); + + wdev_lock(netdev->ieee80211_ptr); + if (txq_params.link_id >= 0 && + !(netdev->ieee80211_ptr->valid_links & + BIT(txq_params.link_id))) + result = -ENOLINK; + else if (txq_params.link_id >= 0 && + !netdev->ieee80211_ptr->valid_links) + result = -EINVAL; + else + result = rdev_set_txq_params(rdev, netdev, + &txq_params); + wdev_unlock(netdev->ieee80211_ptr); if (result) goto out; } } if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) { - result = __nl80211_set_channel( - rdev, - nl80211_can_set_dev_channel(wdev) ? netdev : NULL, - info); + int link_id = nl80211_link_id_or_invalid(info->attrs); + + if (wdev) { + wdev_lock(wdev); + result = __nl80211_set_channel( + rdev, + nl80211_can_set_dev_channel(wdev) ? netdev : NULL, + info, link_id); + wdev_unlock(wdev); + } else { + result = __nl80211_set_channel(rdev, netdev, info, link_id); + } + if (result) goto out; } @@ -3602,15 +3798,13 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr)) goto nla_put_failure; - if (rdev->ops->get_channel) { - int ret; + if (rdev->ops->get_channel && !wdev->valid_links) { struct cfg80211_chan_def chandef = {}; + int ret; - ret = rdev_get_channel(rdev, wdev, &chandef); - if (ret == 0) { - if (nl80211_send_chandef(msg, &chandef)) - goto nla_put_failure; - } + ret = rdev_get_channel(rdev, wdev, 0, &chandef); + if (ret == 0 && nl80211_send_chandef(msg, &chandef)) + goto nla_put_failure; } if (rdev->ops->get_tx_power) { @@ -3627,25 +3821,24 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: - if (wdev->ssid_len && - nla_put(msg, NL80211_ATTR_SSID, wdev->ssid_len, wdev->ssid)) + if (wdev->u.ap.ssid_len && + nla_put(msg, NL80211_ATTR_SSID, wdev->u.ap.ssid_len, + wdev->u.ap.ssid)) goto nla_put_failure_locked; break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: - case NL80211_IFTYPE_ADHOC: { - const u8 *ssid_ie; - if (!wdev->current_bss) - break; - rcu_read_lock(); - ssid_ie = ieee80211_bss_get_ie(&wdev->current_bss->pub, - WLAN_EID_SSID); - if (ssid_ie && - nla_put(msg, NL80211_ATTR_SSID, ssid_ie[1], ssid_ie + 2)) - goto nla_put_failure_rcu_locked; - rcu_read_unlock(); + if (wdev->u.client.ssid_len && + nla_put(msg, NL80211_ATTR_SSID, wdev->u.client.ssid_len, + wdev->u.client.ssid)) + goto nla_put_failure_locked; + break; + case NL80211_IFTYPE_ADHOC: + if (wdev->u.ibss.ssid_len && + nla_put(msg, NL80211_ATTR_SSID, wdev->u.ibss.ssid_len, + wdev->u.ibss.ssid)) + goto nla_put_failure_locked; break; - } default: /* nothing */ break; @@ -3662,11 +3855,38 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag goto nla_put_failure; } + if (wdev->valid_links) { + unsigned int link_id; + struct nlattr *links = nla_nest_start(msg, + NL80211_ATTR_MLO_LINKS); + + if (!links) + goto nla_put_failure; + + for_each_valid_link(wdev, link_id) { + struct nlattr *link = nla_nest_start(msg, link_id + 1); + struct cfg80211_chan_def chandef = {}; + int ret; + + if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id)) + goto nla_put_failure; + if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, + wdev->links[link_id].addr)) + goto nla_put_failure; + + ret = rdev_get_channel(rdev, wdev, link_id, &chandef); + if (ret == 0 && nl80211_send_chandef(msg, &chandef)) + goto nla_put_failure; + + nla_nest_end(msg, link); + } + + nla_nest_end(msg, links); + } + genlmsg_end(msg, hdr); return 0; - nla_put_failure_rcu_locked: - rcu_read_unlock(); nla_put_failure_locked: wdev_unlock(wdev); nla_put_failure: @@ -3918,10 +4138,11 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) wdev_lock(wdev); BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN != IEEE80211_MAX_MESH_ID_LEN); - wdev->mesh_id_up_len = + wdev->u.mesh.id_up_len = nla_len(info->attrs[NL80211_ATTR_MESH_ID]); - memcpy(wdev->ssid, nla_data(info->attrs[NL80211_ATTR_MESH_ID]), - wdev->mesh_id_up_len); + memcpy(wdev->u.mesh.id, + nla_data(info->attrs[NL80211_ATTR_MESH_ID]), + wdev->u.mesh.id_up_len); wdev_unlock(wdev); } @@ -4026,10 +4247,11 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) wdev_lock(wdev); BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN != IEEE80211_MAX_MESH_ID_LEN); - wdev->mesh_id_up_len = + wdev->u.mesh.id_up_len = nla_len(info->attrs[NL80211_ATTR_MESH_ID]); - memcpy(wdev->ssid, nla_data(info->attrs[NL80211_ATTR_MESH_ID]), - wdev->mesh_id_up_len); + memcpy(wdev->u.mesh.id, + nla_data(info->attrs[NL80211_ATTR_MESH_ID]), + wdev->u.mesh.id_up_len); wdev_unlock(wdev); break; case NL80211_IFTYPE_NAN: @@ -4102,7 +4324,7 @@ static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info) mutex_lock(&rdev->wiphy.mtx); - return rdev_del_virtual_intf(rdev, wdev); + return cfg80211_remove_virtual_intf(rdev, wdev); } static int nl80211_set_noack_map(struct sk_buff *skb, struct genl_info *info) @@ -4122,6 +4344,38 @@ static int nl80211_set_noack_map(struct sk_buff *skb, struct genl_info *info) return rdev_set_noack_map(rdev, dev, noack_map); } +static int nl80211_validate_key_link_id(struct genl_info *info, + struct wireless_dev *wdev, + int link_id, bool pairwise) +{ + if (pairwise) { + if (link_id != -1) { + GENL_SET_ERR_MSG(info, + "link ID not allowed for pairwise key"); + return -EINVAL; + } + + return 0; + } + + if (wdev->valid_links) { + if (link_id == -1) { + GENL_SET_ERR_MSG(info, + "link ID must for MLO group key"); + return -EINVAL; + } + if (!(wdev->valid_links & BIT(link_id))) { + GENL_SET_ERR_MSG(info, "invalid link ID for MLO group key"); + return -EINVAL; + } + } else if (link_id != -1) { + GENL_SET_ERR_MSG(info, "link ID not allowed for non-MLO group key"); + return -EINVAL; + } + + return 0; +} + struct get_key_cookie { struct sk_buff *msg; int error; @@ -4183,13 +4437,15 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) void *hdr; struct sk_buff *msg; bool bigtk_support = false; + int link_id = nl80211_link_id_or_invalid(info->attrs); + struct wireless_dev *wdev = dev->ieee80211_ptr; if (wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION)) bigtk_support = true; - if ((dev->ieee80211_ptr->iftype == NL80211_IFTYPE_STATION || - dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_CLIENT) && + if ((wdev->iftype == NL80211_IFTYPE_STATION || + wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) && wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) bigtk_support = true; @@ -4241,8 +4497,12 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr)) goto nla_put_failure; - err = rdev_get_key(rdev, dev, key_idx, pairwise, mac_addr, &cookie, - get_key_callback); + err = nl80211_validate_key_link_id(info, wdev, link_id, pairwise); + if (err) + goto free_msg; + + err = rdev_get_key(rdev, dev, link_id, key_idx, pairwise, mac_addr, + &cookie, get_key_callback); if (err) goto free_msg; @@ -4266,6 +4526,8 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) struct key_parse key; int err; struct net_device *dev = info->user_ptr[1]; + int link_id = nl80211_link_id_or_invalid(info->attrs); + struct wireless_dev *wdev = dev->ieee80211_ptr; err = nl80211_parse_key(info, &key); if (err) @@ -4281,7 +4543,7 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) !(key.p.mode == NL80211_KEY_SET_TX)) return -EINVAL; - wdev_lock(dev->ieee80211_ptr); + wdev_lock(wdev); if (key.def) { if (!rdev->ops->set_default_key) { @@ -4289,18 +4551,22 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) goto out; } - err = nl80211_key_allowed(dev->ieee80211_ptr); + err = nl80211_key_allowed(wdev); if (err) goto out; - err = rdev_set_default_key(rdev, dev, key.idx, - key.def_uni, key.def_multi); + err = nl80211_validate_key_link_id(info, wdev, link_id, false); + if (err) + goto out; + + err = rdev_set_default_key(rdev, dev, link_id, key.idx, + key.def_uni, key.def_multi); if (err) goto out; #ifdef CONFIG_CFG80211_WEXT - dev->ieee80211_ptr->wext.default_key = key.idx; + wdev->wext.default_key = key.idx; #endif } else if (key.defmgmt) { if (key.def_uni || !key.def_multi) { @@ -4313,16 +4579,20 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) goto out; } - err = nl80211_key_allowed(dev->ieee80211_ptr); + err = nl80211_key_allowed(wdev); if (err) goto out; - err = rdev_set_default_mgmt_key(rdev, dev, key.idx); + err = nl80211_validate_key_link_id(info, wdev, link_id, false); + if (err) + goto out; + + err = rdev_set_default_mgmt_key(rdev, dev, link_id, key.idx); if (err) goto out; #ifdef CONFIG_CFG80211_WEXT - dev->ieee80211_ptr->wext.default_mgmt_key = key.idx; + wdev->wext.default_mgmt_key = key.idx; #endif } else if (key.defbeacon) { if (key.def_uni || !key.def_multi) { @@ -4335,11 +4605,15 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) goto out; } - err = nl80211_key_allowed(dev->ieee80211_ptr); + err = nl80211_key_allowed(wdev); if (err) goto out; - err = rdev_set_default_beacon_key(rdev, dev, key.idx); + err = nl80211_validate_key_link_id(info, wdev, link_id, false); + if (err) + goto out; + + err = rdev_set_default_beacon_key(rdev, dev, link_id, key.idx); if (err) goto out; } else if (key.p.mode == NL80211_KEY_SET_TX && @@ -4355,14 +4629,18 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) goto out; } - err = rdev_add_key(rdev, dev, key.idx, + err = nl80211_validate_key_link_id(info, wdev, link_id, true); + if (err) + goto out; + + err = rdev_add_key(rdev, dev, link_id, key.idx, NL80211_KEYTYPE_PAIRWISE, mac_addr, &key.p); } else { err = -EINVAL; } out: - wdev_unlock(dev->ieee80211_ptr); + wdev_unlock(wdev); return err; } @@ -4374,6 +4652,8 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) struct net_device *dev = info->user_ptr[1]; struct key_parse key; const u8 *mac_addr = NULL; + int link_id = nl80211_link_id_or_invalid(info->attrs); + struct wireless_dev *wdev = dev->ieee80211_ptr; err = nl80211_parse_key(info, &key); if (err) @@ -4415,18 +4695,23 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - wdev_lock(dev->ieee80211_ptr); - err = nl80211_key_allowed(dev->ieee80211_ptr); + wdev_lock(wdev); + err = nl80211_key_allowed(wdev); if (err) GENL_SET_ERR_MSG(info, "key not allowed"); + + if (!err) + err = nl80211_validate_key_link_id(info, wdev, link_id, + key.type == NL80211_KEYTYPE_PAIRWISE); + if (!err) { - err = rdev_add_key(rdev, dev, key.idx, + err = rdev_add_key(rdev, dev, link_id, key.idx, key.type == NL80211_KEYTYPE_PAIRWISE, mac_addr, &key.p); if (err) GENL_SET_ERR_MSG(info, "key addition failed"); } - wdev_unlock(dev->ieee80211_ptr); + wdev_unlock(wdev); return err; } @@ -4438,6 +4723,8 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) struct net_device *dev = info->user_ptr[1]; u8 *mac_addr = NULL; struct key_parse key; + int link_id = nl80211_link_id_or_invalid(info->attrs); + struct wireless_dev *wdev = dev->ieee80211_ptr; err = nl80211_parse_key(info, &key); if (err) @@ -4465,27 +4752,31 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->del_key) return -EOPNOTSUPP; - wdev_lock(dev->ieee80211_ptr); - err = nl80211_key_allowed(dev->ieee80211_ptr); + wdev_lock(wdev); + err = nl80211_key_allowed(wdev); if (key.type == NL80211_KEYTYPE_GROUP && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) err = -ENOENT; if (!err) - err = rdev_del_key(rdev, dev, key.idx, + err = nl80211_validate_key_link_id(info, wdev, link_id, + key.type == NL80211_KEYTYPE_PAIRWISE); + + if (!err) + err = rdev_del_key(rdev, dev, link_id, key.idx, key.type == NL80211_KEYTYPE_PAIRWISE, mac_addr); #ifdef CONFIG_CFG80211_WEXT if (!err) { - if (key.idx == dev->ieee80211_ptr->wext.default_key) - dev->ieee80211_ptr->wext.default_key = -1; - else if (key.idx == dev->ieee80211_ptr->wext.default_mgmt_key) - dev->ieee80211_ptr->wext.default_mgmt_key = -1; + if (key.idx == wdev->wext.default_key) + wdev->wext.default_key = -1; + else if (key.idx == wdev->wext.default_mgmt_key) + wdev->wext.default_mgmt_key = -1; } #endif - wdev_unlock(dev->ieee80211_ptr); + wdev_unlock(wdev); return err; } @@ -4566,7 +4857,7 @@ static int nl80211_set_mac_acl(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; - if (!dev->ieee80211_ptr->beacon_interval) + if (!dev->ieee80211_ptr->links[0].ap.beacon_interval) return -EINVAL; acl = parse_acl_data(&rdev->wiphy, info); @@ -4722,14 +5013,24 @@ static void he_build_mcs_mask(u16 he_mcs_map, } } -static u16 he_get_txmcsmap(struct genl_info *info, +static u16 he_get_txmcsmap(struct genl_info *info, unsigned int link_id, const struct ieee80211_sta_he_cap *he_cap) { struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; - __le16 tx_mcs; + struct cfg80211_chan_def *chandef; + __le16 tx_mcs; - switch (wdev->chandef.width) { + chandef = wdev_chandef(wdev, link_id); + if (!chandef) { + /* + * This is probably broken, but we never maintained + * a chandef in these cases, so it always was. + */ + return le16_to_cpu(he_cap->he_mcs_nss_supp.tx_mcs_80); + } + + switch (chandef->width) { case NL80211_CHAN_WIDTH_80P80: tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80p80; break; @@ -4740,6 +5041,7 @@ static u16 he_get_txmcsmap(struct genl_info *info, tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80; break; } + return le16_to_cpu(tx_mcs); } @@ -4747,7 +5049,8 @@ static bool he_set_mcs_mask(struct genl_info *info, struct wireless_dev *wdev, struct ieee80211_supported_band *sband, struct nl80211_txrate_he *txrate, - u16 mcs[NL80211_HE_NSS_MAX]) + u16 mcs[NL80211_HE_NSS_MAX], + unsigned int link_id) { const struct ieee80211_sta_he_cap *he_cap; u16 tx_mcs_mask[NL80211_HE_NSS_MAX] = {}; @@ -4760,7 +5063,7 @@ static bool he_set_mcs_mask(struct genl_info *info, memset(mcs, 0, sizeof(u16) * NL80211_HE_NSS_MAX); - tx_mcs_map = he_get_txmcsmap(info, he_cap); + tx_mcs_map = he_get_txmcsmap(info, link_id, he_cap); /* Build he_mcs_mask from HE capabilities */ he_build_mcs_mask(tx_mcs_map, tx_mcs_mask); @@ -4780,7 +5083,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, enum nl80211_attrs attr, struct cfg80211_bitrate_mask *mask, struct net_device *dev, - bool default_all_enabled) + bool default_all_enabled, + unsigned int link_id) { struct nlattr *tb[NL80211_TXRATE_MAX + 1]; struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -4817,7 +5121,7 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, if (!he_cap) continue; - he_tx_mcs_map = he_get_txmcsmap(info, he_cap); + he_tx_mcs_map = he_get_txmcsmap(info, link_id, he_cap); he_build_mcs_mask(he_tx_mcs_map, mask->control[i].he_mcs); mask->control[i].he_gi = 0xFF; @@ -4882,7 +5186,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, if (tb[NL80211_TXRATE_HE] && !he_set_mcs_mask(info, wdev, sband, nla_data(tb[NL80211_TXRATE_HE]), - mask->control[band].he_mcs)) + mask->control[band].he_mcs, + link_id)) return -EINVAL; if (tb[NL80211_TXRATE_HE_GI]) @@ -4996,6 +5301,120 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, return 0; } +static int nl80211_parse_mbssid_config(struct wiphy *wiphy, + struct net_device *dev, + struct nlattr *attrs, + struct cfg80211_mbssid_config *config, + u8 num_elems) +{ + struct nlattr *tb[NL80211_MBSSID_CONFIG_ATTR_MAX + 1]; + + if (!wiphy->mbssid_max_interfaces) + return -EOPNOTSUPP; + + if (nla_parse_nested(tb, NL80211_MBSSID_CONFIG_ATTR_MAX, attrs, NULL, + NULL) || + !tb[NL80211_MBSSID_CONFIG_ATTR_INDEX]) + return -EINVAL; + + config->ema = nla_get_flag(tb[NL80211_MBSSID_CONFIG_ATTR_EMA]); + if (config->ema) { + if (!wiphy->ema_max_profile_periodicity) + return -EOPNOTSUPP; + + if (num_elems > wiphy->ema_max_profile_periodicity) + return -EINVAL; + } + + config->index = nla_get_u8(tb[NL80211_MBSSID_CONFIG_ATTR_INDEX]); + if (config->index >= wiphy->mbssid_max_interfaces || + (!config->index && !num_elems)) + return -EINVAL; + + if (tb[NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX]) { + u32 tx_ifindex = + nla_get_u32(tb[NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX]); + + if ((!config->index && tx_ifindex != dev->ifindex) || + (config->index && tx_ifindex == dev->ifindex)) + return -EINVAL; + + if (tx_ifindex != dev->ifindex) { + struct net_device *tx_netdev = + dev_get_by_index(wiphy_net(wiphy), tx_ifindex); + + if (!tx_netdev || !tx_netdev->ieee80211_ptr || + tx_netdev->ieee80211_ptr->wiphy != wiphy || + tx_netdev->ieee80211_ptr->iftype != + NL80211_IFTYPE_AP) { + dev_put(tx_netdev); + return -EINVAL; + } + + config->tx_wdev = tx_netdev->ieee80211_ptr; + } else { + config->tx_wdev = dev->ieee80211_ptr; + } + } else if (!config->index) { + config->tx_wdev = dev->ieee80211_ptr; + } else { + return -EINVAL; + } + + return 0; +} + +static struct cfg80211_mbssid_elems * +nl80211_parse_mbssid_elems(struct wiphy *wiphy, struct nlattr *attrs) +{ + struct nlattr *nl_elems; + struct cfg80211_mbssid_elems *elems; + int rem_elems; + u8 i = 0, num_elems = 0; + + if (!wiphy->mbssid_max_interfaces) + return ERR_PTR(-EINVAL); + + nla_for_each_nested(nl_elems, attrs, rem_elems) + num_elems++; + + elems = kzalloc(struct_size(elems, elem, num_elems), GFP_KERNEL); + if (!elems) + return ERR_PTR(-ENOMEM); + + nla_for_each_nested(nl_elems, attrs, rem_elems) { + elems->elem[i].data = nla_data(nl_elems); + elems->elem[i].len = nla_len(nl_elems); + i++; + } + elems->cnt = num_elems; + return elems; +} + +static int nl80211_parse_he_bss_color(struct nlattr *attrs, + struct cfg80211_he_bss_color *he_bss_color) +{ + struct nlattr *tb[NL80211_HE_BSS_COLOR_ATTR_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NL80211_HE_BSS_COLOR_ATTR_MAX, attrs, + he_bss_color_policy, NULL); + if (err) + return err; + + if (!tb[NL80211_HE_BSS_COLOR_ATTR_COLOR]) + return -EINVAL; + + he_bss_color->color = + nla_get_u8(tb[NL80211_HE_BSS_COLOR_ATTR_COLOR]); + he_bss_color->enabled = + !nla_get_flag(tb[NL80211_HE_BSS_COLOR_ATTR_DISABLED]); + he_bss_color->partial = + nla_get_flag(tb[NL80211_HE_BSS_COLOR_ATTR_PARTIAL]); + + return 0; +} + static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev, struct nlattr *attrs[], struct cfg80211_beacon_data *bcn) @@ -5005,6 +5424,8 @@ static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev, memset(bcn, 0, sizeof(*bcn)); + bcn->link_id = nl80211_link_id(attrs); + if (attrs[NL80211_ATTR_BEACON_HEAD]) { bcn->head = nla_data(attrs[NL80211_ATTR_BEACON_HEAD]); bcn->head_len = nla_len(attrs[NL80211_ATTR_BEACON_HEAD]); @@ -5076,6 +5497,25 @@ static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev, bcn->ftm_responder = -1; } + if (attrs[NL80211_ATTR_HE_BSS_COLOR]) { + err = nl80211_parse_he_bss_color(attrs[NL80211_ATTR_HE_BSS_COLOR], + &bcn->he_bss_color); + if (err) + return err; + bcn->he_bss_color_valid = true; + } + + if (attrs[NL80211_ATTR_MBSSID_ELEMS]) { + struct cfg80211_mbssid_elems *mbssid = + nl80211_parse_mbssid_elems(&rdev->wiphy, + attrs[NL80211_ATTR_MBSSID_ELEMS]); + + if (IS_ERR(mbssid)) + return PTR_ERR(mbssid); + + bcn->mbssid_ies = mbssid; + } + return 0; } @@ -5123,30 +5563,6 @@ static int nl80211_parse_he_obss_pd(struct nlattr *attrs, return 0; } -static int nl80211_parse_he_bss_color(struct nlattr *attrs, - struct cfg80211_he_bss_color *he_bss_color) -{ - struct nlattr *tb[NL80211_HE_BSS_COLOR_ATTR_MAX + 1]; - int err; - - err = nla_parse_nested(tb, NL80211_HE_BSS_COLOR_ATTR_MAX, attrs, - he_bss_color_policy, NULL); - if (err) - return err; - - if (!tb[NL80211_HE_BSS_COLOR_ATTR_COLOR]) - return -EINVAL; - - he_bss_color->color = - nla_get_u8(tb[NL80211_HE_BSS_COLOR_ATTR_COLOR]); - he_bss_color->enabled = - !nla_get_flag(tb[NL80211_HE_BSS_COLOR_ATTR_DISABLED]); - he_bss_color->partial = - nla_get_flag(tb[NL80211_HE_BSS_COLOR_ATTR_PARTIAL]); - - return 0; -} - static int nl80211_parse_fils_discovery(struct cfg80211_registered_device *rdev, struct nlattr *attrs, struct cfg80211_ap_settings *params) @@ -5207,21 +5623,21 @@ nl80211_parse_unsol_bcast_probe_resp(struct cfg80211_registered_device *rdev, } static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, - const u8 *rates) + const struct element *rates) { int i; if (!rates) return; - for (i = 0; i < rates[1]; i++) { - if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HT_PHY) + for (i = 0; i < rates->datalen; i++) { + if (rates->data[i] == BSS_MEMBERSHIP_SELECTOR_HT_PHY) params->ht_required = true; - if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_VHT_PHY) + if (rates->data[i] == BSS_MEMBERSHIP_SELECTOR_VHT_PHY) params->vht_required = true; - if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HE_PHY) + if (rates->data[i] == BSS_MEMBERSHIP_SELECTOR_HE_PHY) params->he_required = true; - if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_SAE_H2E) + if (rates->data[i] == BSS_MEMBERSHIP_SELECTOR_SAE_H2E) params->sae_h2e_required = true; } } @@ -5231,54 +5647,72 @@ static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, * HT/VHT requirements/capabilities, we parse them out of the IEs for the * benefit of drivers that rebuild IEs in the firmware. */ -static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params) +static int nl80211_calculate_ap_params(struct cfg80211_ap_settings *params) { const struct cfg80211_beacon_data *bcn = ¶ms->beacon; size_t ies_len = bcn->tail_len; const u8 *ies = bcn->tail; - const u8 *rates; - const u8 *cap; + const struct element *rates; + const struct element *cap; - rates = cfg80211_find_ie(WLAN_EID_SUPP_RATES, ies, ies_len); + rates = cfg80211_find_elem(WLAN_EID_SUPP_RATES, ies, ies_len); nl80211_check_ap_rate_selectors(params, rates); - rates = cfg80211_find_ie(WLAN_EID_EXT_SUPP_RATES, ies, ies_len); + rates = cfg80211_find_elem(WLAN_EID_EXT_SUPP_RATES, ies, ies_len); nl80211_check_ap_rate_selectors(params, rates); - cap = cfg80211_find_ie(WLAN_EID_HT_CAPABILITY, ies, ies_len); - if (cap && cap[1] >= sizeof(*params->ht_cap)) - params->ht_cap = (void *)(cap + 2); - cap = cfg80211_find_ie(WLAN_EID_VHT_CAPABILITY, ies, ies_len); - if (cap && cap[1] >= sizeof(*params->vht_cap)) - params->vht_cap = (void *)(cap + 2); - cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_CAPABILITY, ies, ies_len); - if (cap && cap[1] >= sizeof(*params->he_cap) + 1) - params->he_cap = (void *)(cap + 3); - cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION, ies, ies_len); - if (cap && cap[1] >= sizeof(*params->he_oper) + 1) - params->he_oper = (void *)(cap + 3); + cap = cfg80211_find_elem(WLAN_EID_HT_CAPABILITY, ies, ies_len); + if (cap && cap->datalen >= sizeof(*params->ht_cap)) + params->ht_cap = (void *)cap->data; + cap = cfg80211_find_elem(WLAN_EID_VHT_CAPABILITY, ies, ies_len); + if (cap && cap->datalen >= sizeof(*params->vht_cap)) + params->vht_cap = (void *)cap->data; + cap = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY, ies, ies_len); + if (cap && cap->datalen >= sizeof(*params->he_cap) + 1) + params->he_cap = (void *)(cap->data + 1); + cap = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, ies, ies_len); + if (cap && cap->datalen >= sizeof(*params->he_oper) + 1) + params->he_oper = (void *)(cap->data + 1); + cap = cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_CAPABILITY, ies, ies_len); + if (cap) { + if (!cap->datalen) + return -EINVAL; + params->eht_cap = (void *)(cap->data + 1); + if (!ieee80211_eht_capa_size_ok((const u8 *)params->he_cap, + (const u8 *)params->eht_cap, + cap->datalen - 1, true)) + return -EINVAL; + } + cap = cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_OPERATION, ies, ies_len); + if (cap) { + if (!cap->datalen) + return -EINVAL; + params->eht_oper = (void *)(cap->data + 1); + if (!ieee80211_eht_oper_size_ok((const u8 *)params->eht_oper, + cap->datalen - 1)) + return -EINVAL; + } + return 0; } static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev, struct cfg80211_ap_settings *params) { struct wireless_dev *wdev; - bool ret = false; list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (wdev->iftype != NL80211_IFTYPE_AP && wdev->iftype != NL80211_IFTYPE_P2P_GO) continue; - if (!wdev->preset_chandef.chan) + if (!wdev->u.ap.preset_chandef.chan) continue; - params->chandef = wdev->preset_chandef; - ret = true; - break; + params->chandef = wdev->u.ap.preset_chandef; + return true; } - return ret; + return false; } static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, @@ -5336,9 +5770,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; + unsigned int link_id = nl80211_link_id(info->attrs); struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_ap_settings params; + struct cfg80211_ap_settings *params; int err; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && @@ -5348,30 +5783,32 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->start_ap) return -EOPNOTSUPP; - if (wdev->beacon_interval) + if (wdev->links[link_id].ap.beacon_interval) return -EALREADY; - memset(¶ms, 0, sizeof(params)); - /* these are required for START_AP */ if (!info->attrs[NL80211_ATTR_BEACON_INTERVAL] || !info->attrs[NL80211_ATTR_DTIM_PERIOD] || !info->attrs[NL80211_ATTR_BEACON_HEAD]) return -EINVAL; - err = nl80211_parse_beacon(rdev, info->attrs, ¶ms.beacon); - if (err) - return err; + params = kzalloc(sizeof(*params), GFP_KERNEL); + if (!params) + return -ENOMEM; - params.beacon_interval = + err = nl80211_parse_beacon(rdev, info->attrs, ¶ms->beacon); + if (err) + goto out; + + params->beacon_interval = nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]); - params.dtim_period = + params->dtim_period = nla_get_u32(info->attrs[NL80211_ATTR_DTIM_PERIOD]); err = cfg80211_validate_beacon_int(rdev, dev->ieee80211_ptr->iftype, - params.beacon_interval); + params->beacon_interval); if (err) - return err; + goto out; /* * In theory, some of these attributes should be required here @@ -5381,178 +5818,246 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) * additional information -- drivers must check! */ if (info->attrs[NL80211_ATTR_SSID]) { - params.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); - params.ssid_len = + params->ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); + params->ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); - if (params.ssid_len == 0) - return -EINVAL; + if (params->ssid_len == 0) { + err = -EINVAL; + goto out; + } + + if (wdev->u.ap.ssid_len && + (wdev->u.ap.ssid_len != params->ssid_len || + memcmp(wdev->u.ap.ssid, params->ssid, params->ssid_len))) { + /* require identical SSID for MLO */ + err = -EINVAL; + goto out; + } + } else if (wdev->valid_links) { + /* require SSID for MLO */ + err = -EINVAL; + goto out; } if (info->attrs[NL80211_ATTR_HIDDEN_SSID]) - params.hidden_ssid = nla_get_u32( + params->hidden_ssid = nla_get_u32( info->attrs[NL80211_ATTR_HIDDEN_SSID]); - params.privacy = !!info->attrs[NL80211_ATTR_PRIVACY]; + params->privacy = !!info->attrs[NL80211_ATTR_PRIVACY]; if (info->attrs[NL80211_ATTR_AUTH_TYPE]) { - params.auth_type = nla_get_u32( + params->auth_type = nla_get_u32( info->attrs[NL80211_ATTR_AUTH_TYPE]); - if (!nl80211_valid_auth_type(rdev, params.auth_type, - NL80211_CMD_START_AP)) - return -EINVAL; + if (!nl80211_valid_auth_type(rdev, params->auth_type, + NL80211_CMD_START_AP)) { + err = -EINVAL; + goto out; + } } else - params.auth_type = NL80211_AUTHTYPE_AUTOMATIC; + params->auth_type = NL80211_AUTHTYPE_AUTOMATIC; - err = nl80211_crypto_settings(rdev, info, ¶ms.crypto, + err = nl80211_crypto_settings(rdev, info, ¶ms->crypto, NL80211_MAX_NR_CIPHER_SUITES); if (err) - return err; + goto out; if (info->attrs[NL80211_ATTR_INACTIVITY_TIMEOUT]) { - if (!(rdev->wiphy.features & NL80211_FEATURE_INACTIVITY_TIMER)) - return -EOPNOTSUPP; - params.inactivity_timeout = nla_get_u16( + if (!(rdev->wiphy.features & NL80211_FEATURE_INACTIVITY_TIMER)) { + err = -EOPNOTSUPP; + goto out; + } + params->inactivity_timeout = nla_get_u16( info->attrs[NL80211_ATTR_INACTIVITY_TIMEOUT]); } if (info->attrs[NL80211_ATTR_P2P_CTWINDOW]) { - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) - return -EINVAL; - params.p2p_ctwindow = + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { + err = -EINVAL; + goto out; + } + params->p2p_ctwindow = nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]); - if (params.p2p_ctwindow != 0 && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN)) - return -EINVAL; + if (params->p2p_ctwindow != 0 && + !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN)) { + err = -EINVAL; + goto out; + } } if (info->attrs[NL80211_ATTR_P2P_OPPPS]) { u8 tmp; - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) - return -EINVAL; + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { + err = -EINVAL; + goto out; + } tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]); - params.p2p_opp_ps = tmp; - if (params.p2p_opp_ps != 0 && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS)) - return -EINVAL; + params->p2p_opp_ps = tmp; + if (params->p2p_opp_ps != 0 && + !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS)) { + err = -EINVAL; + goto out; + } } if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) { - err = nl80211_parse_chandef(rdev, info, ¶ms.chandef); + err = nl80211_parse_chandef(rdev, info, ¶ms->chandef); if (err) - return err; - } else if (wdev->preset_chandef.chan) { - params.chandef = wdev->preset_chandef; - } else if (!nl80211_get_ap_channel(rdev, ¶ms)) - return -EINVAL; + goto out; + } else if (wdev->valid_links) { + /* with MLD need to specify the channel configuration */ + err = -EINVAL; + goto out; + } else if (wdev->u.ap.preset_chandef.chan) { + params->chandef = wdev->u.ap.preset_chandef; + } else if (!nl80211_get_ap_channel(rdev, params)) { + err = -EINVAL; + goto out; + } - if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, ¶ms.chandef, - wdev->iftype)) - return -EINVAL; + if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, ¶ms->chandef, + wdev->iftype)) { + err = -EINVAL; + goto out; + } + + wdev_lock(wdev); if (info->attrs[NL80211_ATTR_TX_RATES]) { err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, - ¶ms.beacon_rate, - dev, false); + ¶ms->beacon_rate, + dev, false, link_id); if (err) - return err; + goto out_unlock; - err = validate_beacon_tx_rate(rdev, params.chandef.chan->band, - ¶ms.beacon_rate); + err = validate_beacon_tx_rate(rdev, params->chandef.chan->band, + ¶ms->beacon_rate); if (err) - return err; + goto out_unlock; } if (info->attrs[NL80211_ATTR_SMPS_MODE]) { - params.smps_mode = + params->smps_mode = nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]); - switch (params.smps_mode) { + switch (params->smps_mode) { case NL80211_SMPS_OFF: break; case NL80211_SMPS_STATIC: if (!(rdev->wiphy.features & - NL80211_FEATURE_STATIC_SMPS)) - return -EINVAL; + NL80211_FEATURE_STATIC_SMPS)) { + err = -EINVAL; + goto out_unlock; + } break; case NL80211_SMPS_DYNAMIC: if (!(rdev->wiphy.features & - NL80211_FEATURE_DYNAMIC_SMPS)) - return -EINVAL; + NL80211_FEATURE_DYNAMIC_SMPS)) { + err = -EINVAL; + goto out_unlock; + } break; default: - return -EINVAL; + err = -EINVAL; + goto out_unlock; } } else { - params.smps_mode = NL80211_SMPS_OFF; + params->smps_mode = NL80211_SMPS_OFF; } - params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); - if (params.pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) - return -EOPNOTSUPP; + params->pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); + if (params->pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) { + err = -EOPNOTSUPP; + goto out_unlock; + } if (info->attrs[NL80211_ATTR_ACL_POLICY]) { - params.acl = parse_acl_data(&rdev->wiphy, info); - if (IS_ERR(params.acl)) - return PTR_ERR(params.acl); + params->acl = parse_acl_data(&rdev->wiphy, info); + if (IS_ERR(params->acl)) { + err = PTR_ERR(params->acl); + params->acl = NULL; + goto out_unlock; + } } - params.twt_responder = + params->twt_responder = nla_get_flag(info->attrs[NL80211_ATTR_TWT_RESPONDER]); if (info->attrs[NL80211_ATTR_HE_OBSS_PD]) { err = nl80211_parse_he_obss_pd( info->attrs[NL80211_ATTR_HE_OBSS_PD], - ¶ms.he_obss_pd); + ¶ms->he_obss_pd); if (err) - goto out; - } - - if (info->attrs[NL80211_ATTR_HE_BSS_COLOR]) { - err = nl80211_parse_he_bss_color( - info->attrs[NL80211_ATTR_HE_BSS_COLOR], - ¶ms.he_bss_color); - if (err) - goto out; + goto out_unlock; } if (info->attrs[NL80211_ATTR_FILS_DISCOVERY]) { err = nl80211_parse_fils_discovery(rdev, info->attrs[NL80211_ATTR_FILS_DISCOVERY], - ¶ms); + params); if (err) - goto out; + goto out_unlock; } if (info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP]) { err = nl80211_parse_unsol_bcast_probe_resp( rdev, info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP], - ¶ms); + params); if (err) - goto out; + goto out_unlock; } - nl80211_calculate_ap_params(¶ms); + if (info->attrs[NL80211_ATTR_MBSSID_CONFIG]) { + err = nl80211_parse_mbssid_config(&rdev->wiphy, dev, + info->attrs[NL80211_ATTR_MBSSID_CONFIG], + ¶ms->mbssid_config, + params->beacon.mbssid_ies ? + params->beacon.mbssid_ies->cnt : + 0); + if (err) + goto out_unlock; + } - if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT]) - params.flags |= AP_SETTINGS_EXTERNAL_AUTH_SUPPORT; + err = nl80211_calculate_ap_params(params); + if (err) + goto out_unlock; - wdev_lock(wdev); - err = rdev_start_ap(rdev, dev, ¶ms); + if (info->attrs[NL80211_ATTR_AP_SETTINGS_FLAGS]) + params->flags = nla_get_u32( + info->attrs[NL80211_ATTR_AP_SETTINGS_FLAGS]); + else if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT]) + params->flags |= NL80211_AP_SETTINGS_EXTERNAL_AUTH_SUPPORT; + + if (wdev->conn_owner_nlportid && + info->attrs[NL80211_ATTR_SOCKET_OWNER] && + wdev->conn_owner_nlportid != info->snd_portid) { + err = -EINVAL; + goto out_unlock; + } + + /* FIXME: validate MLO/link-id against driver capabilities */ + + err = rdev_start_ap(rdev, dev, params); if (!err) { - wdev->preset_chandef = params.chandef; - wdev->beacon_interval = params.beacon_interval; - wdev->chandef = params.chandef; - wdev->ssid_len = params.ssid_len; - memcpy(wdev->ssid, params.ssid, wdev->ssid_len); + wdev->links[link_id].ap.beacon_interval = params->beacon_interval; + wdev->links[link_id].ap.chandef = params->chandef; + wdev->u.ap.ssid_len = params->ssid_len; + memcpy(wdev->u.ap.ssid, params->ssid, + params->ssid_len); if (info->attrs[NL80211_ATTR_SOCKET_OWNER]) wdev->conn_owner_nlportid = info->snd_portid; } +out_unlock: wdev_unlock(wdev); - out: - kfree(params.acl); + kfree(params->acl); + kfree(params->beacon.mbssid_ies); + if (params->mbssid_config.tx_wdev && + params->mbssid_config.tx_wdev->netdev && + params->mbssid_config.tx_wdev->netdev != dev) + dev_put(params->mbssid_config.tx_wdev->netdev); + kfree(params); return err; } @@ -5560,6 +6065,7 @@ out: static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; + unsigned int link_id = nl80211_link_id(info->attrs); struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_beacon_data params; @@ -5572,26 +6078,29 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->change_beacon) return -EOPNOTSUPP; - if (!wdev->beacon_interval) + if (!wdev->links[link_id].ap.beacon_interval) return -EINVAL; err = nl80211_parse_beacon(rdev, info->attrs, ¶ms); if (err) - return err; + goto out; wdev_lock(wdev); err = rdev_change_beacon(rdev, dev, ¶ms); wdev_unlock(wdev); +out: + kfree(params.mbssid_ies); return err; } static int nl80211_stop_ap(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; + unsigned int link_id = nl80211_link_id(info->attrs); struct net_device *dev = info->user_ptr[1]; - return cfg80211_stop_ap(rdev, dev, false); + return cfg80211_stop_ap(rdev, dev, link_id, false); } static const struct nla_policy sta_flags_policy[NL80211_STA_FLAG_MAX + 1] = { @@ -5727,6 +6236,14 @@ bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr) case RATE_INFO_BW_HE_RU: rate_flg = 0; WARN_ON(!(info->flags & RATE_INFO_FLAGS_HE_MCS)); + break; + case RATE_INFO_BW_320: + rate_flg = NL80211_RATE_INFO_320_MHZ_WIDTH; + break; + case RATE_INFO_BW_EHT_RU: + rate_flg = 0; + WARN_ON(!(info->flags & RATE_INFO_FLAGS_EHT_MCS)); + break; } if (rate_flg && nla_put_flag(msg, rate_flg)) @@ -5759,6 +6276,17 @@ bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr) nla_put_u8(msg, NL80211_RATE_INFO_HE_RU_ALLOC, info->he_ru_alloc)) return false; + } else if (info->flags & RATE_INFO_FLAGS_EHT_MCS) { + if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_MCS, info->mcs)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_NSS, info->nss)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_GI, info->eht_gi)) + return false; + if (info->bw == RATE_INFO_BW_EHT_RU && + nla_put_u8(msg, NL80211_RATE_INFO_EHT_RU_ALLOC, + info->eht_ru_alloc)) + return false; } nla_nest_end(msg, rate); @@ -6170,10 +6698,12 @@ int cfg80211_check_station_change(struct wiphy *wiphy, return -EINVAL; if (params->sta_modify_mask & STATION_PARAM_APPLY_CAPABILITY) return -EINVAL; - if (params->supported_rates) + if (params->link_sta_params.supported_rates) return -EINVAL; - if (params->ext_capab || params->ht_capa || params->vht_capa || - params->he_capa) + if (params->ext_capab || params->link_sta_params.ht_capa || + params->link_sta_params.vht_capa || + params->link_sta_params.he_capa || + params->link_sta_params.eht_capa) return -EINVAL; } @@ -6221,7 +6751,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy, return -EINVAL; /* force (at least) rates when authorizing */ if (params->sta_flags_set & BIT(NL80211_STA_FLAG_AUTHORIZED) && - !params->supported_rates) + !params->link_sta_params.supported_rates) return -EINVAL; break; case CFG80211_STA_TDLS_PEER_ACTIVE: @@ -6245,7 +6775,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy, */ if (statype != CFG80211_STA_AP_CLIENT_UNASSOC && statype != CFG80211_STA_TDLS_PEER_SETUP) - params->opmode_notif_used = false; + params->link_sta_params.opmode_notif_used = false; return 0; } @@ -6291,12 +6821,6 @@ static struct net_device *get_vlan(struct genl_info *info, return ERR_PTR(ret); } -static const struct nla_policy -nl80211_sta_wme_policy[NL80211_STA_WME_MAX + 1] = { - [NL80211_STA_WME_UAPSD_QUEUES] = { .type = NLA_U8 }, - [NL80211_STA_WME_MAX_SP] = { .type = NLA_U8 }, -}; - static int nl80211_parse_sta_wme(struct genl_info *info, struct station_parameters *params) { @@ -6366,16 +6890,29 @@ static int nl80211_set_station_tdls(struct genl_info *info, if (info->attrs[NL80211_ATTR_PEER_AID]) params->aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]); if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) - params->ht_capa = + params->link_sta_params.ht_capa = nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]); if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) - params->vht_capa = + params->link_sta_params.vht_capa = nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]); if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) { - params->he_capa = + params->link_sta_params.he_capa = nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); - params->he_capa_len = + params->link_sta_params.he_capa_len = nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + + if (info->attrs[NL80211_ATTR_EHT_CAPABILITY]) { + params->link_sta_params.eht_capa = + nla_data(info->attrs[NL80211_ATTR_EHT_CAPABILITY]); + params->link_sta_params.eht_capa_len = + nla_len(info->attrs[NL80211_ATTR_EHT_CAPABILITY]); + + if (!ieee80211_eht_capa_size_ok((const u8 *)params->link_sta_params.he_capa, + (const u8 *)params->link_sta_params.eht_capa, + params->link_sta_params.eht_capa_len, + false)) + return -EINVAL; + } } err = nl80211_parse_sta_channel_info(info, params); @@ -6386,7 +6923,8 @@ static int nl80211_set_station_tdls(struct genl_info *info, } static int nl80211_parse_sta_txpower_setting(struct genl_info *info, - struct station_parameters *params) + struct sta_txpwr *txpwr, + bool *txpwr_set) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; int idx; @@ -6398,18 +6936,20 @@ static int nl80211_parse_sta_txpower_setting(struct genl_info *info, return -EOPNOTSUPP; idx = NL80211_ATTR_STA_TX_POWER_SETTING; - params->txpwr.type = nla_get_u8(info->attrs[idx]); + txpwr->type = nla_get_u8(info->attrs[idx]); - if (params->txpwr.type == NL80211_TX_POWER_LIMITED) { + if (txpwr->type == NL80211_TX_POWER_LIMITED) { idx = NL80211_ATTR_STA_TX_POWER; if (info->attrs[idx]) - params->txpwr.power = - nla_get_s16(info->attrs[idx]); + txpwr->power = nla_get_s16(info->attrs[idx]); else return -EINVAL; } - params->sta_modify_mask |= STATION_PARAM_APPLY_STA_TXPOWER; + + *txpwr_set = true; + } else { + *txpwr_set = false; } return 0; @@ -6454,12 +6994,33 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL80211_ATTR_MAC]) return -EINVAL; - mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + params.link_sta_params.link_id = + nl80211_link_id_or_invalid(info->attrs); + + if (info->attrs[NL80211_ATTR_MLD_ADDR]) { + /* If MLD_ADDR attribute is set then this is an MLD station + * and the MLD_ADDR attribute holds the MLD address and the + * MAC attribute holds for the LINK address. + * In that case, the link_id is also expected to be valid. + */ + if (params.link_sta_params.link_id < 0) + return -EINVAL; + + mac_addr = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]); + params.link_sta_params.mld_mac = mac_addr; + params.link_sta_params.link_mac = + nla_data(info->attrs[NL80211_ATTR_MAC]); + if (!is_valid_ether_addr(params.link_sta_params.link_mac)) + return -EINVAL; + } else { + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + } + if (info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]) { - params.supported_rates = + params.link_sta_params.supported_rates = nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]); - params.supported_rates_len = + params.link_sta_params.supported_rates_len = nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]); } @@ -6497,13 +7058,13 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE]); if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { - params.opmode_notif_used = true; - params.opmode_notif = + params.link_sta_params.opmode_notif_used = true; + params.link_sta_params.opmode_notif = nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]); } if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) - params.he_6ghz_capa = + params.link_sta_params.he_6ghz_capa = nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]) @@ -6515,7 +7076,9 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) return -EOPNOTSUPP; - err = nl80211_parse_sta_txpower_setting(info, ¶ms); + err = nl80211_parse_sta_txpower_setting(info, + ¶ms.link_sta_params.txpwr, + ¶ms.link_sta_params.txpwr_set); if (err) return err; @@ -6543,7 +7106,9 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) } /* driver will call cfg80211_check_station_change() */ + wdev_lock(dev->ieee80211_ptr); err = rdev_change_station(rdev, dev, mac_addr, ¶ms); + wdev_unlock(dev->ieee80211_ptr); out_put_vlan: dev_put(params.vlan); @@ -6556,6 +7121,7 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; int err; struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; struct station_parameters params; u8 *mac_addr = NULL; u32 auth_assoc = BIT(NL80211_STA_FLAG_AUTHENTICATED) | @@ -6579,10 +7145,23 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) !info->attrs[NL80211_ATTR_PEER_AID]) return -EINVAL; - mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); - params.supported_rates = + params.link_sta_params.link_id = + nl80211_link_id_or_invalid(info->attrs); + + if (info->attrs[NL80211_ATTR_MLD_ADDR]) { + mac_addr = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]); + params.link_sta_params.mld_mac = mac_addr; + params.link_sta_params.link_mac = + nla_data(info->attrs[NL80211_ATTR_MAC]); + if (!is_valid_ether_addr(params.link_sta_params.link_mac)) + return -EINVAL; + } else { + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + } + + params.link_sta_params.supported_rates = nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]); - params.supported_rates_len = + params.link_sta_params.supported_rates_len = nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]); params.listen_interval = nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); @@ -6621,27 +7200,40 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) - params.ht_capa = + params.link_sta_params.ht_capa = nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]); if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) - params.vht_capa = + params.link_sta_params.vht_capa = nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]); if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) { - params.he_capa = + params.link_sta_params.he_capa = nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); - params.he_capa_len = + params.link_sta_params.he_capa_len = nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + + if (info->attrs[NL80211_ATTR_EHT_CAPABILITY]) { + params.link_sta_params.eht_capa = + nla_data(info->attrs[NL80211_ATTR_EHT_CAPABILITY]); + params.link_sta_params.eht_capa_len = + nla_len(info->attrs[NL80211_ATTR_EHT_CAPABILITY]); + + if (!ieee80211_eht_capa_size_ok((const u8 *)params.link_sta_params.he_capa, + (const u8 *)params.link_sta_params.eht_capa, + params.link_sta_params.eht_capa_len, + false)) + return -EINVAL; + } } if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) - params.he_6ghz_capa = + params.link_sta_params.he_6ghz_capa = nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { - params.opmode_notif_used = true; - params.opmode_notif = + params.link_sta_params.opmode_notif_used = true; + params.link_sta_params.opmode_notif = nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]); } @@ -6658,7 +7250,9 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) return -EOPNOTSUPP; - err = nl80211_parse_sta_txpower_setting(info, ¶ms); + err = nl80211_parse_sta_txpower_setting(info, + ¶ms.link_sta_params.txpwr, + ¶ms.link_sta_params.txpwr_set); if (err) return err; @@ -6679,16 +7273,19 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) * error in this case. */ if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) { - params.ht_capa = NULL; - params.vht_capa = NULL; + params.link_sta_params.ht_capa = NULL; + params.link_sta_params.vht_capa = NULL; - /* HE requires WME */ - if (params.he_capa_len || params.he_6ghz_capa) + /* HE and EHT require WME */ + if (params.link_sta_params.he_capa_len || + params.link_sta_params.he_6ghz_capa || + params.link_sta_params.eht_capa_len) return -EINVAL; } /* Ensure that HT/VHT capabilities are not set for 6 GHz HE STA */ - if (params.he_6ghz_capa && (params.ht_capa || params.vht_capa)) + if (params.link_sta_params.he_6ghz_capa && + (params.link_sta_params.ht_capa || params.link_sta_params.vht_capa)) return -EINVAL; /* When you run into this, adjust the code below for the new flag */ @@ -6779,8 +7376,25 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) /* be aware of params.vlan when changing code here */ + wdev_lock(dev->ieee80211_ptr); + if (wdev->valid_links) { + if (params.link_sta_params.link_id < 0) { + err = -EINVAL; + goto out; + } + if (!(wdev->valid_links & BIT(params.link_sta_params.link_id))) { + err = -ENOLINK; + goto out; + } + } else { + if (params.link_sta_params.link_id >= 0) { + err = -EINVAL; + goto out; + } + } err = rdev_add_station(rdev, dev, mac_addr, ¶ms); - +out: + wdev_unlock(dev->ieee80211_ptr); dev_put(params.vlan); return err; } @@ -6790,6 +7404,7 @@ static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct station_del_parameters params; + int ret; memset(¶ms, 0, sizeof(params)); @@ -6837,7 +7452,11 @@ static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info) params.reason_code = WLAN_REASON_PREV_AUTH_NOT_VALID; } - return rdev_del_station(rdev, dev, ¶ms); + wdev_lock(dev->ieee80211_ptr); + ret = rdev_del_station(rdev, dev, ¶ms); + wdev_unlock(dev->ieee80211_ptr); + + return ret; } static int nl80211_send_mpath(struct sk_buff *msg, u32 portid, u32 seq, @@ -7297,7 +7916,7 @@ static int nl80211_get_mesh_config(struct sk_buff *skb, wdev_lock(wdev); /* If not connected, get default parameters */ - if (!wdev->mesh_id_len) + if (!wdev->u.mesh.id_len) memcpy(&cur_params, &default_mesh_config, sizeof(cur_params)); else err = rdev_get_mesh_config(rdev, dev, &cur_params); @@ -7678,7 +8297,7 @@ static int nl80211_update_mesh_config(struct sk_buff *skb, return err; wdev_lock(wdev); - if (!wdev->mesh_id_len) + if (!wdev->u.mesh.id_len) err = -ENOLINK; if (!err) @@ -7756,6 +8375,7 @@ static int nl80211_get_reg_do(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev; struct wiphy *wiphy = NULL; struct sk_buff *msg; + int err = -EMSGSIZE; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); @@ -7774,34 +8394,35 @@ static int nl80211_get_reg_do(struct sk_buff *skb, struct genl_info *info) rdev = cfg80211_get_dev_from_info(genl_info_net(info), info); if (IS_ERR(rdev)) { - nlmsg_free(msg); - rtnl_unlock(); - return PTR_ERR(rdev); + err = PTR_ERR(rdev); + goto nla_put_failure; } wiphy = &rdev->wiphy; self_managed = wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED; + + rcu_read_lock(); + regdom = get_wiphy_regdom(wiphy); /* a self-managed-reg device must have a private regdom */ if (WARN_ON(!regdom && self_managed)) { - nlmsg_free(msg); - rtnl_unlock(); - return -EINVAL; + err = -EINVAL; + goto nla_put_failure_rcu; } if (regdom && nla_put_u32(msg, NL80211_ATTR_WIPHY, get_wiphy_idx(wiphy))) - goto nla_put_failure; + goto nla_put_failure_rcu; + } else { + rcu_read_lock(); } if (!wiphy && reg_last_request_cell_base() && nla_put_u32(msg, NL80211_ATTR_USER_REG_HINT_TYPE, NL80211_USER_REG_HINT_CELL_BASE)) - goto nla_put_failure; - - rcu_read_lock(); + goto nla_put_failure_rcu; if (!regdom) regdom = rcu_dereference(cfg80211_regdomain); @@ -7821,7 +8442,7 @@ nla_put_failure: rtnl_unlock(); put_failure: nlmsg_free(msg); - return -EMSGSIZE; + return err; } static int nl80211_send_regdom(struct sk_buff *msg, struct netlink_callback *cb, @@ -7867,19 +8488,19 @@ static int nl80211_get_reg_dump(struct sk_buff *skb, struct cfg80211_registered_device *rdev; int err, reg_idx, start = cb->args[2]; - rtnl_lock(); + rcu_read_lock(); if (cfg80211_regdomain && start == 0) { err = nl80211_send_regdom(skb, cb, cb->nlh->nlmsg_seq, NLM_F_MULTI, NULL, - rtnl_dereference(cfg80211_regdomain)); + rcu_dereference(cfg80211_regdomain)); if (err < 0) goto out_err; } /* the global regdom is idx 0 */ reg_idx = 1; - list_for_each_entry(rdev, &cfg80211_rdev_list, list) { + list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) { regdom = get_wiphy_regdom(&rdev->wiphy); if (!regdom) continue; @@ -7898,7 +8519,7 @@ static int nl80211_get_reg_dump(struct sk_buff *skb, cb->args[2] = reg_idx; err = skb->len; out_err: - rtnl_unlock(); + rcu_read_unlock(); return err; } @@ -8168,14 +8789,44 @@ int nl80211_parse_random_mac(struct nlattr **attrs, return 0; } -static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev) +static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev, + struct ieee80211_channel *chan) { + unsigned int link_id; + bool all_ok = true; + ASSERT_WDEV_LOCK(wdev); if (!cfg80211_beaconing_iface_active(wdev)) return true; - if (!(wdev->chandef.chan->flags & IEEE80211_CHAN_RADAR)) + /* + * FIXME: check if we have a free HW resource/link for chan + * + * This, as well as the FIXME below, requires knowing the link + * capabilities of the hardware. + */ + + /* we cannot leave radar channels */ + for_each_valid_link(wdev, link_id) { + struct cfg80211_chan_def *chandef; + + chandef = wdev_chandef(wdev, link_id); + if (!chandef) + continue; + + /* + * FIXME: don't require all_ok, but rather check only the + * correct HW resource/link onto which 'chan' falls, + * as only that link leaves the channel for doing + * the off-channel operation. + */ + + if (chandef->chan->flags & IEEE80211_CHAN_RADAR) + all_ok = false; + } + + if (all_ok) return true; return regulatory_pre_cac_allowed(wdev->wiphy); @@ -8258,7 +8909,7 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, int err; if (!(wiphy->features & randomness_flag) || - (wdev && wdev->current_bss)) + (wdev && wdev->connected)) return -EOPNOTSUPP; err = nl80211_parse_random_mac(attrs, mac_addr, mac_addr_mask); @@ -8395,17 +9046,14 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) request->n_channels = i; wdev_lock(wdev); - if (!cfg80211_off_channel_oper_allowed(wdev)) { - struct ieee80211_channel *chan; + for (i = 0; i < request->n_channels; i++) { + struct ieee80211_channel *chan = request->channels[i]; - if (request->n_channels != 1) { - wdev_unlock(wdev); - err = -EBUSY; - goto out_free; - } + /* if we can go off-channel to the target channel we're good */ + if (cfg80211_off_channel_oper_allowed(wdev, chan)) + continue; - chan = request->channels[0]; - if (chan->center_freq != wdev->chandef.chan->center_freq) { + if (!cfg80211_wdev_on_sub_chan(wdev, chan, true)) { wdev_unlock(wdev); err = -EBUSY; goto out_free; @@ -9089,38 +9737,60 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, struct cfg80211_chan_def chandef; enum nl80211_dfs_regions dfs_region; unsigned int cac_time_ms; - int err; + int err = -EINVAL; + + flush_delayed_work(&rdev->dfs_update_channels_wk); + + wiphy_lock(wiphy); dfs_region = reg_get_dfs_region(wiphy); if (dfs_region == NL80211_DFS_UNSET) - return -EINVAL; + goto unlock; err = nl80211_parse_chandef(rdev, info, &chandef); if (err) - return err; - - if (netif_carrier_ok(dev)) - return -EBUSY; - - if (wdev->cac_started) - return -EBUSY; + goto unlock; err = cfg80211_chandef_dfs_required(wiphy, &chandef, wdev->iftype); if (err < 0) - return err; + goto unlock; - if (err == 0) - return -EINVAL; + if (err == 0) { + err = -EINVAL; + goto unlock; + } - if (!cfg80211_chandef_dfs_usable(wiphy, &chandef)) - return -EINVAL; + if (!cfg80211_chandef_dfs_usable(wiphy, &chandef)) { + err = -EINVAL; + goto unlock; + } + + if (nla_get_flag(info->attrs[NL80211_ATTR_RADAR_BACKGROUND])) { + err = cfg80211_start_background_radar_detection(rdev, wdev, + &chandef); + goto unlock; + } + + if (netif_carrier_ok(dev)) { + err = -EBUSY; + goto unlock; + } + + if (wdev->cac_started) { + err = -EBUSY; + goto unlock; + } /* CAC start is offloaded to HW and can't be started manually */ - if (wiphy_ext_feature_isset(wiphy, NL80211_EXT_FEATURE_DFS_OFFLOAD)) - return -EOPNOTSUPP; + if (wiphy_ext_feature_isset(wiphy, NL80211_EXT_FEATURE_DFS_OFFLOAD)) { + err = -EOPNOTSUPP; + goto unlock; + } - if (!rdev->ops->start_radar_detection) - return -EOPNOTSUPP; + if (!rdev->ops->start_radar_detection) { + err = -EOPNOTSUPP; + goto unlock; + } cac_time_ms = cfg80211_chandef_dfs_cac_time(&rdev->wiphy, &chandef); if (WARN_ON(!cac_time_ms)) @@ -9128,11 +9798,14 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, err = rdev_start_radar_detection(rdev, dev, &chandef, cac_time_ms); if (!err) { - wdev->chandef = chandef; + wdev->links[0].ap.chandef = chandef; wdev->cac_started = true; wdev->cac_start_time = jiffies; wdev->cac_time_ms = cac_time_ms; } +unlock: + wiphy_unlock(wiphy); + return err; } @@ -9193,6 +9866,7 @@ static int nl80211_notify_radar_detection(struct sk_buff *skb, static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; + unsigned int link_id = nl80211_link_id(info->attrs); struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_csa_settings params; @@ -9219,15 +9893,15 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) need_handle_dfs_flag = false; /* useless if AP is not running */ - if (!wdev->beacon_interval) + if (!wdev->links[link_id].ap.beacon_interval) return -ENOTCONN; break; case NL80211_IFTYPE_ADHOC: - if (!wdev->ssid_len) + if (!wdev->u.ibss.ssid_len) return -ENOTCONN; break; case NL80211_IFTYPE_MESH_POINT: - if (!wdev->mesh_id_len) + if (!wdev->u.mesh.id_len) return -ENOTCONN; break; default: @@ -9259,12 +9933,14 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) err = nl80211_parse_beacon(rdev, info->attrs, ¶ms.beacon_after); if (err) - return err; + goto free; csa_attrs = kcalloc(NL80211_ATTR_MAX + 1, sizeof(*csa_attrs), GFP_KERNEL); - if (!csa_attrs) - return -ENOMEM; + if (!csa_attrs) { + err = -ENOMEM; + goto free; + } err = nla_parse_nested_deprecated(csa_attrs, NL80211_ATTR_MAX, info->attrs[NL80211_ATTR_CSA_IES], @@ -9382,6 +10058,8 @@ skip_beacons: wdev_unlock(wdev); free: + kfree(params.beacon_after.mbssid_ies); + kfree(params.beacon_csa.mbssid_ies); kfree(csa_attrs); return err; } @@ -9394,6 +10072,7 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, { struct cfg80211_bss *res = &intbss->pub; const struct cfg80211_bss_ies *ies; + unsigned int link_id; void *hdr; struct nlattr *bss; @@ -9498,13 +10177,20 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, switch (wdev->iftype) { case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_STATION: - if (intbss == wdev->current_bss && - nla_put_u32(msg, NL80211_BSS_STATUS, - NL80211_BSS_STATUS_ASSOCIATED)) - goto nla_put_failure; + for_each_valid_link(wdev, link_id) { + if (intbss == wdev->links[link_id].client.current_bss && + (nla_put_u32(msg, NL80211_BSS_STATUS, + NL80211_BSS_STATUS_ASSOCIATED) || + (wdev->valid_links && + (nla_put_u8(msg, NL80211_BSS_MLO_LINK_ID, + link_id) || + nla_put(msg, NL80211_BSS_MLD_ADDR, ETH_ALEN, + wdev->u.client.connected_addr))))) + goto nla_put_failure; + } break; case NL80211_IFTYPE_ADHOC: - if (intbss == wdev->current_bss && + if (intbss == wdev->u.ibss.current_bss && nla_put_u32(msg, NL80211_BSS_STATUS, NL80211_BSS_STATUS_IBSS_JOINED)) goto nla_put_failure; @@ -9688,7 +10374,9 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) } while (1) { + wdev_lock(wdev); res = rdev_dump_survey(rdev, wdev->netdev, survey_idx, &survey); + wdev_unlock(wdev); if (res == -ENOENT) break; if (res) @@ -9730,11 +10418,12 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct ieee80211_channel *chan; - const u8 *bssid, *ssid, *ie = NULL, *auth_data = NULL; - int err, ssid_len, ie_len = 0, auth_data_len = 0; + const u8 *bssid, *ssid; + int err, ssid_len; enum nl80211_auth_type auth_type; struct key_parse key; bool local_state_change; + struct cfg80211_auth_request req = {}; u32 freq; if (!info->attrs[NL80211_ATTR_MAC]) @@ -9805,8 +10494,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); if (info->attrs[NL80211_ATTR_IE]) { - ie = nla_data(info->attrs[NL80211_ATTR_IE]); - ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + req.ie = nla_data(info->attrs[NL80211_ATTR_IE]); + req.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); } auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]); @@ -9826,8 +10515,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) auth_type != NL80211_AUTHTYPE_FILS_SK_PFS && auth_type != NL80211_AUTHTYPE_FILS_PK) return -EINVAL; - auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]); - auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]); + req.auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]); + req.auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]); } local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE]; @@ -9839,12 +10528,31 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) if (local_state_change) return 0; + req.auth_type = auth_type; + req.key = key.p.key; + req.key_len = key.p.key_len; + req.key_idx = key.idx; + req.link_id = nl80211_link_id_or_invalid(info->attrs); + if (req.link_id >= 0) { + if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_MLO)) + return -EINVAL; + if (!info->attrs[NL80211_ATTR_MLD_ADDR]) + return -EINVAL; + req.ap_mld_addr = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]); + } + + req.bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len, + IEEE80211_BSS_TYPE_ESS, + IEEE80211_PRIVACY_ANY); + if (!req.bss) + return -ENOENT; + wdev_lock(dev->ieee80211_ptr); - err = cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid, - ssid, ssid_len, ie, ie_len, - key.p.key, key.p.key_len, key.idx, - auth_data, auth_data_len); + err = cfg80211_mlme_auth(rdev, dev, &req); wdev_unlock(dev->ieee80211_ptr); + + cfg80211_put_bss(&rdev->wiphy, req.bss); + return err; } @@ -9948,7 +10656,7 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, if (len % sizeof(u32)) return -EINVAL; - if (settings->n_akm_suites > NL80211_MAX_NR_AKM_SUITES) + if (settings->n_akm_suites > rdev->wiphy.max_num_akm_suites) return -EINVAL; memcpy(settings->akm_suites, data, len); @@ -9986,23 +10694,55 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, return 0; } +static struct cfg80211_bss *nl80211_assoc_bss(struct cfg80211_registered_device *rdev, + const u8 *ssid, int ssid_len, + struct nlattr **attrs, + const u8 **bssid_out) +{ + struct ieee80211_channel *chan; + struct cfg80211_bss *bss; + const u8 *bssid; + u32 freq; + + if (!attrs[NL80211_ATTR_MAC] || !attrs[NL80211_ATTR_WIPHY_FREQ]) + return ERR_PTR(-EINVAL); + + bssid = nla_data(attrs[NL80211_ATTR_MAC]); + + freq = MHZ_TO_KHZ(nla_get_u32(attrs[NL80211_ATTR_WIPHY_FREQ])); + if (attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) + freq += nla_get_u32(attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); + + chan = nl80211_get_valid_chan(&rdev->wiphy, freq); + if (!chan) + return ERR_PTR(-EINVAL); + + bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, + ssid, ssid_len, + IEEE80211_BSS_TYPE_ESS, + IEEE80211_PRIVACY_ANY); + if (!bss) + return ERR_PTR(-ENOENT); + + *bssid_out = bssid; + return bss; +} + static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; - struct ieee80211_channel *chan; struct cfg80211_assoc_request req = {}; + struct nlattr **attrs = NULL; const u8 *bssid, *ssid; - int err, ssid_len = 0; - u32 freq; + unsigned int link_id; + int err, ssid_len; if (dev->ieee80211_ptr->conn_owner_nlportid && dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid) return -EPERM; - if (!info->attrs[NL80211_ATTR_MAC] || - !info->attrs[NL80211_ATTR_SSID] || - !info->attrs[NL80211_ATTR_WIPHY_FREQ]) + if (!info->attrs[NL80211_ATTR_SSID]) return -EINVAL; if (!rdev->ops->assoc) @@ -10012,22 +10752,19 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) return -EOPNOTSUPP; - bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); - - freq = MHZ_TO_KHZ(nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); - if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) - freq += - nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); - chan = nl80211_get_valid_chan(&rdev->wiphy, freq); - if (!chan) - return -EINVAL; - ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); if (info->attrs[NL80211_ATTR_IE]) { req.ie = nla_data(info->attrs[NL80211_ATTR_IE]); req.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + + if (cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + req.ie, req.ie_len)) { + GENL_SET_ERR_MSG(info, + "non-inheritance makes no sense"); + return -EINVAL; + } } if (info->attrs[NL80211_ATTR_USE_MFP]) { @@ -10064,6 +10801,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_HE])) req.flags |= ASSOC_REQ_DISABLE_HE; + if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_EHT])) + req.flags |= ASSOC_REQ_DISABLE_EHT; + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) memcpy(&req.vht_capa_mask, nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]), @@ -10112,12 +10852,113 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) sizeof(req.s1g_capa)); } + req.link_id = nl80211_link_id_or_invalid(info->attrs); + + if (info->attrs[NL80211_ATTR_MLO_LINKS]) { + unsigned int attrsize = NUM_NL80211_ATTR * sizeof(*attrs); + struct nlattr *link; + int rem = 0; + + if (req.link_id < 0) + return -EINVAL; + + if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_MLO)) + return -EINVAL; + + if (info->attrs[NL80211_ATTR_MAC] || + info->attrs[NL80211_ATTR_WIPHY_FREQ] || + !info->attrs[NL80211_ATTR_MLD_ADDR]) + return -EINVAL; + + req.ap_mld_addr = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]); + + attrs = kzalloc(attrsize, GFP_KERNEL); + if (!attrs) + return -ENOMEM; + + nla_for_each_nested(link, + info->attrs[NL80211_ATTR_MLO_LINKS], + rem) { + memset(attrs, 0, attrsize); + + nla_parse_nested(attrs, NL80211_ATTR_MAX, + link, NULL, NULL); + + if (!attrs[NL80211_ATTR_MLO_LINK_ID]) { + err = -EINVAL; + goto free; + } + + link_id = nla_get_u8(attrs[NL80211_ATTR_MLO_LINK_ID]); + /* cannot use the same link ID again */ + if (req.links[link_id].bss) { + err = -EINVAL; + goto free; + } + req.links[link_id].bss = + nl80211_assoc_bss(rdev, ssid, ssid_len, attrs, + &bssid); + if (IS_ERR(req.links[link_id].bss)) { + err = PTR_ERR(req.links[link_id].bss); + req.links[link_id].bss = NULL; + goto free; + } + + if (attrs[NL80211_ATTR_IE]) { + req.links[link_id].elems = + nla_data(attrs[NL80211_ATTR_IE]); + req.links[link_id].elems_len = + nla_len(attrs[NL80211_ATTR_IE]); + + if (cfg80211_find_elem(WLAN_EID_FRAGMENT, + req.links[link_id].elems, + req.links[link_id].elems_len)) { + GENL_SET_ERR_MSG(info, + "cannot deal with fragmentation"); + err = -EINVAL; + goto free; + } + + if (cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + req.links[link_id].elems, + req.links[link_id].elems_len)) { + GENL_SET_ERR_MSG(info, + "cannot deal with non-inheritance"); + err = -EINVAL; + goto free; + } + } + } + + if (!req.links[req.link_id].bss) { + err = -EINVAL; + goto free; + } + + if (req.links[req.link_id].elems_len) { + GENL_SET_ERR_MSG(info, + "cannot have per-link elems on assoc link"); + err = -EINVAL; + goto free; + } + + kfree(attrs); + attrs = NULL; + } else { + if (req.link_id >= 0) + return -EINVAL; + + req.bss = nl80211_assoc_bss(rdev, ssid, ssid_len, info->attrs, + &bssid); + if (IS_ERR(req.bss)) + return PTR_ERR(req.bss); + } + err = nl80211_crypto_settings(rdev, info, &req.crypto, 1); if (!err) { wdev_lock(dev->ieee80211_ptr); - err = cfg80211_mlme_assoc(rdev, dev, chan, bssid, - ssid, ssid_len, &req); + err = cfg80211_mlme_assoc(rdev, dev, &req); if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) { dev->ieee80211_ptr->conn_owner_nlportid = @@ -10129,6 +10970,12 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) wdev_unlock(dev->ieee80211_ptr); } +free: + for (link_id = 0; link_id < ARRAY_SIZE(req.links); link_id++) + cfg80211_put_bss(&rdev->wiphy, req.links[link_id].bss); + cfg80211_put_bss(&rdev->wiphy, req.bss); + kfree(attrs); + return err; } @@ -10331,6 +11178,8 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) NL80211_EXT_FEATURE_VHT_IBSS)) return -EINVAL; break; + case NL80211_CHAN_WIDTH_320: + return -EINVAL; default: return -EINVAL; } @@ -10432,7 +11281,6 @@ static int nl80211_set_mcast_rate(struct sk_buff *skb, struct genl_info *info) struct net_device *dev = info->user_ptr[1]; int mcast_rate[NUM_NL80211_BANDS]; u32 nla_rate; - int err; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT && @@ -10451,9 +11299,7 @@ static int nl80211_set_mcast_rate(struct sk_buff *skb, struct genl_info *info) if (!nl80211_parse_mcast_rate(rdev, mcast_rate, nla_rate)) return -EINVAL; - err = rdev_set_mcast_rate(rdev, dev, mcast_rate); - - return err; + return rdev_set_mcast_rate(rdev, dev, mcast_rate); } static struct sk_buff * @@ -10850,6 +11696,9 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_HE])) connect.flags |= ASSOC_REQ_DISABLE_HE; + if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_EHT])) + connect.flags |= ASSOC_REQ_DISABLE_EHT; + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) memcpy(&connect.vht_capa_mask, nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]), @@ -10937,6 +11786,9 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) connect.flags |= CONNECT_REQ_EXTERNAL_AUTH_SUPPORT; } + if (nla_get_flag(info->attrs[NL80211_ATTR_MLO_SUPPORT])) + connect.flags |= CONNECT_REQ_MLO_SUPPORT; + wdev_lock(dev->ieee80211_ptr); err = cfg80211_connect(rdev, dev, &connect, connkeys, @@ -11030,7 +11882,7 @@ static int nl80211_update_connect_params(struct sk_buff *skb, } wdev_lock(dev->ieee80211_ptr); - if (!wdev->current_bss) + if (!wdev->connected) ret = -ENOLINK; else ret = rdev_update_connect_params(rdev, dev, &connect, changed); @@ -11243,9 +12095,9 @@ static int nl80211_remain_on_channel(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; + unsigned int link_id = nl80211_link_id(info->attrs); struct wireless_dev *wdev = info->user_ptr[1]; struct cfg80211_chan_def chandef; - const struct cfg80211_chan_def *compat_chandef; struct sk_buff *msg; void *hdr; u64 cookie; @@ -11275,10 +12127,22 @@ static int nl80211_remain_on_channel(struct sk_buff *skb, return err; wdev_lock(wdev); - if (!cfg80211_off_channel_oper_allowed(wdev) && - !cfg80211_chandef_identical(&wdev->chandef, &chandef)) { - compat_chandef = cfg80211_chandef_compatible(&wdev->chandef, - &chandef); + if (!cfg80211_off_channel_oper_allowed(wdev, chandef.chan)) { + const struct cfg80211_chan_def *oper_chandef, *compat_chandef; + + oper_chandef = wdev_chandef(wdev, link_id); + + if (WARN_ON(!oper_chandef)) { + /* cannot happen since we must beacon to get here */ + WARN_ON(1); + wdev_unlock(wdev); + return -EBUSY; + } + + /* note: returns first one if identical chandefs */ + compat_chandef = cfg80211_chandef_compatible(&chandef, + oper_chandef); + if (compat_chandef != &chandef) { wdev_unlock(wdev); return -EBUSY; @@ -11340,6 +12204,7 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_bitrate_mask mask; + unsigned int link_id = nl80211_link_id(info->attrs); struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; @@ -11351,11 +12216,11 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, wdev_lock(wdev); err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, &mask, - dev, true); + dev, true, link_id); if (err) goto out; - err = rdev_set_bitrate_mask(rdev, dev, NULL, &mask); + err = rdev_set_bitrate_mask(rdev, dev, link_id, NULL, &mask); out: wdev_unlock(wdev); return err; @@ -11480,10 +12345,23 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) return -EINVAL; wdev_lock(wdev); - if (params.offchan && !cfg80211_off_channel_oper_allowed(wdev)) { + if (params.offchan && + !cfg80211_off_channel_oper_allowed(wdev, chandef.chan)) { wdev_unlock(wdev); return -EBUSY; } + + params.link_id = nl80211_link_id_or_invalid(info->attrs); + /* + * This now races due to the unlock, but we cannot check + * the valid links for the _station_ anyway, so that's up + * to the driver. + */ + if (params.link_id >= 0 && + !(wdev->valid_links & BIT(params.link_id))) { + wdev_unlock(wdev); + return -EINVAL; + } wdev_unlock(wdev); params.buf = nla_data(info->attrs[NL80211_ATTR_FRAME]); @@ -11698,12 +12576,13 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, * connection is established and enough beacons received to calculate * the average. */ - if (!wdev->cqm_config->last_rssi_event_value && wdev->current_bss && + if (!wdev->cqm_config->last_rssi_event_value && + wdev->links[0].client.current_bss && rdev->ops->get_station) { struct station_info sinfo = {}; u8 *mac_addr; - mac_addr = wdev->current_bss->pub.bssid; + mac_addr = wdev->links[0].client.current_bss->pub.bssid; err = rdev_get_station(rdev, dev, mac_addr, &sinfo); if (err) @@ -11964,7 +12843,7 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, &setup.beacon_rate, - dev, false); + dev, false, 0); if (err) return err; @@ -12934,7 +13813,7 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) rekey_data.akm = nla_get_u32(tb[NL80211_REKEY_DATA_AKM]); wdev_lock(wdev); - if (!wdev->current_bss) { + if (!wdev->connected) { err = -ENOTCONN; goto out; } @@ -14203,7 +15082,7 @@ static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info) switch (wdev->iftype) { case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: - if (wdev->current_bss) + if (wdev->connected) break; err = -ENOTCONN; goto out; @@ -14376,13 +15255,13 @@ static int nl80211_set_pmk(struct sk_buff *skb, struct genl_info *info) return -EINVAL; wdev_lock(wdev); - if (!wdev->current_bss) { + if (!wdev->connected) { ret = -ENOTCONN; goto out; } pmk_conf.aa = nla_data(info->attrs[NL80211_ATTR_MAC]); - if (memcmp(pmk_conf.aa, wdev->current_bss->pub.bssid, ETH_ALEN)) { + if (memcmp(pmk_conf.aa, wdev->u.client.connected_addr, ETH_ALEN)) { ret = -EINVAL; goto out; } @@ -14486,6 +15365,7 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) u16 proto; bool noencrypt; u64 cookie = 0; + int link_id; int err; if (!wiphy_ext_feature_isset(&rdev->wiphy, @@ -14510,9 +15390,13 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) case NL80211_IFTYPE_MESH_POINT: break; case NL80211_IFTYPE_ADHOC: + if (wdev->u.ibss.current_bss) + break; + err = -ENOTCONN; + goto out; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: - if (wdev->current_bss) + if (wdev->connected) break; err = -ENOTCONN; goto out; @@ -14530,8 +15414,10 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) noencrypt = nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT]); + link_id = nl80211_link_id_or_invalid(info->attrs); + err = rdev_tx_control_port(rdev, dev, buf, len, - dest, cpu_to_be16(proto), noencrypt, + dest, cpu_to_be16(proto), noencrypt, link_id, dont_wait_for_ack ? NULL : &cookie); if (!err && !dont_wait_for_ack) nl_set_extack_cookie_u64(info->extack, cookie); @@ -14548,12 +15434,14 @@ static int nl80211_get_ftm_responder_stats(struct sk_buff *skb, struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_ftm_responder_stats ftm_stats = {}; + unsigned int link_id = nl80211_link_id(info->attrs); struct sk_buff *msg; void *hdr; struct nlattr *ftm_stats_attr; int err; - if (wdev->iftype != NL80211_IFTYPE_AP || !wdev->beacon_interval) + if (wdev->iftype != NL80211_IFTYPE_AP || + !wdev->links[link_id].ap.beacon_interval) return -EOPNOTSUPP; err = rdev_get_ftm_responder_stats(rdev, dev, &ftm_stats); @@ -14683,7 +15571,8 @@ static int nl80211_probe_mesh_link(struct sk_buff *skb, struct genl_info *info) static int parse_tid_conf(struct cfg80211_registered_device *rdev, struct nlattr *attrs[], struct net_device *dev, struct cfg80211_tid_cfg *tid_conf, - struct genl_info *info, const u8 *peer) + struct genl_info *info, const u8 *peer, + unsigned int link_id) { struct netlink_ext_ack *extack = info->extack; u64 mask; @@ -14758,7 +15647,7 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, attr = NL80211_TID_CONFIG_ATTR_TX_RATE; err = nl80211_parse_tx_bitrate_mask(info, attrs, attr, &tid_conf->txrate_mask, dev, - true); + true, link_id); if (err) return err; @@ -14785,6 +15674,7 @@ static int nl80211_set_tid_config(struct sk_buff *skb, { struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct nlattr *attrs[NL80211_TID_CONFIG_ATTR_MAX + 1]; + unsigned int link_id = nl80211_link_id(info->attrs); struct net_device *dev = info->user_ptr[1]; struct cfg80211_tid_config *tid_config; struct nlattr *tid; @@ -14812,6 +15702,8 @@ static int nl80211_set_tid_config(struct sk_buff *skb, if (info->attrs[NL80211_ATTR_MAC]) tid_config->peer = nla_data(info->attrs[NL80211_ATTR_MAC]); + wdev_lock(dev->ieee80211_ptr); + nla_for_each_nested(tid, info->attrs[NL80211_ATTR_TID_CONFIG], rem_conf) { ret = nla_parse_nested(attrs, NL80211_TID_CONFIG_ATTR_MAX, @@ -14822,7 +15714,7 @@ static int nl80211_set_tid_config(struct sk_buff *skb, ret = parse_tid_conf(rdev, attrs, dev, &tid_config->tid_conf[conf_idx], - info, tid_config->peer); + info, tid_config->peer, link_id); if (ret) goto bad_tid_conf; @@ -14833,6 +15725,7 @@ static int nl80211_set_tid_config(struct sk_buff *skb, bad_tid_conf: kfree(tid_config); + wdev_unlock(dev->ieee80211_ptr); return ret; } @@ -14932,10 +15825,228 @@ static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info) wdev_unlock(wdev); out: + kfree(params.beacon_next.mbssid_ies); + kfree(params.beacon_color_change.mbssid_ies); kfree(tb); return err; } +static int nl80211_set_fils_aad(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct cfg80211_fils_aad fils_aad = {}; + u8 *nonces; + + if (!info->attrs[NL80211_ATTR_MAC] || + !info->attrs[NL80211_ATTR_FILS_KEK] || + !info->attrs[NL80211_ATTR_FILS_NONCES]) + return -EINVAL; + + fils_aad.macaddr = nla_data(info->attrs[NL80211_ATTR_MAC]); + fils_aad.kek_len = nla_len(info->attrs[NL80211_ATTR_FILS_KEK]); + fils_aad.kek = nla_data(info->attrs[NL80211_ATTR_FILS_KEK]); + nonces = nla_data(info->attrs[NL80211_ATTR_FILS_NONCES]); + fils_aad.snonce = nonces; + fils_aad.anonce = nonces + FILS_NONCE_LEN; + + return rdev_set_fils_aad(rdev, dev, &fils_aad); +} + +static int nl80211_add_link(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + unsigned int link_id = nl80211_link_id(info->attrs); + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + int ret; + + if (!(wdev->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO)) + return -EINVAL; + + switch (wdev->iftype) { + case NL80211_IFTYPE_AP: + break; + default: + return -EINVAL; + } + + if (!info->attrs[NL80211_ATTR_MAC] || + !is_valid_ether_addr(nla_data(info->attrs[NL80211_ATTR_MAC]))) + return -EINVAL; + + wdev_lock(wdev); + wdev->valid_links |= BIT(link_id); + ether_addr_copy(wdev->links[link_id].addr, + nla_data(info->attrs[NL80211_ATTR_MAC])); + + ret = rdev_add_intf_link(rdev, wdev, link_id); + if (ret) { + wdev->valid_links &= ~BIT(link_id); + eth_zero_addr(wdev->links[link_id].addr); + } + wdev_unlock(wdev); + + return ret; +} + +static int nl80211_remove_link(struct sk_buff *skb, struct genl_info *info) +{ + unsigned int link_id = nl80211_link_id(info->attrs); + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + + /* cannot remove if there's no link */ + if (!info->attrs[NL80211_ATTR_MLO_LINK_ID]) + return -EINVAL; + + switch (wdev->iftype) { + case NL80211_IFTYPE_AP: + break; + default: + return -EINVAL; + } + + wdev_lock(wdev); + cfg80211_remove_link(wdev, link_id); + wdev_unlock(wdev); + + return 0; +} + +static int +nl80211_add_mod_link_station(struct sk_buff *skb, struct genl_info *info, + bool add) +{ + struct link_station_parameters params = {}; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + int err; + + if ((add && !rdev->ops->add_link_station) || + (!add && !rdev->ops->mod_link_station)) + return -EOPNOTSUPP; + + if (add && !info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_MLD_ADDR]) + return -EINVAL; + + if (add && !info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]) + return -EINVAL; + + params.mld_mac = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]); + + if (info->attrs[NL80211_ATTR_MAC]) { + params.link_mac = nla_data(info->attrs[NL80211_ATTR_MAC]); + if (!is_valid_ether_addr(params.link_mac)) + return -EINVAL; + } + + if (!info->attrs[NL80211_ATTR_MLO_LINK_ID]) + return -EINVAL; + + params.link_id = nla_get_u8(info->attrs[NL80211_ATTR_MLO_LINK_ID]); + + if (info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]) { + params.supported_rates = + nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]); + params.supported_rates_len = + nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]); + } + + if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) + params.ht_capa = + nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]); + + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) + params.vht_capa = + nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]); + + if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) { + params.he_capa = + nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + params.he_capa_len = + nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + + if (info->attrs[NL80211_ATTR_EHT_CAPABILITY]) { + params.eht_capa = + nla_data(info->attrs[NL80211_ATTR_EHT_CAPABILITY]); + params.eht_capa_len = + nla_len(info->attrs[NL80211_ATTR_EHT_CAPABILITY]); + + if (!ieee80211_eht_capa_size_ok((const u8 *)params.he_capa, + (const u8 *)params.eht_capa, + params.eht_capa_len, + false)) + return -EINVAL; + } + } + + if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) + params.he_6ghz_capa = + nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); + + if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { + params.opmode_notif_used = true; + params.opmode_notif = + nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]); + } + + err = nl80211_parse_sta_txpower_setting(info, ¶ms.txpwr, + ¶ms.txpwr_set); + if (err) + return err; + + wdev_lock(dev->ieee80211_ptr); + if (add) + err = rdev_add_link_station(rdev, dev, ¶ms); + else + err = rdev_mod_link_station(rdev, dev, ¶ms); + wdev_unlock(dev->ieee80211_ptr); + + return err; +} + +static int +nl80211_add_link_station(struct sk_buff *skb, struct genl_info *info) +{ + return nl80211_add_mod_link_station(skb, info, true); +} + +static int +nl80211_modify_link_station(struct sk_buff *skb, struct genl_info *info) +{ + return nl80211_add_mod_link_station(skb, info, false); +} + +static int +nl80211_remove_link_station(struct sk_buff *skb, struct genl_info *info) +{ + struct link_station_del_parameters params = {}; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + int ret; + + if (!rdev->ops->del_link_station) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_MLD_ADDR] || + !info->attrs[NL80211_ATTR_MLO_LINK_ID]) + return -EINVAL; + + params.mld_mac = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]); + params.link_id = nla_get_u8(info->attrs[NL80211_ATTR_MLO_LINK_ID]); + + wdev_lock(dev->ieee80211_ptr); + ret = rdev_del_link_station(rdev, dev, ¶ms); + wdev_unlock(dev->ieee80211_ptr); + + return ret; +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -14948,38 +16059,121 @@ out: NL80211_FLAG_CHECK_NETDEV_UP) #define NL80211_FLAG_CLEAR_SKB 0x20 #define NL80211_FLAG_NO_WIPHY_MTX 0x40 +#define NL80211_FLAG_MLO_VALID_LINK_ID 0x80 +#define NL80211_FLAG_MLO_UNSUPPORTED 0x100 + +#define INTERNAL_FLAG_SELECTORS(__sel) \ + SELECTOR(__sel, NONE, 0) /* must be first */ \ + SELECTOR(__sel, WIPHY, \ + NL80211_FLAG_NEED_WIPHY) \ + SELECTOR(__sel, WDEV, \ + NL80211_FLAG_NEED_WDEV) \ + SELECTOR(__sel, NETDEV, \ + NL80211_FLAG_NEED_NETDEV) \ + SELECTOR(__sel, NETDEV_LINK, \ + NL80211_FLAG_NEED_NETDEV | \ + NL80211_FLAG_MLO_VALID_LINK_ID) \ + SELECTOR(__sel, NETDEV_NO_MLO, \ + NL80211_FLAG_NEED_NETDEV | \ + NL80211_FLAG_MLO_UNSUPPORTED) \ + SELECTOR(__sel, WIPHY_RTNL, \ + NL80211_FLAG_NEED_WIPHY | \ + NL80211_FLAG_NEED_RTNL) \ + SELECTOR(__sel, WIPHY_RTNL_NOMTX, \ + NL80211_FLAG_NEED_WIPHY | \ + NL80211_FLAG_NEED_RTNL | \ + NL80211_FLAG_NO_WIPHY_MTX) \ + SELECTOR(__sel, WDEV_RTNL, \ + NL80211_FLAG_NEED_WDEV | \ + NL80211_FLAG_NEED_RTNL) \ + SELECTOR(__sel, NETDEV_RTNL, \ + NL80211_FLAG_NEED_NETDEV | \ + NL80211_FLAG_NEED_RTNL) \ + SELECTOR(__sel, NETDEV_UP, \ + NL80211_FLAG_NEED_NETDEV_UP) \ + SELECTOR(__sel, NETDEV_UP_LINK, \ + NL80211_FLAG_NEED_NETDEV_UP | \ + NL80211_FLAG_MLO_VALID_LINK_ID) \ + SELECTOR(__sel, NETDEV_UP_NO_MLO, \ + NL80211_FLAG_NEED_NETDEV_UP | \ + NL80211_FLAG_MLO_UNSUPPORTED) \ + SELECTOR(__sel, NETDEV_UP_NO_MLO_CLEAR, \ + NL80211_FLAG_NEED_NETDEV_UP | \ + NL80211_FLAG_CLEAR_SKB | \ + NL80211_FLAG_MLO_UNSUPPORTED) \ + SELECTOR(__sel, NETDEV_UP_NOTMX, \ + NL80211_FLAG_NEED_NETDEV_UP | \ + NL80211_FLAG_NO_WIPHY_MTX) \ + SELECTOR(__sel, NETDEV_UP_NOTMX_NOMLO, \ + NL80211_FLAG_NEED_NETDEV_UP | \ + NL80211_FLAG_NO_WIPHY_MTX | \ + NL80211_FLAG_MLO_UNSUPPORTED) \ + SELECTOR(__sel, NETDEV_UP_CLEAR, \ + NL80211_FLAG_NEED_NETDEV_UP | \ + NL80211_FLAG_CLEAR_SKB) \ + SELECTOR(__sel, WDEV_UP, \ + NL80211_FLAG_NEED_WDEV_UP) \ + SELECTOR(__sel, WDEV_UP_LINK, \ + NL80211_FLAG_NEED_WDEV_UP | \ + NL80211_FLAG_MLO_VALID_LINK_ID) \ + SELECTOR(__sel, WDEV_UP_RTNL, \ + NL80211_FLAG_NEED_WDEV_UP | \ + NL80211_FLAG_NEED_RTNL) \ + SELECTOR(__sel, WIPHY_CLEAR, \ + NL80211_FLAG_NEED_WIPHY | \ + NL80211_FLAG_CLEAR_SKB) + +enum nl80211_internal_flags_selector { +#define SELECTOR(_, name, value) NL80211_IFL_SEL_##name, + INTERNAL_FLAG_SELECTORS(_) +#undef SELECTOR +}; + +static u32 nl80211_internal_flags[] = { +#define SELECTOR(_, name, value) [NL80211_IFL_SEL_##name] = value, + INTERNAL_FLAG_SELECTORS(_) +#undef SELECTOR +}; static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = NULL; - struct wireless_dev *wdev; - struct net_device *dev; + struct wireless_dev *wdev = NULL; + struct net_device *dev = NULL; + u32 internal_flags; + int err; + + if (WARN_ON(ops->internal_flags >= ARRAY_SIZE(nl80211_internal_flags))) + return -EINVAL; + + internal_flags = nl80211_internal_flags[ops->internal_flags]; rtnl_lock(); - if (ops->internal_flags & NL80211_FLAG_NEED_WIPHY) { + if (internal_flags & NL80211_FLAG_NEED_WIPHY) { rdev = cfg80211_get_dev_from_info(genl_info_net(info), info); if (IS_ERR(rdev)) { - rtnl_unlock(); - return PTR_ERR(rdev); + err = PTR_ERR(rdev); + goto out_unlock; } info->user_ptr[0] = rdev; - } else if (ops->internal_flags & NL80211_FLAG_NEED_NETDEV || - ops->internal_flags & NL80211_FLAG_NEED_WDEV) { + } else if (internal_flags & NL80211_FLAG_NEED_NETDEV || + internal_flags & NL80211_FLAG_NEED_WDEV) { wdev = __cfg80211_wdev_from_attrs(NULL, genl_info_net(info), info->attrs); if (IS_ERR(wdev)) { - rtnl_unlock(); - return PTR_ERR(wdev); + err = PTR_ERR(wdev); + goto out_unlock; } dev = wdev->netdev; + dev_hold(dev); rdev = wiphy_to_rdev(wdev->wiphy); - if (ops->internal_flags & NL80211_FLAG_NEED_NETDEV) { + if (internal_flags & NL80211_FLAG_NEED_NETDEV) { if (!dev) { - rtnl_unlock(); - return -EINVAL; + err = -EINVAL; + goto out_unlock; } info->user_ptr[1] = dev; @@ -14987,32 +16181,68 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, info->user_ptr[1] = wdev; } - if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP && + if (internal_flags & NL80211_FLAG_CHECK_NETDEV_UP && !wdev_running(wdev)) { - rtnl_unlock(); - return -ENETDOWN; + err = -ENETDOWN; + goto out_unlock; } - dev_hold(dev); info->user_ptr[0] = rdev; } - if (rdev && !(ops->internal_flags & NL80211_FLAG_NO_WIPHY_MTX)) { + if (internal_flags & NL80211_FLAG_MLO_VALID_LINK_ID) { + struct nlattr *link_id = info->attrs[NL80211_ATTR_MLO_LINK_ID]; + + if (!wdev) { + err = -EINVAL; + goto out_unlock; + } + + /* MLO -> require valid link ID */ + if (wdev->valid_links && + (!link_id || + !(wdev->valid_links & BIT(nla_get_u8(link_id))))) { + err = -EINVAL; + goto out_unlock; + } + + /* non-MLO -> no link ID attribute accepted */ + if (!wdev->valid_links && link_id) { + err = -EINVAL; + goto out_unlock; + } + } + + if (internal_flags & NL80211_FLAG_MLO_UNSUPPORTED) { + if (info->attrs[NL80211_ATTR_MLO_LINK_ID] || + (wdev && wdev->valid_links)) { + err = -EINVAL; + goto out_unlock; + } + } + + if (rdev && !(internal_flags & NL80211_FLAG_NO_WIPHY_MTX)) { wiphy_lock(&rdev->wiphy); /* we keep the mutex locked until post_doit */ __release(&rdev->wiphy.mtx); } - if (!(ops->internal_flags & NL80211_FLAG_NEED_RTNL)) + if (!(internal_flags & NL80211_FLAG_NEED_RTNL)) rtnl_unlock(); return 0; +out_unlock: + rtnl_unlock(); + dev_put(dev); + return err; } static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { + u32 internal_flags = nl80211_internal_flags[ops->internal_flags]; + if (info->user_ptr[1]) { - if (ops->internal_flags & NL80211_FLAG_NEED_WDEV) { + if (internal_flags & NL80211_FLAG_NEED_WDEV) { struct wireless_dev *wdev = info->user_ptr[1]; dev_put(wdev->netdev); @@ -15022,7 +16252,7 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb, } if (info->user_ptr[0] && - !(ops->internal_flags & NL80211_FLAG_NO_WIPHY_MTX)) { + !(internal_flags & NL80211_FLAG_NO_WIPHY_MTX)) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; /* we kept the mutex locked since pre_doit */ @@ -15030,7 +16260,7 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb, wiphy_unlock(&rdev->wiphy); } - if (ops->internal_flags & NL80211_FLAG_NEED_RTNL) + if (internal_flags & NL80211_FLAG_NEED_RTNL) rtnl_unlock(); /* If needed, clear the netlink message payload from the SKB @@ -15038,7 +16268,7 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb, * the heap after the SKB is freed. The netlink message header * is still needed for further processing, so leave it intact. */ - if (ops->internal_flags & NL80211_FLAG_CLEAR_SKB) { + if (internal_flags & NL80211_FLAG_CLEAR_SKB) { struct nlmsghdr *nlh = nlmsg_hdr(skb); memset(nlmsg_data(nlh), 0, nlmsg_len(nlh)); @@ -15150,6 +16380,11 @@ error: return err; } +#define SELECTOR(__sel, name, value) \ + ((__sel) == (value)) ? NL80211_IFL_SEL_##name : +int __missing_selector(void); +#define IFLAGS(__val) INTERNAL_FLAG_SELECTORS(__val) __missing_selector() + static const struct genl_ops nl80211_ops[] = { { .cmd = NL80211_CMD_GET_WIPHY, @@ -15158,7 +16393,7 @@ static const struct genl_ops nl80211_ops[] = { .dumpit = nl80211_dump_wiphy, .done = nl80211_dump_wiphy_done, /* can be retrieved by unprivileged users */ - .internal_flags = NL80211_FLAG_NEED_WIPHY, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY), }, }; @@ -15175,112 +16410,117 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_get_interface, .dumpit = nl80211_dump_interface, /* can be retrieved by unprivileged users */ - .internal_flags = NL80211_FLAG_NEED_WDEV, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV), }, { .cmd = NL80211_CMD_SET_INTERFACE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_interface, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL), }, { .cmd = NL80211_CMD_NEW_INTERFACE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_new_interface, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL | - /* we take the wiphy mutex later ourselves */ - NL80211_FLAG_NO_WIPHY_MTX, + .internal_flags = + IFLAGS(NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_NEED_RTNL | + /* we take the wiphy mutex later ourselves */ + NL80211_FLAG_NO_WIPHY_MTX), }, { .cmd = NL80211_CMD_DEL_INTERFACE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_interface, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV | + NL80211_FLAG_NEED_RTNL), }, { .cmd = NL80211_CMD_GET_KEY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_key, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_SET_KEY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_key, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_CLEAR_SKB, + /* cannot use NL80211_FLAG_MLO_VALID_LINK_ID, depends on key */ + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_CLEAR_SKB), }, { .cmd = NL80211_CMD_NEW_KEY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_new_key, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_CLEAR_SKB, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_CLEAR_SKB), }, { .cmd = NL80211_CMD_DEL_KEY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_key, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_SET_BEACON, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_set_beacon, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_MLO_VALID_LINK_ID), }, { .cmd = NL80211_CMD_START_AP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_start_ap, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_MLO_VALID_LINK_ID), }, { .cmd = NL80211_CMD_STOP_AP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_stop_ap, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_MLO_VALID_LINK_ID), }, { .cmd = NL80211_CMD_GET_STATION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_station, .dumpit = nl80211_dump_station, - .internal_flags = NL80211_FLAG_NEED_NETDEV, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV), }, { .cmd = NL80211_CMD_SET_STATION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_station, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_NEW_STATION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_new_station, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_DEL_STATION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_station, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_GET_MPATH, @@ -15288,7 +16528,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_get_mpath, .dumpit = nl80211_dump_mpath, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_GET_MPP, @@ -15296,42 +16536,41 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_get_mpp, .dumpit = nl80211_dump_mpp, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_SET_MPATH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_mpath, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_NEW_MPATH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_new_mpath, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_DEL_MPATH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_mpath, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_SET_BSS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_bss, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_GET_REG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_reg_do, .dumpit = nl80211_get_reg_dump, - .internal_flags = 0, /* can be retrieved by unprivileged users */ }, #ifdef CONFIG_CFG80211_CRDA_SUPPORT @@ -15340,7 +16579,6 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_reg, .flags = GENL_ADMIN_PERM, - .internal_flags = 0, }, #endif { @@ -15360,28 +16598,28 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_mesh_config, /* can be retrieved by unprivileged users */ - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_SET_MESH_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_update_mesh_config, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_TRIGGER_SCAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_trigger_scan, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP), }, { .cmd = NL80211_CMD_ABORT_SCAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_abort_scan, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP), }, { .cmd = NL80211_CMD_GET_SCAN, @@ -15393,60 +16631,58 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_start_sched_scan, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_STOP_SCHED_SCAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_stop_sched_scan, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_AUTHENTICATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_authenticate, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - 0 | - NL80211_FLAG_CLEAR_SKB, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_CLEAR_SKB), }, { .cmd = NL80211_CMD_ASSOCIATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_associate, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - 0 | - NL80211_FLAG_CLEAR_SKB, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_CLEAR_SKB), }, { .cmd = NL80211_CMD_DEAUTHENTICATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_deauthenticate, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_DISASSOCIATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_disassociate, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_JOIN_IBSS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_join_ibss, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_LEAVE_IBSS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_leave_ibss, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, #ifdef CONFIG_NL80211_TESTMODE { @@ -15455,7 +16691,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_testmode_do, .dumpit = nl80211_testmode_dump, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY), }, #endif { @@ -15463,34 +16699,32 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_connect, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - 0 | - NL80211_FLAG_CLEAR_SKB, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_CLEAR_SKB), }, { .cmd = NL80211_CMD_UPDATE_CONNECT_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_update_connect_params, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - 0 | - NL80211_FLAG_CLEAR_SKB, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_CLEAR_SKB), }, { .cmd = NL80211_CMD_DISCONNECT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_disconnect, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_SET_WIPHY_NETNS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_wiphy_netns, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL | - NL80211_FLAG_NO_WIPHY_MTX, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_NO_WIPHY_MTX), }, { .cmd = NL80211_CMD_GET_SURVEY, @@ -15502,121 +16736,124 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_setdel_pmksa, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - 0 | - NL80211_FLAG_CLEAR_SKB, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_CLEAR_SKB), }, { .cmd = NL80211_CMD_DEL_PMKSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_setdel_pmksa, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_FLUSH_PMKSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_flush_pmksa, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_REMAIN_ON_CHANNEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_remain_on_channel, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP, + /* FIXME: requiring a link ID here is probably not good */ + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP | + NL80211_FLAG_MLO_VALID_LINK_ID), }, { .cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_cancel_remain_on_channel, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP), }, { .cmd = NL80211_CMD_SET_TX_BITRATE_MASK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_tx_bitrate_mask, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_MLO_VALID_LINK_ID), }, { .cmd = NL80211_CMD_REGISTER_FRAME, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_register_mgmt, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV), }, { .cmd = NL80211_CMD_FRAME, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tx_mgmt, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP), }, { .cmd = NL80211_CMD_FRAME_WAIT_CANCEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tx_mgmt_cancel_wait, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP), }, { .cmd = NL80211_CMD_SET_POWER_SAVE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_power_save, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV), }, { .cmd = NL80211_CMD_GET_POWER_SAVE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_power_save, /* can be retrieved by unprivileged users */ - .internal_flags = NL80211_FLAG_NEED_NETDEV, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV), }, { .cmd = NL80211_CMD_SET_CQM, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_cqm, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV), }, { .cmd = NL80211_CMD_SET_CHANNEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_channel, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_MLO_VALID_LINK_ID), }, { .cmd = NL80211_CMD_JOIN_MESH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_join_mesh, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_LEAVE_MESH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_leave_mesh, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_JOIN_OCB, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_join_ocb, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_LEAVE_OCB, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_leave_ocb, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, #ifdef CONFIG_PM { @@ -15624,14 +16861,14 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_wowlan, /* can be retrieved by unprivileged users */ - .internal_flags = NL80211_FLAG_NEED_WIPHY, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY), }, { .cmd = NL80211_CMD_SET_WOWLAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_wowlan, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY), }, #endif { @@ -15639,125 +16876,127 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_rekey_data, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - 0 | - NL80211_FLAG_CLEAR_SKB, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_CLEAR_SKB), }, { .cmd = NL80211_CMD_TDLS_MGMT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tdls_mgmt, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_TDLS_OPER, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tdls_oper, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_UNEXPECTED_FRAME, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_register_unexpected_frame, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV), }, { .cmd = NL80211_CMD_PROBE_CLIENT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_probe_client, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_REGISTER_BEACONS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_register_beacons, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY), }, { .cmd = NL80211_CMD_SET_NOACK_MAP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_noack_map, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV), }, { .cmd = NL80211_CMD_START_P2P_DEVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_start_p2p_device, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV | + NL80211_FLAG_NEED_RTNL), }, { .cmd = NL80211_CMD_STOP_P2P_DEVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_stop_p2p_device, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP | + NL80211_FLAG_NEED_RTNL), }, { .cmd = NL80211_CMD_START_NAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_start_nan, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV | + NL80211_FLAG_NEED_RTNL), }, { .cmd = NL80211_CMD_STOP_NAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_stop_nan, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP | + NL80211_FLAG_NEED_RTNL), }, { .cmd = NL80211_CMD_ADD_NAN_FUNCTION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_nan_add_func, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP), }, { .cmd = NL80211_CMD_DEL_NAN_FUNCTION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_nan_del_func, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP), }, { .cmd = NL80211_CMD_CHANGE_NAN_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_nan_change_config, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP), }, { .cmd = NL80211_CMD_SET_MCAST_RATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_mcast_rate, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV), }, { .cmd = NL80211_CMD_SET_MAC_ACL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_mac_acl, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_MLO_UNSUPPORTED), }, { .cmd = NL80211_CMD_RADAR_DETECT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_start_radar_detection, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NO_WIPHY_MTX | + NL80211_FLAG_MLO_UNSUPPORTED), }, { .cmd = NL80211_CMD_GET_PROTOCOL_FEATURES, @@ -15769,41 +17008,42 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_update_ft_ies, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_CRIT_PROTOCOL_START, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_crit_protocol_start, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP), }, { .cmd = NL80211_CMD_CRIT_PROTOCOL_STOP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_crit_protocol_stop, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP), }, { .cmd = NL80211_CMD_GET_COALESCE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_coalesce, - .internal_flags = NL80211_FLAG_NEED_WIPHY, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY), }, { .cmd = NL80211_CMD_SET_COALESCE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_coalesce, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY), }, { .cmd = NL80211_CMD_CHANNEL_SWITCH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_channel_switch, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_MLO_VALID_LINK_ID), }, { .cmd = NL80211_CMD_VENDOR, @@ -15811,132 +17051,174 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_vendor_cmd, .dumpit = nl80211_vendor_cmd_dump, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY | - 0 | - NL80211_FLAG_CLEAR_SKB, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_CLEAR_SKB), }, { .cmd = NL80211_CMD_SET_QOS_MAP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_qos_map, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_ADD_TX_TS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_add_tx_ts, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_MLO_UNSUPPORTED), }, { .cmd = NL80211_CMD_DEL_TX_TS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_tx_ts, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_TDLS_CHANNEL_SWITCH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tdls_channel_switch, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_TDLS_CANCEL_CHANNEL_SWITCH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tdls_cancel_channel_switch, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_SET_MULTICAST_TO_UNICAST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_multicast_to_unicast, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV), }, { .cmd = NL80211_CMD_SET_PMK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_pmk, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - 0 | - NL80211_FLAG_CLEAR_SKB, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_CLEAR_SKB), }, { .cmd = NL80211_CMD_DEL_PMK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_pmk, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_EXTERNAL_AUTH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_external_auth, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_CONTROL_PORT_FRAME, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tx_control_port, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_GET_FTM_RESPONDER_STATS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_ftm_responder_stats, - .internal_flags = NL80211_FLAG_NEED_NETDEV, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_MLO_VALID_LINK_ID), }, { .cmd = NL80211_CMD_PEER_MEASUREMENT_START, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_pmsr_start, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP), }, { .cmd = NL80211_CMD_NOTIFY_RADAR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_notify_radar_detection, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_UPDATE_OWE_INFO, .doit = nl80211_update_owe_info, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_PROBE_MESH_LINK, .doit = nl80211_probe_mesh_link, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { .cmd = NL80211_CMD_SET_TID_CONFIG, .doit = nl80211_set_tid_config, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_MLO_VALID_LINK_ID), }, { .cmd = NL80211_CMD_SET_SAR_SPECS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_sar_specs, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_NEED_RTNL), }, { .cmd = NL80211_CMD_COLOR_CHANGE_REQUEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_color_change, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), + }, + { + .cmd = NL80211_CMD_SET_FILS_AAD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = nl80211_set_fils_aad, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), + }, + { + .cmd = NL80211_CMD_ADD_LINK, + .doit = nl80211_add_link, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), + }, + { + .cmd = NL80211_CMD_REMOVE_LINK, + .doit = nl80211_remove_link, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_MLO_VALID_LINK_ID), + }, + { + .cmd = NL80211_CMD_ADD_LINK_STA, + .doit = nl80211_add_link_station, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_MLO_VALID_LINK_ID), + }, + { + .cmd = NL80211_CMD_MODIFY_LINK_STA, + .doit = nl80211_modify_link_station, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_MLO_VALID_LINK_ID), + }, + { + .cmd = NL80211_CMD_REMOVE_LINK_STA, + .doit = nl80211_remove_link_station, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_MLO_VALID_LINK_ID), }, }; @@ -16331,13 +17613,13 @@ void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev, } void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *buf, - size_t len, gfp_t gfp, int uapsd_queues, - const u8 *req_ies, size_t req_ies_len) + struct net_device *netdev, + struct cfg80211_rx_assoc_resp *data) { - nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_ASSOCIATE, gfp, uapsd_queues, - req_ies, req_ies_len, false); + nl80211_send_mlme_event(rdev, netdev, data->buf, data->len, + NL80211_CMD_ASSOCIATE, GFP_KERNEL, + data->uapsd_queues, + data->req_ies, data->req_ies_len, false); } void nl80211_send_deauth(struct cfg80211_registered_device *rdev, @@ -16446,10 +17728,29 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, { struct sk_buff *msg; void *hdr; + unsigned int link; + size_t link_info_size = 0; + const u8 *connected_addr = cr->valid_links ? + cr->ap_mld_addr : cr->links[0].bssid; + + if (cr->valid_links) { + for_each_valid_link(cr, link) { + /* Nested attribute header */ + link_info_size += NLA_HDRLEN; + /* Link ID */ + link_info_size += nla_total_size(sizeof(u8)); + link_info_size += cr->links[link].addr ? + nla_total_size(ETH_ALEN) : 0; + link_info_size += (cr->links[link].bssid || + cr->links[link].bss) ? + nla_total_size(ETH_ALEN) : 0; + } + } msg = nlmsg_new(100 + cr->req_ie_len + cr->resp_ie_len + cr->fils.kek_len + cr->fils.pmk_len + - (cr->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp); + (cr->fils.pmkid ? WLAN_PMKID_LEN : 0) + link_info_size, + gfp); if (!msg) return; @@ -16461,8 +17762,8 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || - (cr->bssid && - nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, cr->bssid)) || + (connected_addr && + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, connected_addr)) || nla_put_u16(msg, NL80211_ATTR_STATUS_CODE, cr->status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE : cr->status) || @@ -16488,6 +17789,38 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, cr->fils.pmkid))))) goto nla_put_failure; + if (cr->valid_links) { + int i = 1; + struct nlattr *nested; + + nested = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS); + if (!nested) + goto nla_put_failure; + + for_each_valid_link(cr, link) { + struct nlattr *nested_mlo_links; + const u8 *bssid = cr->links[link].bss ? + cr->links[link].bss->bssid : + cr->links[link].bssid; + + nested_mlo_links = nla_nest_start(msg, i); + if (!nested_mlo_links) + goto nla_put_failure; + + if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link) || + (bssid && + nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, bssid)) || + (cr->links[link].addr && + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, + cr->links[link].addr))) + goto nla_put_failure; + + nla_nest_end(msg, nested_mlo_links); + i++; + } + nla_nest_end(msg, nested); + } + genlmsg_end(msg, hdr); genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, @@ -16504,11 +17837,32 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, { struct sk_buff *msg; void *hdr; - const u8 *bssid = info->bss ? info->bss->bssid : info->bssid; + size_t link_info_size = 0; + unsigned int link; + const u8 *connected_addr = info->ap_mld_addr ? + info->ap_mld_addr : + (info->links[0].bss ? + info->links[0].bss->bssid : + info->links[0].bssid); + + if (info->valid_links) { + for_each_valid_link(info, link) { + /* Nested attribute header */ + link_info_size += NLA_HDRLEN; + /* Link ID */ + link_info_size += nla_total_size(sizeof(u8)); + link_info_size += info->links[link].addr ? + nla_total_size(ETH_ALEN) : 0; + link_info_size += (info->links[link].bssid || + info->links[link].bss) ? + nla_total_size(ETH_ALEN) : 0; + } + } msg = nlmsg_new(100 + info->req_ie_len + info->resp_ie_len + info->fils.kek_len + info->fils.pmk_len + - (info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp); + (info->fils.pmkid ? WLAN_PMKID_LEN : 0) + + link_info_size, gfp); if (!msg) return; @@ -16520,7 +17874,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || - nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, connected_addr) || (info->req_ie && nla_put(msg, NL80211_ATTR_REQ_IE, info->req_ie_len, info->req_ie)) || @@ -16539,6 +17893,38 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, info->fils.pmkid))) goto nla_put_failure; + if (info->valid_links) { + int i = 1; + struct nlattr *nested; + + nested = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS); + if (!nested) + goto nla_put_failure; + + for_each_valid_link(info, link) { + struct nlattr *nested_mlo_links; + const u8 *bssid = info->links[link].bss ? + info->links[link].bss->bssid : + info->links[link].bssid; + + nested_mlo_links = nla_nest_start(msg, i); + if (!nested_mlo_links) + goto nla_put_failure; + + if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link) || + (bssid && + nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, bssid)) || + (info->links[link].addr && + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, + info->links[link].addr))) + goto nla_put_failure; + + nla_nest_end(msg, nested_mlo_links); + i++; + } + nla_nest_end(msg, nested); + } + genlmsg_end(msg, hdr); genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, @@ -16830,6 +18216,44 @@ static void nl80211_send_remain_on_chan_event( nlmsg_free(msg); } +void cfg80211_assoc_comeback(struct net_device *netdev, + const u8 *ap_addr, u32 timeout) +{ + struct wireless_dev *wdev = netdev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_assoc_comeback(wdev, ap_addr, timeout); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_ASSOC_COMEBACK); + if (!hdr) { + nlmsg_free(msg); + return; + } + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, ap_addr) || + nla_put_u32(msg, NL80211_ATTR_TIMEOUT, timeout)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, + NL80211_MCGRP_MLME, GFP_KERNEL); + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_assoc_comeback); + void cfg80211_ready_on_channel(struct wireless_dev *wdev, u64 cookie, struct ieee80211_channel *chan, unsigned int duration, gfp_t gfp) @@ -17039,14 +18463,13 @@ EXPORT_SYMBOL(cfg80211_rx_unexpected_4addr_frame); int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u32 nlportid, - int freq, int sig_dbm, - const u8 *buf, size_t len, u32 flags, gfp_t gfp) + struct cfg80211_rx_info *info, gfp_t gfp) { struct net_device *netdev = wdev->netdev; struct sk_buff *msg; void *hdr; - msg = nlmsg_new(100 + len, gfp); + msg = nlmsg_new(100 + info->len, gfp); if (!msg) return -ENOMEM; @@ -17061,13 +18484,23 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, netdev->ifindex)) || nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), NL80211_ATTR_PAD) || - nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, KHZ_TO_MHZ(freq)) || - nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, freq % 1000) || - (sig_dbm && - nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || - nla_put(msg, NL80211_ATTR_FRAME, len, buf) || - (flags && - nla_put_u32(msg, NL80211_ATTR_RXMGMT_FLAGS, flags))) + (info->have_link_id && + nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, info->link_id)) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, KHZ_TO_MHZ(info->freq)) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, info->freq % 1000) || + (info->sig_dbm && + nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, info->sig_dbm)) || + nla_put(msg, NL80211_ATTR_FRAME, info->len, info->buf) || + (info->flags && + nla_put_u32(msg, NL80211_ATTR_RXMGMT_FLAGS, info->flags)) || + (info->rx_tstamp && nla_put_u64_64bit(msg, + NL80211_ATTR_RX_HW_TIMESTAMP, + info->rx_tstamp, + NL80211_ATTR_PAD)) || + (info->ack_tstamp && nla_put_u64_64bit(msg, + NL80211_ATTR_TX_HW_TIMESTAMP, + info->ack_tstamp, + NL80211_ATTR_PAD))) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -17079,8 +18512,8 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, return -ENOBUFS; } -static void nl80211_frame_tx_status(struct wireless_dev *wdev, u64 cookie, - const u8 *buf, size_t len, bool ack, +static void nl80211_frame_tx_status(struct wireless_dev *wdev, + struct cfg80211_tx_status *status, gfp_t gfp, enum nl80211_commands command) { struct wiphy *wiphy = wdev->wiphy; @@ -17090,11 +18523,13 @@ static void nl80211_frame_tx_status(struct wireless_dev *wdev, u64 cookie, void *hdr; if (command == NL80211_CMD_FRAME_TX_STATUS) - trace_cfg80211_mgmt_tx_status(wdev, cookie, ack); + trace_cfg80211_mgmt_tx_status(wdev, status->cookie, + status->ack); else - trace_cfg80211_control_port_tx_status(wdev, cookie, ack); + trace_cfg80211_control_port_tx_status(wdev, status->cookie, + status->ack); - msg = nlmsg_new(100 + len, gfp); + msg = nlmsg_new(100 + status->len, gfp); if (!msg) return; @@ -17109,10 +18544,16 @@ static void nl80211_frame_tx_status(struct wireless_dev *wdev, u64 cookie, netdev->ifindex)) || nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), NL80211_ATTR_PAD) || - nla_put(msg, NL80211_ATTR_FRAME, len, buf) || - nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie, + nla_put(msg, NL80211_ATTR_FRAME, status->len, status->buf) || + nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, status->cookie, NL80211_ATTR_PAD) || - (ack && nla_put_flag(msg, NL80211_ATTR_ACK))) + (status->ack && nla_put_flag(msg, NL80211_ATTR_ACK)) || + (status->tx_tstamp && + nla_put_u64_64bit(msg, NL80211_ATTR_TX_HW_TIMESTAMP, + status->tx_tstamp, NL80211_ATTR_PAD)) || + (status->ack_tstamp && + nla_put_u64_64bit(msg, NL80211_ATTR_RX_HW_TIMESTAMP, + status->ack_tstamp, NL80211_ATTR_PAD))) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -17129,18 +18570,24 @@ void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie, const u8 *buf, size_t len, bool ack, gfp_t gfp) { - nl80211_frame_tx_status(wdev, cookie, buf, len, ack, gfp, + struct cfg80211_tx_status status = { + .cookie = cookie, + .buf = buf, + .len = len, + .ack = ack + }; + + nl80211_frame_tx_status(wdev, &status, gfp, NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS); } EXPORT_SYMBOL(cfg80211_control_port_tx_status); -void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, - const u8 *buf, size_t len, bool ack, gfp_t gfp) +void cfg80211_mgmt_tx_status_ext(struct wireless_dev *wdev, + struct cfg80211_tx_status *status, gfp_t gfp) { - nl80211_frame_tx_status(wdev, cookie, buf, len, ack, gfp, - NL80211_CMD_FRAME_TX_STATUS); + nl80211_frame_tx_status(wdev, status, gfp, NL80211_CMD_FRAME_TX_STATUS); } -EXPORT_SYMBOL(cfg80211_mgmt_tx_status); +EXPORT_SYMBOL(cfg80211_mgmt_tx_status_ext); static int __nl80211_rx_control_port(struct net_device *dev, struct sk_buff *skb, @@ -17484,11 +18931,13 @@ EXPORT_SYMBOL(cfg80211_pmksa_candidate_notify); static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, struct net_device *netdev, + unsigned int link_id, struct cfg80211_chan_def *chandef, gfp_t gfp, enum nl80211_commands notif, u8 count, bool quiet) { + struct wireless_dev *wdev = netdev->ieee80211_ptr; struct sk_buff *msg; void *hdr; @@ -17505,6 +18954,10 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex)) goto nla_put_failure; + if (wdev->valid_links && + nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id)) + goto nla_put_failure; + if (nl80211_send_chandef(msg, chandef)) goto nla_put_failure; @@ -17527,42 +18980,63 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, } void cfg80211_ch_switch_notify(struct net_device *dev, - struct cfg80211_chan_def *chandef) + struct cfg80211_chan_def *chandef, + unsigned int link_id) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); ASSERT_WDEV_LOCK(wdev); + WARN_INVALID_LINK_ID(wdev, link_id); - trace_cfg80211_ch_switch_notify(dev, chandef); + trace_cfg80211_ch_switch_notify(dev, chandef, link_id); - wdev->chandef = *chandef; - wdev->preset_chandef = *chandef; - - if ((wdev->iftype == NL80211_IFTYPE_STATION || - wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) && - !WARN_ON(!wdev->current_bss)) - cfg80211_update_assoc_bss_entry(wdev, chandef->chan); + switch (wdev->iftype) { + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + if (!WARN_ON(!wdev->links[link_id].client.current_bss)) + cfg80211_update_assoc_bss_entry(wdev, link_id, + chandef->chan); + break; + case NL80211_IFTYPE_MESH_POINT: + wdev->u.mesh.chandef = *chandef; + wdev->u.mesh.preset_chandef = *chandef; + break; + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + wdev->links[link_id].ap.chandef = *chandef; + break; + case NL80211_IFTYPE_ADHOC: + wdev->u.ibss.chandef = *chandef; + break; + default: + WARN_ON(1); + break; + } cfg80211_sched_dfs_chan_update(rdev); - nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL, + nl80211_ch_switch_notify(rdev, dev, link_id, chandef, GFP_KERNEL, NL80211_CMD_CH_SWITCH_NOTIFY, 0, false); } EXPORT_SYMBOL(cfg80211_ch_switch_notify); void cfg80211_ch_switch_started_notify(struct net_device *dev, struct cfg80211_chan_def *chandef, - u8 count, bool quiet) + unsigned int link_id, u8 count, + bool quiet) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - trace_cfg80211_ch_switch_started_notify(dev, chandef); + ASSERT_WDEV_LOCK(wdev); + WARN_INVALID_LINK_ID(wdev, link_id); - nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL, + trace_cfg80211_ch_switch_started_notify(dev, chandef, link_id); + + nl80211_ch_switch_notify(rdev, dev, link_id, chandef, GFP_KERNEL, NL80211_CMD_CH_SWITCH_STARTED_NOTIFY, count, quiet); } diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index d642e3be4ee7..855d540ddfb9 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions of this file - * Copyright (C) 2018, 2020-2021 Intel Corporation + * Copyright (C) 2018, 2020-2022 Intel Corporation */ #ifndef __NET_WIRELESS_NL80211_H #define __NET_WIRELESS_NL80211_H @@ -60,9 +60,7 @@ void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev, const u8 *buf, size_t len, gfp_t gfp); void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev, struct net_device *netdev, - const u8 *buf, size_t len, gfp_t gfp, - int uapsd_queues, - const u8 *req_ies, size_t req_ies_len); + struct cfg80211_rx_assoc_resp *data); void nl80211_send_deauth(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, size_t len, @@ -107,8 +105,7 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u32 nlpid, - int freq, int sig_dbm, - const u8 *buf, size_t len, u32 flags, gfp_t gfp); + struct cfg80211_rx_info *info, gfp_t gfp); void nl80211_radar_notify(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/ocb.c b/net/wireless/ocb.c index 2d26a6d980bf..27a1732264f9 100644 --- a/net/wireless/ocb.c +++ b/net/wireless/ocb.c @@ -4,6 +4,7 @@ * * Copyright: (c) 2014 Czech Technical University in Prague * (c) 2014 Volkswagen Group Research + * Copyright (C) 2022 Intel Corporation * Author: Rostislav Lisovy * Funded by: Volkswagen Group Research */ @@ -34,7 +35,7 @@ int __cfg80211_join_ocb(struct cfg80211_registered_device *rdev, err = rdev_join_ocb(rdev, dev, setup); if (!err) - wdev->chandef = setup->chandef; + wdev->u.ocb.chandef = setup->chandef; return err; } @@ -69,7 +70,7 @@ int __cfg80211_leave_ocb(struct cfg80211_registered_device *rdev, err = rdev_leave_ocb(rdev, dev); if (!err) - memset(&wdev->chandef, 0, sizeof(wdev->chandef)); + memset(&wdev->u.ocb.chandef, 0, sizeof(wdev->u.ocb.chandef)); return err; } diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index ce6bf218a1a3..13b209a8db28 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -1,4 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * Portions of this file + * Copyright(c) 2016-2017 Intel Deutschland GmbH + * Copyright (C) 2018, 2021-2022 Intel Corporation + */ #ifndef __CFG80211_RDEV_OPS #define __CFG80211_RDEV_OPS @@ -72,65 +77,69 @@ rdev_change_virtual_intf(struct cfg80211_registered_device *rdev, } static inline int rdev_add_key(struct cfg80211_registered_device *rdev, - struct net_device *netdev, u8 key_index, - bool pairwise, const u8 *mac_addr, + struct net_device *netdev, int link_id, + u8 key_index, bool pairwise, const u8 *mac_addr, struct key_params *params) { int ret; - trace_rdev_add_key(&rdev->wiphy, netdev, key_index, pairwise, + trace_rdev_add_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr, params->mode); - ret = rdev->ops->add_key(&rdev->wiphy, netdev, key_index, pairwise, - mac_addr, params); + ret = rdev->ops->add_key(&rdev->wiphy, netdev, link_id, key_index, + pairwise, mac_addr, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, - u8 key_index, bool pairwise, const u8 *mac_addr, void *cookie, + int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, + void *cookie, void (*callback)(void *cookie, struct key_params*)) { int ret; - trace_rdev_get_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr); - ret = rdev->ops->get_key(&rdev->wiphy, netdev, key_index, pairwise, - mac_addr, cookie, callback); + trace_rdev_get_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, + mac_addr); + ret = rdev->ops->get_key(&rdev->wiphy, netdev, link_id, key_index, + pairwise, mac_addr, cookie, callback); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_key(struct cfg80211_registered_device *rdev, - struct net_device *netdev, u8 key_index, - bool pairwise, const u8 *mac_addr) + struct net_device *netdev, int link_id, + u8 key_index, bool pairwise, const u8 *mac_addr) { int ret; - trace_rdev_del_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr); - ret = rdev->ops->del_key(&rdev->wiphy, netdev, key_index, pairwise, - mac_addr); + trace_rdev_del_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, + mac_addr); + ret = rdev->ops->del_key(&rdev->wiphy, netdev, link_id, key_index, + pairwise, mac_addr); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_default_key(struct cfg80211_registered_device *rdev, - struct net_device *netdev, u8 key_index, bool unicast, - bool multicast) + struct net_device *netdev, int link_id, u8 key_index, + bool unicast, bool multicast) { int ret; - trace_rdev_set_default_key(&rdev->wiphy, netdev, key_index, + trace_rdev_set_default_key(&rdev->wiphy, netdev, link_id, key_index, unicast, multicast); - ret = rdev->ops->set_default_key(&rdev->wiphy, netdev, key_index, - unicast, multicast); + ret = rdev->ops->set_default_key(&rdev->wiphy, netdev, link_id, + key_index, unicast, multicast); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_default_mgmt_key(struct cfg80211_registered_device *rdev, - struct net_device *netdev, u8 key_index) + struct net_device *netdev, int link_id, u8 key_index) { int ret; - trace_rdev_set_default_mgmt_key(&rdev->wiphy, netdev, key_index); - ret = rdev->ops->set_default_mgmt_key(&rdev->wiphy, netdev, + trace_rdev_set_default_mgmt_key(&rdev->wiphy, netdev, link_id, + key_index); + ret = rdev->ops->set_default_mgmt_key(&rdev->wiphy, netdev, link_id, key_index); trace_rdev_return_int(&rdev->wiphy, ret); return ret; @@ -138,13 +147,15 @@ rdev_set_default_mgmt_key(struct cfg80211_registered_device *rdev, static inline int rdev_set_default_beacon_key(struct cfg80211_registered_device *rdev, - struct net_device *netdev, u8 key_index) + struct net_device *netdev, int link_id, + u8 key_index) { int ret; - trace_rdev_set_default_beacon_key(&rdev->wiphy, netdev, key_index); - ret = rdev->ops->set_default_beacon_key(&rdev->wiphy, netdev, - key_index); + trace_rdev_set_default_beacon_key(&rdev->wiphy, netdev, link_id, + key_index); + ret = rdev->ops->set_default_beacon_key(&rdev->wiphy, netdev, link_id, + key_index); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } @@ -172,11 +183,11 @@ static inline int rdev_change_beacon(struct cfg80211_registered_device *rdev, } static inline int rdev_stop_ap(struct cfg80211_registered_device *rdev, - struct net_device *dev) + struct net_device *dev, unsigned int link_id) { int ret; - trace_rdev_stop_ap(&rdev->wiphy, dev); - ret = rdev->ops->stop_ap(&rdev->wiphy, dev); + trace_rdev_stop_ap(&rdev->wiphy, dev, link_id); + ret = rdev->ops->stop_ap(&rdev->wiphy, dev, link_id); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } @@ -464,18 +475,9 @@ static inline int rdev_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_assoc_request *req) { - const struct cfg80211_bss_ies *bss_ies; int ret; - /* - * Note: we might trace not exactly the data that's processed, - * due to races and the driver/mac80211 getting a newer copy. - */ - rcu_read_lock(); - bss_ies = rcu_dereference(req->bss->ies); - trace_rdev_assoc(&rdev->wiphy, dev, req, bss_ies); - rcu_read_unlock(); - + trace_rdev_assoc(&rdev->wiphy, dev, req); ret = rdev->ops->assoc(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; @@ -651,12 +653,14 @@ static inline int rdev_testmode_dump(struct cfg80211_registered_device *rdev, static inline int rdev_set_bitrate_mask(struct cfg80211_registered_device *rdev, - struct net_device *dev, const u8 *peer, + struct net_device *dev, unsigned int link_id, + const u8 *peer, const struct cfg80211_bitrate_mask *mask) { int ret; - trace_rdev_set_bitrate_mask(&rdev->wiphy, dev, peer, mask); - ret = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, peer, mask); + trace_rdev_set_bitrate_mask(&rdev->wiphy, dev, link_id, peer, mask); + ret = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, link_id, + peer, mask); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } @@ -748,13 +752,14 @@ static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev, struct net_device *dev, const void *buf, size_t len, const u8 *dest, __be16 proto, - const bool noencrypt, u64 *cookie) + const bool noencrypt, int link, + u64 *cookie) { int ret; trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len, - dest, proto, noencrypt); + dest, proto, noencrypt, link); ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len, - dest, proto, noencrypt, cookie); + dest, proto, noencrypt, link, cookie); if (cookie) trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); else @@ -944,12 +949,13 @@ static inline int rdev_set_noack_map(struct cfg80211_registered_device *rdev, static inline int rdev_get_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, + unsigned int link_id, struct cfg80211_chan_def *chandef) { int ret; - trace_rdev_get_channel(&rdev->wiphy, wdev); - ret = rdev->ops->get_channel(&rdev->wiphy, wdev, chandef); + trace_rdev_get_channel(&rdev->wiphy, wdev, link_id); + ret = rdev->ops->get_channel(&rdev->wiphy, wdev, link_id, chandef); trace_rdev_return_chandef(&rdev->wiphy, ret, chandef); return ret; @@ -1107,12 +1113,14 @@ static inline int rdev_set_qos_map(struct cfg80211_registered_device *rdev, static inline int rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev, - struct net_device *dev, struct cfg80211_chan_def *chandef) + struct net_device *dev, + unsigned int link_id, + struct cfg80211_chan_def *chandef) { int ret; - trace_rdev_set_ap_chanwidth(&rdev->wiphy, dev, chandef); - ret = rdev->ops->set_ap_chanwidth(&rdev->wiphy, dev, chandef); + trace_rdev_set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef); + ret = rdev->ops->set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; @@ -1381,4 +1389,109 @@ static inline int rdev_color_change(struct cfg80211_registered_device *rdev, return ret; } +static inline int +rdev_set_fils_aad(struct cfg80211_registered_device *rdev, + struct net_device *dev, struct cfg80211_fils_aad *fils_aad) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_set_fils_aad(&rdev->wiphy, dev, fils_aad); + if (rdev->ops->set_fils_aad) + ret = rdev->ops->set_fils_aad(&rdev->wiphy, dev, fils_aad); + trace_rdev_return_int(&rdev->wiphy, ret); + + return ret; +} + +static inline int +rdev_set_radar_background(struct cfg80211_registered_device *rdev, + struct cfg80211_chan_def *chandef) +{ + struct wiphy *wiphy = &rdev->wiphy; + int ret; + + if (!rdev->ops->set_radar_background) + return -EOPNOTSUPP; + + trace_rdev_set_radar_background(wiphy, chandef); + ret = rdev->ops->set_radar_background(wiphy, chandef); + trace_rdev_return_int(wiphy, ret); + + return ret; +} + +static inline int +rdev_add_intf_link(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + unsigned int link_id) +{ + int ret = 0; + + trace_rdev_add_intf_link(&rdev->wiphy, wdev, link_id); + if (rdev->ops->add_intf_link) + ret = rdev->ops->add_intf_link(&rdev->wiphy, wdev, link_id); + trace_rdev_return_int(&rdev->wiphy, ret); + + return ret; +} + +static inline void +rdev_del_intf_link(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + unsigned int link_id) +{ + trace_rdev_del_intf_link(&rdev->wiphy, wdev, link_id); + if (rdev->ops->add_intf_link) + rdev->ops->add_intf_link(&rdev->wiphy, wdev, link_id); + trace_rdev_return_void(&rdev->wiphy); +} + +static inline int +rdev_add_link_station(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct link_station_parameters *params) +{ + int ret; + + if (!rdev->ops->add_link_station) + return -EOPNOTSUPP; + + trace_rdev_add_link_station(&rdev->wiphy, dev, params); + ret = rdev->ops->add_link_station(&rdev->wiphy, dev, params); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + +static inline int +rdev_mod_link_station(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct link_station_parameters *params) +{ + int ret; + + if (!rdev->ops->mod_link_station) + return -EOPNOTSUPP; + + trace_rdev_mod_link_station(&rdev->wiphy, dev, params); + ret = rdev->ops->mod_link_station(&rdev->wiphy, dev, params); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + +static inline int +rdev_del_link_station(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct link_station_del_parameters *params) +{ + int ret; + + if (!rdev->ops->del_link_station) + return -EOPNOTSUPP; + + trace_rdev_del_link_station(&rdev->wiphy, dev, params); + ret = rdev->ops->del_link_station(&rdev->wiphy, dev, params); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index d0fbe822e793..4f3f31244e8b 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -5,7 +5,7 @@ * Copyright 2008-2011 Luis R. Rodriguez * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2021 Intel Corporation + * Copyright (C) 2018 - 2022 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -133,6 +133,7 @@ static u32 reg_is_indoor_portid; static void restore_regulatory_settings(bool reset_user, bool cached); static void print_regdomain(const struct ieee80211_regdomain *rd); +static void reg_process_hint(struct regulatory_request *reg_request); static const struct ieee80211_regdomain *get_cfg80211_regdom(void) { @@ -1108,6 +1109,8 @@ int reg_reload_regdb(void) const struct firmware *fw; void *db; int err; + const struct ieee80211_regdomain *current_regdomain; + struct regulatory_request *request; err = request_firmware(&fw, "regulatory.db", ®_pdev->dev); if (err) @@ -1128,8 +1131,26 @@ int reg_reload_regdb(void) if (!IS_ERR_OR_NULL(regdb)) kfree(regdb); regdb = db; - rtnl_unlock(); + /* reset regulatory domain */ + current_regdomain = get_cfg80211_regdom(); + + request = kzalloc(sizeof(*request), GFP_KERNEL); + if (!request) { + err = -ENOMEM; + goto out_unlock; + } + + request->wiphy_idx = WIPHY_IDX_INVALID; + request->alpha2[0] = current_regdomain->alpha2[0]; + request->alpha2[1] = current_regdomain->alpha2[1]; + request->initiator = NL80211_REGDOM_SET_BY_CORE; + request->user_reg_hint_type = NL80211_USER_REG_HINT_USER; + + reg_process_hint(request); + +out_unlock: + rtnl_unlock(); out: release_firmware(fw); return err; @@ -1227,6 +1248,8 @@ unsigned int reg_get_max_bandwidth(const struct ieee80211_regdomain *rd, { unsigned int bw = reg_get_max_bandwidth_from_range(rd, rule); + if (rule->flags & NL80211_RRF_NO_320MHZ) + bw = min_t(unsigned int, bw, MHZ_TO_KHZ(160)); if (rule->flags & NL80211_RRF_NO_160MHZ) bw = min_t(unsigned int, bw, MHZ_TO_KHZ(80)); if (rule->flags & NL80211_RRF_NO_80MHZ) @@ -1600,6 +1623,8 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_NO_160MHZ; if (rd_flags & NL80211_RRF_NO_HE) channel_flags |= IEEE80211_CHAN_NO_HE; + if (rd_flags & NL80211_RRF_NO_320MHZ) + channel_flags |= IEEE80211_CHAN_NO_320MHZ; return channel_flags; } @@ -1762,6 +1787,8 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd bw_flags |= IEEE80211_CHAN_NO_80MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(160)) bw_flags |= IEEE80211_CHAN_NO_160MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(320)) + bw_flags |= IEEE80211_CHAN_NO_320MHZ; } return bw_flags; } @@ -2349,6 +2376,7 @@ static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev) struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); enum nl80211_iftype iftype; bool ret; + int link; wdev_lock(wdev); iftype = wdev->iftype; @@ -2357,60 +2385,87 @@ static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev) if (!wdev->netdev || !netif_running(wdev->netdev)) goto wdev_inactive_unlock; - switch (iftype) { - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_P2P_GO: - if (!wdev->beacon_interval) - goto wdev_inactive_unlock; - chandef = wdev->chandef; - break; - case NL80211_IFTYPE_ADHOC: - if (!wdev->ssid_len) - goto wdev_inactive_unlock; - chandef = wdev->chandef; - break; - case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_P2P_CLIENT: - if (!wdev->current_bss || - !wdev->current_bss->pub.channel) - goto wdev_inactive_unlock; + for (link = 0; link < ARRAY_SIZE(wdev->links); link++) { + struct ieee80211_channel *chan; - if (!rdev->ops->get_channel || - rdev_get_channel(rdev, wdev, &chandef)) - cfg80211_chandef_create(&chandef, - wdev->current_bss->pub.channel, - NL80211_CHAN_NO_HT); - break; - case NL80211_IFTYPE_MONITOR: - case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_P2P_DEVICE: - /* no enforcement required */ - break; - default: - /* others not implemented for now */ - WARN_ON(1); - break; + if (!wdev->valid_links && link > 0) + break; + if (!(wdev->valid_links & BIT(link))) + continue; + switch (iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + if (!wdev->links[link].ap.beacon_interval) + continue; + chandef = wdev->links[link].ap.chandef; + break; + case NL80211_IFTYPE_MESH_POINT: + if (!wdev->u.mesh.beacon_interval) + continue; + chandef = wdev->u.mesh.chandef; + break; + case NL80211_IFTYPE_ADHOC: + if (!wdev->u.ibss.ssid_len) + continue; + chandef = wdev->u.ibss.chandef; + break; + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + /* Maybe we could consider disabling that link only? */ + if (!wdev->links[link].client.current_bss) + continue; + + chan = wdev->links[link].client.current_bss->pub.channel; + if (!chan) + continue; + + if (!rdev->ops->get_channel || + rdev_get_channel(rdev, wdev, link, &chandef)) + cfg80211_chandef_create(&chandef, chan, + NL80211_CHAN_NO_HT); + break; + case NL80211_IFTYPE_MONITOR: + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_P2P_DEVICE: + /* no enforcement required */ + break; + default: + /* others not implemented for now */ + WARN_ON(1); + break; + } + + wdev_unlock(wdev); + + switch (iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_MESH_POINT: + wiphy_lock(wiphy); + ret = cfg80211_reg_can_beacon_relax(wiphy, &chandef, + iftype); + wiphy_unlock(wiphy); + + if (!ret) + return ret; + break; + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + ret = cfg80211_chandef_usable(wiphy, &chandef, + IEEE80211_CHAN_DISABLED); + if (!ret) + return ret; + break; + default: + break; + } + + wdev_lock(wdev); } wdev_unlock(wdev); - switch (iftype) { - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_P2P_GO: - case NL80211_IFTYPE_ADHOC: - wiphy_lock(wiphy); - ret = cfg80211_reg_can_beacon_relax(wiphy, &chandef, iftype); - wiphy_unlock(wiphy); - - return ret; - case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_P2P_CLIENT: - return cfg80211_chandef_usable(wiphy, &chandef, - IEEE80211_CHAN_DISABLED); - default: - break; - } - return true; wdev_inactive_unlock: @@ -4192,8 +4247,17 @@ static void cfg80211_check_and_end_cac(struct cfg80211_registered_device *rdev) * In both cases we should end the CAC on the wdev. */ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { - if (wdev->cac_started && - !cfg80211_chandef_dfs_usable(&rdev->wiphy, &wdev->chandef)) + struct cfg80211_chan_def *chandef; + + if (!wdev->cac_started) + continue; + + /* FIXME: radar detection is tied to link 0 for now */ + chandef = wdev_chandef(wdev, 0); + if (!chandef) + continue; + + if (!cfg80211_chandef_dfs_usable(&rdev->wiphy, chandef)) rdev_end_cac(rdev, wdev->netdev); } } diff --git a/net/wireless/scan.c b/net/wireless/scan.c index ef31e401d791..c6c4f136930f 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -5,7 +5,7 @@ * Copyright 2008 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2016 Intel Deutschland GmbH - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation */ #include #include @@ -403,22 +403,20 @@ static int cfg80211_add_nontrans_list(struct cfg80211_bss *trans_bss, struct cfg80211_bss *nontrans_bss) { - const u8 *ssid; - size_t ssid_len; + const struct element *ssid_elem; struct cfg80211_bss *bss = NULL; rcu_read_lock(); - ssid = ieee80211_bss_get_ie(nontrans_bss, WLAN_EID_SSID); - if (!ssid) { + ssid_elem = ieee80211_bss_get_elem(nontrans_bss, WLAN_EID_SSID); + if (!ssid_elem) { rcu_read_unlock(); return -EINVAL; } - ssid_len = ssid[1]; - ssid = ssid + 2; /* check if nontrans_bss is in the list */ list_for_each_entry(bss, &trans_bss->nontrans_list, nontrans_list) { - if (is_bss(bss, nontrans_bss->bssid, ssid, ssid_len)) { + if (is_bss(bss, nontrans_bss->bssid, ssid_elem->data, + ssid_elem->datalen)) { rcu_read_unlock(); return 0; } @@ -548,7 +546,7 @@ static int cfg80211_parse_ap_info(struct cfg80211_colocated_ap *entry, memcpy(entry->bssid, pos, ETH_ALEN); pos += ETH_ALEN; - if (length == IEEE80211_TBTT_INFO_OFFSET_BSSID_SSSID_BSS_PARAM) { + if (length >= IEEE80211_TBTT_INFO_OFFSET_BSSID_SSSID_BSS_PARAM) { memcpy(&entry->short_ssid, pos, sizeof(entry->short_ssid)); entry->short_ssid_valid = true; @@ -2256,7 +2254,8 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, struct ieee80211_mgmt *mgmt, size_t len) { u8 *ie, *new_ie, *pos; - const u8 *nontrans_ssid, *trans_ssid, *mbssid; + const struct element *nontrans_ssid; + const u8 *trans_ssid, *mbssid; size_t ielen = len - offsetof(struct ieee80211_mgmt, u.probe_resp.variable); size_t new_ie_len; @@ -2283,11 +2282,11 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, return; new_ie_len -= mbssid[1]; - nontrans_ssid = ieee80211_bss_get_ie(nontrans_bss, WLAN_EID_SSID); + nontrans_ssid = ieee80211_bss_get_elem(nontrans_bss, WLAN_EID_SSID); if (!nontrans_ssid) return; - new_ie_len += nontrans_ssid[1]; + new_ie_len += nontrans_ssid->datalen; /* generate new ie for nontrans BSS * 1. replace SSID with nontrans BSS' SSID @@ -2304,7 +2303,7 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, pos = new_ie; /* copy the nontransmitted SSID */ - cpy_len = nontrans_ssid[1] + 2; + cpy_len = nontrans_ssid->datalen + 2; memcpy(pos, nontrans_ssid, cpy_len); pos += cpy_len; /* copy the IEs between SSID and MBSSID */ @@ -2598,7 +2597,8 @@ void cfg80211_bss_iter(struct wiphy *wiphy, spin_lock_bh(&rdev->bss_lock); list_for_each_entry(bss, &rdev->bss_list, list) { - if (!chandef || cfg80211_is_sub_chan(chandef, bss->pub.channel)) + if (!chandef || cfg80211_is_sub_chan(chandef, bss->pub.channel, + false)) iter(wiphy, &bss->pub, iter_data); } @@ -2607,11 +2607,12 @@ void cfg80211_bss_iter(struct wiphy *wiphy, EXPORT_SYMBOL(cfg80211_bss_iter); void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev, + unsigned int link_id, struct ieee80211_channel *chan) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - struct cfg80211_internal_bss *cbss = wdev->current_bss; + struct cfg80211_internal_bss *cbss = wdev->links[link_id].client.current_bss; struct cfg80211_internal_bss *new = NULL; struct cfg80211_internal_bss *bss; struct cfg80211_bss *nontrans_bss; @@ -2716,7 +2717,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, struct cfg80211_registered_device *rdev; struct wiphy *wiphy; struct iw_scan_req *wreq = NULL; - struct cfg80211_scan_request *creq = NULL; + struct cfg80211_scan_request *creq; int i, err, n_channels = 0; enum nl80211_band band; @@ -2731,10 +2732,8 @@ int cfg80211_wext_siwscan(struct net_device *dev, if (IS_ERR(rdev)) return PTR_ERR(rdev); - if (rdev->scan_req || rdev->scan_msg) { - err = -EBUSY; - goto out; - } + if (rdev->scan_req || rdev->scan_msg) + return -EBUSY; wiphy = &rdev->wiphy; @@ -2747,10 +2746,8 @@ int cfg80211_wext_siwscan(struct net_device *dev, creq = kzalloc(sizeof(*creq) + sizeof(struct cfg80211_ssid) + n_channels * sizeof(void *), GFP_ATOMIC); - if (!creq) { - err = -ENOMEM; - goto out; - } + if (!creq) + return -ENOMEM; creq->wiphy = wiphy; creq->wdev = dev->ieee80211_ptr; diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 08a70b4f090c..d513536617bd 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -5,7 +5,7 @@ * (for nl80211's connect() and wext) * * Copyright 2009 Johannes Berg - * Copyright (C) 2009, 2020 Intel Corporation. All rights reserved. + * Copyright (C) 2009, 2020, 2022 Intel Corporation. All rights reserved. * Copyright 2017 Intel Deutschland GmbH */ @@ -147,6 +147,7 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev, { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_connect_params *params; + struct cfg80211_auth_request auth_req = {}; struct cfg80211_assoc_request req = {}; int err; @@ -167,13 +168,19 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev, if (WARN_ON(!rdev->ops->auth)) return -EOPNOTSUPP; wdev->conn->state = CFG80211_CONN_AUTHENTICATING; - return cfg80211_mlme_auth(rdev, wdev->netdev, - params->channel, params->auth_type, - params->bssid, - params->ssid, params->ssid_len, - NULL, 0, - params->key, params->key_len, - params->key_idx, NULL, 0); + auth_req.key = params->key; + auth_req.key_len = params->key_len; + auth_req.key_idx = params->key_idx; + auth_req.auth_type = params->auth_type; + auth_req.bss = cfg80211_get_bss(&rdev->wiphy, params->channel, + params->bssid, + params->ssid, params->ssid_len, + IEEE80211_BSS_TYPE_ESS, + IEEE80211_PRIVACY_ANY); + auth_req.link_id = -1; + err = cfg80211_mlme_auth(rdev, wdev->netdev, &auth_req); + cfg80211_put_bss(&rdev->wiphy, auth_req.bss); + return err; case CFG80211_CONN_AUTH_FAILED_TIMEOUT: *treason = NL80211_TIMEOUT_AUTH; return -ENOTCONN; @@ -192,10 +199,20 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev, req.ht_capa_mask = params->ht_capa_mask; req.vht_capa = params->vht_capa; req.vht_capa_mask = params->vht_capa_mask; + req.link_id = -1; + + req.bss = cfg80211_get_bss(&rdev->wiphy, params->channel, + params->bssid, + params->ssid, params->ssid_len, + IEEE80211_BSS_TYPE_ESS, + IEEE80211_PRIVACY_ANY); + if (!req.bss) { + err = -ENOENT; + } else { + err = cfg80211_mlme_assoc(rdev, wdev->netdev, &req); + cfg80211_put_bss(&rdev->wiphy, req.bss); + } - err = cfg80211_mlme_assoc(rdev, wdev->netdev, params->channel, - params->bssid, params->ssid, - params->ssid_len, &req); if (err) cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, NULL, 0, @@ -258,7 +275,7 @@ void cfg80211_conn_work(struct work_struct *work) memset(&cr, 0, sizeof(cr)); cr.status = -1; - cr.bssid = bssid; + cr.links[0].bssid = bssid; cr.timeout_reason = treason; __cfg80211_connect_result(wdev->netdev, &cr, false); } @@ -367,7 +384,7 @@ void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len) memset(&cr, 0, sizeof(cr)); cr.status = status_code; - cr.bssid = mgmt->bssid; + cr.links[0].bssid = mgmt->bssid; cr.timeout_reason = NL80211_TIMEOUT_UNSPECIFIED; __cfg80211_connect_result(wdev->netdev, &cr, false); } else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) { @@ -454,6 +471,20 @@ void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev) schedule_work(&rdev->conn_work); } +static void cfg80211_wdev_release_bsses(struct wireless_dev *wdev) +{ + unsigned int link; + + for_each_valid_link(wdev, link) { + if (!wdev->links[link].client.current_bss) + continue; + cfg80211_unhold_bss(wdev->links[link].client.current_bss); + cfg80211_put_bss(wdev->wiphy, + &wdev->links[link].client.current_bss->pub); + wdev->links[link].client.current_bss = NULL; + } +} + static int cfg80211_sme_get_conn_ies(struct wireless_dev *wdev, const u8 *ies, size_t ies_len, const u8 **out_ies, size_t *out_ies_len) @@ -521,12 +552,11 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, if (!rdev->ops->auth || !rdev->ops->assoc) return -EOPNOTSUPP; - if (wdev->current_bss) { - cfg80211_unhold_bss(wdev->current_bss); - cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); - wdev->current_bss = NULL; + cfg80211_wdev_release_bsses(wdev); + if (wdev->connected) { cfg80211_sme_free(wdev); + wdev->connected = false; } if (wdev->conn) @@ -563,8 +593,8 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, wdev->conn->auto_auth = false; } - wdev->conn->params.ssid = wdev->ssid; - wdev->conn->params.ssid_len = wdev->ssid_len; + wdev->conn->params.ssid = wdev->u.client.ssid; + wdev->conn->params.ssid_len = wdev->u.client.ssid_len; /* see if we have the bss already */ bss = cfg80211_get_conn_bss(wdev); @@ -648,7 +678,7 @@ static bool cfg80211_is_all_idle(void) list_for_each_entry(rdev, &cfg80211_rdev_list, list) { list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { wdev_lock(wdev); - if (wdev->conn || wdev->current_bss || + if (wdev->conn || wdev->connected || cfg80211_beaconing_iface_active(wdev)) is_all_idle = false; wdev_unlock(wdev); @@ -668,6 +698,19 @@ static void disconnect_work(struct work_struct *work) DECLARE_WORK(cfg80211_disconnect_work, disconnect_work); +static void +cfg80211_connect_result_release_bsses(struct wireless_dev *wdev, + struct cfg80211_connect_resp_params *cr) +{ + unsigned int link; + + for_each_valid_link(cr, link) { + if (!cr->links[link].bss) + continue; + cfg80211_unhold_bss(bss_from_pub(cr->links[link].bss)); + cfg80211_put_bss(wdev->wiphy, cr->links[link].bss); + } +} /* * API calls for drivers implementing connect/disconnect and @@ -680,25 +723,42 @@ void __cfg80211_connect_result(struct net_device *dev, bool wextev) { struct wireless_dev *wdev = dev->ieee80211_ptr; - const u8 *country_ie; + const struct element *country_elem = NULL; + const u8 *country_data; + u8 country_datalen; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif + unsigned int link; + const u8 *connected_addr; + bool bss_not_found = false; ASSERT_WDEV_LOCK(wdev); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && - wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) { - cfg80211_put_bss(wdev->wiphy, cr->bss); - return; + wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) + goto out; + + if (cr->valid_links) { + if (WARN_ON(!cr->ap_mld_addr)) + goto out; + + for_each_valid_link(cr, link) { + if (WARN_ON(!cr->links[link].addr)) + goto out; + } + + if (WARN_ON(wdev->connect_keys)) + goto out; } wdev->unprot_beacon_reported = 0; nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, cr, GFP_KERNEL); + connected_addr = cr->valid_links ? cr->ap_mld_addr : cr->links[0].bssid; #ifdef CONFIG_CFG80211_WEXT - if (wextev) { + if (wextev && !cr->valid_links) { if (cr->req_ie && cr->status == WLAN_STATUS_SUCCESS) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = cr->req_ie_len; @@ -715,76 +775,149 @@ void __cfg80211_connect_result(struct net_device *dev, memset(&wrqu, 0, sizeof(wrqu)); wrqu.ap_addr.sa_family = ARPHRD_ETHER; - if (cr->bssid && cr->status == WLAN_STATUS_SUCCESS) { - memcpy(wrqu.ap_addr.sa_data, cr->bssid, ETH_ALEN); - memcpy(wdev->wext.prev_bssid, cr->bssid, ETH_ALEN); + if (connected_addr && cr->status == WLAN_STATUS_SUCCESS) { + memcpy(wrqu.ap_addr.sa_data, connected_addr, ETH_ALEN); + memcpy(wdev->wext.prev_bssid, connected_addr, ETH_ALEN); wdev->wext.prev_bssid_valid = true; } wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); } #endif - if (!cr->bss && (cr->status == WLAN_STATUS_SUCCESS)) { - WARN_ON_ONCE(!wiphy_to_rdev(wdev->wiphy)->ops->connect); - cr->bss = cfg80211_get_bss(wdev->wiphy, NULL, cr->bssid, - wdev->ssid, wdev->ssid_len, - wdev->conn_bss_type, - IEEE80211_PRIVACY_ANY); - if (cr->bss) - cfg80211_hold_bss(bss_from_pub(cr->bss)); + if (cr->status == WLAN_STATUS_SUCCESS) { + if (!wiphy_to_rdev(wdev->wiphy)->ops->connect) { + for_each_valid_link(cr, link) { + if (WARN_ON_ONCE(!cr->links[link].bss)) + break; + } + } + + for_each_valid_link(cr, link) { + if (cr->links[link].bss) + continue; + + cr->links[link].bss = + cfg80211_get_bss(wdev->wiphy, NULL, + cr->links[link].bssid, + wdev->u.client.ssid, + wdev->u.client.ssid_len, + wdev->conn_bss_type, + IEEE80211_PRIVACY_ANY); + if (!cr->links[link].bss) { + bss_not_found = true; + break; + } + cfg80211_hold_bss(bss_from_pub(cr->links[link].bss)); + } } - if (wdev->current_bss) { - cfg80211_unhold_bss(wdev->current_bss); - cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); - wdev->current_bss = NULL; - } + cfg80211_wdev_release_bsses(wdev); if (cr->status != WLAN_STATUS_SUCCESS) { kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; - wdev->ssid_len = 0; + wdev->u.client.ssid_len = 0; wdev->conn_owner_nlportid = 0; - if (cr->bss) { - cfg80211_unhold_bss(bss_from_pub(cr->bss)); - cfg80211_put_bss(wdev->wiphy, cr->bss); - } + cfg80211_connect_result_release_bsses(wdev, cr); cfg80211_sme_free(wdev); return; } - if (WARN_ON(!cr->bss)) + if (WARN_ON(bss_not_found)) { + cfg80211_connect_result_release_bsses(wdev, cr); return; + } - wdev->current_bss = bss_from_pub(cr->bss); + memset(wdev->links, 0, sizeof(wdev->links)); + wdev->valid_links = cr->valid_links; + for_each_valid_link(cr, link) + wdev->links[link].client.current_bss = + bss_from_pub(cr->links[link].bss); + wdev->connected = true; + ether_addr_copy(wdev->u.client.connected_addr, connected_addr); + if (cr->valid_links) { + for_each_valid_link(cr, link) + memcpy(wdev->links[link].addr, cr->links[link].addr, + ETH_ALEN); + } if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP)) cfg80211_upload_connect_keys(wdev); rcu_read_lock(); - country_ie = ieee80211_bss_get_ie(cr->bss, WLAN_EID_COUNTRY); - if (!country_ie) { + for_each_valid_link(cr, link) { + country_elem = + ieee80211_bss_get_elem(cr->links[link].bss, + WLAN_EID_COUNTRY); + if (country_elem) + break; + } + if (!country_elem) { rcu_read_unlock(); return; } - country_ie = kmemdup(country_ie, 2 + country_ie[1], GFP_ATOMIC); + country_datalen = country_elem->datalen; + country_data = kmemdup(country_elem->data, country_datalen, GFP_ATOMIC); rcu_read_unlock(); - if (!country_ie) + if (!country_data) return; - /* - * ieee80211_bss_get_ie() ensures we can access: - * - country_ie + 2, the start of the country ie data, and - * - and country_ie[1] which is the IE length - */ - regulatory_hint_country_ie(wdev->wiphy, cr->bss->channel->band, - country_ie + 2, country_ie[1]); - kfree(country_ie); + regulatory_hint_country_ie(wdev->wiphy, + cr->links[link].bss->channel->band, + country_data, country_datalen); + kfree(country_data); + + return; +out: + for_each_valid_link(cr, link) + cfg80211_put_bss(wdev->wiphy, cr->links[link].bss); } -/* Consumes bss object one way or another */ +static void cfg80211_update_link_bss(struct wireless_dev *wdev, + struct cfg80211_bss **bss) +{ + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct cfg80211_internal_bss *ibss; + + if (!*bss) + return; + + ibss = bss_from_pub(*bss); + if (list_empty(&ibss->list)) { + struct cfg80211_bss *found = NULL, *tmp = *bss; + + found = cfg80211_get_bss(wdev->wiphy, NULL, + (*bss)->bssid, + wdev->u.client.ssid, + wdev->u.client.ssid_len, + wdev->conn_bss_type, + IEEE80211_PRIVACY_ANY); + if (found) { + /* The same BSS is already updated so use it + * instead, as it has latest info. + */ + *bss = found; + } else { + /* Update with BSS provided by driver, it will + * be freshly added and ref cnted, we can free + * the old one. + * + * signal_valid can be false, as we are not + * expecting the BSS to be found. + * + * keep the old timestamp to avoid confusion + */ + cfg80211_bss_update(rdev, ibss, false, + ibss->ts); + } + + cfg80211_put_bss(wdev->wiphy, tmp); + } +} + +/* Consumes bss object(s) one way or another */ void cfg80211_connect_done(struct net_device *dev, struct cfg80211_connect_resp_params *params, gfp_t gfp) @@ -794,55 +927,34 @@ void cfg80211_connect_done(struct net_device *dev, struct cfg80211_event *ev; unsigned long flags; u8 *next; + size_t link_info_size = 0; + unsigned int link; - if (params->bss) { - struct cfg80211_internal_bss *ibss = bss_from_pub(params->bss); - - if (list_empty(&ibss->list)) { - struct cfg80211_bss *found = NULL, *tmp = params->bss; - - found = cfg80211_get_bss(wdev->wiphy, NULL, - params->bss->bssid, - wdev->ssid, wdev->ssid_len, - wdev->conn_bss_type, - IEEE80211_PRIVACY_ANY); - if (found) { - /* The same BSS is already updated so use it - * instead, as it has latest info. - */ - params->bss = found; - } else { - /* Update with BSS provided by driver, it will - * be freshly added and ref cnted, we can free - * the old one. - * - * signal_valid can be false, as we are not - * expecting the BSS to be found. - * - * keep the old timestamp to avoid confusion - */ - cfg80211_bss_update(rdev, ibss, false, - ibss->ts); - } - - cfg80211_put_bss(wdev->wiphy, tmp); - } + for_each_valid_link(params, link) { + cfg80211_update_link_bss(wdev, ¶ms->links[link].bss); + link_info_size += params->links[link].bssid ? ETH_ALEN : 0; + link_info_size += params->links[link].addr ? ETH_ALEN : 0; } - ev = kzalloc(sizeof(*ev) + (params->bssid ? ETH_ALEN : 0) + + ev = kzalloc(sizeof(*ev) + (params->ap_mld_addr ? ETH_ALEN : 0) + params->req_ie_len + params->resp_ie_len + params->fils.kek_len + params->fils.pmk_len + - (params->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp); + (params->fils.pmkid ? WLAN_PMKID_LEN : 0) + link_info_size, + gfp); + if (!ev) { - cfg80211_put_bss(wdev->wiphy, params->bss); + for_each_valid_link(params, link) + cfg80211_put_bss(wdev->wiphy, + params->links[link].bss); return; } ev->type = EVENT_CONNECT_RESULT; next = ((u8 *)ev) + sizeof(*ev); - if (params->bssid) { - ev->cr.bssid = next; - memcpy((void *)ev->cr.bssid, params->bssid, ETH_ALEN); + if (params->ap_mld_addr) { + ev->cr.ap_mld_addr = next; + memcpy((void *)ev->cr.ap_mld_addr, params->ap_mld_addr, + ETH_ALEN); next += ETH_ALEN; } if (params->req_ie_len) { @@ -882,9 +994,28 @@ void cfg80211_connect_done(struct net_device *dev, ev->cr.fils.update_erp_next_seq_num = params->fils.update_erp_next_seq_num; if (params->fils.update_erp_next_seq_num) ev->cr.fils.erp_next_seq_num = params->fils.erp_next_seq_num; - if (params->bss) - cfg80211_hold_bss(bss_from_pub(params->bss)); - ev->cr.bss = params->bss; + ev->cr.valid_links = params->valid_links; + for_each_valid_link(params, link) { + if (params->links[link].bss) + cfg80211_hold_bss( + bss_from_pub(params->links[link].bss)); + ev->cr.links[link].bss = params->links[link].bss; + + if (params->links[link].addr) { + ev->cr.links[link].addr = next; + memcpy((void *)ev->cr.links[link].addr, + params->links[link].addr, + ETH_ALEN); + next += ETH_ALEN; + } + if (params->links[link].bssid) { + ev->cr.links[link].bssid = next; + memcpy((void *)ev->cr.links[link].bssid, + params->links[link].bssid, + ETH_ALEN); + next += ETH_ALEN; + } + } ev->cr.status = params->status; ev->cr.timeout_reason = params->timeout_reason; @@ -902,58 +1033,88 @@ void __cfg80211_roamed(struct wireless_dev *wdev, #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif + unsigned int link; + const u8 *connected_addr; + ASSERT_WDEV_LOCK(wdev); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) goto out; - if (WARN_ON(!wdev->current_bss)) + if (WARN_ON(!wdev->connected)) goto out; - cfg80211_unhold_bss(wdev->current_bss); - cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); - wdev->current_bss = NULL; + if (info->valid_links) { + if (WARN_ON(!info->ap_mld_addr)) + goto out; - if (WARN_ON(!info->bss)) - return; + for_each_valid_link(info, link) { + if (WARN_ON(!info->links[link].addr)) + goto out; + } + } - cfg80211_hold_bss(bss_from_pub(info->bss)); - wdev->current_bss = bss_from_pub(info->bss); + cfg80211_wdev_release_bsses(wdev); + for_each_valid_link(info, link) { + if (WARN_ON(!info->links[link].bss)) + goto out; + } + + memset(wdev->links, 0, sizeof(wdev->links)); + wdev->valid_links = info->valid_links; + for_each_valid_link(info, link) { + cfg80211_hold_bss(bss_from_pub(info->links[link].bss)); + wdev->links[link].client.current_bss = + bss_from_pub(info->links[link].bss); + } + + connected_addr = info->valid_links ? + info->ap_mld_addr : + info->links[0].bss->bssid; + ether_addr_copy(wdev->u.client.connected_addr, connected_addr); + if (info->valid_links) { + for_each_valid_link(info, link) + memcpy(wdev->links[link].addr, info->links[link].addr, + ETH_ALEN); + } wdev->unprot_beacon_reported = 0; nl80211_send_roamed(wiphy_to_rdev(wdev->wiphy), wdev->netdev, info, GFP_KERNEL); #ifdef CONFIG_CFG80211_WEXT - if (info->req_ie) { - memset(&wrqu, 0, sizeof(wrqu)); - wrqu.data.length = info->req_ie_len; - wireless_send_event(wdev->netdev, IWEVASSOCREQIE, - &wrqu, info->req_ie); - } + if (!info->valid_links) { + if (info->req_ie) { + memset(&wrqu, 0, sizeof(wrqu)); + wrqu.data.length = info->req_ie_len; + wireless_send_event(wdev->netdev, IWEVASSOCREQIE, + &wrqu, info->req_ie); + } - if (info->resp_ie) { - memset(&wrqu, 0, sizeof(wrqu)); - wrqu.data.length = info->resp_ie_len; - wireless_send_event(wdev->netdev, IWEVASSOCRESPIE, - &wrqu, info->resp_ie); - } + if (info->resp_ie) { + memset(&wrqu, 0, sizeof(wrqu)); + wrqu.data.length = info->resp_ie_len; + wireless_send_event(wdev->netdev, IWEVASSOCRESPIE, + &wrqu, info->resp_ie); + } - memset(&wrqu, 0, sizeof(wrqu)); - wrqu.ap_addr.sa_family = ARPHRD_ETHER; - memcpy(wrqu.ap_addr.sa_data, info->bss->bssid, ETH_ALEN); - memcpy(wdev->wext.prev_bssid, info->bss->bssid, ETH_ALEN); - wdev->wext.prev_bssid_valid = true; - wireless_send_event(wdev->netdev, SIOCGIWAP, &wrqu, NULL); + memset(&wrqu, 0, sizeof(wrqu)); + wrqu.ap_addr.sa_family = ARPHRD_ETHER; + memcpy(wrqu.ap_addr.sa_data, connected_addr, ETH_ALEN); + memcpy(wdev->wext.prev_bssid, connected_addr, ETH_ALEN); + wdev->wext.prev_bssid_valid = true; + wireless_send_event(wdev->netdev, SIOCGIWAP, &wrqu, NULL); + } #endif return; out: - cfg80211_put_bss(wdev->wiphy, info->bss); + for_each_valid_link(info, link) + cfg80211_put_bss(wdev->wiphy, info->links[link].bss); } -/* Consumes info->bss object one way or another */ +/* Consumes info->links.bss object(s) one way or another */ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, gfp_t gfp) { @@ -962,25 +1123,41 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, struct cfg80211_event *ev; unsigned long flags; u8 *next; + unsigned int link; + size_t link_info_size = 0; + bool bss_not_found = false; - if (!info->bss) { - info->bss = cfg80211_get_bss(wdev->wiphy, info->channel, - info->bssid, wdev->ssid, - wdev->ssid_len, - wdev->conn_bss_type, - IEEE80211_PRIVACY_ANY); + for_each_valid_link(info, link) { + link_info_size += info->links[link].addr ? ETH_ALEN : 0; + link_info_size += info->links[link].bssid ? ETH_ALEN : 0; + + if (info->links[link].bss) + continue; + + info->links[link].bss = + cfg80211_get_bss(wdev->wiphy, + info->links[link].channel, + info->links[link].bssid, + wdev->u.client.ssid, + wdev->u.client.ssid_len, + wdev->conn_bss_type, + IEEE80211_PRIVACY_ANY); + + if (!info->links[link].bss) { + bss_not_found = true; + break; + } } - if (WARN_ON(!info->bss)) - return; + if (WARN_ON(bss_not_found)) + goto out; ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len + info->fils.kek_len + info->fils.pmk_len + - (info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp); - if (!ev) { - cfg80211_put_bss(wdev->wiphy, info->bss); - return; - } + (info->fils.pmkid ? WLAN_PMKID_LEN : 0) + + (info->ap_mld_addr ? ETH_ALEN : 0) + link_info_size, gfp); + if (!ev) + goto out; ev->type = EVENT_ROAMED; next = ((u8 *)ev) + sizeof(*ev); @@ -1020,12 +1197,43 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, ev->rm.fils.update_erp_next_seq_num = info->fils.update_erp_next_seq_num; if (info->fils.update_erp_next_seq_num) ev->rm.fils.erp_next_seq_num = info->fils.erp_next_seq_num; - ev->rm.bss = info->bss; + if (info->ap_mld_addr) { + ev->rm.ap_mld_addr = next; + memcpy((void *)ev->rm.ap_mld_addr, info->ap_mld_addr, + ETH_ALEN); + next += ETH_ALEN; + } + ev->rm.valid_links = info->valid_links; + for_each_valid_link(info, link) { + ev->rm.links[link].bss = info->links[link].bss; + + if (info->links[link].addr) { + ev->rm.links[link].addr = next; + memcpy((void *)ev->rm.links[link].addr, + info->links[link].addr, + ETH_ALEN); + next += ETH_ALEN; + } + + if (info->links[link].bssid) { + ev->rm.links[link].bssid = next; + memcpy((void *)ev->rm.links[link].bssid, + info->links[link].bssid, + ETH_ALEN); + next += ETH_ALEN; + } + } spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); + + return; +out: + for_each_valid_link(info, link) + cfg80211_put_bss(wdev->wiphy, info->links[link].bss); + } EXPORT_SYMBOL(cfg80211_roamed); @@ -1033,11 +1241,12 @@ void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid) { ASSERT_WDEV_LOCK(wdev); - if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && + wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) return; - if (WARN_ON(!wdev->current_bss) || - WARN_ON(!ether_addr_equal(wdev->current_bss->pub.bssid, bssid))) + if (WARN_ON(!wdev->connected) || + WARN_ON(!ether_addr_equal(wdev->u.client.connected_addr, bssid))) return; nl80211_send_port_authorized(wiphy_to_rdev(wdev->wiphy), wdev->netdev, @@ -1089,13 +1298,9 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) return; - if (wdev->current_bss) { - cfg80211_unhold_bss(wdev->current_bss); - cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); - } - - wdev->current_bss = NULL; - wdev->ssid_len = 0; + cfg80211_wdev_release_bsses(wdev); + wdev->connected = false; + wdev->u.client.ssid_len = 0; wdev->conn_owner_nlportid = 0; kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; @@ -1123,7 +1328,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; for (i = 0; i <= max_key_idx; i++) - rdev_del_key(rdev, dev, i, false, NULL); + rdev_del_key(rdev, dev, -1, i, false, NULL); } rdev_set_qos_map(rdev, dev, NULL); @@ -1184,19 +1389,20 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, * already connected, so reject a new SSID unless it's the * same (which is the case for re-association.) */ - if (wdev->ssid_len && - (wdev->ssid_len != connect->ssid_len || - memcmp(wdev->ssid, connect->ssid, wdev->ssid_len))) + if (wdev->u.client.ssid_len && + (wdev->u.client.ssid_len != connect->ssid_len || + memcmp(wdev->u.client.ssid, connect->ssid, wdev->u.client.ssid_len))) return -EALREADY; /* * If connected, reject (re-)association unless prev_bssid * matches the current BSSID. */ - if (wdev->current_bss) { + if (wdev->connected) { if (!prev_bssid) return -EALREADY; - if (!ether_addr_equal(prev_bssid, wdev->current_bss->pub.bssid)) + if (!ether_addr_equal(prev_bssid, + wdev->u.client.connected_addr)) return -ENOTCONN; } @@ -1247,8 +1453,8 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, } wdev->connect_keys = connkeys; - memcpy(wdev->ssid, connect->ssid, connect->ssid_len); - wdev->ssid_len = connect->ssid_len; + memcpy(wdev->u.client.ssid, connect->ssid, connect->ssid_len); + wdev->u.client.ssid_len = connect->ssid_len; wdev->conn_bss_type = connect->pbss ? IEEE80211_BSS_TYPE_PBSS : IEEE80211_BSS_TYPE_ESS; @@ -1264,8 +1470,8 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, * This could be reassoc getting refused, don't clear * ssid_len in that case. */ - if (!wdev->current_bss) - wdev->ssid_len = 0; + if (!wdev->connected) + wdev->u.client.ssid_len = 0; return err; } @@ -1289,7 +1495,7 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev, err = cfg80211_sme_disconnect(wdev, reason); else if (!rdev->ops->disconnect) cfg80211_mlme_down(rdev, dev); - else if (wdev->ssid_len) + else if (wdev->u.client.ssid_len) err = rdev_disconnect(rdev, dev, reason); /* @@ -1297,8 +1503,8 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev, * in which case cfg80211_disconnected() will take care of * this later. */ - if (!wdev->current_bss) - wdev->ssid_len = 0; + if (!wdev->connected) + wdev->u.client.ssid_len = 0; return err; } @@ -1322,7 +1528,7 @@ void cfg80211_autodisconnect_wk(struct work_struct *work) break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: - __cfg80211_stop_ap(rdev, wdev->netdev, false); + __cfg80211_stop_ap(rdev, wdev->netdev, -1, false); break; case NL80211_IFTYPE_MESH_POINT: __cfg80211_leave_mesh(rdev, wdev->netdev); @@ -1334,7 +1540,7 @@ void cfg80211_autodisconnect_wk(struct work_struct *work) * ops->disconnect not implemented. Otherwise we can * use cfg80211_disconnect. */ - if (rdev->ops->disconnect || wdev->current_bss) + if (rdev->ops->disconnect || wdev->connected) cfg80211_disconnect(rdev, wdev->netdev, WLAN_REASON_DEAUTH_LEAVING, true); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 19b78d472283..a405c3edbc47 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -167,6 +167,19 @@ __entry->center_freq1, __entry->freq1_offset, \ __entry->center_freq2 +#define FILS_AAD_ASSIGN(fa) \ + do { \ + if (fa) { \ + ether_addr_copy(__entry->macaddr, fa->macaddr); \ + __entry->kek_len = fa->kek_len; \ + } else { \ + eth_zero_addr(__entry->macaddr); \ + __entry->kek_len = 0; \ + } \ + } while (0) +#define FILS_AAD_PR_FMT \ + "macaddr: %pM, kek_len: %d" + #define SINFO_ENTRY __field(int, generation) \ __field(u32, connected_time) \ __field(u32, inactive_time) \ @@ -421,13 +434,14 @@ TRACE_EVENT(rdev_change_virtual_intf, ); DECLARE_EVENT_CLASS(key_handle, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, - bool pairwise, const u8 *mac_addr), - TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr), + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, + u8 key_index, bool pairwise, const u8 *mac_addr), + TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(mac_addr) + __field(int, link_id) __field(u8, key_index) __field(bool, pairwise) ), @@ -435,34 +449,38 @@ DECLARE_EVENT_CLASS(key_handle, WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(mac_addr, mac_addr); + __entry->link_id = link_id; __entry->key_index = key_index; __entry->pairwise = pairwise; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key_index: %u, pairwise: %s, mac addr: " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index, - BOOL_TO_STR(__entry->pairwise), MAC_PR_ARG(mac_addr)) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, " + "key_index: %u, pairwise: %s, mac addr: " MAC_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id, + __entry->key_index, BOOL_TO_STR(__entry->pairwise), + MAC_PR_ARG(mac_addr)) ); DEFINE_EVENT(key_handle, rdev_get_key, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, - bool pairwise, const u8 *mac_addr), - TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr) + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, + u8 key_index, bool pairwise, const u8 *mac_addr), + TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr) ); DEFINE_EVENT(key_handle, rdev_del_key, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, - bool pairwise, const u8 *mac_addr), - TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr) + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, + u8 key_index, bool pairwise, const u8 *mac_addr), + TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr) ); TRACE_EVENT(rdev_add_key, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, - bool pairwise, const u8 *mac_addr, u8 mode), - TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr, mode), + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, + u8 key_index, bool pairwise, const u8 *mac_addr, u8 mode), + TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr, mode), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(mac_addr) + __field(int, link_id) __field(u8, key_index) __field(bool, pairwise) __field(u8, mode) @@ -471,24 +489,27 @@ TRACE_EVENT(rdev_add_key, WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(mac_addr, mac_addr); + __entry->link_id = link_id; __entry->key_index = key_index; __entry->pairwise = pairwise; __entry->mode = mode; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key_index: %u, " - "mode: %u, pairwise: %s, mac addr: " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index, - __entry->mode, BOOL_TO_STR(__entry->pairwise), - MAC_PR_ARG(mac_addr)) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, " + "key_index: %u, mode: %u, pairwise: %s, " + "mac addr: " MAC_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id, + __entry->key_index, __entry->mode, + BOOL_TO_STR(__entry->pairwise), MAC_PR_ARG(mac_addr)) ); TRACE_EVENT(rdev_set_default_key, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, - bool unicast, bool multicast), - TP_ARGS(wiphy, netdev, key_index, unicast, multicast), + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, + u8 key_index, bool unicast, bool multicast), + TP_ARGS(wiphy, netdev, link_id, key_index, unicast, multicast), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY + __field(int, link_id) __field(u8, key_index) __field(bool, unicast) __field(bool, multicast) @@ -496,48 +517,58 @@ TRACE_EVENT(rdev_set_default_key, TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; + __entry->link_id = link_id; __entry->key_index = key_index; __entry->unicast = unicast; __entry->multicast = multicast; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key index: %u, unicast: %s, multicast: %s", - WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index, - BOOL_TO_STR(__entry->unicast), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, " + "key index: %u, unicast: %s, multicast: %s", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id, + __entry->key_index, BOOL_TO_STR(__entry->unicast), BOOL_TO_STR(__entry->multicast)) ); TRACE_EVENT(rdev_set_default_mgmt_key, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index), - TP_ARGS(wiphy, netdev, key_index), + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, + u8 key_index), + TP_ARGS(wiphy, netdev, link_id, key_index), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY + __field(int, link_id) __field(u8, key_index) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; + __entry->link_id = link_id; __entry->key_index = key_index; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key index: %u", - WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, " + "key index: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, + __entry->link_id, __entry->key_index) ); TRACE_EVENT(rdev_set_default_beacon_key, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index), - TP_ARGS(wiphy, netdev, key_index), + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, + u8 key_index), + TP_ARGS(wiphy, netdev, link_id, key_index), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY + __field(int, link_id) __field(u8, key_index) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; + __entry->link_id = link_id; __entry->key_index = key_index; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key index: %u", - WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, " + "key index: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, + __entry->link_id, __entry->key_index) ); TRACE_EVENT(rdev_start_ap, @@ -556,6 +587,7 @@ TRACE_EVENT(rdev_start_ap, __field(bool, privacy) __field(enum nl80211_auth_type, auth_type) __field(int, inactivity_timeout) + __field(unsigned int, link_id) ), TP_fast_assign( WIPHY_ASSIGN; @@ -570,16 +602,17 @@ TRACE_EVENT(rdev_start_ap, __entry->inactivity_timeout = settings->inactivity_timeout; memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1); memcpy(__entry->ssid, settings->ssid, settings->ssid_len); + __entry->link_id = settings->beacon.link_id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", AP settings - ssid: %s, " CHAN_DEF_PR_FMT ", beacon interval: %d, dtim period: %d, " "hidden ssid: %d, wpa versions: %u, privacy: %s, " - "auth type: %d, inactivity timeout: %d", + "auth type: %d, inactivity timeout: %d, link_id: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->ssid, CHAN_DEF_PR_ARG, __entry->beacon_interval, __entry->dtim_period, __entry->hidden_ssid, __entry->wpa_ver, BOOL_TO_STR(__entry->privacy), __entry->auth_type, - __entry->inactivity_timeout) + __entry->inactivity_timeout, __entry->link_id) ); TRACE_EVENT(rdev_change_beacon, @@ -589,6 +622,7 @@ TRACE_EVENT(rdev_change_beacon, TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY + __field(int, link_id) __dynamic_array(u8, head, info ? info->head_len : 0) __dynamic_array(u8, tail, info ? info->tail_len : 0) __dynamic_array(u8, beacon_ies, info ? info->beacon_ies_len : 0) @@ -602,6 +636,7 @@ TRACE_EVENT(rdev_change_beacon, WIPHY_ASSIGN; NETDEV_ASSIGN; if (info) { + __entry->link_id = info->link_id; if (info->head) memcpy(__get_dynamic_array(head), info->head, info->head_len); @@ -622,9 +657,30 @@ TRACE_EVENT(rdev_change_beacon, if (info->probe_resp) memcpy(__get_dynamic_array(probe_resp), info->probe_resp, info->probe_resp_len); + } else { + __entry->link_id = -1; } ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id:%d", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id) +); + +TRACE_EVENT(rdev_stop_ap, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + unsigned int link_id), + TP_ARGS(wiphy, netdev, link_id), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __field(unsigned int, link_id) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + __entry->link_id = link_id; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id) ); DECLARE_EVENT_CLASS(wiphy_netdev_evt, @@ -641,11 +697,6 @@ DECLARE_EVENT_CLASS(wiphy_netdev_evt, TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG) ); -DEFINE_EVENT(wiphy_netdev_evt, rdev_stop_ap, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), - TP_ARGS(wiphy, netdev) -); - DEFINE_EVENT(wiphy_netdev_evt, rdev_set_rekey_data, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) @@ -705,7 +756,7 @@ DECLARE_EVENT_CLASS(station_add_change, __array(u8, vht_capa, (int)sizeof(struct ieee80211_vht_cap)) __array(char, vlan, IFNAMSIZ) __dynamic_array(u8, supported_rates, - params->supported_rates_len) + params->link_sta_params.supported_rates_len) __dynamic_array(u8, ext_capab, params->ext_capab_len) __dynamic_array(u8, supported_channels, params->supported_channels_len) @@ -725,20 +776,23 @@ DECLARE_EVENT_CLASS(station_add_change, __entry->plink_state = params->plink_state; __entry->uapsd_queues = params->uapsd_queues; memset(__entry->ht_capa, 0, sizeof(struct ieee80211_ht_cap)); - if (params->ht_capa) - memcpy(__entry->ht_capa, params->ht_capa, + if (params->link_sta_params.ht_capa) + memcpy(__entry->ht_capa, + params->link_sta_params.ht_capa, sizeof(struct ieee80211_ht_cap)); memset(__entry->vht_capa, 0, sizeof(struct ieee80211_vht_cap)); - if (params->vht_capa) - memcpy(__entry->vht_capa, params->vht_capa, + if (params->link_sta_params.vht_capa) + memcpy(__entry->vht_capa, + params->link_sta_params.vht_capa, sizeof(struct ieee80211_vht_cap)); memset(__entry->vlan, 0, sizeof(__entry->vlan)); if (params->vlan) memcpy(__entry->vlan, params->vlan->name, IFNAMSIZ); - if (params->supported_rates && params->supported_rates_len) + if (params->link_sta_params.supported_rates && + params->link_sta_params.supported_rates_len) memcpy(__get_dynamic_array(supported_rates), - params->supported_rates, - params->supported_rates_len); + params->link_sta_params.supported_rates, + params->link_sta_params.supported_rates_len); if (params->ext_capab && params->ext_capab_len) memcpy(__get_dynamic_array(ext_capab), params->ext_capab, @@ -755,8 +809,9 @@ DECLARE_EVENT_CLASS(station_add_change, params->supported_oper_classes_len); __entry->max_sp = params->max_sp; __entry->capability = params->capability; - __entry->opmode_notif = params->opmode_notif; - __entry->opmode_notif_used = params->opmode_notif_used; + __entry->opmode_notif = params->link_sta_params.opmode_notif; + __entry->opmode_notif_used = + params->link_sta_params.opmode_notif_used; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT ", station flags mask: %u, station flags set: %u, " @@ -1195,9 +1250,8 @@ TRACE_EVENT(rdev_auth, TRACE_EVENT(rdev_assoc, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - struct cfg80211_assoc_request *req, - const struct cfg80211_bss_ies *bss_ies), - TP_ARGS(wiphy, netdev, req, bss_ies), + struct cfg80211_assoc_request *req), + TP_ARGS(wiphy, netdev, req), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY @@ -1205,9 +1259,6 @@ TRACE_EVENT(rdev_assoc, MAC_ENTRY(prev_bssid) __field(bool, use_mfp) __field(u32, flags) - __dynamic_array(u8, bss_elements, bss_ies->len) - __field(bool, bss_elements_bcon) - __field(u64, bss_elements_tsf) __dynamic_array(u8, elements, req->ie_len) __array(u8, ht_capa, sizeof(struct ieee80211_ht_cap)) __array(u8, ht_capa_mask, sizeof(struct ieee80211_ht_cap)) @@ -1227,11 +1278,6 @@ TRACE_EVENT(rdev_assoc, MAC_ASSIGN(prev_bssid, req->prev_bssid); __entry->use_mfp = req->use_mfp; __entry->flags = req->flags; - if (bss_ies->len) - memcpy(__get_dynamic_array(bss_elements), - bss_ies->data, bss_ies->len); - __entry->bss_elements_bcon = bss_ies->from_beacon; - __entry->bss_elements_tsf = bss_ies->tsf; if (req->ie) memcpy(__get_dynamic_array(elements), req->ie, req->ie_len); @@ -1290,10 +1336,7 @@ TRACE_EVENT(rdev_disassoc, TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; - if (req->bss) - MAC_ASSIGN(bssid, req->bss->bssid); - else - eth_zero_addr(__entry->bssid); + MAC_ASSIGN(bssid, req->ap_addr); __entry->reason_code = req->reason_code; __entry->local_state_change = req->local_state_change; ), @@ -1606,20 +1649,24 @@ TRACE_EVENT(rdev_testmode_dump, TRACE_EVENT(rdev_set_bitrate_mask, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + unsigned int link_id, const u8 *peer, const struct cfg80211_bitrate_mask *mask), - TP_ARGS(wiphy, netdev, peer, mask), + TP_ARGS(wiphy, netdev, link_id, peer, mask), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY + __field(unsigned int, link_id) MAC_ENTRY(peer) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; + __entry->link_id = link_id; MAC_ASSIGN(peer, peer); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer)) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, peer: " MAC_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id, + MAC_PR_ARG(peer)) ); TRACE_EVENT(rdev_update_mgmt_frame_registrations, @@ -1986,14 +2033,15 @@ TRACE_EVENT(rdev_mgmt_tx, TRACE_EVENT(rdev_tx_control_port, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *buf, size_t len, const u8 *dest, __be16 proto, - bool unencrypted), - TP_ARGS(wiphy, netdev, buf, len, dest, proto, unencrypted), + bool unencrypted, int link_id), + TP_ARGS(wiphy, netdev, buf, len, dest, proto, unencrypted, link_id), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(dest) __field(__be16, proto) __field(bool, unencrypted) + __field(int, link_id) ), TP_fast_assign( WIPHY_ASSIGN; @@ -2001,12 +2049,14 @@ TRACE_EVENT(rdev_tx_control_port, MAC_ASSIGN(dest, dest); __entry->proto = proto; __entry->unencrypted = unencrypted; + __entry->link_id = link_id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT "," - " proto: 0x%x, unencrypted: %s", + " proto: 0x%x, unencrypted: %s, link: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest), be16_to_cpu(__entry->proto), - BOOL_TO_STR(__entry->unencrypted)) + BOOL_TO_STR(__entry->unencrypted), + __entry->link_id) ); TRACE_EVENT(rdev_set_noack_map, @@ -2027,9 +2077,28 @@ TRACE_EVENT(rdev_set_noack_map, WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->noack_map) ); -DEFINE_EVENT(wiphy_wdev_evt, rdev_get_channel, - TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), - TP_ARGS(wiphy, wdev) +DECLARE_EVENT_CLASS(wiphy_wdev_link_evt, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + unsigned int link_id), + TP_ARGS(wiphy, wdev, link_id), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(unsigned int, link_id) + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->link_id = link_id; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %u", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id) +); + +DEFINE_EVENT(wiphy_wdev_link_evt, rdev_get_channel, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + unsigned int link_id), + TP_ARGS(wiphy, wdev, link_id) ); TRACE_EVENT(rdev_return_chandef, @@ -2283,20 +2352,24 @@ TRACE_EVENT(rdev_set_qos_map, TRACE_EVENT(rdev_set_ap_chanwidth, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + unsigned int link_id, struct cfg80211_chan_def *chandef), - TP_ARGS(wiphy, netdev, chandef), + TP_ARGS(wiphy, netdev, link_id, chandef), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY CHAN_DEF_ENTRY + __field(unsigned int, link_id) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; CHAN_DEF_ASSIGN(chandef); + __entry->link_id = link_id; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d", + WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG, + __entry->link_id) ); TRACE_EVENT(rdev_add_tx_ts, @@ -2614,6 +2687,173 @@ DEFINE_EVENT(wiphy_wdev_cookie_evt, rdev_abort_pmsr, TP_ARGS(wiphy, wdev, cookie) ); +TRACE_EVENT(rdev_set_fils_aad, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_fils_aad *fils_aad), + TP_ARGS(wiphy, netdev, fils_aad), + TP_STRUCT__entry(WIPHY_ENTRY + NETDEV_ENTRY + __array(u8, macaddr, ETH_ALEN) + __field(u8, kek_len) + ), + TP_fast_assign(WIPHY_ASSIGN; + NETDEV_ASSIGN; + FILS_AAD_ASSIGN(fils_aad); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " FILS_AAD_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->macaddr, + __entry->kek_len) +); + +TRACE_EVENT(rdev_update_owe_info, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_update_owe_info *owe_info), + TP_ARGS(wiphy, netdev, owe_info), + TP_STRUCT__entry(WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __field(u16, status) + __dynamic_array(u8, ie, owe_info->ie_len)), + TP_fast_assign(WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, owe_info->peer); + __entry->status = owe_info->status; + memcpy(__get_dynamic_array(ie), + owe_info->ie, owe_info->ie_len);), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT + " status %d", WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), + __entry->status) +); + +TRACE_EVENT(rdev_probe_mesh_link, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + const u8 *dest, const u8 *buf, size_t len), + TP_ARGS(wiphy, netdev, dest, buf, len), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(dest) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(dest, dest); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest)) +); + +TRACE_EVENT(rdev_set_tid_config, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_tid_config *tid_conf), + TP_ARGS(wiphy, netdev, tid_conf), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, tid_conf->peer); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer)) +); + +TRACE_EVENT(rdev_reset_tid_config, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + const u8 *peer, u8 tids), + TP_ARGS(wiphy, netdev, peer, tids), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __field(u8, tids) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, peer); + __entry->tids = tids; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT ", tids: 0x%x", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tids) +); + +TRACE_EVENT(rdev_set_sar_specs, + TP_PROTO(struct wiphy *wiphy, struct cfg80211_sar_specs *sar), + TP_ARGS(wiphy, sar), + TP_STRUCT__entry( + WIPHY_ENTRY + __field(u16, type) + __field(u16, num) + ), + TP_fast_assign( + WIPHY_ASSIGN; + __entry->type = sar->type; + __entry->num = sar->num_sub_specs; + + ), + TP_printk(WIPHY_PR_FMT ", Set type:%d, num_specs:%d", + WIPHY_PR_ARG, __entry->type, __entry->num) +); + +TRACE_EVENT(rdev_color_change, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_color_change_settings *params), + TP_ARGS(wiphy, netdev, params), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __field(u8, count) + __field(u16, bcn_ofs) + __field(u16, pres_ofs) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + __entry->count = params->count; + __entry->bcn_ofs = params->counter_offset_beacon; + __entry->pres_ofs = params->counter_offset_presp; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT + ", count: %u", + WIPHY_PR_ARG, NETDEV_PR_ARG, + __entry->count) +); + +TRACE_EVENT(rdev_set_radar_background, + TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef), + + TP_ARGS(wiphy, chandef), + + TP_STRUCT__entry( + WIPHY_ENTRY + CHAN_DEF_ENTRY + ), + + TP_fast_assign( + WIPHY_ASSIGN; + CHAN_DEF_ASSIGN(chandef) + ), + + TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT, + WIPHY_PR_ARG, CHAN_DEF_PR_ARG) +); + +DEFINE_EVENT(wiphy_wdev_link_evt, rdev_add_intf_link, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + unsigned int link_id), + TP_ARGS(wiphy, wdev, link_id) +); + +DEFINE_EVENT(wiphy_wdev_link_evt, rdev_del_intf_link, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + unsigned int link_id), + TP_ARGS(wiphy, wdev, link_id) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ @@ -2668,20 +2908,20 @@ DEFINE_EVENT(netdev_evt_only, cfg80211_send_rx_auth, ); TRACE_EVENT(cfg80211_send_rx_assoc, - TP_PROTO(struct net_device *netdev, struct cfg80211_bss *bss), - TP_ARGS(netdev, bss), + TP_PROTO(struct net_device *netdev, + struct cfg80211_rx_assoc_resp *data), + TP_ARGS(netdev, data), TP_STRUCT__entry( NETDEV_ENTRY - MAC_ENTRY(bssid) - CHAN_ENTRY + MAC_ENTRY(ap_addr) ), TP_fast_assign( NETDEV_ASSIGN; - MAC_ASSIGN(bssid, bss->bssid); - CHAN_ASSIGN(bss->channel); + MAC_ASSIGN(ap_addr, + data->ap_mld_addr ?: data->links[0].bss->bssid); ), - TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT ", " CHAN_PR_FMT, - NETDEV_PR_ARG, MAC_PR_ARG(bssid), CHAN_PR_ARG) + TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT, + NETDEV_PR_ARG, MAC_PR_ARG(ap_addr)) ); DECLARE_EVENT_CLASS(netdev_frame_event, @@ -2750,9 +2990,22 @@ DEFINE_EVENT(netdev_mac_evt, cfg80211_send_auth_timeout, TP_ARGS(netdev, mac) ); -DEFINE_EVENT(netdev_mac_evt, cfg80211_send_assoc_timeout, - TP_PROTO(struct net_device *netdev, const u8 *mac), - TP_ARGS(netdev, mac) +TRACE_EVENT(cfg80211_send_assoc_failure, + TP_PROTO(struct net_device *netdev, + struct cfg80211_assoc_failure *data), + TP_ARGS(netdev, data), + TP_STRUCT__entry( + NETDEV_ENTRY + MAC_ENTRY(ap_addr) + __field(bool, timeout) + ), + TP_fast_assign( + NETDEV_ASSIGN; + MAC_ASSIGN(ap_addr, data->ap_mld_addr ?: data->bss[0]->bssid); + __entry->timeout = data->timeout; + ), + TP_printk(NETDEV_PR_FMT ", mac: " MAC_PR_FMT ", timeout: %d", + NETDEV_PR_ARG, MAC_PR_ARG(ap_addr), __entry->timeout) ); TRACE_EVENT(cfg80211_michael_mic_failure, @@ -2861,8 +3114,8 @@ DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_del_sta, ); TRACE_EVENT(cfg80211_rx_mgmt, - TP_PROTO(struct wireless_dev *wdev, int freq, int sig_dbm), - TP_ARGS(wdev, freq, sig_dbm), + TP_PROTO(struct wireless_dev *wdev, struct cfg80211_rx_info *info), + TP_ARGS(wdev, info), TP_STRUCT__entry( WDEV_ENTRY __field(int, freq) @@ -2870,8 +3123,8 @@ TRACE_EVENT(cfg80211_rx_mgmt, ), TP_fast_assign( WDEV_ASSIGN; - __entry->freq = freq; - __entry->sig_dbm = sig_dbm; + __entry->freq = info->freq; + __entry->sig_dbm = info->sig_dbm; ), TP_printk(WDEV_PR_FMT ", freq: "KHZ_F", sig dbm: %d", WDEV_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm) @@ -2991,49 +3244,58 @@ TRACE_EVENT(cfg80211_chandef_dfs_required, TRACE_EVENT(cfg80211_ch_switch_notify, TP_PROTO(struct net_device *netdev, - struct cfg80211_chan_def *chandef), - TP_ARGS(netdev, chandef), + struct cfg80211_chan_def *chandef, + unsigned int link_id), + TP_ARGS(netdev, chandef, link_id), TP_STRUCT__entry( NETDEV_ENTRY CHAN_DEF_ENTRY + __field(unsigned int, link_id) ), TP_fast_assign( NETDEV_ASSIGN; CHAN_DEF_ASSIGN(chandef); + __entry->link_id = link_id; ), - TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT, - NETDEV_PR_ARG, CHAN_DEF_PR_ARG) + TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d", + NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id) ); TRACE_EVENT(cfg80211_ch_switch_started_notify, TP_PROTO(struct net_device *netdev, - struct cfg80211_chan_def *chandef), - TP_ARGS(netdev, chandef), + struct cfg80211_chan_def *chandef, + unsigned int link_id), + TP_ARGS(netdev, chandef, link_id), TP_STRUCT__entry( NETDEV_ENTRY CHAN_DEF_ENTRY + __field(unsigned int, link_id) ), TP_fast_assign( NETDEV_ASSIGN; CHAN_DEF_ASSIGN(chandef); + __entry->link_id = link_id; ), - TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT, - NETDEV_PR_ARG, CHAN_DEF_PR_ARG) + TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d", + NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id) ); TRACE_EVENT(cfg80211_radar_event, - TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef), - TP_ARGS(wiphy, chandef), + TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, + bool offchan), + TP_ARGS(wiphy, chandef, offchan), TP_STRUCT__entry( WIPHY_ENTRY CHAN_DEF_ENTRY + __field(bool, offchan) ), TP_fast_assign( WIPHY_ASSIGN; CHAN_DEF_ASSIGN(chandef); + __entry->offchan = offchan; ), - TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT, - WIPHY_PR_ARG, CHAN_DEF_PR_ARG) + TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", offchan %d", + WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->offchan) ); TRACE_EVENT(cfg80211_cac_event, @@ -3486,26 +3748,6 @@ TRACE_EVENT(cfg80211_pmsr_complete, (unsigned long long)__entry->cookie) ); -TRACE_EVENT(rdev_update_owe_info, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - struct cfg80211_update_owe_info *owe_info), - TP_ARGS(wiphy, netdev, owe_info), - TP_STRUCT__entry(WIPHY_ENTRY - NETDEV_ENTRY - MAC_ENTRY(peer) - __field(u16, status) - __dynamic_array(u8, ie, owe_info->ie_len)), - TP_fast_assign(WIPHY_ASSIGN; - NETDEV_ASSIGN; - MAC_ASSIGN(peer, owe_info->peer); - __entry->status = owe_info->status; - memcpy(__get_dynamic_array(ie), - owe_info->ie, owe_info->ie_len);), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT - " status %d", WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), - __entry->status) -); - TRACE_EVENT(cfg80211_update_owe_info_event, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_update_owe_info *owe_info), @@ -3523,104 +3765,6 @@ TRACE_EVENT(cfg80211_update_owe_info_event, WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer)) ); -TRACE_EVENT(rdev_probe_mesh_link, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - const u8 *dest, const u8 *buf, size_t len), - TP_ARGS(wiphy, netdev, dest, buf, len), - TP_STRUCT__entry( - WIPHY_ENTRY - NETDEV_ENTRY - MAC_ENTRY(dest) - ), - TP_fast_assign( - WIPHY_ASSIGN; - NETDEV_ASSIGN; - MAC_ASSIGN(dest, dest); - ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest)) -); - -TRACE_EVENT(rdev_set_tid_config, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - struct cfg80211_tid_config *tid_conf), - TP_ARGS(wiphy, netdev, tid_conf), - TP_STRUCT__entry( - WIPHY_ENTRY - NETDEV_ENTRY - MAC_ENTRY(peer) - ), - TP_fast_assign( - WIPHY_ASSIGN; - NETDEV_ASSIGN; - MAC_ASSIGN(peer, tid_conf->peer); - ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer)) -); - -TRACE_EVENT(rdev_reset_tid_config, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - const u8 *peer, u8 tids), - TP_ARGS(wiphy, netdev, peer, tids), - TP_STRUCT__entry( - WIPHY_ENTRY - NETDEV_ENTRY - MAC_ENTRY(peer) - __field(u8, tids) - ), - TP_fast_assign( - WIPHY_ASSIGN; - NETDEV_ASSIGN; - MAC_ASSIGN(peer, peer); - __entry->tids = tids; - ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT ", tids: 0x%x", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tids) -); - -TRACE_EVENT(rdev_set_sar_specs, - TP_PROTO(struct wiphy *wiphy, struct cfg80211_sar_specs *sar), - TP_ARGS(wiphy, sar), - TP_STRUCT__entry( - WIPHY_ENTRY - __field(u16, type) - __field(u16, num) - ), - TP_fast_assign( - WIPHY_ASSIGN; - __entry->type = sar->type; - __entry->num = sar->num_sub_specs; - - ), - TP_printk(WIPHY_PR_FMT ", Set type:%d, num_specs:%d", - WIPHY_PR_ARG, __entry->type, __entry->num) -); - -TRACE_EVENT(rdev_color_change, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - struct cfg80211_color_change_settings *params), - TP_ARGS(wiphy, netdev, params), - TP_STRUCT__entry( - WIPHY_ENTRY - NETDEV_ENTRY - __field(u8, count) - __field(u16, bcn_ofs) - __field(u16, pres_ofs) - ), - TP_fast_assign( - WIPHY_ASSIGN; - NETDEV_ASSIGN; - __entry->count = params->count; - __entry->bcn_ofs = params->counter_offset_beacon; - __entry->pres_ofs = params->counter_offset_presp; - ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT - ", count: %u", - WIPHY_PR_ARG, NETDEV_PR_ARG, - __entry->count) -); - TRACE_EVENT(cfg80211_bss_color_notify, TP_PROTO(struct net_device *netdev, enum nl80211_commands cmd, @@ -3643,6 +3787,120 @@ TRACE_EVENT(cfg80211_bss_color_notify, __entry->color_bitmap) ); +TRACE_EVENT(cfg80211_assoc_comeback, + TP_PROTO(struct wireless_dev *wdev, const u8 *ap_addr, u32 timeout), + TP_ARGS(wdev, ap_addr, timeout), + TP_STRUCT__entry( + WDEV_ENTRY + MAC_ENTRY(ap_addr) + __field(u32, timeout) + ), + TP_fast_assign( + WDEV_ASSIGN; + MAC_ASSIGN(ap_addr, ap_addr); + __entry->timeout = timeout; + ), + TP_printk(WDEV_PR_FMT ", " MAC_PR_FMT ", timeout: %u TUs", + WDEV_PR_ARG, MAC_PR_ARG(ap_addr), __entry->timeout) +); + +DECLARE_EVENT_CLASS(link_station_add_mod, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct link_station_parameters *params), + TP_ARGS(wiphy, netdev, params), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __array(u8, mld_mac, 6) + __array(u8, link_mac, 6) + __field(u32, link_id) + __dynamic_array(u8, supported_rates, + params->supported_rates_len) + __array(u8, ht_capa, (int)sizeof(struct ieee80211_ht_cap)) + __array(u8, vht_capa, (int)sizeof(struct ieee80211_vht_cap)) + __field(u8, opmode_notif) + __field(bool, opmode_notif_used) + __dynamic_array(u8, he_capa, params->he_capa_len) + __array(u8, he_6ghz_capa, (int)sizeof(struct ieee80211_he_6ghz_capa)) + __dynamic_array(u8, eht_capa, params->eht_capa_len) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + memset(__entry->mld_mac, 0, 6); + memset(__entry->link_mac, 0, 6); + if (params->mld_mac) + memcpy(__entry->mld_mac, params->mld_mac, 6); + if (params->link_mac) + memcpy(__entry->link_mac, params->link_mac, 6); + __entry->link_id = params->link_id; + if (params->supported_rates && params->supported_rates_len) + memcpy(__get_dynamic_array(supported_rates), + params->supported_rates, + params->supported_rates_len); + memset(__entry->ht_capa, 0, sizeof(struct ieee80211_ht_cap)); + if (params->ht_capa) + memcpy(__entry->ht_capa, params->ht_capa, + sizeof(struct ieee80211_ht_cap)); + memset(__entry->vht_capa, 0, sizeof(struct ieee80211_vht_cap)); + if (params->vht_capa) + memcpy(__entry->vht_capa, params->vht_capa, + sizeof(struct ieee80211_vht_cap)); + __entry->opmode_notif = params->opmode_notif; + __entry->opmode_notif_used = params->opmode_notif_used; + if (params->he_capa && params->he_capa_len) + memcpy(__get_dynamic_array(he_capa), params->he_capa, + params->he_capa_len); + memset(__entry->he_6ghz_capa, 0, sizeof(struct ieee80211_he_6ghz_capa)); + if (params->he_6ghz_capa) + memcpy(__entry->he_6ghz_capa, params->he_6ghz_capa, + sizeof(struct ieee80211_he_6ghz_capa)); + if (params->eht_capa && params->eht_capa_len) + memcpy(__get_dynamic_array(eht_capa), params->eht_capa, + params->eht_capa_len); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT + ", link mac: " MAC_PR_FMT ", link id: %u", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(mld_mac), + MAC_PR_ARG(link_mac), __entry->link_id) +); + +DEFINE_EVENT(link_station_add_mod, rdev_add_link_station, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct link_station_parameters *params), + TP_ARGS(wiphy, netdev, params) +); + +DEFINE_EVENT(link_station_add_mod, rdev_mod_link_station, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct link_station_parameters *params), + TP_ARGS(wiphy, netdev, params) +); + +TRACE_EVENT(rdev_del_link_station, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct link_station_del_parameters *params), + TP_ARGS(wiphy, netdev, params), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __array(u8, mld_mac, 6) + __field(u32, link_id) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + memset(__entry->mld_mac, 0, 6); + if (params->mld_mac) + memcpy(__entry->mld_mac, params->mld_mac, 6); + __entry->link_id = params->link_id; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT + ", link id: %u", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(mld_mac), + __entry->link_id) +); + #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/wireless/util.c b/net/wireless/util.c index cb15d7f4eb05..814e6833c2b8 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -5,7 +5,7 @@ * Copyright 2007-2009 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation */ #include #include @@ -80,6 +80,7 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) return 0; /* not supported */ switch (band) { case NL80211_BAND_2GHZ: + case NL80211_BAND_LC: if (chan == 14) return MHZ_TO_KHZ(2484); else if (chan < 14) @@ -209,6 +210,7 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) WARN_ON(want); break; case NL80211_BAND_2GHZ: + case NL80211_BAND_LC: want = 7; for (i = 0; i < sband->n_bitrates; i++) { switch (sband->bitrates[i].bitrate) { @@ -632,12 +634,14 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, if (likely((!is_amsdu && ether_addr_equal(payload.hdr, rfc1042_header) && tmp.h_proto != htons(ETH_P_AARP) && tmp.h_proto != htons(ETH_P_IPX)) || - ether_addr_equal(payload.hdr, bridge_tunnel_header))) + ether_addr_equal(payload.hdr, bridge_tunnel_header))) { /* remove RFC1042 or Bridge-Tunnel encapsulation and * replace EtherType */ hdrlen += ETH_ALEN + 2; - else + skb_postpull_rcsum(skb, &payload, ETH_ALEN + 2); + } else { tmp.h_proto = htons(skb->len - hdrlen); + } pskb_pull(skb, hdrlen); @@ -931,13 +935,13 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev) for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) { if (!wdev->connect_keys->params[i].cipher) continue; - if (rdev_add_key(rdev, dev, i, false, NULL, + if (rdev_add_key(rdev, dev, -1, i, false, NULL, &wdev->connect_keys->params[i])) { netdev_err(dev, "failed to set key %d\n", i); continue; } if (wdev->connect_keys->def == i && - rdev_set_default_key(rdev, dev, i, true, true)) { + rdev_set_default_key(rdev, dev, -1, i, true, true)) { netdev_err(dev, "failed to set defkey %d\n", i); continue; } @@ -1037,7 +1041,6 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, return -EBUSY; dev->ieee80211_ptr->use_4addr = false; - dev->ieee80211_ptr->mesh_id_up_len = 0; wdev_lock(dev->ieee80211_ptr); rdev_set_qos_map(rdev, dev, NULL); wdev_unlock(dev->ieee80211_ptr); @@ -1045,7 +1048,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, switch (otype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: - cfg80211_stop_ap(rdev, dev, true); + cfg80211_stop_ap(rdev, dev, -1, true); break; case NL80211_IFTYPE_ADHOC: cfg80211_leave_ibss(rdev, dev, false); @@ -1069,6 +1072,11 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, cfg80211_process_rdev_events(rdev); cfg80211_mlme_purge_registrations(dev->ieee80211_ptr); + + memset(&dev->ieee80211_ptr->u, 0, + sizeof(dev->ieee80211_ptr->u)); + memset(&dev->ieee80211_ptr->links, 0, + sizeof(dev->ieee80211_ptr->links)); } err = rdev_change_virtual_intf(rdev, dev, ntype, params); @@ -1426,6 +1434,137 @@ static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) return result / 10000; } +static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) +{ +#define SCALE 6144 + static const u32 mcs_divisors[16] = { + 102399, /* 16.666666... */ + 51201, /* 8.333333... */ + 34134, /* 5.555555... */ + 25599, /* 4.166666... */ + 17067, /* 2.777777... */ + 12801, /* 2.083333... */ + 11377, /* 1.851725... */ + 10239, /* 1.666666... */ + 8532, /* 1.388888... */ + 7680, /* 1.250000... */ + 6828, /* 1.111111... */ + 6144, /* 1.000000... */ + 5690, /* 0.926106... */ + 5120, /* 0.833333... */ + 409600, /* 66.666666... */ + 204800, /* 33.333333... */ + }; + static const u32 rates_996[3] = { 480388888, 453700000, 408333333 }; + static const u32 rates_484[3] = { 229411111, 216666666, 195000000 }; + static const u32 rates_242[3] = { 114711111, 108333333, 97500000 }; + static const u32 rates_106[3] = { 40000000, 37777777, 34000000 }; + static const u32 rates_52[3] = { 18820000, 17777777, 16000000 }; + static const u32 rates_26[3] = { 9411111, 8888888, 8000000 }; + u64 tmp; + u32 result; + + if (WARN_ON_ONCE(rate->mcs > 15)) + return 0; + if (WARN_ON_ONCE(rate->eht_gi > NL80211_RATE_INFO_EHT_GI_3_2)) + return 0; + if (WARN_ON_ONCE(rate->eht_ru_alloc > + NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) + return 0; + if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) + return 0; + + /* Bandwidth checks for MCS 14 */ + if (rate->mcs == 14) { + if ((rate->bw != RATE_INFO_BW_EHT_RU && + rate->bw != RATE_INFO_BW_80 && + rate->bw != RATE_INFO_BW_160 && + rate->bw != RATE_INFO_BW_320) || + (rate->bw == RATE_INFO_BW_EHT_RU && + rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_996 && + rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_2x996 && + rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) { + WARN(1, "invalid EHT BW for MCS 14: bw:%d, ru:%d\n", + rate->bw, rate->eht_ru_alloc); + return 0; + } + } + + if (rate->bw == RATE_INFO_BW_320 || + (rate->bw == RATE_INFO_BW_EHT_RU && + rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) + result = 4 * rates_996[rate->eht_gi]; + else if (rate->bw == RATE_INFO_BW_EHT_RU && + rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996P484) + result = 3 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; + else if (rate->bw == RATE_INFO_BW_EHT_RU && + rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996) + result = 3 * rates_996[rate->eht_gi]; + else if (rate->bw == RATE_INFO_BW_EHT_RU && + rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996P484) + result = 2 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; + else if (rate->bw == RATE_INFO_BW_160 || + (rate->bw == RATE_INFO_BW_EHT_RU && + rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996)) + result = 2 * rates_996[rate->eht_gi]; + else if (rate->bw == RATE_INFO_BW_EHT_RU && + rate->eht_ru_alloc == + NL80211_RATE_INFO_EHT_RU_ALLOC_996P484P242) + result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi] + + rates_242[rate->eht_gi]; + else if (rate->bw == RATE_INFO_BW_EHT_RU && + rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996P484) + result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; + else if (rate->bw == RATE_INFO_BW_80 || + (rate->bw == RATE_INFO_BW_EHT_RU && + rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996)) + result = rates_996[rate->eht_gi]; + else if (rate->bw == RATE_INFO_BW_EHT_RU && + rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484P242) + result = rates_484[rate->eht_gi] + rates_242[rate->eht_gi]; + else if (rate->bw == RATE_INFO_BW_40 || + (rate->bw == RATE_INFO_BW_EHT_RU && + rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484)) + result = rates_484[rate->eht_gi]; + else if (rate->bw == RATE_INFO_BW_20 || + (rate->bw == RATE_INFO_BW_EHT_RU && + rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_242)) + result = rates_242[rate->eht_gi]; + else if (rate->bw == RATE_INFO_BW_EHT_RU && + rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106P26) + result = rates_106[rate->eht_gi] + rates_26[rate->eht_gi]; + else if (rate->bw == RATE_INFO_BW_EHT_RU && + rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106) + result = rates_106[rate->eht_gi]; + else if (rate->bw == RATE_INFO_BW_EHT_RU && + rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52P26) + result = rates_52[rate->eht_gi] + rates_26[rate->eht_gi]; + else if (rate->bw == RATE_INFO_BW_EHT_RU && + rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52) + result = rates_52[rate->eht_gi]; + else if (rate->bw == RATE_INFO_BW_EHT_RU && + rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_26) + result = rates_26[rate->eht_gi]; + else { + WARN(1, "invalid EHT MCS: bw:%d, ru:%d\n", + rate->bw, rate->eht_ru_alloc); + return 0; + } + + /* now scale to the appropriate MCS */ + tmp = result; + tmp *= SCALE; + do_div(tmp, mcs_divisors[rate->mcs]); + + /* and take NSS */ + tmp *= rate->nss; + do_div(tmp, 8); + + result = tmp; + + return result / 10000; +} + u32 cfg80211_calculate_bitrate(struct rate_info *rate) { if (rate->flags & RATE_INFO_FLAGS_MCS) @@ -1440,6 +1579,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate) return cfg80211_calculate_bitrate_vht(rate); if (rate->flags & RATE_INFO_FLAGS_HE_MCS) return cfg80211_calculate_bitrate_he(rate); + if (rate->flags & RATE_INFO_FLAGS_EHT_MCS) + return cfg80211_calculate_bitrate_eht(rate); return rate->legacy; } @@ -1795,6 +1936,24 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, } EXPORT_SYMBOL(ieee80211_chandef_to_operating_class); +static int cfg80211_wdev_bi(struct wireless_dev *wdev) +{ + switch (wdev->iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + WARN_ON(wdev->valid_links); + return wdev->links[0].ap.beacon_interval; + case NL80211_IFTYPE_MESH_POINT: + return wdev->u.mesh.beacon_interval; + case NL80211_IFTYPE_ADHOC: + return wdev->u.ibss.beacon_interval; + default: + break; + } + + return 0; +} + static void cfg80211_calculate_bi_data(struct wiphy *wiphy, u32 new_beacon_int, u32 *beacon_int_gcd, bool *beacon_int_different) @@ -1805,19 +1964,27 @@ static void cfg80211_calculate_bi_data(struct wiphy *wiphy, u32 new_beacon_int, *beacon_int_different = false; list_for_each_entry(wdev, &wiphy->wdev_list, list) { - if (!wdev->beacon_interval) + int wdev_bi; + + /* this feature isn't supported with MLO */ + if (wdev->valid_links) + continue; + + wdev_bi = cfg80211_wdev_bi(wdev); + + if (!wdev_bi) continue; if (!*beacon_int_gcd) { - *beacon_int_gcd = wdev->beacon_interval; + *beacon_int_gcd = wdev_bi; continue; } - if (wdev->beacon_interval == *beacon_int_gcd) + if (wdev_bi == *beacon_int_gcd) continue; *beacon_int_different = true; - *beacon_int_gcd = gcd(*beacon_int_gcd, wdev->beacon_interval); + *beacon_int_gcd = gcd(*beacon_int_gcd, wdev_bi); } if (new_beacon_int && *beacon_int_gcd != new_beacon_int) { @@ -2282,3 +2449,60 @@ bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype, return false; } EXPORT_SYMBOL(cfg80211_iftype_allowed); + +void cfg80211_remove_link(struct wireless_dev *wdev, unsigned int link_id) +{ + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + + ASSERT_WDEV_LOCK(wdev); + + switch (wdev->iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + __cfg80211_stop_ap(rdev, wdev->netdev, link_id, true); + break; + default: + /* per-link not relevant */ + break; + } + + wdev->valid_links &= ~BIT(link_id); + + rdev_del_intf_link(rdev, wdev, link_id); + + eth_zero_addr(wdev->links[link_id].addr); +} + +void cfg80211_remove_links(struct wireless_dev *wdev) +{ + unsigned int link_id; + + wdev_lock(wdev); + if (wdev->valid_links) { + for_each_valid_link(wdev, link_id) + cfg80211_remove_link(wdev, link_id); + } + wdev_unlock(wdev); +} + +int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev) +{ + cfg80211_remove_links(wdev); + + return rdev_del_virtual_intf(rdev, wdev); +} + +const struct wiphy_iftype_ext_capab * +cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type) +{ + int i; + + for (i = 0; i < wiphy->num_iftype_ext_capab; i++) { + if (wiphy->iftype_ext_capab[i].iftype == type) + return &wiphy->iftype_ext_capab[i]; + } + + return NULL; +} +EXPORT_SYMBOL(cfg80211_get_iftype_ext_capa); diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index a32065d600a1..ddf340bfa07a 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -7,7 +7,7 @@ * we directly assign the wireless handlers of wireless interfaces. * * Copyright 2008-2009 Johannes Berg - * Copyright (C) 2019-2021 Intel Corporation + * Copyright (C) 2019-2022 Intel Corporation */ #include @@ -415,6 +415,9 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, int err, i; bool rejoin = false; + if (wdev->valid_links) + return -EINVAL; + if (pairwise && !addr) return -EINVAL; @@ -437,7 +440,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, return -EOPNOTSUPP; if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC) { - if (!wdev->current_bss) + if (!wdev->connected) return -ENOLINK; if (!rdev->ops->set_default_mgmt_key) @@ -450,7 +453,9 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, if (remove) { err = 0; - if (wdev->current_bss) { + if (wdev->connected || + (wdev->iftype == NL80211_IFTYPE_ADHOC && + wdev->u.ibss.current_bss)) { /* * If removing the current TX key, we will need to * join a new IBSS without the privacy bit clear. @@ -465,7 +470,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) err = -ENOENT; else - err = rdev_del_key(rdev, dev, idx, pairwise, + err = rdev_del_key(rdev, dev, -1, idx, pairwise, addr); } wdev->wext.connect.privacy = false; @@ -501,8 +506,10 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, return -EINVAL; err = 0; - if (wdev->current_bss) - err = rdev_add_key(rdev, dev, idx, pairwise, addr, params); + if (wdev->connected || + (wdev->iftype == NL80211_IFTYPE_ADHOC && + wdev->u.ibss.current_bss)) + err = rdev_add_key(rdev, dev, -1, idx, pairwise, addr, params); else if (params->cipher != WLAN_CIPHER_SUITE_WEP40 && params->cipher != WLAN_CIPHER_SUITE_WEP104) return -EINVAL; @@ -526,7 +533,9 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, if ((params->cipher == WLAN_CIPHER_SUITE_WEP40 || params->cipher == WLAN_CIPHER_SUITE_WEP104) && (tx_key || (!addr && wdev->wext.default_key == -1))) { - if (wdev->current_bss) { + if (wdev->connected || + (wdev->iftype == NL80211_IFTYPE_ADHOC && + wdev->u.ibss.current_bss)) { /* * If we are getting a new TX key from not having * had one before we need to join a new IBSS with @@ -537,7 +546,8 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, __cfg80211_leave_ibss(rdev, wdev->netdev, true); rejoin = true; } - err = rdev_set_default_key(rdev, dev, idx, true, true); + err = rdev_set_default_key(rdev, dev, -1, idx, true, + true); } if (!err) { wdev->wext.default_key = idx; @@ -549,8 +559,10 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC && (tx_key || (!addr && wdev->wext.default_mgmt_key == -1))) { - if (wdev->current_bss) - err = rdev_set_default_mgmt_key(rdev, dev, idx); + if (wdev->connected || + (wdev->iftype == NL80211_IFTYPE_ADHOC && + wdev->u.ibss.current_bss)) + err = rdev_set_default_mgmt_key(rdev, dev, -1, idx); if (!err) wdev->wext.default_mgmt_key = idx; return err; @@ -595,6 +607,11 @@ static int cfg80211_wext_siwencode(struct net_device *dev, return -EOPNOTSUPP; wiphy_lock(&rdev->wiphy); + if (wdev->valid_links) { + err = -EOPNOTSUPP; + goto out; + } + idx = erq->flags & IW_ENCODE_INDEX; if (idx == 0) { idx = wdev->wext.default_key; @@ -613,8 +630,10 @@ static int cfg80211_wext_siwencode(struct net_device *dev, /* No key data - just set the default TX key index */ err = 0; wdev_lock(wdev); - if (wdev->current_bss) - err = rdev_set_default_key(rdev, dev, idx, true, + if (wdev->connected || + (wdev->iftype == NL80211_IFTYPE_ADHOC && + wdev->u.ibss.current_bss)) + err = rdev_set_default_key(rdev, dev, -1, idx, true, true); if (!err) wdev->wext.default_key = idx; @@ -667,6 +686,13 @@ static int cfg80211_wext_siwencodeext(struct net_device *dev, !rdev->ops->set_default_key) return -EOPNOTSUPP; + wdev_lock(wdev); + if (wdev->valid_links) { + wdev_unlock(wdev); + return -EOPNOTSUPP; + } + wdev_unlock(wdev); + switch (ext->alg) { case IW_ENCODE_ALG_NONE: remove = true; @@ -865,7 +891,7 @@ static int cfg80211_wext_giwfreq(struct net_device *dev, break; } - ret = rdev_get_channel(rdev, wdev, &chandef); + ret = rdev_get_channel(rdev, wdev, 0, &chandef); if (ret) break; freq->m = chandef.chan->center_freq; @@ -1270,7 +1296,10 @@ static int cfg80211_wext_siwrate(struct net_device *dev, return -EINVAL; wiphy_lock(&rdev->wiphy); - ret = rdev_set_bitrate_mask(rdev, dev, NULL, &mask); + if (dev->ieee80211_ptr->valid_links) + ret = -EOPNOTSUPP; + else + ret = rdev_set_bitrate_mask(rdev, dev, 0, NULL, &mask); wiphy_unlock(&rdev->wiphy); return ret; @@ -1294,8 +1323,9 @@ static int cfg80211_wext_giwrate(struct net_device *dev, err = 0; wdev_lock(wdev); - if (wdev->current_bss) - memcpy(addr, wdev->current_bss->pub.bssid, ETH_ALEN); + if (!wdev->valid_links && wdev->links[0].client.current_bss) + memcpy(addr, wdev->links[0].client.current_bss->pub.bssid, + ETH_ALEN); else err = -EOPNOTSUPP; wdev_unlock(wdev); @@ -1339,11 +1369,11 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) /* Grab BSSID of current BSS, if any */ wdev_lock(wdev); - if (!wdev->current_bss) { + if (wdev->valid_links || !wdev->links[0].client.current_bss) { wdev_unlock(wdev); return NULL; } - memcpy(bssid, wdev->current_bss->pub.bssid, ETH_ALEN); + memcpy(bssid, wdev->links[0].client.current_bss->pub.bssid, ETH_ALEN); wdev_unlock(wdev); memset(&sinfo, 0, sizeof(sinfo)); diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index 193a18a53142..68f45afc352d 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -3,7 +3,7 @@ * cfg80211 wext compat for managed mode. * * Copyright 2009 Johannes Berg - * Copyright (C) 2009, 2020-2021 Intel Corporation. + * Copyright (C) 2009, 2020-2022 Intel Corporation */ #include @@ -124,9 +124,12 @@ int cfg80211_mgd_wext_giwfreq(struct net_device *dev, if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; + if (wdev->valid_links) + return -EOPNOTSUPP; + wdev_lock(wdev); - if (wdev->current_bss) - chan = wdev->current_bss->pub.channel; + if (wdev->links[0].client.current_bss) + chan = wdev->links[0].client.current_bss->pub.channel; else if (wdev->wext.connect.channel) chan = wdev->wext.connect.channel; wdev_unlock(wdev); @@ -208,22 +211,26 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev, if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; + if (wdev->valid_links) + return -EINVAL; + data->flags = 0; wdev_lock(wdev); - if (wdev->current_bss) { - const u8 *ie; + if (wdev->links[0].client.current_bss) { + const struct element *ssid_elem; rcu_read_lock(); - ie = ieee80211_bss_get_ie(&wdev->current_bss->pub, - WLAN_EID_SSID); - if (ie) { + ssid_elem = ieee80211_bss_get_elem( + &wdev->links[0].client.current_bss->pub, + WLAN_EID_SSID); + if (ssid_elem) { data->flags = 1; - data->length = ie[1]; + data->length = ssid_elem->datalen; if (data->length > IW_ESSID_MAX_SIZE) ret = -EINVAL; else - memcpy(ssid, ie + 2, data->length); + memcpy(ssid, ssid_elem->data, data->length); } rcu_read_unlock(); } else if (wdev->wext.connect.ssid && wdev->wext.connect.ssid_len) { @@ -300,8 +307,14 @@ int cfg80211_mgd_wext_giwap(struct net_device *dev, ap_addr->sa_family = ARPHRD_ETHER; wdev_lock(wdev); - if (wdev->current_bss) - memcpy(ap_addr->sa_data, wdev->current_bss->pub.bssid, ETH_ALEN); + if (wdev->valid_links) { + wdev_unlock(wdev); + return -EOPNOTSUPP; + } + if (wdev->links[0].client.current_bss) + memcpy(ap_addr->sa_data, + wdev->links[0].client.current_bss->pub.bssid, + ETH_ALEN); else eth_zero_addr(ap_addr->sa_data); wdev_unlock(wdev); diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index 4dae3ab8d030..2e80e38bcfd8 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -237,7 +237,7 @@ static struct xfrm_algo_desc aalg_list[] = { .uinfo = { .auth = { - .icv_truncbits = 96, + .icv_truncbits = IS_ENABLED(CONFIG_ANDROID) ? 128 : 96, .icv_fullbits = 256, } }, diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 4dc4a7bbe51c..e46afab0939e 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -273,6 +273,7 @@ static int xfrm4_beet_encap_add(struct xfrm_state *x, struct sk_buff *skb) */ static int xfrm4_tunnel_encap_add(struct xfrm_state *x, struct sk_buff *skb) { + bool small_ipv6 = (skb->protocol == htons(ETH_P_IPV6)) && (skb->len <= IPV6_MIN_MTU); struct dst_entry *dst = skb_dst(skb); struct iphdr *top_iph; int flags; @@ -303,7 +304,7 @@ static int xfrm4_tunnel_encap_add(struct xfrm_state *x, struct sk_buff *skb) if (flags & XFRM_STATE_NOECN) IP_ECN_clear(top_iph); - top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? + top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) || small_ipv6 ? 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst)); @@ -479,13 +480,11 @@ static int xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb) return -EOPNOTSUPP; } -#if IS_ENABLED(CONFIG_NET_PKTGEN) int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb) { return xfrm_outer_mode_output(x, skb); } EXPORT_SYMBOL_GPL(pktgen_xfrm_outer_mode_output); -#endif static int xfrm_output_one(struct sk_buff *skb, int err) { @@ -580,22 +579,22 @@ out: return err; } -int xfrm_output_resume(struct sock *sk, struct sk_buff *skb, int err) +int xfrm_output_resume(struct sk_buff *skb, int err) { struct net *net = xs_net(skb_dst(skb)->xfrm); while (likely((err = xfrm_output_one(skb, err)) == 0)) { nf_reset_ct(skb); - err = skb_dst(skb)->ops->local_out(net, sk, skb); + err = skb_dst(skb)->ops->local_out(net, skb->sk, skb); if (unlikely(err != 1)) goto out; if (!skb_dst(skb)->xfrm) - return dst_output(net, sk, skb); + return dst_output(net, skb->sk, skb); err = nf_hook(skb_dst(skb)->ops->family, - NF_INET_POST_ROUTING, net, sk, skb, + NF_INET_POST_ROUTING, net, skb->sk, skb, NULL, skb_dst(skb)->dev, xfrm_output2); if (unlikely(err != 1)) goto out; @@ -611,7 +610,7 @@ EXPORT_SYMBOL_GPL(xfrm_output_resume); static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { - return xfrm_output_resume(sk, skb, 1); + return xfrm_output_resume(skb, 1); } static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 15132b080614..aebfe9b40047 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2441,19 +2441,22 @@ int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen) if (IS_ERR(data)) return PTR_ERR(data); - if (in_compat_syscall()) { - struct xfrm_translator *xtr = xfrm_get_translator(); + /* Use the 64-bit / untranslated format on Android, even for compat */ + if (!IS_ENABLED(CONFIG_ANDROID) || IS_ENABLED(CONFIG_XFRM_USER_COMPAT)) { + if (in_compat_syscall()) { + struct xfrm_translator *xtr = xfrm_get_translator(); - if (!xtr) { - kfree(data); - return -EOPNOTSUPP; - } + if (!xtr) { + kfree(data); + return -EOPNOTSUPP; + } - err = xtr->xlate_user_policy_sockptr(&data, optlen); - xfrm_put_translator(xtr); - if (err) { - kfree(data); - return err; + err = xtr->xlate_user_policy_sockptr(&data, optlen); + xfrm_put_translator(xtr); + if (err) { + kfree(data); + return err; + } } } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 5fba82757ce5..3afcec07dc10 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2875,19 +2875,22 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, if (!netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; - if (in_compat_syscall()) { - struct xfrm_translator *xtr = xfrm_get_translator(); + /* Use the 64-bit / untranslated format on Android, even for compat */ + if (!IS_ENABLED(CONFIG_ANDROID) || IS_ENABLED(CONFIG_XFRM_USER_COMPAT)) { + if (in_compat_syscall()) { + struct xfrm_translator *xtr = xfrm_get_translator(); - if (!xtr) - return -EOPNOTSUPP; + if (!xtr) + return -EOPNOTSUPP; - nlh64 = xtr->rcv_msg_compat(nlh, link->nla_max, - link->nla_pol, extack); - xfrm_put_translator(xtr); - if (IS_ERR(nlh64)) - return PTR_ERR(nlh64); - if (nlh64) - nlh = nlh64; + nlh64 = xtr->rcv_msg_compat(nlh, link->nla_max, + link->nla_pol, extack); + xfrm_put_translator(xtr); + if (IS_ERR(nlh64)) + return PTR_ERR(nlh64); + if (nlh64) + nlh = nlh64; + } } if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || diff --git a/samples/crypto/fips140_lab_util.c b/samples/crypto/fips140_lab_util.c new file mode 100644 index 000000000000..5f8e9018013a --- /dev/null +++ b/samples/crypto/fips140_lab_util.c @@ -0,0 +1,638 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2021 Google LLC + * + * This program provides commands that dump certain types of output from the + * fips140 kernel module, as required by the FIPS lab for evaluation purposes. + * + * While the fips140 kernel module can only be accessed directly by other kernel + * code, an easy-to-use userspace utility program was desired for lab testing. + * When possible, this program uses AF_ALG to access the crypto algorithms; this + * requires that the kernel has AF_ALG enabled. Where AF_ALG isn't sufficient, + * a custom device node /dev/fips140 is used instead; this requires that the + * fips140 module is loaded and has evaluation testing support compiled in. + * + * This program can be compiled and run on an Android device as follows: + * + * NDK_DIR=$HOME/android-ndk-r23b # adjust directory path as needed + * $NDK_DIR/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android31-clang \ + * fips140_lab_util.c -O2 -Wall -o fips140_lab_util + * adb push fips140_lab_util /data/local/tmp/ + * adb root + * adb shell /data/local/tmp/fips140_lab_util + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../crypto/fips140-eval-testing-uapi.h" + +/* --------------------------------------------------------------------------- + * Utility functions + * ---------------------------------------------------------------------------*/ + +#define ARRAY_SIZE(A) (sizeof(A) / sizeof((A)[0])) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +static void __attribute__((noreturn)) +do_die(const char *format, va_list va, int err) +{ + fputs("ERROR: ", stderr); + vfprintf(stderr, format, va); + if (err) + fprintf(stderr, ": %s", strerror(err)); + putc('\n', stderr); + exit(1); +} + +static void __attribute__((noreturn, format(printf, 1, 2))) +die_errno(const char *format, ...) +{ + va_list va; + + va_start(va, format); + do_die(format, va, errno); + va_end(va); +} + +static void __attribute__((noreturn, format(printf, 1, 2))) +die(const char *format, ...) +{ + va_list va; + + va_start(va, format); + do_die(format, va, 0); + va_end(va); +} + +static void __attribute__((noreturn)) +assertion_failed(const char *expr, const char *file, int line) +{ + die("Assertion failed: %s at %s:%d", expr, file, line); +} + +#define ASSERT(e) ({ if (!(e)) assertion_failed(#e, __FILE__, __LINE__); }) + +static void rand_bytes(uint8_t *bytes, size_t count) +{ + size_t i; + + for (i = 0; i < count; i++) + bytes[i] = rand(); +} + +static const char *booltostr(bool b) +{ + return b ? "true" : "false"; +} + +static const char *bytes_to_hex(const uint8_t *bytes, size_t count) +{ + static char hex[1025]; + size_t i; + + ASSERT(count <= 512); + for (i = 0; i < count; i++) + sprintf(&hex[2*i], "%02x", bytes[i]); + return hex; +} + +static void full_write(int fd, const void *buf, size_t count) +{ + while (count) { + ssize_t ret = write(fd, buf, count); + + if (ret < 0) + die_errno("write failed"); + buf += ret; + count -= ret; + } +} + +enum { + OPT_AMOUNT, + OPT_ITERATIONS, +}; + +static void usage(void); + +/* --------------------------------------------------------------------------- + * /dev/fips140 ioctls + * ---------------------------------------------------------------------------*/ + +static int get_fips140_device_number(void) +{ + FILE *f; + char line[128]; + int number; + char name[32]; + + f = fopen("/proc/devices", "r"); + if (!f) + die_errno("Failed to open /proc/devices"); + while (fgets(line, sizeof(line), f)) { + if (sscanf(line, "%d %31s", &number, name) == 2 && + strcmp(name, "fips140") == 0) + return number; + } + fclose(f); + die("fips140 device node is unavailable.\n" +"The fips140 device node is only available when the fips140 module is loaded\n" +"and has been built with evaluation testing support."); +} + +static void create_fips140_node_if_needed(void) +{ + struct stat stbuf; + int major; + + if (stat("/dev/fips140", &stbuf) == 0) + return; + + major = get_fips140_device_number(); + if (mknod("/dev/fips140", S_IFCHR | 0600, makedev(major, 1)) != 0) + die_errno("Failed to create fips140 device node"); +} + +static int fips140_dev_fd = -1; + +static int fips140_ioctl(int cmd, const void *arg) +{ + if (fips140_dev_fd < 0) { + create_fips140_node_if_needed(); + fips140_dev_fd = open("/dev/fips140", O_RDONLY); + if (fips140_dev_fd < 0) + die_errno("Failed to open /dev/fips140"); + } + return ioctl(fips140_dev_fd, cmd, arg); +} + +static bool fips140_is_approved_service(const char *name) +{ + int ret = fips140_ioctl(FIPS140_IOCTL_IS_APPROVED_SERVICE, name); + + if (ret < 0) + die_errno("FIPS140_IOCTL_IS_APPROVED_SERVICE unexpectedly failed"); + if (ret == 1) + return true; + if (ret == 0) + return false; + die("FIPS140_IOCTL_IS_APPROVED_SERVICE returned unexpected value %d", + ret); +} + +static const char *fips140_module_version(void) +{ + static char buf[256]; + int ret; + + memset(buf, 0, sizeof(buf)); + ret = fips140_ioctl(FIPS140_IOCTL_MODULE_VERSION, buf); + if (ret < 0) + die_errno("FIPS140_IOCTL_MODULE_VERSION unexpectedly failed"); + if (ret != 0) + die("FIPS140_IOCTL_MODULE_VERSION returned unexpected value %d", + ret); + return buf; +} + +/* --------------------------------------------------------------------------- + * AF_ALG utilities + * ---------------------------------------------------------------------------*/ + +#define AF_ALG_MAX_RNG_REQUEST_SIZE 128 + +static int get_alg_fd(const char *alg_type, const char *alg_name) +{ + struct sockaddr_alg addr = {}; + int alg_fd; + + alg_fd = socket(AF_ALG, SOCK_SEQPACKET, 0); + if (alg_fd < 0) + die("Failed to create AF_ALG socket.\n" +"AF_ALG is only available when it has been enabled in the kernel.\n"); + + strncpy((char *)addr.salg_type, alg_type, sizeof(addr.salg_type) - 1); + strncpy((char *)addr.salg_name, alg_name, sizeof(addr.salg_name) - 1); + + if (bind(alg_fd, (void *)&addr, sizeof(addr)) != 0) + die_errno("Failed to bind AF_ALG socket to %s %s", + alg_type, alg_name); + return alg_fd; +} + +static int get_req_fd(int alg_fd, const char *alg_name) +{ + int req_fd = accept(alg_fd, NULL, NULL); + + if (req_fd < 0) + die_errno("Failed to get request file descriptor for %s", + alg_name); + return req_fd; +} + +/* --------------------------------------------------------------------------- + * dump_jitterentropy command + * ---------------------------------------------------------------------------*/ + +static void dump_from_jent_fd(int fd, size_t count) +{ + uint8_t buf[AF_ALG_MAX_RNG_REQUEST_SIZE]; + + while (count) { + ssize_t ret; + + memset(buf, 0, sizeof(buf)); + ret = read(fd, buf, MIN(count, sizeof(buf))); + if (ret < 0) + die_errno("error reading from jitterentropy_rng"); + full_write(STDOUT_FILENO, buf, ret); + count -= ret; + } +} + +static int cmd_dump_jitterentropy(int argc, char *argv[]) +{ + static const struct option longopts[] = { + { "amount", required_argument, NULL, OPT_AMOUNT }, + { "iterations", required_argument, NULL, OPT_ITERATIONS }, + { NULL, 0, NULL, 0 }, + }; + size_t amount = 128; + size_t iterations = 1; + size_t i; + int c; + + while ((c = getopt_long(argc, argv, "", longopts, NULL)) != -1) { + switch (c) { + case OPT_AMOUNT: + amount = strtoul(optarg, NULL, 0); + if (amount <= 0 || amount >= ULONG_MAX) + die("invalid argument to --amount"); + break; + case OPT_ITERATIONS: + iterations = strtoul(optarg, NULL, 0); + if (iterations <= 0 || iterations >= ULONG_MAX) + die("invalid argument to --iterations"); + break; + default: + usage(); + return 1; + } + } + + for (i = 0; i < iterations; i++) { + int alg_fd = get_alg_fd("rng", "jitterentropy_rng"); + int req_fd = get_req_fd(alg_fd, "jitterentropy_rng"); + + dump_from_jent_fd(req_fd, amount); + + close(req_fd); + close(alg_fd); + } + return 0; +} + +/* --------------------------------------------------------------------------- + * show_invalid_inputs command + * ---------------------------------------------------------------------------*/ + +enum direction { + UNSPECIFIED, + DECRYPT, + ENCRYPT, +}; + +static const struct invalid_input_test { + const char *alg_type; + const char *alg_name; + const char *key; + size_t key_size; + const char *msg; + size_t msg_size; + const char *iv; + size_t iv_size; + enum direction direction; + int setkey_error; + int crypt_error; +} invalid_input_tests[] = { + { + .alg_type = "skcipher", + .alg_name = "cbc(aes)", + .key_size = 16, + }, { + .alg_type = "skcipher", + .alg_name = "cbc(aes)", + .key_size = 17, + .setkey_error = EINVAL, + }, { + .alg_type = "skcipher", + .alg_name = "cbc(aes)", + .key_size = 24, + }, { + .alg_type = "skcipher", + .alg_name = "cbc(aes)", + .key_size = 32, + }, { + .alg_type = "skcipher", + .alg_name = "cbc(aes)", + .key_size = 33, + .setkey_error = EINVAL, + }, { + .alg_type = "skcipher", + .alg_name = "cbc(aes)", + .key_size = 16, + .msg_size = 1, + .direction = DECRYPT, + .crypt_error = EINVAL, + }, { + .alg_type = "skcipher", + .alg_name = "cbc(aes)", + .key_size = 16, + .msg_size = 16, + .direction = ENCRYPT, + }, { + .alg_type = "skcipher", + .alg_name = "cbc(aes)", + .key_size = 16, + .msg_size = 17, + .direction = ENCRYPT, + .crypt_error = EINVAL, + }, { + .alg_type = "hash", + .alg_name = "cmac(aes)", + .key_size = 29, + .setkey_error = EINVAL, + }, { + .alg_type = "skcipher", + .alg_name = "xts(aes)", + .key_size = 32, + }, { + .alg_type = "skcipher", + .alg_name = "xts(aes)", + .key = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" + "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", + .key_size = 32, + .setkey_error = EINVAL, + } +}; + +static const char *describe_crypt_op(const struct invalid_input_test *t) +{ + if (t->direction == ENCRYPT) + return "encryption"; + if (t->direction == DECRYPT) + return "decryption"; + if (strcmp(t->alg_type, "hash") == 0) + return "hashing"; + ASSERT(0); +} + +static bool af_alg_setkey(const struct invalid_input_test *t, int alg_fd) +{ + const uint8_t *key = (const uint8_t *)t->key; + uint8_t _key[t->key_size]; + + if (t->key_size == 0) + return true; + + if (t->key == NULL) { + rand_bytes(_key, t->key_size); + key = _key; + } + if (setsockopt(alg_fd, SOL_ALG, ALG_SET_KEY, key, t->key_size) != 0) { + printf("%s: setting %zu-byte key failed with error '%s'\n", + t->alg_name, t->key_size, strerror(errno)); + printf("\tkey was %s\n\n", bytes_to_hex(key, t->key_size)); + ASSERT(t->setkey_error == errno); + return false; + } + printf("%s: setting %zu-byte key succeeded\n", + t->alg_name, t->key_size); + printf("\tkey was %s\n\n", bytes_to_hex(key, t->key_size)); + ASSERT(t->setkey_error == 0); + return true; +} + +static void af_alg_process_msg(const struct invalid_input_test *t, int alg_fd) +{ + struct iovec iov; + struct msghdr hdr = { + .msg_iov = &iov, + .msg_iovlen = 1, + }; + const uint8_t *msg = (const uint8_t *)t->msg; + uint8_t *_msg = NULL; + uint8_t *output = NULL; + uint8_t *control = NULL; + size_t controllen = 0; + struct cmsghdr *cmsg; + int req_fd; + + if (t->msg_size == 0) + return; + + req_fd = get_req_fd(alg_fd, t->alg_name); + + if (t->msg == NULL) { + _msg = malloc(t->msg_size); + rand_bytes(_msg, t->msg_size); + msg = _msg; + } + output = malloc(t->msg_size); + iov.iov_base = (void *)msg; + iov.iov_len = t->msg_size; + + if (t->direction != UNSPECIFIED) + controllen += CMSG_SPACE(sizeof(uint32_t)); + if (t->iv_size) + controllen += CMSG_SPACE(sizeof(struct af_alg_iv) + t->iv_size); + control = calloc(1, controllen); + hdr.msg_control = control; + hdr.msg_controllen = controllen; + cmsg = CMSG_FIRSTHDR(&hdr); + if (t->direction != UNSPECIFIED) { + cmsg->cmsg_level = SOL_ALG; + cmsg->cmsg_type = ALG_SET_OP; + cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); + *(uint32_t *)CMSG_DATA(cmsg) = t->direction == DECRYPT ? + ALG_OP_DECRYPT : ALG_OP_ENCRYPT; + cmsg = CMSG_NXTHDR(&hdr, cmsg); + } + if (t->iv_size) { + struct af_alg_iv *alg_iv; + + cmsg->cmsg_level = SOL_ALG; + cmsg->cmsg_type = ALG_SET_IV; + cmsg->cmsg_len = CMSG_LEN(sizeof(*alg_iv) + t->iv_size); + alg_iv = (struct af_alg_iv *)CMSG_DATA(cmsg); + alg_iv->ivlen = t->iv_size; + memcpy(alg_iv->iv, t->iv, t->iv_size); + } + + if (sendmsg(req_fd, &hdr, 0) != t->msg_size) + die_errno("sendmsg failed"); + + if (read(req_fd, output, t->msg_size) != t->msg_size) { + printf("%s: %s of %zu-byte message failed with error '%s'\n", + t->alg_name, describe_crypt_op(t), t->msg_size, + strerror(errno)); + printf("\tmessage was %s\n\n", bytes_to_hex(msg, t->msg_size)); + ASSERT(t->crypt_error == errno); + } else { + printf("%s: %s of %zu-byte message succeeded\n", + t->alg_name, describe_crypt_op(t), t->msg_size); + printf("\tmessage was %s\n\n", bytes_to_hex(msg, t->msg_size)); + ASSERT(t->crypt_error == 0); + } + free(_msg); + free(output); + free(control); + close(req_fd); +} + +static void test_invalid_input(const struct invalid_input_test *t) +{ + int alg_fd = get_alg_fd(t->alg_type, t->alg_name); + + if (af_alg_setkey(t, alg_fd)) + af_alg_process_msg(t, alg_fd); + + close(alg_fd); +} + +static int cmd_show_invalid_inputs(int argc, char *argv[]) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(invalid_input_tests); i++) + test_invalid_input(&invalid_input_tests[i]); + return 0; +} + +/* --------------------------------------------------------------------------- + * show_module_version command + * ---------------------------------------------------------------------------*/ + +static int cmd_show_module_version(int argc, char *argv[]) +{ + printf("fips140_module_version() => \"%s\"\n", + fips140_module_version()); + return 0; +} + +/* --------------------------------------------------------------------------- + * show_service_indicators command + * ---------------------------------------------------------------------------*/ + +static const char * const default_services_to_show[] = { + "aes", + "cbc(aes)", + "cbcmac(aes)", + "cmac(aes)", + "ctr(aes)", + "cts(cbc(aes))", + "ecb(aes)", + "essiv(cbc(aes),sha256)", + "gcm(aes)", + "hmac(sha1)", + "hmac(sha224)", + "hmac(sha256)", + "hmac(sha384)", + "hmac(sha512)", + "jitterentropy_rng", + "sha1", + "sha224", + "sha256", + "sha384", + "sha512", + "stdrng", + "xcbc(aes)", + "xts(aes)", +}; + +static int cmd_show_service_indicators(int argc, char *argv[]) +{ + const char * const *services = default_services_to_show; + int count = ARRAY_SIZE(default_services_to_show); + int i; + + if (argc > 1) { + services = (const char **)(argv + 1); + count = argc - 1; + } + for (i = 0; i < count; i++) { + printf("fips140_is_approved_service(\"%s\") => %s\n", + services[i], + booltostr(fips140_is_approved_service(services[i]))); + } + return 0; +} + +/* --------------------------------------------------------------------------- + * main() + * ---------------------------------------------------------------------------*/ + +static const struct command { + const char *name; + int (*func)(int argc, char *argv[]); +} commands[] = { + { "dump_jitterentropy", cmd_dump_jitterentropy }, + { "show_invalid_inputs", cmd_show_invalid_inputs }, + { "show_module_version", cmd_show_module_version }, + { "show_service_indicators", cmd_show_service_indicators }, +}; + +static void usage(void) +{ + fprintf(stderr, +"Usage:\n" +" fips140_lab_util dump_jitterentropy [OPTION]...\n" +" fips140_lab_util show_invalid_inputs\n" +" fips140_lab_util show_module_version\n" +" fips140_lab_util show_service_indicators [SERVICE]...\n" +"\n" +"Options for dump_jitterentropy:\n" +" --amount=AMOUNT Amount to dump in bytes per iteration (default 128)\n" +" --iterations=COUNT Number of start-up iterations (default 1)\n" + ); +} + +int main(int argc, char *argv[]) +{ + int i; + + if (argc < 2) { + usage(); + return 2; + } + for (i = 1; i < argc; i++) { + if (strcmp(argv[i], "--help") == 0) { + usage(); + return 2; + } + } + + for (i = 0; i < ARRAY_SIZE(commands); i++) { + if (strcmp(commands[i].name, argv[1]) == 0) + return commands[i].func(argc - 1, argv + 1); + } + fprintf(stderr, "Unknown command: %s\n\n", argv[1]); + usage(); + return 2; +} diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 17aa8ef2d52a..81c91386e1b9 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -86,7 +86,8 @@ ifdef need-builtin targets-for-builtin += $(obj)/built-in.a endif -targets-for-modules := $(patsubst %.o, %.mod, $(filter %.o, $(obj-m))) +targets-for-modules := $(foreach x, o mod $(if $(CONFIG_TRIM_UNUSED_KSYMS), usyms), \ + $(patsubst %.o, %.$x, $(filter %.o, $(obj-m)))) ifdef CONFIG_LTO_CLANG targets-for-modules += $(patsubst %.o, %.lto.o, $(filter %.o, $(obj-m))) @@ -261,9 +262,6 @@ objtool_dep = $(objtool_obj) \ ifdef CONFIG_TRIM_UNUSED_KSYMS cmd_gen_ksymdeps = \ $(CONFIG_SHELL) $(srctree)/scripts/gen_ksymdeps.sh $@ >> $(dot-target).cmd - -# List module undefined symbols -undefined_syms = $(NM) $< | $(AWK) '$$1 == "U" { printf("%s%s", x++ ? " " : "", $$2) }'; endif define rule_cc_o_c @@ -310,14 +308,18 @@ $(obj)/%.lto.o: $(obj)/%.o FORCE $(call if_changed,cc_lto_link_modules) endif -cmd_mod = { \ - echo $(if $($*-objs)$($*-y)$($*-m), $(addprefix $(obj)/, $($*-objs) $($*-y) $($*-m)), $(@:.mod=.o)); \ - $(undefined_syms) echo; \ - } > $@ +cmd_mod = printf '%s\n' $(call real-search, $*.o, .o, -objs -y -m) | \ + $(AWK) '!x[$$0]++ { print("$(obj)/"$$0) }' > $@ -$(obj)/%.mod: $(obj)/%$(mod-prelink-ext).o FORCE +$(obj)/%.mod: FORCE $(call if_changed,mod) +# List module undefined symbols +cmd_undefined_syms = $(NM) $< | sed -n 's/^ *U //p' > $@ + +$(obj)/%.usyms: $(obj)/%$(mod-prelink-ext).o FORCE + $(call if_changed,undefined_syms) + quiet_cmd_cc_lst_c = MKLST $@ cmd_cc_lst_c = $(CC) $(c_flags) -g -c -o $*.o $< && \ $(CONFIG_SHELL) $(srctree)/scripts/makelst $*.o \ @@ -467,26 +469,21 @@ quiet_cmd_ar_lib = AR $@ $(obj)/lib.a: $(lib-y) FORCE $(call if_changed,ar_lib) -# NOTE: -# Do not replace $(filter %.o,^) with $(real-prereqs). When a single object -# module is turned into a multi object module, $^ will contain header file -# dependencies recorded in the .*.cmd file. ifdef CONFIG_LTO_CLANG quiet_cmd_link_multi-m = AR [M] $@ cmd_link_multi-m = \ $(cmd_update_lto_symversions); \ rm -f $@; \ - $(AR) cDPrsT $@ $(filter %.o,$^) + $(AR) cDPrsT $@ @$(patsubst %.o,%.mod,$@) else quiet_cmd_link_multi-m = LD [M] $@ - cmd_link_multi-m = $(LD) $(ld_flags) -r -o $@ $(filter %.o,$^) + cmd_link_multi-m = $(LD) $(ld_flags) -r -o $@ @$(patsubst %.o,%.mod,$@) endif -$(multi-obj-m): FORCE +$(multi-obj-m): %.o: %.mod FORCE $(call if_changed,link_multi-m) $(call multi_depend, $(multi-obj-m), .o, -objs -y -m) -targets += $(multi-obj-m) targets := $(filter-out $(PHONY), $(targets)) # Add intermediate targets: diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index fa25e95035a9..ea365b16b2a6 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -219,7 +219,9 @@ cpp_flags = -Wp,-MMD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) \ ld_flags = $(KBUILD_LDFLAGS) $(ldflags-y) $(LDFLAGS_$(@F)) -DTC_INCLUDE := $(srctree)/scripts/dtc/include-prefixes +# ANDROID: Allow DTC_INCLUDE to be set by the BUILD_CONFIG. This allows one to +# compile an out-of-tree device tree. +DTC_INCLUDE += $(srctree)/scripts/dtc/include-prefixes dtc_cpp_flags = -Wp,-MMD,$(depfile).pre.tmp -nostdinc \ $(addprefix -I,$(DTC_INCLUDE)) \ @@ -236,9 +238,9 @@ endif # Usage: # $(call multi_depend, multi_used_targets, suffix_to_remove, suffix_to_add) define multi_depend -$(foreach m, $(notdir $1), \ - $(eval $(obj)/$m: \ - $(addprefix $(obj)/, $(foreach s, $3, $($(m:%$(strip $2)=%$(s))))))) +$(foreach m, $1, \ + $(eval $m: \ + $(addprefix $(obj)/, $(call suffix-search, $(patsubst $(obj)/%,%,$m), $2, $3)))) endef quiet_cmd_copy = COPY $@ @@ -415,19 +417,31 @@ printf "%08x\n" $$dec_size | \ ) quiet_cmd_bzip2 = BZIP2 $@ - cmd_bzip2 = { cat $(real-prereqs) | $(KBZIP2) -9; $(size_append); } > $@ + cmd_bzip2 = cat $(real-prereqs) | $(KBZIP2) -9 > $@ + +quiet_cmd_bzip2_with_size = BZIP2 $@ + cmd_bzip2_with_size = { cat $(real-prereqs) | $(KBZIP2) -9; $(size_append); } > $@ # Lzma # --------------------------------------------------------------------------- quiet_cmd_lzma = LZMA $@ - cmd_lzma = { cat $(real-prereqs) | $(LZMA) -9; $(size_append); } > $@ + cmd_lzma = cat $(real-prereqs) | $(LZMA) -9 > $@ + +quiet_cmd_lzma_with_size = LZMA $@ + cmd_lzma_with_size = { cat $(real-prereqs) | $(LZMA) -9; $(size_append); } > $@ quiet_cmd_lzo = LZO $@ - cmd_lzo = { cat $(real-prereqs) | $(KLZOP) -9; $(size_append); } > $@ + cmd_lzo = cat $(real-prereqs) | $(KLZOP) -9 > $@ + +quiet_cmd_lzo_with_size = LZO $@ + cmd_lzo_with_size = { cat $(real-prereqs) | $(KLZOP) -9; $(size_append); } > $@ quiet_cmd_lz4 = LZ4 $@ - cmd_lz4 = { cat $(real-prereqs) | $(LZ4) -l -c1 stdin stdout; \ + cmd_lz4 = cat $(real-prereqs) | $(LZ4) -l -12 --favor-decSpeed stdin stdout > $@ + +quiet_cmd_lz4_with_size = LZ4 $@ + cmd_lz4_with_size = { cat $(real-prereqs) | $(LZ4) -l -12 --favor-decSpeed stdin stdout; \ $(size_append); } > $@ # U-Boot mkimage @@ -470,7 +484,10 @@ quiet_cmd_uimage = UIMAGE $@ # big dictionary would increase the memory usage too much in the multi-call # decompression mode. A BCJ filter isn't used either. quiet_cmd_xzkern = XZKERN $@ - cmd_xzkern = { cat $(real-prereqs) | sh $(srctree)/scripts/xz_wrap.sh; \ + cmd_xzkern = cat $(real-prereqs) | sh $(srctree)/scripts/xz_wrap.sh > $@ + +quiet_cmd_xzkern_with_size = XZKERN $@ + cmd_xzkern_with_size = { cat $(real-prereqs) | sh $(srctree)/scripts/xz_wrap.sh; \ $(size_append); } > $@ quiet_cmd_xzmisc = XZMISC $@ @@ -496,7 +513,10 @@ quiet_cmd_zstd = ZSTD $@ cmd_zstd = { cat $(real-prereqs) | $(ZSTD) -19; $(size_append); } > $@ quiet_cmd_zstd22 = ZSTD22 $@ - cmd_zstd22 = { cat $(real-prereqs) | $(ZSTD) -22 --ultra; $(size_append); } > $@ + cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -22 --ultra > $@ + +quiet_cmd_zstd22_with_size = ZSTD22 $@ + cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -22 --ultra; $(size_append); } > $@ # ASM offsets # --------------------------------------------------------------------------- diff --git a/scripts/Makefile.modfinal b/scripts/Makefile.modfinal index ce9661d968a3..10d9a902f851 100644 --- a/scripts/Makefile.modfinal +++ b/scripts/Makefile.modfinal @@ -12,6 +12,8 @@ include $(srctree)/scripts/Kbuild.include # for c_flags and mod-prelink-ext include $(srctree)/scripts/Makefile.lib +mixed-build-prefix = $(if $(KBUILD_MIXED_TREE),$(KBUILD_MIXED_TREE)/) + # find all modules listed in modules.order modules := $(sort $(shell cat $(MODORDER))) @@ -39,10 +41,10 @@ quiet_cmd_ld_ko_o = LD [M] $@ quiet_cmd_btf_ko = BTF [M] $@ cmd_btf_ko = \ - if [ -f vmlinux ]; then \ - LLVM_OBJCOPY="$(OBJCOPY)" $(PAHOLE) -J $(PAHOLE_FLAGS) --btf_base vmlinux $@; \ + if [ -f $(mixed-build-prefix)vmlinux ]; then \ + LLVM_OBJCOPY="$(OBJCOPY)" $(PAHOLE) -J $(PAHOLE_FLAGS) --btf_base $(mixed-build-prefix)vmlinux $@; \ else \ - printf "Skipping BTF generation for %s due to unavailability of vmlinux\n" $@ 1>&2; \ + printf "Skipping BTF generation for %s due to unavailability of $(mixed-build-prefix)vmlinux\n" $@ 1>&2; \ fi; # Same as newer-prereqs, but allows to exclude specified extra dependencies @@ -55,8 +57,8 @@ if_changed_except = $(if $(call newer_prereqs_except,$(2))$(cmd-check), \ # Re-generate module BTFs if either module's .ko or vmlinux changed -$(modules): %.ko: %$(mod-prelink-ext).o %.mod.o scripts/module.lds $(if $(KBUILD_BUILTIN),vmlinux) FORCE - +$(call if_changed_except,ld_ko_o,vmlinux) +$(modules): %.ko: %$(mod-prelink-ext).o %.mod.o scripts/module.lds $(if $(KBUILD_BUILTIN),$(mixed-build-prefix)vmlinux) FORCE + +$(call if_changed_except,ld_ko_o,$(mixed-build-prefix)vmlinux) ifdef CONFIG_DEBUG_INFO_BTF_MODULES +$(if $(newer-prereqs),$(call cmd,btf_ko)) endif diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst index ff9b09e4cfca..3cf77187456b 100644 --- a/scripts/Makefile.modinst +++ b/scripts/Makefile.modinst @@ -24,6 +24,10 @@ suffix-$(CONFIG_MODULE_COMPRESS_XZ) := .xz suffix-$(CONFIG_MODULE_COMPRESS_ZSTD) := .zst modules := $(patsubst $(extmod_prefix)%, $(dst)/%$(suffix-y), $(modules)) +ifneq ($(KBUILD_EXTMOD),) +extmod_suffix := $(shell echo "${KBUILD_EXTMOD}" | md5sum | cut -d " " -f 1) +modules += $(dst)/modules.order.$(extmod_suffix) +endif __modinst: $(modules) @: @@ -82,6 +86,12 @@ $(dst)/%.ko: $(extmod_prefix)%.ko FORCE $(call cmd,strip) $(call cmd,sign) +ifneq ($(KBUILD_EXTMOD),) +$(dst)/modules.order.$(extmod_suffix): $(MODORDER) FORCE + $(call cmd,install) + @sed -i "s:^$(KBUILD_EXTMOD):$(INSTALL_MOD_DIR):g" $@ +endif + else $(dst)/%.ko: FORCE diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index 0273bf7375e2..92624482b32a 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -44,6 +44,8 @@ include $(srctree)/scripts/Kbuild.include # for mod-prelink-ext include $(srctree)/scripts/Makefile.lib +mixed-build-prefix = $(if $(KBUILD_MIXED_TREE),$(KBUILD_MIXED_TREE)/) + MODPOST = scripts/mod/modpost \ $(if $(CONFIG_MODVERSIONS),-m) \ $(if $(CONFIG_MODULE_SRCVERSION_ALL),-a) \ @@ -64,16 +66,17 @@ else ifeq ($(KBUILD_EXTMOD),) -input-symdump := vmlinux.symvers +input-symdump := $(mixed-build-prefix)vmlinux.symvers output-symdump := modules-only.symvers +module_srcpath := $(srctree) quiet_cmd_cat = GEN $@ cmd_cat = cat $(real-prereqs) > $@ -ifneq ($(wildcard vmlinux.symvers),) +ifneq ($(wildcard $(mixed-build-prefix)vmlinux.symvers),) __modpost: Module.symvers -Module.symvers: vmlinux.symvers modules-only.symvers FORCE +Module.symvers: $(mixed-build-prefix)vmlinux.symvers modules-only.symvers FORCE $(call if_changed,cat) targets += Module.symvers @@ -95,6 +98,27 @@ MODPOST += -e input-symdump := Module.symvers $(KBUILD_EXTRA_SYMBOLS) output-symdump := $(KBUILD_EXTMOD)/Module.symvers +# Get the external module's source path. KBUILD_EXTMOD could either be an +# absolute path or relative path from $(srctree). This makes sure that we +# aren't using a relative path from a separate working directory (O= or +# KBUILD_OUTPUT) since that may not be the actual module's SCM project path. So +# check the path relative to $(srctree) first. +ifneq ($(realpath $(srctree)/$(KBUILD_EXTMOD) 2>/dev/null),) + module_srcpath := $(srctree)/$(KBUILD_EXTMOD) +else + module_srcpath := $(KBUILD_EXTMOD) +endif + +endif + +ifeq ($(CONFIG_MODULE_SCMVERSION),y) +# Get the SCM version of the module. `sed` verifies setlocalversion returns +# a proper revision based on the SCM type, e.g. git, mercurial, or svn. +module_scmversion := $(shell $(srctree)/scripts/setlocalversion $(module_srcpath) | \ + sed -n 's/.*-\(\(g\|hg\)[a-fA-F0-9]\+\(-dirty\)\?\|svn[0-9]\+\).*/\1/p') +ifneq ($(module_scmversion),) +MODPOST += -v $(module_scmversion) +endif endif existing-input-symdump := $(wildcard $(input-symdump)) diff --git a/scripts/adjust_autoksyms.sh b/scripts/adjust_autoksyms.sh index 59fdb875e818..f1b5ac818411 100755 --- a/scripts/adjust_autoksyms.sh +++ b/scripts/adjust_autoksyms.sh @@ -35,7 +35,7 @@ case "$KBUILD_VERBOSE" in esac # Generate a new symbol list file -$CONFIG_SHELL $srctree/scripts/gen_autoksyms.sh "$new_ksyms_file" +$CONFIG_SHELL $srctree/scripts/gen_autoksyms.sh --modorder "$new_ksyms_file" # Extract changes between old and new list and touch corresponding # dependency files. diff --git a/scripts/clang-tools/gen_compile_commands.py b/scripts/clang-tools/gen_compile_commands.py index 1d1bde1fd45e..47da25b3ba7d 100755 --- a/scripts/clang-tools/gen_compile_commands.py +++ b/scripts/clang-tools/gen_compile_commands.py @@ -157,10 +157,10 @@ def cmdfiles_for_modorder(modorder): if ext != '.ko': sys.exit('{}: module path must end with .ko'.format(ko)) mod = base + '.mod' - # The first line of *.mod lists the objects that compose the module. + # Read from *.mod, to get a list of objects that compose the module. with open(mod) as m: - for obj in m.readline().split(): - yield to_cmdfile(obj) + for mod_line in m: + yield to_cmdfile(mod_line.rstrip()) def process_line(root_directory, command_prefix, file_path): diff --git a/scripts/depmod.sh b/scripts/depmod.sh index 3643b4f896ed..2de598d4691a 100755 --- a/scripts/depmod.sh +++ b/scripts/depmod.sh @@ -3,14 +3,15 @@ # # A depmod wrapper used by the toplevel Makefile -if test $# -ne 2; then - echo "Usage: $0 /sbin/depmod " >&2 +if test $# -ne 2 -a $# -ne 3; then + echo "Usage: $0 /sbin/depmod [System.map folder]" >&2 exit 1 fi DEPMOD=$1 KERNELRELEASE=$2 +KBUILD_MIXED_TREE=$3 -if ! test -r System.map ; then +if ! test -r ${KBUILD_MIXED_TREE}System.map ; then echo "Warning: modules_install: missing 'System.map' file. Skipping depmod." >&2 exit 0 fi @@ -41,7 +42,7 @@ if $depmod_hack_needed; then KERNELRELEASE=99.98.$KERNELRELEASE fi -set -- -ae -F System.map +set -- -ae -F ${KBUILD_MIXED_TREE}System.map if test -n "$INSTALL_MOD_PATH"; then set -- "$@" -b "$INSTALL_MOD_PATH" fi diff --git a/scripts/extract-cert.c b/scripts/extract-cert.c index 79ecbbfe37cd..71b298023022 100644 --- a/scripts/extract-cert.c +++ b/scripts/extract-cert.c @@ -56,6 +56,7 @@ static void display_openssl_errors(int l) } } +#ifndef OPENSSL_IS_BORINGSSL static void drain_openssl_errors(void) { const char *file; @@ -65,6 +66,7 @@ static void drain_openssl_errors(void) return; while (ERR_get_error_line(&file, &line)) {} } +#endif #define ERR(cond, fmt, ...) \ do { \ @@ -119,6 +121,10 @@ int main(int argc, char **argv) fclose(f); exit(0); } else if (!strncmp(cert_src, "pkcs11:", 7)) { +#ifdef OPENSSL_IS_BORINGSSL + ERR(1, "BoringSSL does not support extracting from PKCS#11"); + exit(1); +#else ENGINE *e; struct { const char *cert_id; @@ -141,6 +147,7 @@ int main(int argc, char **argv) ENGINE_ctrl_cmd(e, "LOAD_CERT_CTRL", 0, &parms, NULL, 1); ERR(!parms.cert, "Get X.509 from PKCS#11"); write_cert(parms.cert); +#endif } else { BIO *b; X509 *x509; diff --git a/scripts/gen_autoksyms.sh b/scripts/gen_autoksyms.sh index 6ed0d225c8b1..117cb07bd043 100755 --- a/scripts/gen_autoksyms.sh +++ b/scripts/gen_autoksyms.sh @@ -2,13 +2,10 @@ # SPDX-License-Identifier: GPL-2.0-only # Create an autoksyms.h header file from the list of all module's needed symbols -# as recorded on the second line of *.mod files and the user-provided symbol -# whitelist. +# as recorded in *.usyms files and the user-provided symbol whitelist. set -e -output_file="$1" - # Use "make V=1" to debug this script. case "$KBUILD_VERBOSE" in *1*) @@ -19,6 +16,15 @@ esac # We need access to CONFIG_ symbols . include/config/auto.conf +read_modorder= + +if [ "$1" = --modorder ]; then + shift + read_modorder=1 +fi + +output_file="$1" + needed_symbols= # Special case for modversions (see modpost.c) @@ -46,10 +52,8 @@ cat > "$output_file" << EOT EOT -[ -f modules.order ] && modlist=modules.order || modlist=/dev/null - { - sed 's/ko$/mod/' $modlist | xargs -n1 sed -n -e '2p' + [ -n "${read_modorder}" ] && sed 's/ko$/usyms/' modules.order | xargs cat echo "$needed_symbols" [ -n "$ksym_wl" ] && cat "$ksym_wl" } | sed -e 's/ /\n/g' | sed -n -e '/^$/!p' | @@ -57,4 +61,7 @@ EOT # point addresses. sed -e 's/^\.//' | sort -u | +# Ignore __this_module. It's not an exported symbol, and will be resolved +# when the final .ko's are linked. +grep -v '^__this_module$' | sed -e 's/\(.*\)/#define __KSYM_\1 1/' >> "$output_file" diff --git a/scripts/gen_gki_modules_headers.sh b/scripts/gen_gki_modules_headers.sh new file mode 100755 index 000000000000..cab6e4f37c44 --- /dev/null +++ b/scripts/gen_gki_modules_headers.sh @@ -0,0 +1,107 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-only +# +# Copyright 2022 Google LLC +# Author: ramjiyani@google.com (Ramji Jiyani) +# + +# +# Generates hearder file with list of unprotected symbols +# +# Called By: KERNEL_SRC/kernel/Makefile if CONFIG_MODULE_SIG_PROTECT=y +# +# gki_module_unprotected.h: Symbols allowed to _access_ by unsigned modules +# +# If valid symbol file doesn't exists then still generates valid C header files for +# compilation to proceed with no symbols to protect +# + +# Collect arguments from Makefile +TARGET=$1 +SRCTREE=$2 +SYMBOL_LIST=$3 + +set -e + +# +# Common Definitions +# +# Use "make V=1" to debug this script. +case "$KBUILD_VERBOSE" in +*1*) + set -x + ;; +esac + +# +# generate_header(): +# Args: $1 = Name of the header file +# $2 = Input symbol list +# $3 = Symbol type (protected/exported) +# +generate_header() { + local header_file=$1 + local symbol_file=$2 + local symbol_type=$3 + + if [ -f "${header_file}" ]; then + rm -f -- "${header_file}" + fi + + # If symbol_file exist preprocess it and find maximum name length + if [ -s "${symbol_file}" ]; then + # Remove White Spaces, empty lines and symbol list markers if any + sed -i '/^[[:space:]]*$/d; /^#/d; /\[abi_symbol_list\]/d' "${symbol_file}" + + # Sort in byte order for kernel binary search at runtime + LC_ALL=C sort -u -o "${symbol_file}" "${symbol_file}" + + # Trim white spaces & +1 for null termination + local max_name_len=$(awk ' + { + $1=$1; + if ( length > L ) { + L=length + } + } END { print ++L }' "${symbol_file}") + else + # Set to 1 to generate valid C header file + local max_name_len=1 + fi + + # Header generation + cat > "${header_file}" <<- EOT + /* + * DO NOT EDIT + * + * Build generated header file with ${symbol_type} + */ + + #define NR_$(printf ${symbol_type} | tr [:lower:] [:upper:])_SYMBOLS \\ + $(printf '\t')(ARRAY_SIZE(gki_${symbol_type}_symbols)) + #define MAX_$(printf ${symbol_type} | tr [:lower:] [:upper:])_NAME_LEN (${max_name_len}) + + static const char gki_${symbol_type}_symbols[][MAX_$(printf ${symbol_type} | + tr [:lower:] [:upper:])_NAME_LEN] = { + EOT + + # If a valid symbol_file present add symbols in an array except the 1st line + if [ -s "${symbol_file}" ]; then + sed -e 's/^[ \t]*/\t"/;s/[ \t]*$/",/' "${symbol_file}" >> "${header_file}" + fi + + # Terminate the file + echo "};" >> "${header_file}" +} + +if [ "$(basename "${TARGET}")" = "gki_module_unprotected.h" ]; then + # Union of vendor symbol lists + GKI_VENDOR_SYMBOLS="${SYMBOL_LIST}" + generate_header "${TARGET}" "${GKI_VENDOR_SYMBOLS}" "unprotected" +else + # Sorted list of exported symbols + GKI_EXPORTED_SYMBOLS="${SYMBOL_LIST}" + + generate_header "${TARGET}" "${GKI_EXPORTED_SYMBOLS}" "protected_exports" +fi + diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 54ad86d13784..bfac62a73f82 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -111,7 +111,8 @@ static bool is_ignored_symbol(const char *name, char type) ".LASANPC", /* s390 kasan local symbols */ "__crc_", /* modversions */ "__efistub_", /* arm64 EFI stub namespace */ - "__kvm_nvhe_", /* arm64 non-VHE KVM namespace */ + "__kvm_nvhe_$", /* arm64 local symbols in non-VHE KVM namespace */ + "__kvm_nvhe_.L", /* arm64 local symbols in non-VHE KVM namespace */ "__AArch64ADRPThunk_", /* arm64 lld */ "__ARMV5PILongThunk_", /* arm lld */ "__ARMV7PILongThunk_", diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index b284ee01fdeb..44a55aeb35e2 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -28,6 +28,8 @@ static int modversions = 0; static int all_versions = 0; /* If we are modposting external module set to 1 */ static int external_module = 0; +#define MODULE_SCMVERSION_SIZE 64 +static char module_scmversion[MODULE_SCMVERSION_SIZE]; /* Only warn about unresolved symbols */ static int warn_unresolved = 0; /* How a symbol is exported */ @@ -418,10 +420,9 @@ static struct symbol *sym_add_exported(const char *name, struct module *mod, s = new_symbol(name, mod, export); } else if (!external_module || s->module->is_vmlinux || s->module == mod) { - warn("%s: '%s' exported twice. Previous export was in %s%s\n", - mod->name, name, s->module->name, - s->module->is_vmlinux ? "" : ".ko"); - return s; + fatal("%s: '%s' exported twice. Previous export was in %s%s\n", + mod->name, name, s->module->name, + s->module->is_vmlinux ? "" : ".ko"); } s->module = mod; @@ -2235,6 +2236,20 @@ static void add_intree_flag(struct buffer *b, int is_intree) buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n"); } +/** + * add_scmversion() - Adds the MODULE_INFO macro for the scmversion. + * @b: Buffer to append to. + * + * This function fills in the module attribute `scmversion` for the kernel + * module. This is useful for determining a given module's SCM version on + * device via /sys/modules//scmversion and/or using the modinfo tool. + */ +static void add_scmversion(struct buffer *b) +{ + if (module_scmversion[0] != '\0') + buf_printf(b, "\nMODULE_INFO(scmversion, \"%s\");\n", module_scmversion); +} + /* Cannot check for assembler */ static void add_retpoline(struct buffer *b) { @@ -2504,7 +2519,7 @@ int main(int argc, char **argv) struct dump_list *dump_read_start = NULL; struct dump_list **dump_read_iter = &dump_read_start; - while ((opt = getopt(argc, argv, "ei:mnT:o:awENd:")) != -1) { + while ((opt = getopt(argc, argv, "ei:mnT:o:awENd:v:")) != -1) { switch (opt) { case 'e': external_module = 1; @@ -2542,6 +2557,9 @@ int main(int argc, char **argv) case 'd': missing_namespace_deps = optarg; break; + case 'v': + strncpy(module_scmversion, optarg, sizeof(module_scmversion) - 1); + break; default: exit(1); } @@ -2581,6 +2599,7 @@ int main(int argc, char **argv) add_depends(&buf, mod); add_moddevtable(&buf, mod); add_srcversion(&buf, mod); + add_scmversion(&buf); sprintf(fname, "%s.mod.c", mod->name); write_if_changed(&buf, fname); diff --git a/scripts/mod/sumversion.c b/scripts/mod/sumversion.c index 905c0ec291e1..79bb9eaa65ac 100644 --- a/scripts/mod/sumversion.c +++ b/scripts/mod/sumversion.c @@ -387,7 +387,7 @@ out_file: /* Calc and record src checksum. */ void get_src_version(const char *modname, char sum[], unsigned sumlen) { - char *buf, *pos, *firstline; + char *buf; struct md4_ctx md; char *fname; char filelist[PATH_MAX + 1]; @@ -397,15 +397,8 @@ void get_src_version(const char *modname, char sum[], unsigned sumlen) buf = read_text_file(filelist); - pos = buf; - firstline = get_line(&pos); - if (!firstline) { - warn("bad ending versions file for %s\n", modname); - goto free; - } - md4_init(&md); - while ((fname = strsep(&firstline, " "))) { + while ((fname = strsep(&buf, "\n"))) { if (!*fname) continue; if (!(is_static_library(fname)) && diff --git a/scripts/module.lds.S b/scripts/module.lds.S index 3a3aa2354ed8..0dabe5b43cc0 100644 --- a/scripts/module.lds.S +++ b/scripts/module.lds.S @@ -33,7 +33,36 @@ SECTIONS { __patchable_function_entries : { *(__patchable_function_entries) } -#ifdef CONFIG_LTO_CLANG +#if IS_ENABLED(CONFIG_CRYPTO_FIPS140_MOD) + /* + * The FIPS140 module incorporates copies of builtin code, which gets + * integrity checked at module load time, and registered in a way that + * ensures that the integrity checked versions supersede the builtin + * ones. These objects are compiled as builtin code, and so their init + * hooks will be exported from the binary in the same way as builtin + * initcalls are, i.e., annotated with a level that defines the order + * in which the hooks are expected to be invoked. + */ +#define INIT_CALLS_LEVEL(level) \ + KEEP(*(.initcall##level##.init*)) \ + KEEP(*(.initcall##level##s.init*)) + + .initcalls : { + *(.initcalls._start) + INIT_CALLS_LEVEL(0) + INIT_CALLS_LEVEL(1) + INIT_CALLS_LEVEL(2) + INIT_CALLS_LEVEL(3) + INIT_CALLS_LEVEL(4) + INIT_CALLS_LEVEL(5) + INIT_CALLS_LEVEL(rootfs) + INIT_CALLS_LEVEL(6) + INIT_CALLS_LEVEL(7) + *(.initcalls._end) + } +#endif + +#if defined(CONFIG_LTO_CLANG) || IS_ENABLED(CONFIG_CRYPTO_FIPS140_MOD) /* * With CONFIG_LTO_CLANG, LLD always enables -fdata-sections and * -ffunction-sections, which increases the size of the final module. @@ -50,8 +79,10 @@ SECTIONS { } .rodata : { + *(.rodata.._start) *(.rodata .rodata.[0-9a-zA-Z_]*) *(.rodata..L*) + *(.rodata.._end) } /* @@ -59,8 +90,11 @@ SECTIONS { * of the .text section, and is aligned to PAGE_SIZE. */ .text : ALIGN_CFI { + *(.text.._start) *(.text.__cfi_check) *(.text .text.[0-9a-zA-Z_]* .text..L.cfi*) + *(.text.._end) + *(.text.._fips140_unchecked) } #endif } diff --git a/scripts/nsdeps b/scripts/nsdeps index 04c4b96e95ec..f1718cc0d700 100644 --- a/scripts/nsdeps +++ b/scripts/nsdeps @@ -34,9 +34,8 @@ generate_deps() { local mod=${1%.ko:} shift local namespaces="$*" - local mod_source_files="`cat $mod.mod | sed -n 1p \ - | sed -e 's/\.o/\.c/g' \ - | sed "s|[^ ]* *|${src_prefix}&|g"`" + local mod_source_files=$(sed "s|^\(.*\)\.o$|${src_prefix}\1.c|" $mod.mod) + for ns in $namespaces; do echo "Adding namespace $ns to module $mod.ko." generate_deps_for_ns $ns "$mod_source_files" diff --git a/scripts/setlocalversion b/scripts/setlocalversion index 6b54e46a0f12..3fd158348562 100755 --- a/scripts/setlocalversion +++ b/scripts/setlocalversion @@ -11,12 +11,14 @@ # usage() { - echo "Usage: $0 [--save-scmversion] [srctree]" >&2 + echo "Usage: $0 [--save-scmversion] [srctree] [branch] [kmi-generation]" >&2 exit 1 } scm_only=false srctree=. +android_release= +kmi_generation= if test "$1" = "--save-scmversion"; then scm_only=true shift @@ -25,6 +27,22 @@ if test $# -gt 0; then srctree=$1 shift fi +if test $# -gt 0; then + # Extract the Android release version. If there is no match, then return 255 + # and clear the var $android_release + android_release=`echo "$1" | sed -e '/android[0-9]\{2,\}/!{q255}; \ + s/^\(android[0-9]\{2,\}\)-.*/\1/'` + if test $? -ne 0; then + android_release= + fi + shift + + if test $# -gt 0; then + kmi_generation=$1 + [ $(expr $kmi_generation : '^[0-9]\+$') -eq 0 ] && usage + shift + fi +fi if test $# -gt 0 -o ! -d "$srctree"; then usage fi @@ -44,8 +62,13 @@ scm_version() fi # Check for git and a git repo. - if test -z "$(git rev-parse --show-cdup 2>/dev/null)" && - head=$(git rev-parse --verify HEAD 2>/dev/null); then + if head=$(git rev-parse --verify HEAD 2>/dev/null); then + + if [ -n "$android_release" ] && [ -n "$kmi_generation" ]; then + printf '%s' "-$android_release-$kmi_generation" + elif [ -n "$android_release" ]; then + printf '%s' "-$android_release" + fi # If we are at a tagged commit (like "v2.6.30-rc6"), we ignore # it, because this version is defined in the top level Makefile. @@ -143,4 +166,9 @@ elif [ "${LOCALVERSION+set}" != "set" ]; then res="$res${scm:++}" fi +# finally, add the abXXX number if BUILD_NUMBER is set +if test -n "${BUILD_NUMBER}"; then + res="$res-ab${BUILD_NUMBER}" +fi + echo "$res" diff --git a/scripts/sign-file.c b/scripts/sign-file.c index 7434e9ea926e..513367e634be 100644 --- a/scripts/sign-file.c +++ b/scripts/sign-file.c @@ -99,6 +99,7 @@ static void display_openssl_errors(int l) } } +#ifndef OPENSSL_NO_ENGINE static void drain_openssl_errors(void) { const char *file; @@ -108,6 +109,7 @@ static void drain_openssl_errors(void) return; while (ERR_get_error_line(&file, &line)) {} } +#endif #define ERR(cond, fmt, ...) \ do { \ @@ -142,7 +144,9 @@ static int pem_pw_cb(char *buf, int len, int w, void *v) static EVP_PKEY *read_private_key(const char *private_key_name) { EVP_PKEY *private_key; + BIO *b; +#ifndef OPENSSL_NO_ENGINE if (!strncmp(private_key_name, "pkcs11:", 7)) { ENGINE *e; @@ -160,17 +164,16 @@ static EVP_PKEY *read_private_key(const char *private_key_name) private_key = ENGINE_load_private_key(e, private_key_name, NULL, NULL); ERR(!private_key, "%s", private_key_name); - } else { - BIO *b; - - b = BIO_new_file(private_key_name, "rb"); - ERR(!b, "%s", private_key_name); - private_key = PEM_read_bio_PrivateKey(b, NULL, pem_pw_cb, - NULL); - ERR(!private_key, "%s", private_key_name); - BIO_free(b); + return private_key; } +#endif + b = BIO_new_file(private_key_name, "rb"); + ERR(!b, "%s", private_key_name); + private_key = PEM_read_bio_PrivateKey(b, NULL, pem_pw_cb, + NULL); + ERR(!private_key, "%s", private_key_name); + BIO_free(b); return private_key; } diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 97f4c944a20f..e0582fed1279 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -44,6 +44,9 @@ #define avc_cache_stats_incr(field) do {} while (0) #endif +#undef CREATE_TRACE_POINTS +#include + struct avc_entry { u32 ssid; u32 tsid; @@ -441,6 +444,7 @@ static void avc_node_free(struct rcu_head *rhead) static void avc_node_delete(struct selinux_avc *avc, struct avc_node *node) { + trace_android_rvh_selinux_avc_node_delete(node); hlist_del_rcu(&node->list); call_rcu(&node->rhead, avc_node_free); atomic_dec(&avc->avc_cache.active_nodes); @@ -457,6 +461,7 @@ static void avc_node_kill(struct selinux_avc *avc, struct avc_node *node) static void avc_node_replace(struct selinux_avc *avc, struct avc_node *new, struct avc_node *old) { + trace_android_rvh_selinux_avc_node_replace(old, new); hlist_replace_rcu(&old->list, &new->list); call_rcu(&old->rhead, avc_node_free); atomic_dec(&avc->avc_cache.active_nodes); @@ -565,8 +570,10 @@ static struct avc_node *avc_lookup(struct selinux_avc *avc, avc_cache_stats_incr(lookups); node = avc_search_node(avc, ssid, tsid, tclass); - if (node) + if (node) { + trace_android_rvh_selinux_avc_lookup(node, ssid, tsid, tclass); return node; + } avc_cache_stats_incr(misses); return NULL; @@ -650,6 +657,7 @@ static struct avc_node *avc_insert(struct selinux_avc *avc, } } hlist_add_head_rcu(&node->list, head); + trace_android_rvh_selinux_avc_insert(node); found: spin_unlock_irqrestore(lock, flag); return node; diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 084757ff4390..d49b0c29726d 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -117,7 +117,8 @@ struct security_class_mapping secclass_map[] = { { COMMON_IPC_PERMS, NULL } }, { "netlink_route_socket", { COMMON_SOCK_PERMS, - "nlmsg_read", "nlmsg_write", NULL } }, + "nlmsg_read", "nlmsg_write", "nlmsg_readpriv", "nlmsg_getneigh", + NULL } }, { "netlink_tcpdiag_socket", { COMMON_SOCK_PERMS, "nlmsg_read", "nlmsg_write", NULL } }, diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index c0d966020ebd..064786cef0a2 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -97,6 +97,8 @@ struct selinux_state { bool checkreqprot; bool initialized; bool policycap[__POLICYDB_CAPABILITY_MAX]; + bool android_netlink_route; + bool android_netlink_getneigh; struct page *status_page; struct mutex status_lock; @@ -226,6 +228,20 @@ static inline bool selinux_policycap_ioctl_skip_cloexec(void) return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_IOCTL_SKIP_CLOEXEC]); } +static inline bool selinux_android_nlroute_getlink(void) +{ + struct selinux_state *state = &selinux_state; + + return state->android_netlink_route; +} + +static inline bool selinux_android_nlroute_getneigh(void) +{ + struct selinux_state *state = &selinux_state; + + return state->android_netlink_getneigh; +} + struct selinux_policy_convert_data; struct selinux_load_state { @@ -459,5 +475,6 @@ extern void avtab_cache_init(void); extern void ebitmap_cache_init(void); extern void hashtab_cache_init(void); extern int security_sidtab_hash_stats(struct selinux_state *state, char *page); +extern void selinux_nlmsg_init(void); #endif /* _SELINUX_SECURITY_H_ */ diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 94ea2a8b2bb7..4fee2e26b2c4 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -25,7 +25,7 @@ struct nlmsg_perm { u32 perm; }; -static const struct nlmsg_perm nlmsg_route_perms[] = +static struct nlmsg_perm nlmsg_route_perms[] = { { RTM_NEWLINK, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELLINK, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, @@ -216,3 +216,43 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) return err; } + +static void nlmsg_set_perm_for_type(u32 perm, u16 type) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(nlmsg_route_perms); i++) { + if (nlmsg_route_perms[i].nlmsg_type == type) { + nlmsg_route_perms[i].perm = perm; + break; + } + } +} + +/** + * Use nlmsg_readpriv as the permission for RTM_GETLINK messages if the + * netlink_route_getlink policy capability is set. Otherwise use nlmsg_read. + * Similarly, use nlmsg_getneigh for RTM_GETNEIGH and RTM_GETNEIGHTBL if the + * netlink_route_getneigh policy capability is set. Otherwise use nlmsg_read. + */ +void selinux_nlmsg_init(void) +{ + if (selinux_android_nlroute_getlink()) + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_READPRIV, + RTM_GETLINK); + else + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_READ, + RTM_GETLINK); + + if (selinux_android_nlroute_getneigh()) { + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_GETNEIGH, + RTM_GETNEIGH); + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_GETNEIGH, + RTM_GETNEIGHTBL); + } else { + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_READ, + RTM_GETNEIGH); + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_READ, + RTM_GETNEIGHTBL); + } +} diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index 0ae1b718194a..ad91f52c8258 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -2491,6 +2491,14 @@ int policydb_read(struct policydb *p, void *fp) p->reject_unknown = !!(le32_to_cpu(buf[1]) & REJECT_UNKNOWN); p->allow_unknown = !!(le32_to_cpu(buf[1]) & ALLOW_UNKNOWN); + if ((le32_to_cpu(buf[1]) & POLICYDB_CONFIG_ANDROID_NETLINK_ROUTE)) { + p->android_netlink_route = 1; + } + + if ((le32_to_cpu(buf[1]) & POLICYDB_CONFIG_ANDROID_NETLINK_GETNEIGH)) { + p->android_netlink_getneigh = 1; + } + if (p->policyvers >= POLICYDB_VERSION_POLCAP) { rc = ebitmap_read(&p->policycaps, fp); if (rc) diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h index ffc4e7bad205..1d758118e224 100644 --- a/security/selinux/ss/policydb.h +++ b/security/selinux/ss/policydb.h @@ -238,6 +238,8 @@ struct genfs { /* The policy database */ struct policydb { int mls_enabled; + int android_netlink_route; + int android_netlink_getneigh; /* symbol tables */ struct symtab symtab[SYM_NUM]; @@ -334,6 +336,8 @@ extern struct role_trans_datum *policydb_roletr_search( struct policydb *p, struct role_trans_key *key); #define POLICYDB_CONFIG_MLS 1 +#define POLICYDB_CONFIG_ANDROID_NETLINK_ROUTE (1 << 31) +#define POLICYDB_CONFIG_ANDROID_NETLINK_GETNEIGH (1 << 30) /* the config flags related to unknown classes/perms are bits 2 and 3 */ #define REJECT_UNKNOWN 0x00000002 diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 01716ed76592..0527a5ca73c1 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -68,6 +68,8 @@ #include "policycap_names.h" #include "ima.h" +#include + struct convert_context_args { struct selinux_state *state; struct policydb *oldp; @@ -2165,6 +2167,10 @@ static void security_load_policycaps(struct selinux_state *state, pr_info("SELinux: unknown policy capability %u\n", i); } + + state->android_netlink_route = p->android_netlink_route; + state->android_netlink_getneigh = p->android_netlink_getneigh; + selinux_nlmsg_init(); } static int security_preserve_bools(struct selinux_policy *oldpolicy, @@ -2258,6 +2264,7 @@ void selinux_policy_commit(struct selinux_state *state, */ selinux_mark_initialized(state); selinux_complete_init(); + trace_android_rvh_selinux_is_initialized(state); } /* Free the old policy */ diff --git a/sound/core/Kconfig b/sound/core/Kconfig index db2e3c63ff41..29bda025c44b 100644 --- a/sound/core/Kconfig +++ b/sound/core/Kconfig @@ -154,6 +154,16 @@ config SND_VERBOSE_PRINTK You don't need this unless you're debugging ALSA. +config SND_CTL_FAST_LOOKUP + bool "Fast lookup of control elements" if EXPERT + default y + select XARRAY_MULTI + help + This option enables the faster lookup of control elements. + It will consume more memory because of the additional Xarray. + If you want to choose the memory footprint over the performance + inevitably, turn this off. + config SND_DEBUG bool "Debug" help diff --git a/sound/core/control.c b/sound/core/control.c index b83ec284d611..6f824ae19afb 100644 --- a/sound/core/control.c +++ b/sound/core/control.c @@ -365,6 +365,93 @@ static int snd_ctl_find_hole(struct snd_card *card, unsigned int count) return 0; } +/* check whether the given id is contained in the given kctl */ +static bool elem_id_matches(const struct snd_kcontrol *kctl, + const struct snd_ctl_elem_id *id) +{ + return kctl->id.iface == id->iface && + kctl->id.device == id->device && + kctl->id.subdevice == id->subdevice && + !strncmp(kctl->id.name, id->name, sizeof(kctl->id.name)) && + kctl->id.index <= id->index && + kctl->id.index + kctl->count > id->index; +} + +#ifdef CONFIG_SND_CTL_FAST_LOOKUP +/* Compute a hash key for the corresponding ctl id + * It's for the name lookup, hence the numid is excluded. + * The hash key is bound in LONG_MAX to be used for Xarray key. + */ +#define MULTIPLIER 37 +static unsigned long get_ctl_id_hash(const struct snd_ctl_elem_id *id) +{ + unsigned long h; + const unsigned char *p; + + h = id->iface; + h = MULTIPLIER * h + id->device; + h = MULTIPLIER * h + id->subdevice; + for (p = id->name; *p; p++) + h = MULTIPLIER * h + *p; + h = MULTIPLIER * h + id->index; + h &= LONG_MAX; + return h; +} + +/* add hash entries to numid and ctl xarray tables */ +static void add_hash_entries(struct snd_card *card, + struct snd_kcontrol *kcontrol) +{ + struct snd_ctl_elem_id id = kcontrol->id; + int i; + + xa_store_range(&card->ctl_numids, kcontrol->id.numid, + kcontrol->id.numid + kcontrol->count - 1, + kcontrol, GFP_KERNEL); + + for (i = 0; i < kcontrol->count; i++) { + id.index = kcontrol->id.index + i; + if (xa_insert(&card->ctl_hash, get_ctl_id_hash(&id), + kcontrol, GFP_KERNEL)) { + /* skip hash for this entry, noting we had collision */ + card->ctl_hash_collision = true; + dev_dbg(card->dev, "ctl_hash collision %d:%s:%d\n", + id.iface, id.name, id.index); + } + } +} + +/* remove hash entries that have been added */ +static void remove_hash_entries(struct snd_card *card, + struct snd_kcontrol *kcontrol) +{ + struct snd_ctl_elem_id id = kcontrol->id; + struct snd_kcontrol *matched; + unsigned long h; + int i; + + for (i = 0; i < kcontrol->count; i++) { + xa_erase(&card->ctl_numids, id.numid); + h = get_ctl_id_hash(&id); + matched = xa_load(&card->ctl_hash, h); + if (matched && (matched == kcontrol || + elem_id_matches(matched, &id))) + xa_erase(&card->ctl_hash, h); + id.index++; + id.numid++; + } +} +#else /* CONFIG_SND_CTL_FAST_LOOKUP */ +static inline void add_hash_entries(struct snd_card *card, + struct snd_kcontrol *kcontrol) +{ +} +static inline void remove_hash_entries(struct snd_card *card, + struct snd_kcontrol *kcontrol) +{ +} +#endif /* CONFIG_SND_CTL_FAST_LOOKUP */ + enum snd_ctl_add_mode { CTL_ADD_EXCLUSIVE, CTL_REPLACE, CTL_ADD_ON_REPLACE, }; @@ -409,6 +496,8 @@ static int __snd_ctl_add_replace(struct snd_card *card, kcontrol->id.numid = card->last_numid + 1; card->last_numid += kcontrol->count; + add_hash_entries(card, kcontrol); + for (idx = 0; idx < kcontrol->count; idx++) snd_ctl_notify_one(card, SNDRV_CTL_EVENT_MASK_ADD, kcontrol, idx); @@ -480,6 +569,26 @@ int snd_ctl_replace(struct snd_card *card, struct snd_kcontrol *kcontrol, } EXPORT_SYMBOL(snd_ctl_replace); +static int __snd_ctl_remove(struct snd_card *card, + struct snd_kcontrol *kcontrol, + bool remove_hash) +{ + unsigned int idx; + + if (snd_BUG_ON(!card || !kcontrol)) + return -EINVAL; + list_del(&kcontrol->list); + + if (remove_hash) + remove_hash_entries(card, kcontrol); + + card->controls_count -= kcontrol->count; + for (idx = 0; idx < kcontrol->count; idx++) + snd_ctl_notify_one(card, SNDRV_CTL_EVENT_MASK_REMOVE, kcontrol, idx); + snd_ctl_free_one(kcontrol); + return 0; +} + /** * snd_ctl_remove - remove the control from the card and release it * @card: the card instance @@ -493,16 +602,7 @@ EXPORT_SYMBOL(snd_ctl_replace); */ int snd_ctl_remove(struct snd_card *card, struct snd_kcontrol *kcontrol) { - unsigned int idx; - - if (snd_BUG_ON(!card || !kcontrol)) - return -EINVAL; - list_del(&kcontrol->list); - card->controls_count -= kcontrol->count; - for (idx = 0; idx < kcontrol->count; idx++) - snd_ctl_notify_one(card, SNDRV_CTL_EVENT_MASK_REMOVE, kcontrol, idx); - snd_ctl_free_one(kcontrol); - return 0; + return __snd_ctl_remove(card, kcontrol, true); } EXPORT_SYMBOL(snd_ctl_remove); @@ -643,14 +743,30 @@ int snd_ctl_rename_id(struct snd_card *card, struct snd_ctl_elem_id *src_id, up_write(&card->controls_rwsem); return -ENOENT; } + remove_hash_entries(card, kctl); kctl->id = *dst_id; kctl->id.numid = card->last_numid + 1; card->last_numid += kctl->count; + add_hash_entries(card, kctl); up_write(&card->controls_rwsem); return 0; } EXPORT_SYMBOL(snd_ctl_rename_id); +#ifndef CONFIG_SND_CTL_FAST_LOOKUP +static struct snd_kcontrol * +snd_ctl_find_numid_slow(struct snd_card *card, unsigned int numid) +{ + struct snd_kcontrol *kctl; + + list_for_each_entry(kctl, &card->controls, list) { + if (kctl->id.numid <= numid && kctl->id.numid + kctl->count > numid) + return kctl; + } + return NULL; +} +#endif /* !CONFIG_SND_CTL_FAST_LOOKUP */ + /** * snd_ctl_find_numid - find the control instance with the given number-id * @card: the card instance @@ -666,15 +782,13 @@ EXPORT_SYMBOL(snd_ctl_rename_id); */ struct snd_kcontrol *snd_ctl_find_numid(struct snd_card *card, unsigned int numid) { - struct snd_kcontrol *kctl; - if (snd_BUG_ON(!card || !numid)) return NULL; - list_for_each_entry(kctl, &card->controls, list) { - if (kctl->id.numid <= numid && kctl->id.numid + kctl->count > numid) - return kctl; - } - return NULL; +#ifdef CONFIG_SND_CTL_FAST_LOOKUP + return xa_load(&card->ctl_numids, numid); +#else + return snd_ctl_find_numid_slow(card, numid); +#endif } EXPORT_SYMBOL(snd_ctl_find_numid); @@ -700,21 +814,18 @@ struct snd_kcontrol *snd_ctl_find_id(struct snd_card *card, return NULL; if (id->numid != 0) return snd_ctl_find_numid(card, id->numid); - list_for_each_entry(kctl, &card->controls, list) { - if (kctl->id.iface != id->iface) - continue; - if (kctl->id.device != id->device) - continue; - if (kctl->id.subdevice != id->subdevice) - continue; - if (strncmp(kctl->id.name, id->name, sizeof(kctl->id.name))) - continue; - if (kctl->id.index > id->index) - continue; - if (kctl->id.index + kctl->count <= id->index) - continue; +#ifdef CONFIG_SND_CTL_FAST_LOOKUP + kctl = xa_load(&card->ctl_hash, get_ctl_id_hash(id)); + if (kctl && elem_id_matches(kctl, id)) return kctl; - } + if (!card->ctl_hash_collision) + return NULL; /* we can rely on only hash table */ +#endif + /* no matching in hash table - try all as the last resort */ + list_for_each_entry(kctl, &card->controls, list) + if (elem_id_matches(kctl, id)) + return kctl; + return NULL; } EXPORT_SYMBOL(snd_ctl_find_id); @@ -2202,8 +2313,13 @@ static int snd_ctl_dev_free(struct snd_device *device) down_write(&card->controls_rwsem); while (!list_empty(&card->controls)) { control = snd_kcontrol(card->controls.next); - snd_ctl_remove(card, control); + __snd_ctl_remove(card, control, false); } + +#ifdef CONFIG_SND_CTL_FAST_LOOKUP + xa_destroy(&card->ctl_numids); + xa_destroy(&card->ctl_hash); +#endif up_write(&card->controls_rwsem); put_device(&card->ctl_dev); return 0; diff --git a/sound/core/init.c b/sound/core/init.c index 7b3618997d34..f6aa08c334a6 100644 --- a/sound/core/init.c +++ b/sound/core/init.c @@ -310,6 +310,10 @@ static int snd_card_init(struct snd_card *card, struct device *parent, rwlock_init(&card->ctl_files_rwlock); INIT_LIST_HEAD(&card->controls); INIT_LIST_HEAD(&card->ctl_files); +#ifdef CONFIG_SND_CTL_FAST_LOOKUP + xa_init(&card->ctl_numids); + xa_init(&card->ctl_hash); +#endif spin_lock_init(&card->files_lock); INIT_LIST_HEAD(&card->files_list); mutex_init(&card->memory_mutex); diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index 47bc308b86c4..d7990a13f8ac 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -3219,6 +3219,39 @@ int snd_soc_get_dai_id(struct device_node *ep) } EXPORT_SYMBOL_GPL(snd_soc_get_dai_id); +/** + * snd_soc_info_multi_ext - external single mixer info callback + * @kcontrol: mixer control + * @uinfo: control element information + * + * Callback to provide information about a single external mixer control. + * that accepts multiple input. + * + * Returns 0 for success. + */ +int snd_soc_info_multi_ext(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo) +{ + struct soc_multi_mixer_control *mc = + (struct soc_multi_mixer_control *)kcontrol->private_value; + int platform_max; + + if (!mc->platform_max) + mc->platform_max = mc->max; + platform_max = mc->platform_max; + + if (platform_max == 1 && !strnstr(kcontrol->id.name, " Volume", 30)) + uinfo->type = SNDRV_CTL_ELEM_TYPE_BOOLEAN; + else + uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; + + uinfo->count = mc->count; + uinfo->value.integer.min = 0; + uinfo->value.integer.max = platform_max; + return 0; +} +EXPORT_SYMBOL_GPL(snd_soc_info_multi_ext); + int snd_soc_get_dai_name(const struct of_phandle_args *args, const char **dai_name) { diff --git a/sound/soc/soc-generic-dmaengine-pcm.c b/sound/soc/soc-generic-dmaengine-pcm.c index 4aa48c74f21a..8659cb1794f1 100644 --- a/sound/soc/soc-generic-dmaengine-pcm.c +++ b/sound/soc/soc-generic-dmaengine-pcm.c @@ -15,6 +15,10 @@ #include +static unsigned int prealloc_buffer_size_kbytes = 512; +module_param(prealloc_buffer_size_kbytes, uint, 0444); +MODULE_PARM_DESC(prealloc_buffer_size_kbytes, "Preallocate DMA buffer size (KB)."); + /* * The platforms dmaengine driver does not support reporting the amount of * bytes that are still left to transfer. @@ -233,13 +237,15 @@ static int dmaengine_pcm_new(struct snd_soc_component *component, size_t max_buffer_size; unsigned int i; - if (config && config->prealloc_buffer_size) { + if (config && config->prealloc_buffer_size) prealloc_buffer_size = config->prealloc_buffer_size; + else + prealloc_buffer_size = prealloc_buffer_size_kbytes * 1024; + + if (config && config->pcm_hardware && config->pcm_hardware->buffer_bytes_max) max_buffer_size = config->pcm_hardware->buffer_bytes_max; - } else { - prealloc_buffer_size = 512 * 1024; + else max_buffer_size = SIZE_MAX; - } for_each_pcm_streams(i) { struct snd_pcm_substream *substream = rtd->pcm->streams[i].substream; diff --git a/sound/usb/card.c b/sound/usb/card.c index b1ad8f1fd2ae..5a29406a7fc4 100644 --- a/sound/usb/card.c +++ b/sound/usb/card.c @@ -41,6 +41,8 @@ #include #include +#include + #include "usbaudio.h" #include "card.h" #include "midi.h" @@ -790,6 +792,8 @@ static int usb_audio_probe(struct usb_interface *intf, if (err < 0) return err; + trace_android_vh_audio_usb_offload_vendor_set(intf); + /* * found a config. now register to ALSA */ @@ -864,6 +868,8 @@ static int usb_audio_probe(struct usb_interface *intf, usb_disable_autosuspend(interface_to_usbdev(intf)); } + trace_android_vh_audio_usb_offload_connect(intf, chip); + /* * For devices with more than one control interface, we assume the * first contains the audio controls. We might need a more specific @@ -950,6 +956,8 @@ static void usb_audio_disconnect(struct usb_interface *intf) card = chip->card; + trace_android_rvh_audio_usb_offload_disconnect(intf); + mutex_lock(®ister_mutex); if (atomic_inc_return(&chip->shutdown) == 1) { struct snd_usb_stream *as; @@ -1053,6 +1061,7 @@ int snd_usb_autoresume(struct snd_usb_audio *chip) } return 0; } +EXPORT_SYMBOL_GPL(snd_usb_autoresume); void snd_usb_autosuspend(struct snd_usb_audio *chip) { @@ -1066,6 +1075,7 @@ void snd_usb_autosuspend(struct snd_usb_audio *chip) for (i = 0; i < chip->num_interfaces; i++) usb_autopm_put_interface(chip->intf[i]); } +EXPORT_SYMBOL_GPL(snd_usb_autosuspend); static int usb_audio_suspend(struct usb_interface *intf, pm_message_t message) { diff --git a/sound/usb/card.h b/sound/usb/card.h index 87f042d06ce0..a85e45d186aa 100644 --- a/sound/usb/card.h +++ b/sound/usb/card.h @@ -2,6 +2,8 @@ #ifndef __USBAUDIO_CARD_H #define __USBAUDIO_CARD_H +#include + #define MAX_NR_RATES 1024 #define MAX_PACKS 6 /* per URB */ #define MAX_PACKS_HS (MAX_PACKS * 8) /* in high speed mode */ @@ -142,6 +144,11 @@ struct snd_usb_endpoint { spinlock_t lock; struct list_head list; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; struct media_ctl; @@ -193,6 +200,8 @@ struct snd_usb_substream { bool trigger_tstamp_pending_update; /* trigger timestamp being updated from initial estimate */ bool lowlatency_playback; /* low-latency playback mode */ struct media_ctl *media_ctl; + + ANDROID_KABI_RESERVE(1); }; struct snd_usb_stream { diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c index 092350eb5f4e..814f7af99663 100644 --- a/sound/usb/endpoint.c +++ b/sound/usb/endpoint.c @@ -13,6 +13,8 @@ #include #include +#include + #include "usbaudio.h" #include "helper.h" #include "card.h" @@ -818,6 +820,7 @@ snd_usb_endpoint_open(struct snd_usb_audio *chip, mutex_unlock(&chip->mutex); return ep; } +EXPORT_SYMBOL_GPL(snd_usb_endpoint_open); /* * snd_usb_endpoint_set_sync: Link data and sync endpoints @@ -902,6 +905,7 @@ void snd_usb_endpoint_close(struct snd_usb_audio *chip, } mutex_unlock(&chip->mutex); } +EXPORT_SYMBOL_GPL(snd_usb_endpoint_close); /* Prepare for suspening EP, called from the main suspend handler */ void snd_usb_endpoint_suspend(struct snd_usb_endpoint *ep) @@ -1404,6 +1408,7 @@ unlock: mutex_unlock(&chip->mutex); return err; } +EXPORT_SYMBOL_GPL(snd_usb_endpoint_configure); /* get the current rate set to the given clock by any endpoint */ int snd_usb_endpoint_get_clock_rate(struct snd_usb_audio *chip, int clock) @@ -1481,6 +1486,8 @@ int snd_usb_endpoint_start(struct snd_usb_endpoint *ep) goto fill_rest; } + trace_android_vh_audio_usb_offload_ep_action(ep, true); + for (i = 0; i < ep->nurbs; i++) { struct urb *urb = ep->urb[i].urb; @@ -1566,6 +1573,7 @@ void snd_usb_endpoint_stop(struct snd_usb_endpoint *ep, bool keep_pending) if (ep->sync_source) WRITE_ONCE(ep->sync_source->sync_sink, NULL); stop_urbs(ep, false, keep_pending); + trace_android_vh_audio_usb_offload_ep_action(ep, false); } } diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c index 87a30be64324..b1a3fad14dc2 100644 --- a/sound/usb/pcm.c +++ b/sound/usb/pcm.c @@ -14,6 +14,8 @@ #include #include +#include + #include "usbaudio.h" #include "card.h" #include "quirks.h" @@ -95,6 +97,7 @@ find_format(struct list_head *fmt_list_head, snd_pcm_format_t format, const struct audioformat *fp; const struct audioformat *found = NULL; int cur_attr = 0, attr; + bool need_ignore = false; list_for_each_entry(fp, fmt_list_head, list) { if (strict_match) { @@ -114,6 +117,11 @@ find_format(struct list_head *fmt_list_head, snd_pcm_format_t format, continue; } attr = fp->ep_attr & USB_ENDPOINT_SYNCTYPE; + + trace_android_vh_audio_usb_offload_synctype(subs, attr, &need_ignore); + if (need_ignore) + continue; + if (!found) { found = fp; cur_attr = attr; diff --git a/sound/usb/usbaudio.h b/sound/usb/usbaudio.h index ec06f441e890..6a5af24a8f37 100644 --- a/sound/usb/usbaudio.h +++ b/sound/usb/usbaudio.h @@ -12,6 +12,8 @@ #define USB_ID_VENDOR(id) ((id) >> 16) #define USB_ID_PRODUCT(id) ((u16)(id)) +#include + /* * */ @@ -60,6 +62,11 @@ struct snd_usb_audio { struct usb_host_interface *ctrl_intf; /* the audio control interface */ struct media_device *media_dev; struct media_intf_devnode *ctl_intf_media_devnode; + + ANDROID_KABI_RESERVE(1); + ANDROID_KABI_RESERVE(2); + ANDROID_KABI_RESERVE(3); + ANDROID_KABI_RESERVE(4); }; #define USB_AUDIO_IFACE_UNUSED ((void *)-1L) diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index b3edde68bc3e..1d0a0a2a9711 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -362,6 +362,7 @@ struct kvm_arm_copy_mte_tags { #define KVM_ARM_VCPU_PMU_V3_IRQ 0 #define KVM_ARM_VCPU_PMU_V3_INIT 1 #define KVM_ARM_VCPU_PMU_V3_FILTER 2 +#define KVM_ARM_VCPU_PMU_V3_SET_PMU 3 #define KVM_ARM_VCPU_TIMER_CTRL 1 #define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 #define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile index af9f9d3534c9..35f092065c4f 100644 --- a/tools/bpf/resolve_btfids/Makefile +++ b/tools/bpf/resolve_btfids/Makefile @@ -23,6 +23,8 @@ CC = $(HOSTCC) LD = $(HOSTLD) ARCH = $(HOSTARCH) RM ?= rm +CFLAGS := $(KBUILD_HOSTCFLAGS) +LDFLAGS := $(KBUILD_HOSTLDFLAGS) OUTPUT ?= $(srctree)/tools/bpf/resolve_btfids/ @@ -45,9 +47,10 @@ $(SUBCMDOBJ): fixdep FORCE | $(OUTPUT)/libsubcmd $(Q)$(MAKE) -C $(SUBCMD_SRC) OUTPUT=$(abspath $(dir $@))/ $(abspath $@) $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf - $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(abspath $(dir $@))/ $(abspath $@) + $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(abspath $(dir $@))/ \ + EXTRA_CFLAGS="$(CFLAGS)" $(abspath $@) -CFLAGS := -g \ +CFLAGS += -g \ -I$(srctree)/tools/include \ -I$(srctree)/tools/include/uapi \ -I$(LIBBPF_SRC) \ diff --git a/tools/crypto/gen_fips140_testvecs.py b/tools/crypto/gen_fips140_testvecs.py new file mode 100755 index 000000000000..825c4872235a --- /dev/null +++ b/tools/crypto/gen_fips140_testvecs.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0-only +# +# Copyright 2021 Google LLC +# +# Generate most of the test vectors for the FIPS 140 cryptographic self-tests. +# +# Usage: +# tools/crypto/gen_fips140_testvecs.py > crypto/fips140-generated-testvecs.h +# +# Prerequisites: +# Debian: apt-get install python3-pycryptodome python3-cryptography +# Arch Linux: pacman -S python-pycryptodomex python-cryptography + +import hashlib +import hmac +import os + +import Cryptodome.Cipher.AES +import Cryptodome.Util.Counter + +import cryptography.hazmat.primitives.ciphers +import cryptography.hazmat.primitives.ciphers.algorithms +import cryptography.hazmat.primitives.ciphers.modes + +scriptname = os.path.basename(__file__) + +message = bytes('This is a 32-byte test message.\0', 'ascii') +aes_key = bytes('128-bit AES key\0', 'ascii') +aes_xts_key = bytes('This is an AES-128-XTS key.\0\0\0\0\0', 'ascii') +aes_iv = bytes('ABCDEFGHIJKLMNOP', 'ascii') +assoc = bytes('associated data string', 'ascii') +hmac_key = bytes('128-bit HMAC key', 'ascii') + +def warn_generated(): + print(f'''/* + * This header was automatically generated by {scriptname}. + * Don't edit it directly. + */''') + +def is_string_value(value): + return (value.isascii() and + all(c == '\x00' or c.isprintable() for c in str(value, 'ascii'))) + +def format_value(value, is_string): + if is_string: + return value + hexstr = '' + for byte in value: + hexstr += f'\\x{byte:02x}' + return hexstr + +def print_value(name, value): + is_string = is_string_value(value) + hdr = f'static const u8 fips_{name}[{len(value)}] __initconst =' + print(hdr, end='') + if is_string: + value = str(value, 'ascii').rstrip('\x00') + chars_per_byte = 1 + else: + chars_per_byte = 4 + bytes_per_line = 64 // chars_per_byte + + if len(hdr) + (chars_per_byte * len(value)) + 4 <= 80: + print(f' "{format_value(value, is_string)}"', end='') + else: + for chunk in [value[i:i+bytes_per_line] + for i in range(0, len(value), bytes_per_line)]: + print(f'\n\t"{format_value(chunk, is_string)}"', end='') + print(';') + print('') + +def generate_aes_testvecs(): + print_value('aes_key', aes_key) + print_value('aes_iv', aes_iv) + + cbc = Cryptodome.Cipher.AES.new(aes_key, Cryptodome.Cipher.AES.MODE_CBC, + iv=aes_iv) + print_value('aes_cbc_ciphertext', cbc.encrypt(message)) + + ecb = Cryptodome.Cipher.AES.new(aes_key, Cryptodome.Cipher.AES.MODE_ECB) + print_value('aes_ecb_ciphertext', ecb.encrypt(message)) + + ctr = Cryptodome.Cipher.AES.new(aes_key, Cryptodome.Cipher.AES.MODE_CTR, + nonce=bytes(), initial_value=aes_iv) + print_value('aes_ctr_ciphertext', ctr.encrypt(message)) + + print_value('aes_gcm_assoc', assoc) + gcm = Cryptodome.Cipher.AES.new(aes_key, Cryptodome.Cipher.AES.MODE_GCM, + nonce=aes_iv[:12], mac_len=16) + gcm.update(assoc) + raw_ciphertext, tag = gcm.encrypt_and_digest(message) + print_value('aes_gcm_ciphertext', raw_ciphertext + tag) + + # Unfortunately, pycryptodome doesn't support XTS, so for it we need to use + # a different Python package (the "cryptography" package). + print_value('aes_xts_key', aes_xts_key) + xts = cryptography.hazmat.primitives.ciphers.Cipher( + cryptography.hazmat.primitives.ciphers.algorithms.AES(aes_xts_key), + cryptography.hazmat.primitives.ciphers.modes.XTS(aes_iv)).encryptor() + ciphertext = xts.update(message) + xts.finalize() + print_value('aes_xts_ciphertext', ciphertext) + + cmac = Cryptodome.Hash.CMAC.new(aes_key, ciphermod=Cryptodome.Cipher.AES) + cmac.update(message) + print_value('aes_cmac_digest', cmac.digest()) + +def generate_sha_testvecs(): + print_value('hmac_key', hmac_key) + for alg in ['sha1', 'sha256', 'hmac_sha256', 'sha512']: + if alg.startswith('hmac_'): + h = hmac.new(hmac_key, message, alg.removeprefix('hmac_')) + else: + h = hashlib.new(alg, message) + print_value(f'{alg}_digest', h.digest()) + +print('/* SPDX-License-Identifier: GPL-2.0-only */') +print('/* Copyright 2021 Google LLC */') +print('') +warn_generated() +print('') +print_value('message', message) +generate_aes_testvecs() +generate_sha_testvecs() +warn_generated() diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 43bd7f713c39..de45fcd2dcbe 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -235,7 +235,7 @@ struct prctl_mm_map { #define PR_GET_TAGGED_ADDR_CTRL 56 # define PR_TAGGED_ADDR_ENABLE (1UL << 0) /* MTE tag check fault modes */ -# define PR_MTE_TCF_NONE 0 +# define PR_MTE_TCF_NONE 0UL # define PR_MTE_TCF_SYNC (1UL << 1) # define PR_MTE_TCF_ASYNC (1UL << 2) # define PR_MTE_TCF_MASK (PR_MTE_TCF_SYNC | PR_MTE_TCF_ASYNC) diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh index ecda972e8777..b3d88b0a6987 100644 --- a/tools/testing/selftests/damon/debugfs_attrs.sh +++ b/tools/testing/selftests/damon/debugfs_attrs.sh @@ -57,6 +57,19 @@ test_write_fail "$file" "1 2 3 5 4" "$orig_content" \ test_content "$file" "$orig_content" "1 2 3 4 5" "successfully written" echo "$orig_content" > "$file" +# Test schemes file +# ================= + +file="$DBGFS/schemes" +orig_content=$(cat "$file") + +test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3 1 100 3 2 1" \ + "$orig_content" "valid input" +test_write_fail "$file" "1 2 +3 4 5 6 3 0 0 0 1 2 3 1 100 3 2 1" "$orig_content" "multi lines" +test_write_succ "$file" "" "$orig_content" "disabling" +echo "$orig_content" > "$file" + # Test target_ids file # ==================== diff --git a/tools/testing/selftests/filesystems/fuse/.gitignore b/tools/testing/selftests/filesystems/fuse/.gitignore new file mode 100644 index 000000000000..3ee9a27fe66a --- /dev/null +++ b/tools/testing/selftests/filesystems/fuse/.gitignore @@ -0,0 +1,2 @@ +fuse_test +*.raw diff --git a/tools/testing/selftests/filesystems/fuse/Makefile b/tools/testing/selftests/filesystems/fuse/Makefile new file mode 100644 index 000000000000..261d7606560a --- /dev/null +++ b/tools/testing/selftests/filesystems/fuse/Makefile @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: GPL-2.0 +CFLAGS += -D_FILE_OFFSET_BITS=64 -Wall -Werror -I../.. -I../../../../.. -I../../../../include +LDLIBS := -lpthread -lelf +TEST_GEN_PROGS := fuse_test fuse_daemon +TEST_GEN_FILES := \ + test_bpf.bpf \ + fd_bpf.bpf \ + fd.sh \ + +EXTRA_CLEAN := *.bpf +BPF_FLAGS = -Wall -Werror -O2 -g -emit-llvm \ + -I ../../../../../include \ + -idirafter /usr/lib/gcc/x86_64-linux-gnu/10/include \ + -idirafter /usr/local/include \ + -idirafter /usr/include/x86_64-linux-gnu \ + -idirafter /usr/include \ + +include ../../lib.mk + +# Put after include ../../lib.mk since that changes $(TEST_GEN_PROGS) +# Otherwise you get multiple targets, this becomes the default, and it's a mess +EXTRA_SOURCES := bpf_loader.c +$(TEST_GEN_PROGS) : $(EXTRA_SOURCES) + +$(OUTPUT)/%.ir: %.c + clang $(BPF_FLAGS) -c $< -o $@ + +$(OUTPUT)/%.bpf: $(OUTPUT)/%.ir + llc -march=bpf -filetype=obj -o $@ $< + +$(OUTPUT)/fd.sh: fd.txt + cp $< $@ + chmod 755 $@ + diff --git a/tools/testing/selftests/filesystems/fuse/OWNERS b/tools/testing/selftests/filesystems/fuse/OWNERS new file mode 100644 index 000000000000..5eb371e1a5a3 --- /dev/null +++ b/tools/testing/selftests/filesystems/fuse/OWNERS @@ -0,0 +1,2 @@ +# include OWNERS from the authoritative android-mainline branch +include kernel/common:android-mainline:/tools/testing/selftests/filesystems/incfs/OWNERS diff --git a/tools/testing/selftests/filesystems/fuse/bpf_loader.c b/tools/testing/selftests/filesystems/fuse/bpf_loader.c new file mode 100644 index 000000000000..5bf26eadd421 --- /dev/null +++ b/tools/testing/selftests/filesystems/fuse/bpf_loader.c @@ -0,0 +1,791 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2021 Google LLC + */ + +#include "test_fuse.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include + +struct _test_options test_options; + +struct s s(const char *s1) +{ + struct s s = {0}; + + if (!s1) + return s; + + s.s = malloc(strlen(s1) + 1); + if (!s.s) + return s; + + strcpy(s.s, s1); + return s; +} + +struct s sn(const char *s1, const char *s2) +{ + struct s s = {0}; + + if (!s1) + return s; + + s.s = malloc(s2 - s1 + 1); + if (!s.s) + return s; + + strncpy(s.s, s1, s2 - s1); + s.s[s2 - s1] = 0; + return s; +} + +int s_cmp(struct s s1, struct s s2) +{ + int result = -1; + + if (!s1.s || !s2.s) + goto out; + result = strcmp(s1.s, s2.s); +out: + free(s1.s); + free(s2.s); + return result; +} + +struct s s_cat(struct s s1, struct s s2) +{ + struct s s = {0}; + + if (!s1.s || !s2.s) + goto out; + + s.s = malloc(strlen(s1.s) + strlen(s2.s) + 1); + if (!s.s) + goto out; + + strcpy(s.s, s1.s); + strcat(s.s, s2.s); +out: + free(s1.s); + free(s2.s); + return s; +} + +struct s s_splitleft(struct s s1, char c) +{ + struct s s = {0}; + char *split; + + if (!s1.s) + return s; + + split = strchr(s1.s, c); + if (split) + s = sn(s1.s, split); + + free(s1.s); + return s; +} + +struct s s_splitright(struct s s1, char c) +{ + struct s s2 = {0}; + char *split; + + if (!s1.s) + return s2; + + split = strchr(s1.s, c); + if (split) + s2 = s(split + 1); + + free(s1.s); + return s2; +} + +struct s s_word(struct s s1, char c, size_t n) +{ + while (n--) + s1 = s_splitright(s1, c); + return s_splitleft(s1, c); +} + +struct s s_path(struct s s1, struct s s2) +{ + return s_cat(s_cat(s1, s("/")), s2); +} + +struct s s_pathn(size_t n, struct s s1, ...) +{ + va_list argp; + + va_start(argp, s1); + while (--n) + s1 = s_path(s1, va_arg(argp, struct s)); + va_end(argp); + return s1; +} + +int s_link(struct s src_pathname, struct s dst_pathname) +{ + int res; + + if (src_pathname.s && dst_pathname.s) { + res = link(src_pathname.s, dst_pathname.s); + } else { + res = -1; + errno = ENOMEM; + } + + free(src_pathname.s); + free(dst_pathname.s); + return res; +} + +int s_symlink(struct s src_pathname, struct s dst_pathname) +{ + int res; + + if (src_pathname.s && dst_pathname.s) { + res = symlink(src_pathname.s, dst_pathname.s); + } else { + res = -1; + errno = ENOMEM; + } + + free(src_pathname.s); + free(dst_pathname.s); + return res; +} + + +int s_mkdir(struct s pathname, mode_t mode) +{ + int res; + + if (!pathname.s) { + errno = ENOMEM; + return -1; + } + + res = mkdir(pathname.s, mode); + free(pathname.s); + return res; +} + +int s_rmdir(struct s pathname) +{ + int res; + + if (!pathname.s) { + errno = ENOMEM; + return -1; + } + + res = rmdir(pathname.s); + free(pathname.s); + return res; +} + +int s_unlink(struct s pathname) +{ + int res; + + if (!pathname.s) { + errno = ENOMEM; + return -1; + } + + res = unlink(pathname.s); + free(pathname.s); + return res; +} + +int s_open(struct s pathname, int flags, ...) +{ + va_list ap; + int res; + + va_start(ap, flags); + if (!pathname.s) { + errno = ENOMEM; + return -1; + } + + if (flags & (O_CREAT | O_TMPFILE)) + res = open(pathname.s, flags, va_arg(ap, mode_t)); + else + res = open(pathname.s, flags); + + free(pathname.s); + va_end(ap); + return res; +} + +int s_openat(int dirfd, struct s pathname, int flags, ...) +{ + va_list ap; + int res; + + va_start(ap, flags); + if (!pathname.s) { + errno = ENOMEM; + return -1; + } + + if (flags & (O_CREAT | O_TMPFILE)) + res = openat(dirfd, pathname.s, flags, va_arg(ap, mode_t)); + else + res = openat(dirfd, pathname.s, flags); + + free(pathname.s); + va_end(ap); + return res; +} + +int s_creat(struct s pathname, mode_t mode) +{ + int res; + + if (!pathname.s) { + errno = ENOMEM; + return -1; + } + + res = open(pathname.s, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, mode); + free(pathname.s); + return res; +} + +int s_mkfifo(struct s pathname, mode_t mode) +{ + int res; + + if (!pathname.s) { + errno = ENOMEM; + return -1; + } + + res = mknod(pathname.s, S_IFIFO | mode, 0); + free(pathname.s); + return res; +} + +int s_stat(struct s pathname, struct stat *st) +{ + int res; + + if (!pathname.s) { + errno = ENOMEM; + return -1; + } + + res = stat(pathname.s, st); + free(pathname.s); + return res; +} + +int s_statfs(struct s pathname, struct statfs *st) +{ + int res; + + if (!pathname.s) { + errno = ENOMEM; + return -1; + } + + res = statfs(pathname.s, st); + free(pathname.s); + return res; +} + +DIR *s_opendir(struct s pathname) +{ + DIR *res; + + res = opendir(pathname.s); + free(pathname.s); + return res; +} + +int s_getxattr(struct s pathname, const char name[], void *value, size_t size, + ssize_t *ret_size) +{ + if (!pathname.s) { + errno = ENOMEM; + return -1; + } + + *ret_size = getxattr(pathname.s, name, value, size); + free(pathname.s); + return *ret_size >= 0 ? 0 : -1; +} + +int s_listxattr(struct s pathname, void *list, size_t size, ssize_t *ret_size) +{ + if (!pathname.s) { + errno = ENOMEM; + return -1; + } + + *ret_size = listxattr(pathname.s, list, size); + free(pathname.s); + return *ret_size >= 0 ? 0 : -1; +} + +int s_setxattr(struct s pathname, const char name[], const void *value, size_t size, int flags) +{ + int res; + + if (!pathname.s) { + errno = ENOMEM; + return -1; + } + + res = setxattr(pathname.s, name, value, size, flags); + free(pathname.s); + return res; +} + +int s_removexattr(struct s pathname, const char name[]) +{ + int res; + + if (!pathname.s) { + errno = ENOMEM; + return -1; + } + + res = removexattr(pathname.s, name); + free(pathname.s); + return res; +} + +int s_rename(struct s oldpathname, struct s newpathname) +{ + int res; + + if (!oldpathname.s || !newpathname.s) { + errno = ENOMEM; + return -1; + } + + res = rename(oldpathname.s, newpathname.s); + free(oldpathname.s); + free(newpathname.s); + return res; +} + +int s_fuse_attr(struct s pathname, struct fuse_attr *fuse_attr_out) +{ + + struct stat st; + int result = TEST_FAILURE; + + TESTSYSCALL(s_stat(pathname, &st)); + + fuse_attr_out->ino = st.st_ino; + fuse_attr_out->mode = st.st_mode; + fuse_attr_out->nlink = st.st_nlink; + fuse_attr_out->uid = st.st_uid; + fuse_attr_out->gid = st.st_gid; + fuse_attr_out->rdev = st.st_rdev; + fuse_attr_out->size = st.st_size; + fuse_attr_out->blksize = st.st_blksize; + fuse_attr_out->blocks = st.st_blocks; + fuse_attr_out->atime = st.st_atime; + fuse_attr_out->mtime = st.st_mtime; + fuse_attr_out->ctime = st.st_ctime; + fuse_attr_out->atimensec = UINT32_MAX; + fuse_attr_out->mtimensec = UINT32_MAX; + fuse_attr_out->ctimensec = UINT32_MAX; + + result = TEST_SUCCESS; +out: + return result; +} + +struct s tracing_folder(void) +{ + struct s trace = {0}; + FILE *mounts = NULL; + char *line = NULL; + size_t size = 0; + + TEST(mounts = fopen("/proc/mounts", "re"), mounts); + while (getline(&line, &size, mounts) != -1) { + if (!s_cmp(s_word(sn(line, line + size), ' ', 2), + s("tracefs"))) { + trace = s_word(sn(line, line + size), ' ', 1); + break; + } + + if (!s_cmp(s_word(sn(line, line + size), ' ', 2), s("debugfs"))) + trace = s_path(s_word(sn(line, line + size), ' ', 1), + s("tracing")); + } + +out: + free(line); + fclose(mounts); + return trace; +} + +int tracing_on(void) +{ + int result = TEST_FAILURE; + int tracing_on = -1; + + TEST(tracing_on = s_open(s_path(tracing_folder(), s("tracing_on")), + O_WRONLY | O_CLOEXEC), + tracing_on != -1); + TESTEQUAL(write(tracing_on, "1", 1), 1); + result = TEST_SUCCESS; +out: + close(tracing_on); + return result; +} + +char *concat_file_name(const char *dir, const char *file) +{ + char full_name[FILENAME_MAX] = ""; + + if (snprintf(full_name, ARRAY_SIZE(full_name), "%s/%s", dir, file) < 0) + return NULL; + return strdup(full_name); +} + +char *setup_mount_dir(const char *name) +{ + struct stat st; + char *current_dir = getcwd(NULL, 0); + char *mount_dir = concat_file_name(current_dir, name); + + free(current_dir); + if (stat(mount_dir, &st) == 0) { + if (S_ISDIR(st.st_mode)) + return mount_dir; + + ksft_print_msg("%s is a file, not a dir.\n", mount_dir); + return NULL; + } + + if (mkdir(mount_dir, 0777)) { + ksft_print_msg("Can't create mount dir."); + return NULL; + } + + return mount_dir; +} + +int delete_dir_tree(const char *dir_path, bool remove_root) +{ + DIR *dir = NULL; + struct dirent *dp; + int result = 0; + + dir = opendir(dir_path); + if (!dir) { + result = -errno; + goto out; + } + + while ((dp = readdir(dir))) { + char *full_path; + + if (!strcmp(dp->d_name, ".") || !strcmp(dp->d_name, "..")) + continue; + + full_path = concat_file_name(dir_path, dp->d_name); + if (dp->d_type == DT_DIR) + result = delete_dir_tree(full_path, true); + else + result = unlink(full_path); + free(full_path); + if (result) + goto out; + } + +out: + if (dir) + closedir(dir); + if (!result && remove_root) + rmdir(dir_path); + return result; +} + +static int mount_fuse_maybe_init(const char *mount_dir, int bpf_fd, int dir_fd, + int *fuse_dev_ptr, bool init) +{ + int result = TEST_FAILURE; + int fuse_dev = -1; + char options[FILENAME_MAX]; + uint8_t bytes_in[FUSE_MIN_READ_BUFFER]; + uint8_t bytes_out[FUSE_MIN_READ_BUFFER]; + + DECL_FUSE_IN(init); + + TEST(fuse_dev = open("/dev/fuse", O_RDWR | O_CLOEXEC), fuse_dev != -1); + snprintf(options, FILENAME_MAX, "fd=%d,user_id=0,group_id=0,rootmode=0040000", + fuse_dev); + if (bpf_fd != -1) + snprintf(options + strlen(options), + sizeof(options) - strlen(options), + ",root_bpf=%d", bpf_fd); + if (dir_fd != -1) + snprintf(options + strlen(options), + sizeof(options) - strlen(options), + ",root_dir=%d", dir_fd); + TESTSYSCALL(mount("ABC", mount_dir, "fuse", 0, options)); + + if (init) { + TESTFUSEIN(FUSE_INIT, init_in); + TESTEQUAL(init_in->major, FUSE_KERNEL_VERSION); + TESTEQUAL(init_in->minor, FUSE_KERNEL_MINOR_VERSION); + TESTFUSEOUT1(fuse_init_out, ((struct fuse_init_out) { + .major = FUSE_KERNEL_VERSION, + .minor = FUSE_KERNEL_MINOR_VERSION, + .max_readahead = 4096, + .flags = 0, + .max_background = 0, + .congestion_threshold = 0, + .max_write = 4096, + .time_gran = 1000, + .max_pages = 12, + .map_alignment = 4096, + })); + } + + *fuse_dev_ptr = fuse_dev; + fuse_dev = -1; + result = TEST_SUCCESS; +out: + close(fuse_dev); + return result; +} + +int mount_fuse(const char *mount_dir, int bpf_fd, int dir_fd, int *fuse_dev_ptr) +{ + return mount_fuse_maybe_init(mount_dir, bpf_fd, dir_fd, fuse_dev_ptr, + true); +} + +int mount_fuse_no_init(const char *mount_dir, int bpf_fd, int dir_fd, + int *fuse_dev_ptr) +{ + return mount_fuse_maybe_init(mount_dir, bpf_fd, dir_fd, fuse_dev_ptr, + false); +} + +struct fuse_bpf_map { + unsigned int map_type; + size_t key_size; + size_t value_size; + unsigned int max_entries; +}; + +static int install_maps(Elf_Data *maps, int maps_index, Elf *elf, + Elf_Data *symbols, int symbol_index, + struct map_relocation **mr, size_t *map_count) +{ + int result = TEST_FAILURE; + int i; + GElf_Sym symbol; + + TESTNE((void *)symbols, NULL); + + for (i = 0; i < symbols->d_size / sizeof(symbol); ++i) { + TESTNE((void *)gelf_getsym(symbols, i, &symbol), 0); + if (symbol.st_shndx == maps_index) { + struct fuse_bpf_map *map; + union bpf_attr attr; + int map_fd; + + map = (struct fuse_bpf_map *) + ((char *)maps->d_buf + symbol.st_value); + + attr = (union bpf_attr) { + .map_type = map->map_type, + .key_size = map->key_size, + .value_size = map->value_size, + .max_entries = map->max_entries, + }; + + TEST(*mr = realloc(*mr, ++*map_count * + sizeof(struct fuse_bpf_map)), + *mr); + TEST(map_fd = syscall(__NR_bpf, BPF_MAP_CREATE, + &attr, sizeof(attr)), + map_fd != -1); + (*mr)[*map_count - 1] = (struct map_relocation) { + .name = strdup(elf_strptr(elf, symbol_index, + symbol.st_name)), + .fd = map_fd, + .value = symbol.st_value, + }; + } + } + + result = TEST_SUCCESS; +out: + return result; +} + +static inline int relocate_maps(GElf_Shdr *rel_header, Elf_Data *rel_data, + Elf_Data *prog_data, Elf_Data *symbol_data, + struct map_relocation *map_relocations, + size_t map_count) +{ + int result = TEST_FAILURE; + int i; + struct bpf_insn *insns = (struct bpf_insn *) prog_data->d_buf; + + for (i = 0; i < rel_header->sh_size / rel_header->sh_entsize; ++i) { + GElf_Sym sym; + GElf_Rel rel; + unsigned int insn_idx; + int map_idx; + + gelf_getrel(rel_data, i, &rel); + insn_idx = rel.r_offset / sizeof(struct bpf_insn); + insns[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; + + gelf_getsym(symbol_data, GELF_R_SYM(rel.r_info), &sym); + for (map_idx = 0; map_idx < map_count; map_idx++) { + if (map_relocations[map_idx].value == sym.st_value) { + insns[insn_idx].imm = + map_relocations[map_idx].fd; + break; + } + } + TESTNE(map_idx, map_count); + } + + result = TEST_SUCCESS; +out: + return result; +} + +int install_elf_bpf(const char *file, const char *section, int *fd, + struct map_relocation **map_relocations, size_t *map_count) +{ + int result = TEST_FAILURE; + char path[PATH_MAX] = {}; + char *last_slash; + int filter_fd = -1; + union bpf_attr bpf_attr; + static char log[1 << 20]; + Elf *elf = NULL; + GElf_Ehdr ehdr; + Elf_Data *data_prog = NULL, *data_maps = NULL, *data_symbols = NULL; + int maps_index, symbol_index, prog_index; + int i; + int bpf_prog_type_fuse_fd = -1; + char buffer[10] = {0}; + int bpf_prog_type_fuse; + + TESTNE(readlink("/proc/self/exe", path, PATH_MAX), -1); + TEST(last_slash = strrchr(path, '/'), last_slash); + strcpy(last_slash + 1, file); + TEST(filter_fd = open(path, O_RDONLY | O_CLOEXEC), filter_fd != -1); + TESTNE(elf_version(EV_CURRENT), EV_NONE); + TEST(elf = elf_begin(filter_fd, ELF_C_READ, NULL), elf); + TESTEQUAL((void *) gelf_getehdr(elf, &ehdr), &ehdr); + for (i = 1; i < ehdr.e_shnum; i++) { + char *shname; + GElf_Shdr shdr; + Elf_Scn *scn; + + TEST(scn = elf_getscn(elf, i), scn); + TESTEQUAL((void *)gelf_getshdr(scn, &shdr), &shdr); + TEST(shname = elf_strptr(elf, ehdr.e_shstrndx, shdr.sh_name), + shname); + + if (!strcmp(shname, "maps")) { + TEST(data_maps = elf_getdata(scn, 0), data_maps); + maps_index = i; + } else if (shdr.sh_type == SHT_SYMTAB) { + TEST(data_symbols = elf_getdata(scn, 0), data_symbols); + symbol_index = shdr.sh_link; + } else if (!strcmp(shname, section)) { + TEST(data_prog = elf_getdata(scn, 0), data_prog); + prog_index = i; + } + } + TESTNE((void *) data_prog, NULL); + + if (data_maps) + TESTEQUAL(install_maps(data_maps, maps_index, elf, + data_symbols, symbol_index, + map_relocations, map_count), 0); + + /* Now relocate maps */ + for (i = 1; i < ehdr.e_shnum; i++) { + GElf_Shdr rel_header; + Elf_Scn *scn; + Elf_Data *rel_data; + + TEST(scn = elf_getscn(elf, i), scn); + TESTEQUAL((void *)gelf_getshdr(scn, &rel_header), + &rel_header); + if (rel_header.sh_type != SHT_REL) + continue; + TEST(rel_data = elf_getdata(scn, 0), rel_data); + + if (rel_header.sh_info != prog_index) + continue; + TESTEQUAL(relocate_maps(&rel_header, rel_data, + data_prog, data_symbols, + *map_relocations, *map_count), + 0); + } + + TEST(bpf_prog_type_fuse_fd = open("/sys/fs/fuse/bpf_prog_type_fuse", + O_RDONLY | O_CLOEXEC), + bpf_prog_type_fuse_fd != -1); + TESTGE(read(bpf_prog_type_fuse_fd, buffer, sizeof(buffer)), 1); + TEST(bpf_prog_type_fuse = strtol(buffer, NULL, 10), + bpf_prog_type_fuse != 0); + + bpf_attr = (union bpf_attr) { + .prog_type = bpf_prog_type_fuse, + .insn_cnt = data_prog->d_size / 8, + .insns = ptr_to_u64(data_prog->d_buf), + .license = ptr_to_u64("GPL"), + .log_buf = test_options.verbose ? ptr_to_u64(log) : 0, + .log_size = test_options.verbose ? sizeof(log) : 0, + .log_level = test_options.verbose ? 2 : 0, + }; + *fd = syscall(__NR_bpf, BPF_PROG_LOAD, &bpf_attr, sizeof(bpf_attr)); + if (test_options.verbose) + ksft_print_msg("%s\n", log); + if (*fd == -1 && errno == ENOSPC) + ksft_print_msg("bpf log size too small!\n"); + TESTNE(*fd, -1); + + result = TEST_SUCCESS; +out: + close(filter_fd); + close(bpf_prog_type_fuse_fd); + return result; +} + + diff --git a/tools/testing/selftests/filesystems/fuse/fd.txt b/tools/testing/selftests/filesystems/fuse/fd.txt new file mode 100644 index 000000000000..15ce77180d55 --- /dev/null +++ b/tools/testing/selftests/filesystems/fuse/fd.txt @@ -0,0 +1,21 @@ +fuse_daemon $* +cd fd-dst +ls +cd show +ls +fsstress -s 123 -d . -p 4 -n 100 -l5 +echo test > wibble +ls +cat wibble +fallocate -l 1000 wobble +mkdir testdir +mkdir tmpdir +rmdir tmpdir +touch tmp +mv tmp tmp2 +rm tmp2 + +# FUSE_LINK +echo "ln_src contents" > ln_src +ln ln_src ln_link +cat ln_link diff --git a/tools/testing/selftests/filesystems/fuse/fd_bpf.c b/tools/testing/selftests/filesystems/fuse/fd_bpf.c new file mode 100644 index 000000000000..3cd82d67e759 --- /dev/null +++ b/tools/testing/selftests/filesystems/fuse/fd_bpf.c @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +// Copyright (c) 2021 Google LLC + +#include "test_fuse_bpf.h" + +SEC("maps") struct fuse_bpf_map test_map = { + BPF_MAP_TYPE_ARRAY, + sizeof(uint32_t), + sizeof(uint32_t), + 1000, +}; + +SEC("maps") struct fuse_bpf_map test_map2 = { + BPF_MAP_TYPE_HASH, + sizeof(uint32_t), + sizeof(uint64_t), + 76, +}; + +SEC("test_daemon") int trace_daemon(struct fuse_bpf_args *fa) +{ + uint64_t uid_gid = bpf_get_current_uid_gid(); + uint32_t uid = uid_gid & 0xffffffff; + uint64_t pid_tgid = bpf_get_current_pid_tgid(); + uint32_t pid = pid_tgid & 0xffffffff; + uint32_t key = 23; + uint32_t *pvalue; + + pvalue = bpf_map_lookup_elem(&test_map, &key); + if (pvalue) { + uint32_t value = *pvalue; + + bpf_printk("pid %u uid %u value %u", pid, uid, value); + value++; + bpf_map_update_elem(&test_map, &key, &value, BPF_ANY); + } + + switch (fa->opcode) { + case FUSE_ACCESS | FUSE_PREFILTER: { + bpf_printk("Access: %d", fa->nodeid); + return FUSE_BPF_BACKING; + } + + case FUSE_GETATTR | FUSE_PREFILTER: { + const struct fuse_getattr_in *fgi = fa->in_args[0].value; + + bpf_printk("Get Attr %d", fgi->fh); + return FUSE_BPF_BACKING; + } + + case FUSE_SETATTR | FUSE_PREFILTER: { + const struct fuse_setattr_in *fsi = fa->in_args[0].value; + + bpf_printk("Set Attr %d", fsi->fh); + return FUSE_BPF_BACKING; + } + + case FUSE_OPENDIR | FUSE_PREFILTER: { + bpf_printk("Open Dir: %d", fa->nodeid); + return FUSE_BPF_BACKING; + } + + case FUSE_READDIR | FUSE_PREFILTER: { + const struct fuse_read_in *fri = fa->in_args[0].value; + + bpf_printk("Read Dir: fh: %lu", fri->fh, fri->offset); + return FUSE_BPF_BACKING; + } + + case FUSE_LOOKUP | FUSE_PREFILTER: { + const char *name = fa->in_args[0].value; + + bpf_printk("Lookup: %lx %s", fa->nodeid, name); + if (fa->nodeid == 1) + return FUSE_BPF_USER_FILTER | FUSE_BPF_BACKING; + else + return FUSE_BPF_BACKING; + } + + case FUSE_MKNOD | FUSE_PREFILTER: { + const struct fuse_mknod_in *fmi = fa->in_args[0].value; + const char *name = fa->in_args[1].value; + + bpf_printk("mknod %s %x %x", name, fmi->rdev | fmi->mode, fmi->umask); + return FUSE_BPF_BACKING; + } + + case FUSE_MKDIR | FUSE_PREFILTER: { + const struct fuse_mkdir_in *fmi = fa->in_args[0].value; + const char *name = fa->in_args[1].value; + + bpf_printk("mkdir: %s %x %x", name, fmi->mode, fmi->umask); + return FUSE_BPF_BACKING; + } + + case FUSE_RMDIR | FUSE_PREFILTER: { + const char *name = fa->in_args[0].value; + + bpf_printk("rmdir: %s", name); + return FUSE_BPF_BACKING; + } + + case FUSE_RENAME | FUSE_PREFILTER: { + const char *oldname = fa->in_args[1].value; + const char *newname = fa->in_args[2].value; + + bpf_printk("rename from %s", oldname); + bpf_printk("rename to %s", newname); + return FUSE_BPF_BACKING; + } + + case FUSE_RENAME2 | FUSE_PREFILTER: { + const struct fuse_rename2_in *fri = fa->in_args[0].value; + uint32_t flags = fri->flags; + const char *oldname = fa->in_args[1].value; + const char *newname = fa->in_args[2].value; + + bpf_printk("rename(%x) from %s", flags, oldname); + bpf_printk("rename to %s", newname); + return FUSE_BPF_BACKING; + } + + case FUSE_UNLINK | FUSE_PREFILTER: { + const char *name = fa->in_args[0].value; + + bpf_printk("unlink: %s", name); + return FUSE_BPF_BACKING; + } + + case FUSE_LINK | FUSE_PREFILTER: { + const struct fuse_link_in *fli = fa->in_args[0].value; + const char *dst_name = fa->in_args[1].value; + + bpf_printk("Link: %d %s", fli->oldnodeid, dst_name); + return FUSE_BPF_BACKING; + } + + case FUSE_SYMLINK | FUSE_PREFILTER: { + const char *link_name = fa->in_args[0].value; + const char *link_dest = fa->in_args[1].value; + + bpf_printk("symlink from %s", link_name); + bpf_printk("symlink to %s", link_dest); + return FUSE_BPF_BACKING; + } + + case FUSE_READLINK | FUSE_PREFILTER: { + const char *link_name = fa->in_args[0].value; + + bpf_printk("readlink from %s", link_name); + return FUSE_BPF_BACKING; + } + + case FUSE_RELEASE | FUSE_PREFILTER: { + const struct fuse_release_in *fri = fa->in_args[0].value; + + bpf_printk("Release: %d", fri->fh); + return FUSE_BPF_BACKING; + } + + case FUSE_RELEASEDIR | FUSE_PREFILTER: { + const struct fuse_release_in *fri = fa->in_args[0].value; + + bpf_printk("Release Dir: %d", fri->fh); + return FUSE_BPF_BACKING; + } + + case FUSE_CREATE | FUSE_PREFILTER: { + bpf_printk("Create %s", fa->in_args[1].value); + return FUSE_BPF_BACKING; + } + + case FUSE_OPEN | FUSE_PREFILTER: { + bpf_printk("Open: %d", fa->nodeid); + return FUSE_BPF_BACKING; + } + + case FUSE_READ | FUSE_PREFILTER: { + const struct fuse_read_in *fri = fa->in_args[0].value; + + bpf_printk("Read: fh: %lu, offset %lu, size %lu", + fri->fh, fri->offset, fri->size); + return FUSE_BPF_BACKING; + } + + case FUSE_WRITE | FUSE_PREFILTER: { + const struct fuse_write_in *fwi = fa->in_args[0].value; + + bpf_printk("Write: fh: %lu, offset %lu, size %lu", + fwi->fh, fwi->offset, fwi->size); + return FUSE_BPF_BACKING; + } + + case FUSE_FLUSH | FUSE_PREFILTER: { + const struct fuse_flush_in *ffi = fa->in_args[0].value; + + bpf_printk("Flush %d", ffi->fh); + return FUSE_BPF_BACKING; + } + + case FUSE_FALLOCATE | FUSE_PREFILTER: { + const struct fuse_fallocate_in *ffa = fa->in_args[0].value; + + bpf_printk("Fallocate %d %lu", ffa->fh, ffa->length); + return FUSE_BPF_BACKING; + } + + case FUSE_GETXATTR | FUSE_PREFILTER: { + const char *name = fa->in_args[1].value; + + bpf_printk("Getxattr %d %s", fa->nodeid, name); + return FUSE_BPF_BACKING; + } + + case FUSE_LISTXATTR | FUSE_PREFILTER: { + const char *name = fa->in_args[1].value; + + bpf_printk("Listxattr %d %s", fa->nodeid, name); + return FUSE_BPF_BACKING; + } + + case FUSE_SETXATTR | FUSE_PREFILTER: { + const char *name = fa->in_args[1].value; + + bpf_printk("Setxattr %d %s", fa->nodeid, name); + return FUSE_BPF_BACKING; + } + + case FUSE_STATFS | FUSE_PREFILTER: { + bpf_printk("statfs %d", fa->nodeid); + return FUSE_BPF_BACKING; + } + + case FUSE_LSEEK | FUSE_PREFILTER: { + const struct fuse_lseek_in *fli = fa->in_args[0].value; + + bpf_printk("lseek type:%d, offset:%lld", fli->whence, fli->offset); + return FUSE_BPF_BACKING; + } + + default: + if (fa->opcode & FUSE_PREFILTER) + bpf_printk("prefilter *** UNKNOWN *** opcode: %d", + fa->opcode & FUSE_OPCODE_FILTER); + else if (fa->opcode & FUSE_POSTFILTER) + bpf_printk("postfilter *** UNKNOWN *** opcode: %d", + fa->opcode & FUSE_OPCODE_FILTER); + else + bpf_printk("*** UNKNOWN *** opcode: %d", fa->opcode); + return FUSE_BPF_BACKING; + } +} diff --git a/tools/testing/selftests/filesystems/fuse/fuse_daemon.c b/tools/testing/selftests/filesystems/fuse/fuse_daemon.c new file mode 100644 index 000000000000..1b6f8c2acf2b --- /dev/null +++ b/tools/testing/selftests/filesystems/fuse/fuse_daemon.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2021 Google LLC + */ + +#include "test_fuse.h" + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include + +bool user_messages; +bool kernel_messages; + +static int display_trace(void) +{ + int pid = -1; + int tp = -1; + char c; + ssize_t bytes_read; + static char line[256] = {0}; + + if (!kernel_messages) + return TEST_SUCCESS; + + TEST(pid = fork(), pid != -1); + if (pid != 0) + return pid; + + TESTEQUAL(tracing_on(), 0); + TEST(tp = s_open(s_path(tracing_folder(), s("trace_pipe")), + O_RDONLY | O_CLOEXEC), tp != -1); + for (;;) { + TEST(bytes_read = read(tp, &c, sizeof(c)), + bytes_read == 1); + if (c == '\n') { + printf("%s\n", line); + line[0] = 0; + } else + sprintf(line + strlen(line), "%c", c); + } +out: + if (pid == 0) { + close(tp); + exit(TEST_FAILURE); + } + return pid; +} + +static const char *fuse_opcode_to_string(int opcode) +{ + switch (opcode & FUSE_OPCODE_FILTER) { + case FUSE_LOOKUP: + return "FUSE_LOOKUP"; + case FUSE_FORGET: + return "FUSE_FORGET"; + case FUSE_GETATTR: + return "FUSE_GETATTR"; + case FUSE_SETATTR: + return "FUSE_SETATTR"; + case FUSE_READLINK: + return "FUSE_READLINK"; + case FUSE_SYMLINK: + return "FUSE_SYMLINK"; + case FUSE_MKNOD: + return "FUSE_MKNOD"; + case FUSE_MKDIR: + return "FUSE_MKDIR"; + case FUSE_UNLINK: + return "FUSE_UNLINK"; + case FUSE_RMDIR: + return "FUSE_RMDIR"; + case FUSE_RENAME: + return "FUSE_RENAME"; + case FUSE_LINK: + return "FUSE_LINK"; + case FUSE_OPEN: + return "FUSE_OPEN"; + case FUSE_READ: + return "FUSE_READ"; + case FUSE_WRITE: + return "FUSE_WRITE"; + case FUSE_STATFS: + return "FUSE_STATFS"; + case FUSE_RELEASE: + return "FUSE_RELEASE"; + case FUSE_FSYNC: + return "FUSE_FSYNC"; + case FUSE_SETXATTR: + return "FUSE_SETXATTR"; + case FUSE_GETXATTR: + return "FUSE_GETXATTR"; + case FUSE_LISTXATTR: + return "FUSE_LISTXATTR"; + case FUSE_REMOVEXATTR: + return "FUSE_REMOVEXATTR"; + case FUSE_FLUSH: + return "FUSE_FLUSH"; + case FUSE_INIT: + return "FUSE_INIT"; + case FUSE_OPENDIR: + return "FUSE_OPENDIR"; + case FUSE_READDIR: + return "FUSE_READDIR"; + case FUSE_RELEASEDIR: + return "FUSE_RELEASEDIR"; + case FUSE_FSYNCDIR: + return "FUSE_FSYNCDIR"; + case FUSE_GETLK: + return "FUSE_GETLK"; + case FUSE_SETLK: + return "FUSE_SETLK"; + case FUSE_SETLKW: + return "FUSE_SETLKW"; + case FUSE_ACCESS: + return "FUSE_ACCESS"; + case FUSE_CREATE: + return "FUSE_CREATE"; + case FUSE_INTERRUPT: + return "FUSE_INTERRUPT"; + case FUSE_BMAP: + return "FUSE_BMAP"; + case FUSE_DESTROY: + return "FUSE_DESTROY"; + case FUSE_IOCTL: + return "FUSE_IOCTL"; + case FUSE_POLL: + return "FUSE_POLL"; + case FUSE_NOTIFY_REPLY: + return "FUSE_NOTIFY_REPLY"; + case FUSE_BATCH_FORGET: + return "FUSE_BATCH_FORGET"; + case FUSE_FALLOCATE: + return "FUSE_FALLOCATE"; + case FUSE_READDIRPLUS: + return "FUSE_READDIRPLUS"; + case FUSE_RENAME2: + return "FUSE_RENAME2"; + case FUSE_LSEEK: + return "FUSE_LSEEK"; + case FUSE_COPY_FILE_RANGE: + return "FUSE_COPY_FILE_RANGE"; + case FUSE_SETUPMAPPING: + return "FUSE_SETUPMAPPING"; + case FUSE_REMOVEMAPPING: + return "FUSE_REMOVEMAPPING"; + //case FUSE_SYNCFS: + // return "FUSE_SYNCFS"; + case CUSE_INIT: + return "CUSE_INIT"; + case CUSE_INIT_BSWAP_RESERVED: + return "CUSE_INIT_BSWAP_RESERVED"; + case FUSE_INIT_BSWAP_RESERVED: + return "FUSE_INIT_BSWAP_RESERVED"; + } + return "?"; +} + +static int parse_options(int argc, char *const *argv) +{ + signed char c; + + while ((c = getopt(argc, argv, "kuv")) != -1) + switch (c) { + case 'v': + test_options.verbose = true; + break; + + case 'u': + user_messages = true; + break; + + case 'k': + kernel_messages = true; + break; + + default: + return -EINVAL; + } + + return 0; +} + +int main(int argc, char *argv[]) +{ + int result = TEST_FAILURE; + int trace_pid = -1; + char *mount_dir = NULL; + char *src_dir = NULL; + int bpf_fd = -1; + int src_fd = -1; + int fuse_dev = -1; + struct map_relocation *map_relocations = NULL; + size_t map_count = 0; + int i; + + if (geteuid() != 0) + ksft_print_msg("Not a root, might fail to mount.\n"); + TESTEQUAL(parse_options(argc, argv), 0); + + TEST(trace_pid = display_trace(), trace_pid != -1); + + delete_dir_tree("fd-src", true); + TEST(src_dir = setup_mount_dir("fd-src"), src_dir); + delete_dir_tree("fd-dst", true); + TEST(mount_dir = setup_mount_dir("fd-dst"), mount_dir); + + TESTEQUAL(install_elf_bpf("fd_bpf.bpf", "test_daemon", &bpf_fd, + &map_relocations, &map_count), 0); + + TEST(src_fd = open("fd-src", O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTSYSCALL(mkdirat(src_fd, "show", 0777)); + TESTSYSCALL(mkdirat(src_fd, "hide", 0777)); + + for (i = 0; i < map_count; ++i) + if (!strcmp(map_relocations[i].name, "test_map")) { + uint32_t key = 23; + uint32_t value = 1234; + union bpf_attr attr = { + .map_fd = map_relocations[i].fd, + .key = ptr_to_u64(&key), + .value = ptr_to_u64(&value), + .flags = BPF_ANY, + }; + TESTSYSCALL(syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, + &attr, sizeof(attr))); + } + + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + if (fork()) + return 0; + + for (;;) { + uint8_t bytes_in[FUSE_MIN_READ_BUFFER]; + uint8_t bytes_out[FUSE_MIN_READ_BUFFER] __maybe_unused; + struct fuse_in_header *in_header = + (struct fuse_in_header *)bytes_in; + ssize_t res = read(fuse_dev, bytes_in, sizeof(bytes_in)); + + if (res == -1) + break; + + switch (in_header->opcode) { + case FUSE_LOOKUP | FUSE_PREFILTER: { + char *name = (char *)(bytes_in + sizeof(*in_header)); + + if (user_messages) + printf("Lookup %s\n", name); + if (!strcmp(name, "hide")) + TESTFUSEOUTERROR(-ENOENT); + else + TESTFUSEOUTREAD(name, strlen(name) + 1); + break; + } + default: + if (user_messages) { + printf("opcode is %d (%s)\n", in_header->opcode, + fuse_opcode_to_string( + in_header->opcode)); + } + break; + } + } + + result = TEST_SUCCESS; + +out: + for (i = 0; i < map_count; ++i) { + free(map_relocations[i].name); + close(map_relocations[i].fd); + } + free(map_relocations); + umount2(mount_dir, MNT_FORCE); + delete_dir_tree(mount_dir, true); + free(mount_dir); + delete_dir_tree(src_dir, true); + free(src_dir); + if (trace_pid != -1) + kill(trace_pid, SIGKILL); + return result; +} diff --git a/tools/testing/selftests/filesystems/fuse/fuse_test.c b/tools/testing/selftests/filesystems/fuse/fuse_test.c new file mode 100644 index 000000000000..c23f75be15d5 --- /dev/null +++ b/tools/testing/selftests/filesystems/fuse/fuse_test.c @@ -0,0 +1,2142 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2021 Google LLC + */ +#define _GNU_SOURCE + +#include "test_fuse.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +static const char *ft_src = "ft-src"; +static const char *ft_dst = "ft-dst"; + +static void fill_buffer(uint8_t *data, size_t len, int file, int block) +{ + int i; + int seed = 7919 * file + block; + + for (i = 0; i < len; i++) { + seed = 1103515245 * seed + 12345; + data[i] = (uint8_t)(seed >> (i % 13)); + } +} + +static bool test_buffer(uint8_t *data, size_t len, int file, int block) +{ + int i; + int seed = 7919 * file + block; + + for (i = 0; i < len; i++) { + seed = 1103515245 * seed + 12345; + if (data[i] != (uint8_t)(seed >> (i % 13))) + return false; + } + + return true; +} + +static int create_file(int dir, struct s name, int index, size_t blocks) +{ + int result = TEST_FAILURE; + int fd = -1; + int i; + uint8_t data[PAGE_SIZE]; + + TEST(fd = s_openat(dir, name, O_CREAT | O_WRONLY, 0777), fd != -1); + for (i = 0; i < blocks; ++i) { + fill_buffer(data, PAGE_SIZE, index, i); + TESTEQUAL(write(fd, data, sizeof(data)), PAGE_SIZE); + } + TESTSYSCALL(close(fd)); + result = TEST_SUCCESS; + +out: + close(fd); + return result; +} + +static int bpf_clear_trace(void) +{ + int result = TEST_FAILURE; + int tp = -1; + + TEST(tp = s_open(s_path(tracing_folder(), s("trace")), + O_WRONLY | O_TRUNC | O_CLOEXEC), tp != -1); + + result = TEST_SUCCESS; +out: + close(tp); + return result; +} + +static int bpf_test_trace_maybe(const char *substr, bool present) +{ + int result = TEST_FAILURE; + int tp = -1; + char trace_buffer[4096] = {}; + ssize_t bytes_read; + + TEST(tp = s_open(s_path(tracing_folder(), s("trace_pipe")), + O_RDONLY | O_CLOEXEC), + tp != -1); + fcntl(tp, F_SETFL, O_NONBLOCK); + + for (;;) { + bytes_read = read(tp, trace_buffer, sizeof(trace_buffer)); + if (present) + TESTCOND(bytes_read > 0); + else if (bytes_read <= 0) { + result = TEST_SUCCESS; + break; + } + + if (test_options.verbose) + ksft_print_msg("%s\n", trace_buffer); + + if (strstr(trace_buffer, substr)) { + if (present) + result = TEST_SUCCESS; + break; + } + } +out: + close(tp); + return result; +} + +static int bpf_test_trace(const char *substr) +{ + return bpf_test_trace_maybe(substr, true); +} + +static int bpf_test_no_trace(const char *substr) +{ + return bpf_test_trace_maybe(substr, false); +} + +static int basic_test(const char *mount_dir) +{ + const char *test_name = "test"; + const char *test_data = "data"; + + int result = TEST_FAILURE; + int fuse_dev = -1; + char *filename = NULL; + int fd = -1; + FUSE_DECLARE_DAEMON; + + TESTEQUAL(mount_fuse(mount_dir, -1, -1, &fuse_dev), 0); + FUSE_START_DAEMON(); + if (action) { + char data[256]; + + filename = concat_file_name(mount_dir, test_name); + TESTERR(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + TESTEQUAL(read(fd, data, strlen(test_data)), strlen(test_data)); + TESTCOND(!strcmp(data, test_data)); + TESTSYSCALL(close(fd)); + fd = -1; + } else { + DECL_FUSE_IN(open); + DECL_FUSE_IN(read); + DECL_FUSE_IN(flush); + DECL_FUSE_IN(release); + + TESTFUSELOOKUP(test_name, 0); + TESTFUSEOUT1(fuse_entry_out, ((struct fuse_entry_out) { + .nodeid = 2, + .generation = 1, + .attr.ino = 100, + .attr.size = 4, + .attr.blksize = 512, + .attr.mode = S_IFREG | 0777, + })); + + TESTFUSEIN(FUSE_OPEN, open_in); + TESTFUSEOUT1(fuse_open_out, ((struct fuse_open_out) { + .fh = 1, + .open_flags = open_in->flags, + })); + + //TESTFUSEINNULL(FUSE_CANONICAL_PATH); + //TESTFUSEOUTREAD("ignored", 7); + + TESTFUSEIN(FUSE_READ, read_in); + TESTFUSEOUTREAD(test_data, strlen(test_data)); + + TESTFUSEIN(FUSE_FLUSH, flush_in); + TESTFUSEOUTEMPTY(); + + TESTFUSEIN(FUSE_RELEASE, release_in); + TESTFUSEOUTEMPTY(); + exit(TEST_SUCCESS); + } + FUSE_END_DAEMON(); + close(fuse_dev); + close(fd); + free(filename); + umount(mount_dir); + return result; +} + +static int bpf_test_real(const char *mount_dir) +{ + const char *test_name = "real"; + const char *test_data = "Weebles wobble but they don't fall down"; + int result = TEST_FAILURE; + int bpf_fd = -1; + int src_fd = -1; + int fuse_dev = -1; + char *filename = NULL; + int fd = -1; + char read_buffer[256] = {}; + ssize_t bytes_read; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TEST(fd = openat(src_fd, test_name, O_CREAT | O_RDWR | O_CLOEXEC, 0777), + fd != -1); + TESTEQUAL(write(fd, test_data, strlen(test_data)), strlen(test_data)); + TESTSYSCALL(close(fd)); + fd = -1; + + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace", + &bpf_fd, NULL, NULL), 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + filename = concat_file_name(mount_dir, test_name); + TESTERR(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + bytes_read = read(fd, read_buffer, strlen(test_data)); + TESTEQUAL(bytes_read, strlen(test_data)); + TESTEQUAL(strcmp(test_data, read_buffer), 0); + TESTEQUAL(bpf_test_trace("read"), 0); + + result = TEST_SUCCESS; +out: + close(fuse_dev); + close(fd); + free(filename); + umount(mount_dir); + close(src_fd); + close(bpf_fd); + return result; +} + + +static int bpf_test_partial(const char *mount_dir) +{ + const char *test_name = "partial"; + int result = TEST_FAILURE; + int bpf_fd = -1; + int src_fd = -1; + int fuse_dev = -1; + char *filename = NULL; + int fd = -1; + FUSE_DECLARE_DAEMON; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(create_file(src_fd, s(test_name), 1, 2), 0); + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace", + &bpf_fd, NULL, NULL), 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + FUSE_START_DAEMON(); + if (action) { + uint8_t data[PAGE_SIZE]; + + TEST(filename = concat_file_name(mount_dir, test_name), + filename); + TESTERR(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + TESTEQUAL(read(fd, data, PAGE_SIZE), PAGE_SIZE); + TESTEQUAL(bpf_test_trace("read"), 0); + TESTCOND(test_buffer(data, PAGE_SIZE, 2, 0)); + TESTCOND(!test_buffer(data, PAGE_SIZE, 1, 0)); + TESTEQUAL(read(fd, data, PAGE_SIZE), PAGE_SIZE); + TESTCOND(test_buffer(data, PAGE_SIZE, 1, 1)); + TESTCOND(!test_buffer(data, PAGE_SIZE, 2, 1)); + TESTSYSCALL(close(fd)); + fd = -1; + } else { + DECL_FUSE(open); + DECL_FUSE(read); + DECL_FUSE(release); + uint8_t data[PAGE_SIZE]; + + TESTFUSEIN2(FUSE_OPEN | FUSE_POSTFILTER, open_in, open_out); + TESTFUSEOUT1(fuse_open_out, ((struct fuse_open_out) { + .fh = 1, + .open_flags = open_in->flags, + })); + + TESTFUSEIN(FUSE_READ, read_in); + fill_buffer(data, PAGE_SIZE, 2, 0); + TESTFUSEOUTREAD(data, PAGE_SIZE); + + TESTFUSEIN(FUSE_RELEASE, release_in); + TESTFUSEOUTEMPTY(); + exit(TEST_SUCCESS); + } + FUSE_END_DAEMON(); + close(fuse_dev); + close(fd); + free(filename); + umount(mount_dir); + close(src_fd); + close(bpf_fd); + return result; +} + +static int bpf_test_attrs(const char *mount_dir) +{ + const char *test_name = "partial"; + int result = TEST_FAILURE; + int bpf_fd = -1; + int src_fd = -1; + int fuse_dev = -1; + char *filename = NULL; + struct stat st; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(create_file(src_fd, s(test_name), 1, 2), 0); + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace", + &bpf_fd, NULL, NULL), 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + TEST(filename = concat_file_name(mount_dir, test_name), filename); + TESTSYSCALL(stat(filename, &st)); + TESTSYSCALL(chmod(filename, 0111)); + TESTSYSCALL(stat(filename, &st)); + TESTEQUAL(st.st_mode & 0777, 0111); + TESTSYSCALL(chmod(filename, 0777)); + TESTSYSCALL(stat(filename, &st)); + TESTEQUAL(st.st_mode & 0777, 0777); + TESTSYSCALL(chown(filename, 5, 6)); + TESTSYSCALL(stat(filename, &st)); + TESTEQUAL(st.st_uid, 5); + TESTEQUAL(st.st_gid, 6); + + result = TEST_SUCCESS; +out: + close(fuse_dev); + free(filename); + umount(mount_dir); + close(src_fd); + close(bpf_fd); + return result; +} + +static int bpf_test_readdir(const char *mount_dir) +{ + static const char * const names[] = { + "real", "partial", "fake", ".", ".." + }; + bool used[ARRAY_SIZE(names)] = { false }; + int result = TEST_FAILURE; + int bpf_fd = -1; + int src_fd = -1; + int fuse_dev = -1; + DIR *dir = NULL; + struct dirent *dirent; + FUSE_DECLARE_DAEMON; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(create_file(src_fd, s(names[0]), 1, 2), 0); + TESTEQUAL(create_file(src_fd, s(names[1]), 1, 2), 0); + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace", + &bpf_fd, NULL, NULL), 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + FUSE_START_DAEMON(); + if (action) { + int i, j; + + TEST(dir = s_opendir(s(mount_dir)), dir); + TESTEQUAL(bpf_test_trace("opendir"), 0); + + for (i = 0; i < ARRAY_SIZE(names); ++i) { + TEST(dirent = readdir(dir), dirent); + + for (j = 0; j < ARRAY_SIZE(names); ++j) + if (!used[j] && + strcmp(names[j], dirent->d_name) == 0) { + used[j] = true; + break; + } + TESTNE(j, ARRAY_SIZE(names)); + } + TEST(dirent = readdir(dir), dirent == NULL); + TESTSYSCALL(closedir(dir)); + dir = NULL; + TESTEQUAL(bpf_test_trace("readdir"), 0); + } else { + struct fuse_in_header *in_header = + (struct fuse_in_header *)bytes_in; + ssize_t res = read(fuse_dev, bytes_in, sizeof(bytes_in)); + struct fuse_read_out *read_out = + (struct fuse_read_out *) (bytes_in + + sizeof(*in_header) + + sizeof(struct fuse_read_in)); + struct fuse_dirent *fuse_dirent = + (struct fuse_dirent *) (bytes_in + res); + + TESTGE(res, sizeof(*in_header) + sizeof(struct fuse_read_in)); + TESTEQUAL(in_header->opcode, FUSE_READDIR | FUSE_POSTFILTER); + *fuse_dirent = (struct fuse_dirent) { + .ino = 100, + .off = 5, + .namelen = strlen("fake"), + .type = DT_REG, + }; + strcpy((char *)(bytes_in + res + sizeof(*fuse_dirent)), "fake"); + res += FUSE_DIRENT_ALIGN(sizeof(*fuse_dirent) + strlen("fake") + + 1); + TESTFUSEDIROUTREAD(read_out, + bytes_in + + sizeof(struct fuse_in_header) + + sizeof(struct fuse_read_in) + + sizeof(struct fuse_read_out), + res - sizeof(struct fuse_in_header) - + sizeof(struct fuse_read_in) - + sizeof(struct fuse_read_out)); + res = read(fuse_dev, bytes_in, sizeof(bytes_in)); + TESTEQUAL(res, sizeof(*in_header) + + sizeof(struct fuse_read_in) + + sizeof(struct fuse_read_out)); + TESTEQUAL(in_header->opcode, FUSE_READDIR | FUSE_POSTFILTER); + TESTFUSEDIROUTREAD(read_out, bytes_in, 0); + exit(TEST_SUCCESS); + } + FUSE_END_DAEMON(); + closedir(dir); + close(fuse_dev); + umount(mount_dir); + close(src_fd); + close(bpf_fd); + return result; +} + +static int bpf_test_redact_readdir(const char *mount_dir) +{ + static const char * const names[] = { + "f1", "f2", "f3", "f4", "f5", "f6", ".", ".." + }; + bool used[ARRAY_SIZE(names)] = { false }; + int num_shown = (ARRAY_SIZE(names) - 2) / 2 + 2; + int result = TEST_FAILURE; + int bpf_fd = -1; + int src_fd = -1; + int fuse_dev = -1; + DIR *dir = NULL; + struct dirent *dirent; + int i; + int count = 0; + FUSE_DECLARE_DAEMON; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + for (i = 0; i < ARRAY_SIZE(names) - 2; i++) + TESTEQUAL(create_file(src_fd, s(names[i]), 1, 2), 0); + + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_readdir_redact", + &bpf_fd, NULL, NULL), 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + FUSE_START_DAEMON(); + if (action) { + int j; + + TEST(dir = s_opendir(s(mount_dir)), dir); + while ((dirent = readdir(dir))) { + errno = 0; + TESTEQUAL(errno, 0); + + for (j = 0; j < ARRAY_SIZE(names); ++j) + if (!used[j] && + strcmp(names[j], dirent->d_name) == 0) { + used[j] = true; + count++; + break; + } + TESTNE(j, ARRAY_SIZE(names)); + TESTGE(num_shown, count); + } + TESTEQUAL(count, num_shown); + TESTSYSCALL(closedir(dir)); + dir = NULL; + } else { + bool skip = true; + + for (int i = 0; i < ARRAY_SIZE(names) + 1; i++) { + uint8_t bytes_in[FUSE_MIN_READ_BUFFER]; + uint8_t bytes_out[FUSE_MIN_READ_BUFFER]; + struct fuse_in_header *in_header = + (struct fuse_in_header *)bytes_in; + ssize_t res = read(fuse_dev, bytes_in, sizeof(bytes_in)); + int length_out = 0; + uint8_t *pos; + uint8_t *dirs_in; + uint8_t *dirs_out; + struct fuse_read_in *fuse_read_in; + struct fuse_read_out *fuse_read_out_in; + struct fuse_read_out *fuse_read_out_out; + struct fuse_dirent *fuse_dirent_in = NULL; + struct fuse_dirent *next = NULL; + bool again = false; + int dir_ent_len = 0; + + TESTGE(res, sizeof(struct fuse_in_header) + + sizeof(struct fuse_read_in) + + sizeof(struct fuse_read_out)); + + pos = bytes_in + sizeof(struct fuse_in_header); + fuse_read_in = (struct fuse_read_in *) pos; + pos += sizeof(*fuse_read_in); + fuse_read_out_in = (struct fuse_read_out *) pos; + pos += sizeof(*fuse_read_out_in); + dirs_in = pos; + + pos = bytes_out + sizeof(struct fuse_out_header); + fuse_read_out_out = (struct fuse_read_out *) pos; + pos += sizeof(*fuse_read_out_out); + dirs_out = pos; + + if (dirs_in < bytes_in + res) { + bool is_dot; + + fuse_dirent_in = (struct fuse_dirent *) dirs_in; + is_dot = (fuse_dirent_in->namelen == 1 && + !strncmp(fuse_dirent_in->name, ".", 1)) || + (fuse_dirent_in->namelen == 2 && + !strncmp(fuse_dirent_in->name, "..", 2)); + + dir_ent_len = FUSE_DIRENT_ALIGN( + sizeof(*fuse_dirent_in) + + fuse_dirent_in->namelen); + + if (dirs_in + dir_ent_len < bytes_in + res) + next = (struct fuse_dirent *) + (dirs_in + dir_ent_len); + + if (!skip || is_dot) { + memcpy(dirs_out, fuse_dirent_in, + sizeof(struct fuse_dirent) + + fuse_dirent_in->namelen); + length_out += dir_ent_len; + } + again = ((skip && !is_dot) && next); + + if (!is_dot) + skip = !skip; + } + + fuse_read_out_out->offset = next ? next->off : + fuse_read_out_in->offset; + fuse_read_out_out->again = again; + + { + struct fuse_out_header *out_header = + (struct fuse_out_header *)bytes_out; + + *out_header = (struct fuse_out_header) { + .len = sizeof(*out_header) + + sizeof(*fuse_read_out_out) + length_out, + .unique = in_header->unique, + }; + TESTEQUAL(write(fuse_dev, bytes_out, out_header->len), + out_header->len); + } + } + exit(TEST_SUCCESS); + } + FUSE_END_DAEMON(); + closedir(dir); + close(fuse_dev); + umount(mount_dir); + close(src_fd); + close(bpf_fd); + return result; +} + +/* + * This test is more to show what classic fuse does with a creat in a subdir + * than a test of any new functionality + */ +static int bpf_test_creat(const char *mount_dir) +{ + const char *dir_name = "show"; + const char *file_name = "file"; + int result = TEST_FAILURE; + int fuse_dev = -1; + int fd = -1; + FUSE_DECLARE_DAEMON; + + TESTEQUAL(mount_fuse(mount_dir, -1, -1, &fuse_dev), 0); + + FUSE_START_DAEMON(); + if (action) { + TEST(fd = s_creat(s_path(s_path(s(mount_dir), s(dir_name)), + s(file_name)), + 0777), + fd != -1); + TESTSYSCALL(close(fd)); + } else { + DECL_FUSE_IN(create); + DECL_FUSE_IN(release); + DECL_FUSE_IN(flush); + + TESTFUSELOOKUP(dir_name, 0); + TESTFUSEOUT1(fuse_entry_out, ((struct fuse_entry_out) { + .nodeid = 3, + .generation = 1, + .attr.ino = 100, + .attr.size = 4, + .attr.blksize = 512, + .attr.mode = S_IFDIR | 0777, + })); + + TESTFUSELOOKUP(file_name, 0); + TESTFUSEOUTERROR(-ENOENT); + + TESTFUSEINEXT(FUSE_CREATE, create_in, strlen(file_name) + 1); + TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) { + .nodeid = 2, + .generation = 1, + .attr.ino = 200, + .attr.size = 4, + .attr.blksize = 512, + .attr.mode = S_IFREG, + }), + fuse_open_out, ((struct fuse_open_out) { + .fh = 1, + .open_flags = create_in->flags, + })); + + //TESTFUSEINNULL(FUSE_CANONICAL_PATH); + //TESTFUSEOUTREAD("ignored", 7); + + TESTFUSEIN(FUSE_FLUSH, flush_in); + TESTFUSEOUTEMPTY(); + + TESTFUSEIN(FUSE_RELEASE, release_in); + TESTFUSEOUTEMPTY(); + exit(TEST_SUCCESS); + } + FUSE_END_DAEMON(); + close(fuse_dev); + umount(mount_dir); + return result; +} + +static int bpf_test_hidden_entries(const char *mount_dir) +{ + static const char * const dir_names[] = { + "show", + "hide", + }; + const char *file_name = "file"; + const char *data = "The quick brown fox jumps over the lazy dog\n"; + int result = TEST_FAILURE; + int src_fd = -1; + int bpf_fd = -1; + int fuse_dev = -1; + int fd = -1; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTSYSCALL(mkdirat(src_fd, dir_names[0], 0777)); + TESTSYSCALL(mkdirat(src_fd, dir_names[1], 0777)); + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_hidden", + &bpf_fd, NULL, NULL), 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + TEST(fd = s_creat(s_path(s_path(s(mount_dir), s(dir_names[0])), + s(file_name)), + 0777), + fd != -1); + TESTSYSCALL(fallocate(fd, 0, 0, 4096)); + TEST(write(fd, data, strlen(data)), strlen(data)); + TESTSYSCALL(close(fd)); + TESTEQUAL(bpf_test_trace("Create"), 0); + + result = TEST_SUCCESS; +out: + close(fuse_dev); + umount(mount_dir); + close(bpf_fd); + close(src_fd); + return result; +} + +static int bpf_test_dir(const char *mount_dir) +{ + const char *dir_name = "dir"; + int result = TEST_FAILURE; + int src_fd = -1; + int bpf_fd = -1; + int fuse_dev = -1; + struct stat st; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace", + &bpf_fd, NULL, NULL), 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + TESTSYSCALL(s_mkdir(s_path(s(mount_dir), s(dir_name)), 0777)); + TESTEQUAL(bpf_test_trace("mkdir"), 0); + TESTSYSCALL(s_stat(s_path(s(ft_src), s(dir_name)), &st)); + TESTSYSCALL(s_rmdir(s_path(s(mount_dir), s(dir_name)))); + TESTEQUAL(s_stat(s_path(s(ft_src), s(dir_name)), &st), -1); + TESTEQUAL(errno, ENOENT); + result = TEST_SUCCESS; +out: + close(fuse_dev); + umount(mount_dir); + close(bpf_fd); + close(src_fd); + return result; +} + +static int bpf_test_file(const char *mount_dir, bool close_first) +{ + const char *file_name = "real"; + int result = TEST_FAILURE; + int src_fd = -1; + int bpf_fd = -1; + int fuse_dev = -1; + int fd = -1; + struct stat st; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace", + &bpf_fd, NULL, NULL), 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + TEST(fd = s_creat(s_path(s(mount_dir), s(file_name)), + 0777), + fd != -1); + TESTEQUAL(bpf_test_trace("Create"), 0); + if (close_first) { + TESTSYSCALL(close(fd)); + fd = -1; + } + TESTSYSCALL(s_stat(s_path(s(ft_src), s(file_name)), &st)); + TESTSYSCALL(s_unlink(s_path(s(mount_dir), s(file_name)))); + TESTEQUAL(bpf_test_trace("unlink"), 0); + TESTEQUAL(s_stat(s_path(s(ft_src), s(file_name)), &st), -1); + TESTEQUAL(errno, ENOENT); + if (!close_first) { + TESTSYSCALL(close(fd)); + fd = -1; + } + result = TEST_SUCCESS; +out: + close(fd); + close(fuse_dev); + umount(mount_dir); + close(bpf_fd); + close(src_fd); + return result; +} + +static int bpf_test_file_early_close(const char *mount_dir) +{ + return bpf_test_file(mount_dir, true); +} + +static int bpf_test_file_late_close(const char *mount_dir) +{ + return bpf_test_file(mount_dir, false); +} + +static int bpf_test_alter_errcode_bpf(const char *mount_dir) +{ + const char *dir_name = "dir"; + int result = TEST_FAILURE; + int src_fd = -1; + int bpf_fd = -1; + int fuse_dev = -1; + struct stat st; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_error", + &bpf_fd, NULL, NULL), 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + TESTSYSCALL(s_mkdir(s_path(s(mount_dir), s(dir_name)), 0777)); + //TESTEQUAL(bpf_test_trace("mkdir"), 0); + TESTSYSCALL(s_stat(s_path(s(ft_src), s(dir_name)), &st)); + TESTEQUAL(s_mkdir(s_path(s(mount_dir), s(dir_name)), 0777), -EPERM); + TESTSYSCALL(s_rmdir(s_path(s(mount_dir), s(dir_name)))); + TESTEQUAL(s_stat(s_path(s(ft_src), s(dir_name)), &st), -1); + TESTEQUAL(errno, ENOENT); + result = TEST_SUCCESS; +out: + close(fuse_dev); + umount(mount_dir); + close(bpf_fd); + close(src_fd); + return result; +} + +static int bpf_test_alter_errcode_userspace(const char *mount_dir) +{ + const char *dir_name = "doesnotexist"; + int result = TEST_FAILURE; + int src_fd = -1; + int bpf_fd = -1; + int fuse_dev = -1; + FUSE_DECLARE_DAEMON; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_error", + &bpf_fd, NULL, NULL), 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + FUSE_START_DAEMON(); + if (action) { + TESTEQUAL(s_unlink(s_path(s(mount_dir), s(dir_name))), + -1); + TESTEQUAL(errno, ENOMEM); + } else { + TESTFUSELOOKUP("doesnotexist", FUSE_POSTFILTER); + TESTFUSEOUTERROR(-ENOMEM); + exit(TEST_SUCCESS); + } + FUSE_END_DAEMON(); + close(fuse_dev); + umount(mount_dir); + close(bpf_fd); + close(src_fd); + return result; +} + +static int bpf_test_mknod(const char *mount_dir) +{ + const char *file_name = "real"; + int result = TEST_FAILURE; + int src_fd = -1; + int bpf_fd = -1; + int fuse_dev = -1; + struct stat st; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace", + &bpf_fd, NULL, NULL), 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + TESTSYSCALL(s_mkfifo(s_path(s(mount_dir), s(file_name)), 0777)); + TESTEQUAL(bpf_test_trace("mknod"), 0); + TESTSYSCALL(s_stat(s_path(s(ft_src), s(file_name)), &st)); + TESTSYSCALL(s_unlink(s_path(s(mount_dir), s(file_name)))); + TESTEQUAL(bpf_test_trace("unlink"), 0); + TESTEQUAL(s_stat(s_path(s(ft_src), s(file_name)), &st), -1); + TESTEQUAL(errno, ENOENT); + result = TEST_SUCCESS; +out: + close(fuse_dev); + umount(mount_dir); + close(bpf_fd); + close(src_fd); + return result; +} + +static int bpf_test_largedir(const char *mount_dir) +{ + const char *show = "show"; + const int files = 1000; + + int result = TEST_FAILURE; + int src_fd = -1; + int bpf_fd = -1; + int fuse_dev = -1; + struct map_relocation *map_relocations = NULL; + size_t map_count = 0; + FUSE_DECLARE_DAEMON; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(install_elf_bpf("fd_bpf.bpf", "test_daemon", + &bpf_fd, &map_relocations, &map_count), 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + FUSE_START_DAEMON(); + if (action) { + int i; + int fd; + DIR *dir = NULL; + struct dirent *dirent; + + TESTSYSCALL(s_mkdir(s_path(s(mount_dir), s(show)), 0777)); + for (i = 0; i < files; ++i) { + char filename[NAME_MAX]; + + sprintf(filename, "%d", i); + TEST(fd = s_creat(s_path(s_path(s(mount_dir), s(show)), + s(filename)), 0777), fd != -1); + TESTSYSCALL(close(fd)); + } + + TEST(dir = s_opendir(s_path(s(mount_dir), s(show))), dir); + for (dirent = readdir(dir); dirent; dirent = readdir(dir)) + ; + closedir(dir); + } else { + int i; + + for (i = 0; i < files + 2; ++i) { + TESTFUSELOOKUP(show, FUSE_PREFILTER); + TESTFUSEOUTREAD(show, 5); + } + exit(TEST_SUCCESS); + } + FUSE_END_DAEMON(); + close(fuse_dev); + umount(mount_dir); + close(bpf_fd); + close(src_fd); + return result; +} + +static int bpf_test_link(const char *mount_dir) +{ + const char *file_name = "real"; + const char *link_name = "partial"; + int result = TEST_FAILURE; + int fd = -1; + int src_fd = -1; + int bpf_fd = -1; + int fuse_dev = -1; + struct stat st; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace", &bpf_fd, NULL, + NULL), + 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + TEST(fd = s_creat(s_path(s(mount_dir), s(file_name)), 0777), fd != -1); + TESTEQUAL(bpf_test_trace("Create"), 0); + TESTSYSCALL(s_stat(s_path(s(ft_src), s(file_name)), &st)); + + TESTSYSCALL(s_link(s_path(s(mount_dir), s(file_name)), + s_path(s(mount_dir), s(link_name)))); + + TESTEQUAL(bpf_test_trace("link"), 0); + TESTSYSCALL(s_stat(s_path(s(ft_src), s(link_name)), &st)); + + TESTSYSCALL(s_unlink(s_path(s(mount_dir), s(link_name)))); + TESTEQUAL(bpf_test_trace("unlink"), 0); + TESTEQUAL(s_stat(s_path(s(ft_src), s(link_name)), &st), -1); + TESTEQUAL(errno, ENOENT); + + TESTSYSCALL(s_unlink(s_path(s(mount_dir), s(file_name)))); + TESTEQUAL(bpf_test_trace("unlink"), 0); + TESTEQUAL(s_stat(s_path(s(ft_src), s(file_name)), &st), -1); + TESTEQUAL(errno, ENOENT); + + result = TEST_SUCCESS; +out: + close(fd); + close(fuse_dev); + umount(mount_dir); + close(bpf_fd); + close(src_fd); + return result; +} + +static int bpf_test_symlink(const char *mount_dir) +{ + const char *test_name = "real"; + const char *symlink_name = "partial"; + const char *test_data = "Weebles wobble but they don't fall down"; + int result = TEST_FAILURE; + int bpf_fd = -1; + int src_fd = -1; + int fuse_dev = -1; + int fd = -1; + char read_buffer[256] = {}; + ssize_t bytes_read; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TEST(fd = openat(src_fd, test_name, O_CREAT | O_RDWR | O_CLOEXEC, 0777), + fd != -1); + TESTEQUAL(write(fd, test_data, strlen(test_data)), strlen(test_data)); + TESTSYSCALL(close(fd)); + fd = -1; + + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace", + &bpf_fd, NULL, NULL), 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + TESTSYSCALL(s_symlink(s_path(s(mount_dir), s(test_name)), + s_path(s(mount_dir), s(symlink_name)))); + TESTEQUAL(bpf_test_trace("symlink"), 0); + + TESTERR(fd = s_open(s_path(s(mount_dir), s(symlink_name)), O_RDONLY | O_CLOEXEC), fd != -1); + bytes_read = read(fd, read_buffer, strlen(test_data)); + TESTEQUAL(bpf_test_trace("readlink"), 0); + TESTEQUAL(bytes_read, strlen(test_data)); + TESTEQUAL(strcmp(test_data, read_buffer), 0); + + result = TEST_SUCCESS; +out: + close(fuse_dev); + close(fd); + umount(mount_dir); + close(src_fd); + close(bpf_fd); + return result; +} + +static int bpf_test_xattr(const char *mount_dir) +{ + static const char file_name[] = "real"; + static const char xattr_name[] = "user.xattr_test_name"; + static const char xattr_value[] = "this_is_a_test"; + const size_t xattr_size = sizeof(xattr_value); + char xattr_value_ret[256]; + ssize_t xattr_size_ret; + int result = TEST_FAILURE; + int fd = -1; + int src_fd = -1; + int bpf_fd = -1; + int fuse_dev = -1; + struct stat st; + + memset(xattr_value_ret, '\0', sizeof(xattr_value_ret)); + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace", &bpf_fd, NULL, + NULL), + 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + TEST(fd = s_creat(s_path(s(mount_dir), s(file_name)), 0777), fd != -1); + TESTEQUAL(bpf_test_trace("Create"), 0); + TESTSYSCALL(close(fd)); + + TESTSYSCALL(s_stat(s_path(s(ft_src), s(file_name)), &st)); + TEST(result = s_getxattr(s_path(s(mount_dir), s(file_name)), xattr_name, + xattr_value_ret, sizeof(xattr_value_ret), + &xattr_size_ret), + result == -1); + TESTEQUAL(errno, ENODATA); + TESTEQUAL(bpf_test_trace("getxattr"), 0); + + TESTSYSCALL(s_listxattr(s_path(s(mount_dir), s(file_name)), + xattr_value_ret, sizeof(xattr_value_ret), + &xattr_size_ret)); + TESTEQUAL(bpf_test_trace("listxattr"), 0); + TESTEQUAL(xattr_size_ret, 0); + + TESTSYSCALL(s_setxattr(s_path(s(mount_dir), s(file_name)), xattr_name, + xattr_value, xattr_size, 0)); + TESTEQUAL(bpf_test_trace("setxattr"), 0); + + TESTSYSCALL(s_listxattr(s_path(s(mount_dir), s(file_name)), + xattr_value_ret, sizeof(xattr_value_ret), + &xattr_size_ret)); + TESTEQUAL(bpf_test_trace("listxattr"), 0); + TESTEQUAL(xattr_size_ret, sizeof(xattr_name)); + TESTEQUAL(strcmp(xattr_name, xattr_value_ret), 0); + + TESTSYSCALL(s_getxattr(s_path(s(mount_dir), s(file_name)), xattr_name, + xattr_value_ret, sizeof(xattr_value_ret), + &xattr_size_ret)); + TESTEQUAL(bpf_test_trace("getxattr"), 0); + TESTEQUAL(xattr_size, xattr_size_ret); + TESTEQUAL(strcmp(xattr_value, xattr_value_ret), 0); + + TESTSYSCALL(s_removexattr(s_path(s(mount_dir), s(file_name)), xattr_name)); + TESTEQUAL(bpf_test_trace("removexattr"), 0); + + TESTEQUAL(s_getxattr(s_path(s(mount_dir), s(file_name)), xattr_name, + xattr_value_ret, sizeof(xattr_value_ret), + &xattr_size_ret), -1); + TESTEQUAL(errno, ENODATA); + + TESTSYSCALL(s_unlink(s_path(s(mount_dir), s(file_name)))); + TESTEQUAL(bpf_test_trace("unlink"), 0); + TESTEQUAL(s_stat(s_path(s(ft_src), s(file_name)), &st), -1); + TESTEQUAL(errno, ENOENT); + + result = TEST_SUCCESS; +out: + close(fuse_dev); + umount(mount_dir); + close(bpf_fd); + close(src_fd); + return result; +} + +static int bpf_test_set_backing(const char *mount_dir) +{ + const char *backing_name = "backing"; + const char *test_data = "data"; + const char *test_name = "test"; + + int result = TEST_FAILURE; + int fuse_dev = -1; + int fd = -1; + FUSE_DECLARE_DAEMON; + + TESTEQUAL(mount_fuse_no_init(mount_dir, -1, -1, &fuse_dev), 0); + FUSE_START_DAEMON(); + if (action) { + char data[256] = {0}; + + TESTERR(fd = s_open(s_path(s(mount_dir), s(test_name)), + O_RDONLY | O_CLOEXEC), fd != -1); + TESTEQUAL(read(fd, data, strlen(test_data)), strlen(test_data)); + TESTCOND(!strcmp(data, test_data)); + TESTSYSCALL(close(fd)); + fd = -1; + TESTSYSCALL(umount(mount_dir)); + } else { + int bpf_fd = -1; + int backing_fd = -1; + + TESTERR(backing_fd = s_creat(s_path(s(ft_src), s(backing_name)), 0777), + backing_fd != -1); + TESTEQUAL(write(backing_fd, test_data, strlen(test_data)), + strlen(test_data)); + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_simple", + &bpf_fd, NULL, NULL), 0); + + TESTFUSEINIT(); + TESTFUSELOOKUP(test_name, 0); + TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) {0}), + fuse_entry_bpf_out, ((struct fuse_entry_bpf_out) { + .backing_action = FUSE_ACTION_REPLACE, + .backing_fd = backing_fd, + .bpf_action = FUSE_ACTION_REPLACE, + .bpf_fd = bpf_fd, + })); + read(fuse_dev, bytes_in, sizeof(bytes_in)); + TESTSYSCALL(close(bpf_fd)); + TESTSYSCALL(close(backing_fd)); + exit(TEST_SUCCESS); + } + FUSE_END_DAEMON(); + close(fuse_dev); + close(fd); + umount(mount_dir); + return result; +} + +static int bpf_test_remove_backing(const char *mount_dir) +{ + const char *folder1 = "folder1"; + const char *folder2 = "folder2"; + const char *file = "file1"; + const char *contents1 = "contents1"; + const char *contents2 = "contents2"; + + int result = TEST_FAILURE; + int fuse_dev = -1; + int fd = -1; + int src_fd = -1; + int bpf_fd = -1; + char data[256] = {0}; + FUSE_DECLARE_DAEMON; + + /* + * Create folder1/file + * folder2/file + * + * test will install bpf into mount. + * bpf will postfilter root lookup to daemon. + * daemon will remove bpf and redirect opens on folder1 to folder2. + * test will open folder1/file which will be redirected to folder2. + * test will check no traces for file, and contents are folder2/file. + */ + TESTEQUAL(bpf_clear_trace(), 0); + TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(folder1)), 0777)); + TEST(fd = s_creat(s_pathn(3, s(ft_src), s(folder1), s(file)), 0777), + fd != -1); + TESTEQUAL(write(fd, contents1, strlen(contents1)), strlen(contents1)); + TESTSYSCALL(close(fd)); + TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(folder2)), 0777)); + TEST(fd = s_creat(s_pathn(3, s(ft_src), s(folder2), s(file)), 0777), + fd != -1); + TESTEQUAL(write(fd, contents2, strlen(contents2)), strlen(contents2)); + TESTSYSCALL(close(fd)); + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_passthrough", &bpf_fd, + NULL, NULL), 0); + TESTEQUAL(mount_fuse_no_init(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + FUSE_START_DAEMON(); + if (action) { + TESTERR(fd = s_open(s_pathn(3, s(mount_dir), s(folder1), + s(file)), + O_RDONLY | O_CLOEXEC), fd != -1); + TESTEQUAL(read(fd, data, sizeof(data)), strlen(contents2)); + TESTCOND(!strcmp(data, contents2)); + TESTEQUAL(bpf_test_no_trace("file"), 0); + TESTSYSCALL(close(fd)); + fd = -1; + TESTSYSCALL(umount(mount_dir)); + } else { + struct { + char name[8]; + struct fuse_entry_out feo; + struct fuse_entry_bpf_out febo; + } __packed in; + int backing_fd = -1; + + TESTFUSEINIT(); + TESTFUSEIN(FUSE_LOOKUP | FUSE_POSTFILTER, &in); + TEST(backing_fd = s_open(s_path(s(ft_src), s(folder2)), + O_DIRECTORY | O_RDONLY | O_CLOEXEC), + backing_fd != -1); + TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) {0}), + fuse_entry_bpf_out, ((struct fuse_entry_bpf_out) { + .bpf_action = FUSE_ACTION_REMOVE, + .backing_action = FUSE_ACTION_REPLACE, + .backing_fd = backing_fd, + })); + + while (read(fuse_dev, bytes_in, sizeof(bytes_in)) != -1) + ; + TESTSYSCALL(close(backing_fd)); + exit(TEST_SUCCESS); + } + FUSE_END_DAEMON(); + close(fuse_dev); + close(fd); + close(src_fd); + close(bpf_fd); + umount(mount_dir); + return result; +} + +static int bpf_test_dir_rename(const char *mount_dir) +{ + const char *dir_name = "dir"; + const char *dir_name2 = "dir2"; + int result = TEST_FAILURE; + int src_fd = -1; + int bpf_fd = -1; + int fuse_dev = -1; + struct stat st; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace", + &bpf_fd, NULL, NULL), 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + TESTSYSCALL(s_mkdir(s_path(s(mount_dir), s(dir_name)), 0777)); + TESTEQUAL(bpf_test_trace("mkdir"), 0); + TESTSYSCALL(s_stat(s_path(s(ft_src), s(dir_name)), &st)); + TESTSYSCALL(s_rename(s_path(s(mount_dir), s(dir_name)), + s_path(s(mount_dir), s(dir_name2)))); + TESTEQUAL(s_stat(s_path(s(ft_src), s(dir_name)), &st), -1); + TESTEQUAL(errno, ENOENT); + TESTSYSCALL(s_stat(s_path(s(ft_src), s(dir_name2)), &st)); + result = TEST_SUCCESS; +out: + close(fuse_dev); + umount(mount_dir); + close(bpf_fd); + close(src_fd); + return result; +} + +static int bpf_test_file_rename(const char *mount_dir) +{ + const char *dir = "dir"; + const char *file1 = "file1"; + const char *file2 = "file2"; + int result = TEST_FAILURE; + int src_fd = -1; + int bpf_fd = -1; + int fuse_dev = -1; + int fd = -1; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace", + &bpf_fd, NULL, NULL), 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + TESTSYSCALL(s_mkdir(s_path(s(mount_dir), s(dir)), 0777)); + TEST(fd = s_creat(s_pathn(3, s(mount_dir), s(dir), s(file1)), 0777), + fd != -1); + TESTSYSCALL(s_rename(s_pathn(3, s(mount_dir), s(dir), s(file1)), + s_pathn(3, s(mount_dir), s(dir), s(file2)))); + result = TEST_SUCCESS; +out: + close(fd); + umount(mount_dir); + close(fuse_dev); + close(bpf_fd); + close(src_fd); + return result; +} + +static int mmap_test(const char *mount_dir) +{ + const char *file = "file"; + int result = TEST_FAILURE; + int src_fd = -1; + int fuse_dev = -1; + int fd = -1; + char *addr = NULL; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(mount_fuse(mount_dir, -1, src_fd, &fuse_dev), 0); + TEST(fd = s_open(s_path(s(mount_dir), s(file)), + O_CREAT | O_RDWR | O_CLOEXEC, 0777), + fd != -1); + TESTSYSCALL(fallocate(fd, 0, 4096, SEEK_CUR)); + TEST(addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0), + addr != (void *) -1); + memset(addr, 'a', 4096); + + result = TEST_SUCCESS; +out: + munmap(addr, 4096); + close(fd); + umount(mount_dir); + close(fuse_dev); + close(src_fd); + return result; +} + +static int readdir_perms_test(const char *mount_dir) +{ + int result = TEST_FAILURE; + struct __user_cap_header_struct uchs = { _LINUX_CAPABILITY_VERSION_3 }; + struct __user_cap_data_struct ucds[2]; + int src_fd = -1; + int fuse_dev = -1; + DIR *dir = NULL; + + /* Must remove capabilities for this test. */ + TESTSYSCALL(syscall(SYS_capget, &uchs, ucds)); + ucds[0].effective &= ~(1 << CAP_DAC_OVERRIDE | 1 << CAP_DAC_READ_SEARCH); + TESTSYSCALL(syscall(SYS_capset, &uchs, ucds)); + + /* This is what we are testing in fuseland. First test without fuse, */ + TESTSYSCALL(mkdir("test", 0111)); + TEST(dir = opendir("test"), dir == NULL); + closedir(dir); + dir = NULL; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(mount_fuse(mount_dir, -1, src_fd, &fuse_dev), 0); + + TESTSYSCALL(s_mkdir(s_path(s(mount_dir), s("test")), 0111)); + TEST(dir = s_opendir(s_path(s(mount_dir), s("test"))), dir == NULL); + + result = TEST_SUCCESS; +out: + ucds[0].effective |= 1 << CAP_DAC_OVERRIDE | 1 << CAP_DAC_READ_SEARCH; + syscall(SYS_capset, &uchs, ucds); + + closedir(dir); + s_rmdir(s_path(s(mount_dir), s("test"))); + umount(mount_dir); + close(fuse_dev); + close(src_fd); + rmdir("test"); + return result; +} + +static int inotify_test(const char *mount_dir) +{ + int result = TEST_FAILURE; + int src_fd = -1; + int fuse_dev = -1; + struct s dir; + int inotify_fd = -1; + int watch; + int fd = -1; + char buffer[sizeof(struct inotify_event) + NAME_MAX + 1]; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(mount_fuse(mount_dir, -1, src_fd, &fuse_dev), 0); + + TEST(inotify_fd = inotify_init1(IN_CLOEXEC), inotify_fd != -1); + dir = s_path(s(mount_dir), s("dir")); + TESTSYSCALL(mkdir(dir.s, 0777)); + TEST(watch = inotify_add_watch(inotify_fd, dir.s, IN_CREATE), watch); + TEST(fd = s_creat(s_path(s(ft_src), s("dir/file")), 0777), fd != -1); + // buffer will be two struct lengths, as "file" gets rounded up to the + // next multiple of struct inotify_event + TESTEQUAL(read(inotify_fd, &buffer, sizeof(buffer)), + sizeof(struct inotify_event) * 2); + + result = TEST_SUCCESS; +out: + close(fd); + s_unlink(s_path(s(ft_src), s("dir/file"))); + close(inotify_fd); + rmdir(dir.s); + free(dir.s); + umount(mount_dir); + close(fuse_dev); + close(src_fd); + return result; +} + +static int bpf_test_statfs(const char *mount_dir) +{ + int result = TEST_FAILURE; + int src_fd = -1; + int bpf_fd = -1; + int fuse_dev = -1; + int fd = -1; + struct statfs st; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace", + &bpf_fd, NULL, NULL), 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + TESTSYSCALL(s_statfs(s(mount_dir), &st)); + TESTEQUAL(bpf_test_trace("statfs"), 0); + TESTEQUAL(st.f_type, 0x65735546); + result = TEST_SUCCESS; +out: + close(fd); + umount(mount_dir); + close(fuse_dev); + close(bpf_fd); + close(src_fd); + return result; +} + +static int bpf_test_lseek(const char *mount_dir) +{ + const char *file = "real"; + const char *test_data = "data"; + int result = TEST_FAILURE; + int src_fd = -1; + int bpf_fd = -1; + int fuse_dev = -1; + int fd = -1; + + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TEST(fd = openat(src_fd, file, O_CREAT | O_RDWR | O_CLOEXEC, 0777), + fd != -1); + TESTEQUAL(write(fd, test_data, strlen(test_data)), strlen(test_data)); + TESTSYSCALL(close(fd)); + fd = -1; + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace", + &bpf_fd, NULL, NULL), 0); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + + TEST(fd = s_open(s_path(s(mount_dir), s(file)), O_RDONLY | O_CLOEXEC), + fd != -1); + TESTEQUAL(lseek(fd, 3, SEEK_SET), 3); + TESTEQUAL(bpf_test_trace("lseek"), 0); + TESTEQUAL(lseek(fd, 5, SEEK_END), 9); + TESTEQUAL(bpf_test_trace("lseek"), 0); + TESTEQUAL(lseek(fd, 1, SEEK_CUR), 10); + TESTEQUAL(bpf_test_trace("lseek"), 0); + TESTEQUAL(lseek(fd, 1, SEEK_DATA), 1); + TESTEQUAL(bpf_test_trace("lseek"), 0); + result = TEST_SUCCESS; +out: + close(fd); + umount(mount_dir); + close(fuse_dev); + close(bpf_fd); + close(src_fd); + return result; +} + +/* + * State: + * Original: dst/folder1/content.txt + * ^ + * | + * | + * Backing: src/folder1/content.txt + * + * Step 1: open(folder1) - set backing to src/folder1 + * Check 1: cat(content.txt) - check not receiving call on the fuse daemon + * and content is the same + * Step 2: readdirplus(dst) + * Check 2: cat(content.txt) - check not receiving call on the fuse daemon + * and content is the same + */ +static int bpf_test_readdirplus_not_overriding_backing(const char *mount_dir) +{ + const char *folder1 = "folder1"; + const char *content_file = "content.txt"; + const char *content = "hello world"; + + int result = TEST_FAILURE; + int fuse_dev = -1; + int src_fd = -1; + int content_fd = -1; + FUSE_DECLARE_DAEMON; + + TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(folder1)), 0777)); + TEST(content_fd = s_creat(s_pathn(3, s(ft_src), s(folder1), s(content_file)), 0777), + content_fd != -1); + TESTEQUAL(write(content_fd, content, strlen(content)), strlen(content)); + TESTEQUAL(mount_fuse_no_init(mount_dir, -1, -1, &fuse_dev), 0); + + FUSE_START_DAEMON(); + if (action) { + DIR *open_mount_dir = NULL; + struct dirent *mount_dirent; + int dst_folder1_fd = -1; + int dst_content_fd = -1; + char content_buffer[12]; + + // Step 1: Lookup folder1 + TESTERR(dst_folder1_fd = s_open(s_path(s(mount_dir), s(folder1)), + O_RDONLY | O_CLOEXEC), dst_folder1_fd != -1); + + // Check 1: Read content file (backed) + TESTERR(dst_content_fd = + s_open(s_pathn(3, s(mount_dir), s(folder1), s(content_file)), + O_RDONLY | O_CLOEXEC), dst_content_fd != -1); + + TESTEQUAL(read(dst_content_fd, content_buffer, strlen(content)), + strlen(content)); + TESTEQUAL(strncmp(content, content_buffer, strlen(content)), 0); + + TESTSYSCALL(close(dst_content_fd)); + dst_content_fd = -1; + TESTSYSCALL(close(dst_folder1_fd)); + dst_folder1_fd = -1; + memset(content_buffer, 0, strlen(content)); + + // Step 2: readdir folder 1 + TEST(open_mount_dir = s_opendir(s(mount_dir)), + open_mount_dir != NULL); + TEST(mount_dirent = readdir(open_mount_dir), mount_dirent != NULL); + TESTSYSCALL(closedir(open_mount_dir)); + open_mount_dir = NULL; + + // Check 2: Read content file again (must be backed) + TESTERR(dst_content_fd = + s_open(s_pathn(3, s(mount_dir), s(folder1), s(content_file)), + O_RDONLY | O_CLOEXEC), dst_content_fd != -1); + + TESTEQUAL(read(dst_content_fd, content_buffer, strlen(content)), + strlen(content)); + TESTEQUAL(strncmp(content, content_buffer, strlen(content)), 0); + + TESTSYSCALL(close(dst_content_fd)); + dst_content_fd = -1; + } else { + size_t read_size = 0; + struct fuse_in_header *in_header = (struct fuse_in_header *)bytes_in; + struct fuse_read_out *read_out = NULL; + struct fuse_attr attr = {}; + int backing_fd = -1; + DECL_FUSE_IN(open); + DECL_FUSE_IN(getattr); + + TESTFUSEINITFLAGS(FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO); + + // Step 1: Lookup folder 1 with backing + TESTFUSELOOKUP(folder1, 0); + TESTSYSCALL(s_fuse_attr(s_path(s(ft_src), s(folder1)), &attr)); + TEST(backing_fd = s_open(s_path(s(ft_src), s(folder1)), + O_DIRECTORY | O_RDONLY | O_CLOEXEC), + backing_fd != -1); + TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) { + .nodeid = attr.ino, + .generation = 0, + .entry_valid = UINT64_MAX, + .attr_valid = UINT64_MAX, + .entry_valid_nsec = UINT32_MAX, + .attr_valid_nsec = UINT32_MAX, + .attr = attr, + }), fuse_entry_bpf_out, ((struct fuse_entry_bpf_out) { + .backing_action = FUSE_ACTION_REPLACE, + .backing_fd = backing_fd, + })); + TESTSYSCALL(close(backing_fd)); + + // Step 2: Open root dir + TESTFUSEIN(FUSE_OPENDIR, open_in); + TESTFUSEOUT1(fuse_open_out, ((struct fuse_open_out) { + .fh = 100, + .open_flags = open_in->flags + })); + + // Step 2: Handle getattr + TESTFUSEIN(FUSE_GETATTR, getattr_in); + TESTSYSCALL(s_fuse_attr(s(ft_src), &attr)); + TESTFUSEOUT1(fuse_attr_out, ((struct fuse_attr_out) { + .attr_valid = UINT64_MAX, + .attr_valid_nsec = UINT32_MAX, + .attr = attr + })); + + // Step 2: Handle readdirplus + read_size = read(fuse_dev, bytes_in, sizeof(bytes_in)); + TESTEQUAL(in_header->opcode, FUSE_READDIRPLUS); + + struct fuse_direntplus *dirent_plus = + (struct fuse_direntplus *) (bytes_in + read_size); + struct fuse_dirent dirent; + struct fuse_entry_out entry_out; + + read_out = (struct fuse_read_out *) (bytes_in + + sizeof(*in_header) + + sizeof(struct fuse_read_in)); + + TESTSYSCALL(s_fuse_attr(s_path(s(ft_src), s(folder1)), &attr)); + + dirent = (struct fuse_dirent) { + .ino = attr.ino, + .off = 1, + .namelen = strlen(folder1), + .type = DT_REG + }; + entry_out = (struct fuse_entry_out) { + .nodeid = attr.ino, + .generation = 0, + .entry_valid = UINT64_MAX, + .attr_valid = UINT64_MAX, + .entry_valid_nsec = UINT32_MAX, + .attr_valid_nsec = UINT32_MAX, + .attr = attr + }; + *dirent_plus = (struct fuse_direntplus) { + .dirent = dirent, + .entry_out = entry_out + }; + + strcpy((char *)(bytes_in + read_size + sizeof(*dirent_plus)), folder1); + read_size += FUSE_DIRENT_ALIGN(sizeof(*dirent_plus) + strlen(folder1) + + 1); + TESTFUSEDIROUTREAD(read_out, + bytes_in + + sizeof(struct fuse_in_header) + + sizeof(struct fuse_read_in) + + sizeof(struct fuse_read_out), + read_size - sizeof(struct fuse_in_header) - + sizeof(struct fuse_read_in) - + sizeof(struct fuse_read_out)); + exit(TEST_SUCCESS); + } + FUSE_END_DAEMON(); + close(fuse_dev); + close(content_fd); + close(src_fd); + umount(mount_dir); + return result; +} + +static int bpf_test_no_readdirplus_without_nodeid(const char *mount_dir) +{ + const char *folder1 = "folder1"; + const char *folder2 = "folder2"; + int result = TEST_FAILURE; + int fuse_dev = -1; + int src_fd = -1; + int content_fd = -1; + int bpf_fd = -1; + FUSE_DECLARE_DAEMON; + + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_readdirplus", + &bpf_fd, NULL, NULL), 0); + TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(folder1)), 0777)); + TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(folder2)), 0777)); + TESTEQUAL(mount_fuse_no_init(mount_dir, -1, -1, &fuse_dev), 0); + FUSE_START_DAEMON(); + if (action) { + DIR *open_dir = NULL; + struct dirent *dirent; + + // Folder 1: Readdir with no nodeid + TEST(open_dir = s_opendir(s_path(s(ft_dst), s(folder1))), + open_dir != NULL); + TEST(dirent = readdir(open_dir), dirent == NULL); + TESTCOND(errno == EINVAL); + TESTSYSCALL(closedir(open_dir)); + open_dir = NULL; + + // Folder 2: Readdir with a nodeid + TEST(open_dir = s_opendir(s_path(s(ft_dst), s(folder2))), + open_dir != NULL); + TEST(dirent = readdir(open_dir), dirent == NULL); + TESTCOND(errno == EINVAL); + TESTSYSCALL(closedir(open_dir)); + open_dir = NULL; + } else { + size_t read_size; + struct fuse_in_header *in_header = (struct fuse_in_header *)bytes_in; + struct fuse_attr attr = {}; + int backing_fd = -1; + + TESTFUSEINITFLAGS(FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO); + + // folder 1: Set 0 as nodeid, Expect READDIR + TESTFUSELOOKUP(folder1, 0); + TEST(backing_fd = s_open(s_path(s(ft_src), s(folder1)), + O_DIRECTORY | O_RDONLY | O_CLOEXEC), + backing_fd != -1); + TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) { + .nodeid = 0, + .generation = 0, + .entry_valid = UINT64_MAX, + .attr_valid = UINT64_MAX, + .entry_valid_nsec = UINT32_MAX, + .attr_valid_nsec = UINT32_MAX, + .attr = attr, + }), fuse_entry_bpf_out, ((struct fuse_entry_bpf_out) { + .backing_action = FUSE_ACTION_REPLACE, + .backing_fd = backing_fd, + .bpf_action = FUSE_ACTION_REPLACE, + .bpf_fd = bpf_fd, + })); + TESTSYSCALL(close(backing_fd)); + TEST(read_size = read(fuse_dev, bytes_in, sizeof(bytes_in)), read_size > 0); + TESTEQUAL(in_header->opcode, FUSE_READDIR); + TESTFUSEOUTERROR(-EINVAL); + + // folder 2: Set 10 as nodeid, Expect READDIRPLUS + TESTFUSELOOKUP(folder2, 0); + TEST(backing_fd = s_open(s_path(s(ft_src), s(folder2)), + O_DIRECTORY | O_RDONLY | O_CLOEXEC), + backing_fd != -1); + TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) { + .nodeid = 10, + .generation = 0, + .entry_valid = UINT64_MAX, + .attr_valid = UINT64_MAX, + .entry_valid_nsec = UINT32_MAX, + .attr_valid_nsec = UINT32_MAX, + .attr = attr, + }), fuse_entry_bpf_out, ((struct fuse_entry_bpf_out) { + .backing_action = FUSE_ACTION_REPLACE, + .backing_fd = backing_fd, + .bpf_action = FUSE_ACTION_REPLACE, + .bpf_fd = bpf_fd, + })); + TESTSYSCALL(close(backing_fd)); + TEST(read_size = read(fuse_dev, bytes_in, sizeof(bytes_in)), read_size > 0); + TESTEQUAL(in_header->opcode, FUSE_READDIRPLUS); + TESTFUSEOUTERROR(-EINVAL); + exit(TEST_SUCCESS); + } + FUSE_END_DAEMON(); + close(fuse_dev); + close(content_fd); + close(src_fd); + close(bpf_fd); + umount(mount_dir); + return result; +} + +/* + * State: + * Original: dst/folder1/content.txt + * ^ + * | + * | + * Backing: src/folder1/content.txt + * + * Step 1: open(folder1) - lookup folder1 with entry_timeout set to 0 + * Step 2: open(folder1) - lookup folder1 again to trigger revalidate wich will + * set backing fd + * + * Check 1: cat(content.txt) - check not receiving call on the fuse daemon + * and content is the same + */ +static int bpf_test_revalidate_handle_backing_fd(const char *mount_dir) +{ + const char *folder1 = "folder1"; + const char *content_file = "content.txt"; + const char *content = "hello world"; + int result = TEST_FAILURE; + int fuse_dev = -1; + int src_fd = -1; + int content_fd = -1; + FUSE_DECLARE_DAEMON; + + TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(folder1)), 0777)); + TEST(content_fd = s_creat(s_pathn(3, s(ft_src), s(folder1), s(content_file)), 0777), + content_fd != -1); + TESTEQUAL(write(content_fd, content, strlen(content)), strlen(content)); + TESTSYSCALL(close(content_fd)); + content_fd = -1; + TESTEQUAL(mount_fuse_no_init(mount_dir, -1, -1, &fuse_dev), 0); + FUSE_START_DAEMON(); + if (action) { + int dst_folder1_fd = -1; + int dst_content_fd = -1; + char content_buffer[9] = {0}; + // Step 1: Lookup folder1 + TESTERR(dst_folder1_fd = s_open(s_path(s(mount_dir), s(folder1)), + O_RDONLY | O_CLOEXEC), dst_folder1_fd != -1); + TESTSYSCALL(close(dst_folder1_fd)); + dst_folder1_fd = -1; + // Step 2: Lookup folder1 again + TESTERR(dst_folder1_fd = s_open(s_path(s(mount_dir), s(folder1)), + O_RDONLY | O_CLOEXEC), dst_folder1_fd != -1); + TESTSYSCALL(close(dst_folder1_fd)); + dst_folder1_fd = -1; + // Check 1: Read content file (must be backed) + TESTERR(dst_content_fd = + s_open(s_pathn(3, s(mount_dir), s(folder1), s(content_file)), + O_RDONLY | O_CLOEXEC), dst_content_fd != -1); + TESTEQUAL(read(dst_content_fd, content_buffer, strlen(content)), + strlen(content)); + TESTEQUAL(strncmp(content, content_buffer, strlen(content)), 0); + TESTSYSCALL(close(dst_content_fd)); + dst_content_fd = -1; + } else { + struct fuse_attr attr = {}; + int backing_fd = -1; + + TESTFUSEINITFLAGS(FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO); + // Step 1: Lookup folder1 set entry_timeout to 0 to trigger + // revalidate later + TESTFUSELOOKUP(folder1, 0); + TESTSYSCALL(s_fuse_attr(s_path(s(ft_src), s(folder1)), &attr)); + TEST(backing_fd = s_open(s_path(s(ft_src), s(folder1)), + O_DIRECTORY | O_RDONLY | O_CLOEXEC), + backing_fd != -1); + TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) { + .nodeid = attr.ino, + .generation = 0, + .entry_valid = 0, + .attr_valid = UINT64_MAX, + .entry_valid_nsec = 0, + .attr_valid_nsec = UINT32_MAX, + .attr = attr, + }), fuse_entry_bpf_out, ((struct fuse_entry_bpf_out) { + .backing_action = FUSE_ACTION_REPLACE, + .backing_fd = backing_fd, + })); + TESTSYSCALL(close(backing_fd)); + // Step 1: Lookup folder1 as a reaction to revalidate call + // This attempts to change the backing node, which is not allowed on revalidate + TESTFUSELOOKUP(folder1, 0); + TESTSYSCALL(s_fuse_attr(s_path(s(ft_src), s(folder1)), &attr)); + TEST(backing_fd = s_open(s_path(s(ft_src), s(folder1)), + O_DIRECTORY | O_RDONLY | O_CLOEXEC), + backing_fd != -1); + TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) { + .nodeid = attr.ino, + .generation = 0, + .entry_valid = UINT64_MAX, + .attr_valid = UINT64_MAX, + .entry_valid_nsec = UINT32_MAX, + .attr_valid_nsec = UINT32_MAX, + .attr = attr, + }), fuse_entry_bpf_out, ((struct fuse_entry_bpf_out) { + .backing_action = FUSE_ACTION_REPLACE, + .backing_fd = backing_fd, + })); + TESTSYSCALL(close(backing_fd)); + + // Lookup folder1 as a reaction to failed revalidate + TESTFUSELOOKUP(folder1, 0); + TESTSYSCALL(s_fuse_attr(s_path(s(ft_src), s(folder1)), &attr)); + TEST(backing_fd = s_open(s_path(s(ft_src), s(folder1)), + O_DIRECTORY | O_RDONLY | O_CLOEXEC), + backing_fd != -1); + TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) { + .nodeid = attr.ino, + .generation = 0, + .entry_valid = UINT64_MAX, + .attr_valid = UINT64_MAX, + .entry_valid_nsec = UINT32_MAX, + .attr_valid_nsec = UINT32_MAX, + .attr = attr, + }), fuse_entry_bpf_out, ((struct fuse_entry_bpf_out) { + .backing_action = FUSE_ACTION_REPLACE, + .backing_fd = backing_fd, + })); + TESTSYSCALL(close(backing_fd)); + exit(TEST_SUCCESS); + } + FUSE_END_DAEMON(); + close(fuse_dev); + close(content_fd); + close(src_fd); + umount(mount_dir); + return result; +} + +static int bpf_test_lookup_postfilter(const char *mount_dir) +{ + const char *file1_name = "file1"; + const char *file2_name = "file2"; + const char *file3_name = "file3"; + int result = TEST_FAILURE; + int bpf_fd = -1; + int src_fd = -1; + int fuse_dev = -1; + int file_fd = -1; + FUSE_DECLARE_DAEMON; + + TEST(file_fd = s_creat(s_path(s(ft_src), s(file1_name)), 0777), + file_fd != -1); + TESTSYSCALL(close(file_fd)); + TEST(file_fd = s_creat(s_path(s(ft_src), s(file2_name)), 0777), + file_fd != -1); + TESTSYSCALL(close(file_fd)); + file_fd = -1; + TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_lookup_postfilter", + &bpf_fd, NULL, NULL), 0); + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0); + FUSE_START_DAEMON(); + if (action) { + int fd = -1; + + TESTEQUAL(s_open(s_path(s(mount_dir), s(file1_name)), O_RDONLY), + -1); + TESTEQUAL(errno, ENOENT); + TEST(fd = s_open(s_path(s(mount_dir), s(file2_name)), O_RDONLY), + fd != -1); + TESTSYSCALL(close(fd)); + TESTEQUAL(s_open(s_path(s(mount_dir), s(file3_name)), O_RDONLY), + -1); + } else { + struct fuse_in_postfilter_header *in_header = + (struct fuse_in_postfilter_header *)bytes_in; + struct fuse_entry_out *feo; + struct fuse_entry_bpf_out *febo; + + TESTFUSELOOKUP(file1_name, FUSE_POSTFILTER); + TESTFUSEOUTERROR(-ENOENT); + + TESTFUSELOOKUP(file2_name, FUSE_POSTFILTER); + feo = (struct fuse_entry_out *) (bytes_in + + sizeof(struct fuse_in_header) + strlen(file2_name) + 1); + febo = (struct fuse_entry_bpf_out *) ((char *)feo + + sizeof(*feo)); + TESTFUSEOUT2(fuse_entry_out, *feo, fuse_entry_bpf_out, *febo); + + TESTFUSELOOKUP(file3_name, FUSE_POSTFILTER); + TESTEQUAL(in_header->error_in, -ENOENT); + TESTFUSEOUTERROR(-ENOENT); + exit(TEST_SUCCESS); + } + FUSE_END_DAEMON(); + close(file_fd); + close(fuse_dev); + umount(mount_dir); + close(src_fd); + close(bpf_fd); + return result; +} + +static void parse_range(const char *ranges, bool *run_test, size_t tests) +{ + size_t i; + char *range; + + for (i = 0; i < tests; ++i) + run_test[i] = false; + + range = strtok(optarg, ","); + while (range) { + char *dash = strchr(range, '-'); + + if (dash) { + size_t start = 1, end = tests; + char *end_ptr; + + if (dash > range) { + start = strtol(range, &end_ptr, 10); + if (*end_ptr != '-' || start <= 0 || start > tests) + ksft_exit_fail_msg("Bad range\n"); + } + + if (dash[1]) { + end = strtol(dash + 1, &end_ptr, 10); + if (*end_ptr || end <= start || end > tests) + ksft_exit_fail_msg("Bad range\n"); + } + + for (i = start; i <= end; ++i) + run_test[i - 1] = true; + } else { + char *end; + long value = strtol(range, &end, 10); + + if (*end || value <= 0 || value > tests) + ksft_exit_fail_msg("Bad range\n"); + run_test[value - 1] = true; + } + range = strtok(NULL, ","); + } +} + +static int parse_options(int argc, char *const *argv, bool *run_test, + size_t tests) +{ + signed char c; + + while ((c = getopt(argc, argv, "f:t:v")) != -1) + switch (c) { + case 'f': + test_options.file = strtol(optarg, NULL, 10); + break; + + case 't': + parse_range(optarg, run_test, tests); + break; + + case 'v': + test_options.verbose = true; + break; + + default: + return -EINVAL; + } + + return 0; +} + +struct test_case { + int (*pfunc)(const char *dir); + const char *name; +}; + +static void run_one_test(const char *mount_dir, + const struct test_case *test_case) +{ + ksft_print_msg("Running %s\n", test_case->name); + if (test_case->pfunc(mount_dir) == TEST_SUCCESS) + ksft_test_result_pass("%s\n", test_case->name); + else + ksft_test_result_fail("%s\n", test_case->name); +} + +int main(int argc, char *argv[]) +{ + char *mount_dir = NULL; + char *src_dir = NULL; + int i; + int fd, count; + +#define MAKE_TEST(test) \ + { \ + test, #test \ + } + const struct test_case cases[] = { + MAKE_TEST(basic_test), + MAKE_TEST(bpf_test_real), + MAKE_TEST(bpf_test_partial), + MAKE_TEST(bpf_test_attrs), + MAKE_TEST(bpf_test_readdir), + MAKE_TEST(bpf_test_creat), + MAKE_TEST(bpf_test_hidden_entries), + MAKE_TEST(bpf_test_dir), + MAKE_TEST(bpf_test_file_early_close), + MAKE_TEST(bpf_test_file_late_close), + MAKE_TEST(bpf_test_mknod), + MAKE_TEST(bpf_test_largedir), + MAKE_TEST(bpf_test_link), + MAKE_TEST(bpf_test_symlink), + MAKE_TEST(bpf_test_xattr), + MAKE_TEST(bpf_test_redact_readdir), + MAKE_TEST(bpf_test_set_backing), + MAKE_TEST(bpf_test_remove_backing), + MAKE_TEST(bpf_test_dir_rename), + MAKE_TEST(bpf_test_file_rename), + MAKE_TEST(bpf_test_alter_errcode_bpf), + MAKE_TEST(bpf_test_alter_errcode_userspace), + MAKE_TEST(mmap_test), + MAKE_TEST(readdir_perms_test), + MAKE_TEST(inotify_test), + MAKE_TEST(bpf_test_statfs), + MAKE_TEST(bpf_test_lseek), + MAKE_TEST(bpf_test_readdirplus_not_overriding_backing), + MAKE_TEST(bpf_test_no_readdirplus_without_nodeid), + MAKE_TEST(bpf_test_revalidate_handle_backing_fd), + MAKE_TEST(bpf_test_lookup_postfilter), + }; +#undef MAKE_TEST + + bool run_test[ARRAY_SIZE(cases)]; + + for (int i = 0; i < ARRAY_SIZE(cases); ++i) + run_test[i] = true; + + if (parse_options(argc, argv, run_test, ARRAY_SIZE(cases))) + ksft_exit_fail_msg("Bad options\n"); + + // Seed randomness pool for testing on QEMU + // NOTE - this abuses the concept of randomness - do *not* ever do this + // on a machine for production use - the device will think it has good + // randomness when it does not. + fd = open("/dev/urandom", O_WRONLY | O_CLOEXEC); + count = 4096; + for (int i = 0; i < 128; ++i) + ioctl(fd, RNDADDTOENTCNT, &count); + close(fd); + + ksft_print_header(); + + if (geteuid() != 0) + ksft_print_msg("Not a root, might fail to mount.\n"); + + if (tracing_on() != TEST_SUCCESS) + ksft_exit_fail_msg("Can't turn on tracing\n"); + + src_dir = setup_mount_dir(ft_src); + mount_dir = setup_mount_dir(ft_dst); + if (src_dir == NULL || mount_dir == NULL) + ksft_exit_fail_msg("Can't create a mount dir\n"); + + ksft_set_plan(ARRAY_SIZE(run_test)); + + for (i = 0; i < ARRAY_SIZE(run_test); ++i) + if (run_test[i]) { + delete_dir_tree(mount_dir, false); + delete_dir_tree(src_dir, false); + run_one_test(mount_dir, &cases[i]); + } else + ksft_cnt.ksft_xskip++; + + umount2(mount_dir, MNT_FORCE); + delete_dir_tree(mount_dir, true); + delete_dir_tree(src_dir, true); + return !ksft_get_fail_cnt() ? ksft_exit_pass() : ksft_exit_fail(); +} diff --git a/tools/testing/selftests/filesystems/fuse/test_bpf.c b/tools/testing/selftests/filesystems/fuse/test_bpf.c new file mode 100644 index 000000000000..032cb1178f9f --- /dev/null +++ b/tools/testing/selftests/filesystems/fuse/test_bpf.c @@ -0,0 +1,507 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +// Copyright (c) 2022 Google LLC + +#include "test_fuse_bpf.h" + +SEC("test_readdir_redact") +/* return FUSE_BPF_BACKING to use backing fs, 0 to pass to usermode */ +int readdir_test(struct fuse_bpf_args *fa) +{ + switch (fa->opcode) { + case FUSE_READDIR | FUSE_PREFILTER: { + const struct fuse_read_in *fri = fa->in_args[0].value; + + bpf_printk("readdir %d", fri->fh); + return FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER; + } + + case FUSE_READDIR | FUSE_POSTFILTER: { + const struct fuse_read_in *fri = fa->in_args[0].value; + + bpf_printk("readdir postfilter %x", fri->fh); + return FUSE_BPF_USER_FILTER; + } + + default: + bpf_printk("opcode %d", fa->opcode); + return FUSE_BPF_BACKING; + } +} + +SEC("test_trace") +/* return FUSE_BPF_BACKING to use backing fs, 0 to pass to usermode */ +int trace_test(struct fuse_bpf_args *fa) +{ + switch (fa->opcode) { + case FUSE_LOOKUP | FUSE_PREFILTER: { + /* real and partial use backing file */ + const char *name = fa->in_args[0].value; + bool backing = false; + + if (strcmp(name, "real") == 0 || strcmp(name, "partial") == 0) + backing = true; + + if (strcmp(name, "dir") == 0) + backing = true; + if (strcmp(name, "dir2") == 0) + backing = true; + + if (strcmp(name, "file1") == 0) + backing = true; + if (strcmp(name, "file2") == 0) + backing = true; + + bpf_printk("lookup %s %d", name, backing); + return backing ? FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER : 0; + } + + case FUSE_LOOKUP | FUSE_POSTFILTER: { + const char *name = fa->in_args[0].value; + struct fuse_entry_out *feo = fa->out_args[0].value; + + if (strcmp(name, "real") == 0) + feo->nodeid = 5; + else if (strcmp(name, "partial") == 0) + feo->nodeid = 6; + + bpf_printk("post-lookup %s %d", name, feo->nodeid); + return 0; + } + + case FUSE_ACCESS | FUSE_PREFILTER: { + bpf_printk("Access: %d", fa->nodeid); + return FUSE_BPF_BACKING; + } + + case FUSE_CREATE | FUSE_PREFILTER: + bpf_printk("Create: %d", fa->nodeid); + return FUSE_BPF_BACKING; + + case FUSE_MKNOD | FUSE_PREFILTER: { + const struct fuse_mknod_in *fmi = fa->in_args[0].value; + const char *name = fa->in_args[1].value; + + bpf_printk("mknod %s %x %x", name, fmi->rdev | fmi->mode, fmi->umask); + return FUSE_BPF_BACKING; + } + + case FUSE_MKDIR | FUSE_PREFILTER: { + const struct fuse_mkdir_in *fmi = fa->in_args[0].value; + const char *name = fa->in_args[1].value; + + bpf_printk("mkdir %s %x %x", name, fmi->mode, fmi->umask); + return FUSE_BPF_BACKING; + } + + case FUSE_RMDIR | FUSE_PREFILTER: { + const char *name = fa->in_args[0].value; + + bpf_printk("rmdir %s", name); + return FUSE_BPF_BACKING; + } + + case FUSE_RENAME | FUSE_PREFILTER: { + const char *oldname = fa->in_args[1].value; + const char *newname = fa->in_args[2].value; + + bpf_printk("rename from %s", oldname); + bpf_printk("rename to %s", newname); + return FUSE_BPF_BACKING; + } + + case FUSE_RENAME2 | FUSE_PREFILTER: { + const struct fuse_rename2_in *fri = fa->in_args[0].value; + uint32_t flags = fri->flags; + const char *oldname = fa->in_args[1].value; + const char *newname = fa->in_args[2].value; + + bpf_printk("rename(%x) from %s", flags, oldname); + bpf_printk("rename to %s", newname); + return FUSE_BPF_BACKING; + } + + case FUSE_UNLINK | FUSE_PREFILTER: { + const char *name = fa->in_args[0].value; + + bpf_printk("unlink %s", name); + return FUSE_BPF_BACKING; + } + + case FUSE_LINK | FUSE_PREFILTER: { + const struct fuse_link_in *fli = fa->in_args[0].value; + const char *link_name = fa->in_args[1].value; + + bpf_printk("link %d %s", fli->oldnodeid, link_name); + return FUSE_BPF_BACKING; + } + + case FUSE_SYMLINK | FUSE_PREFILTER: { + const char *link_name = fa->in_args[0].value; + const char *link_dest = fa->in_args[1].value; + + bpf_printk("symlink from %s", link_name); + bpf_printk("symlink to %s", link_dest); + return FUSE_BPF_BACKING; + } + + case FUSE_READLINK | FUSE_PREFILTER: { + const char *link_name = fa->in_args[0].value; + + bpf_printk("readlink from", link_name); + return FUSE_BPF_BACKING; + } + + case FUSE_OPEN | FUSE_PREFILTER: { + int backing = 0; + + switch (fa->nodeid) { + case 5: + backing = FUSE_BPF_BACKING; + break; + + case 6: + backing = FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER; + break; + + default: + break; + } + + bpf_printk("open %d %d", fa->nodeid, backing); + return backing; + } + + case FUSE_OPEN | FUSE_POSTFILTER: + bpf_printk("open postfilter"); + return FUSE_BPF_USER_FILTER; + + case FUSE_READ | FUSE_PREFILTER: { + const struct fuse_read_in *fri = fa->in_args[0].value; + + bpf_printk("read %llu %llu", fri->fh, fri->offset); + if (fri->fh == 1 && fri->offset == 0) + return 0; + return FUSE_BPF_BACKING; + } + + case FUSE_GETATTR | FUSE_PREFILTER: { + /* real and partial use backing file */ + int backing = 0; + + switch (fa->nodeid) { + case 1: + case 5: + case 6: + /* + * TODO: Find better solution + * Add 100 to stop clang compiling to jump table which bpf hates + */ + case 100: + backing = FUSE_BPF_BACKING; + break; + } + + bpf_printk("getattr %d %d", fa->nodeid, backing); + return backing; + } + + case FUSE_SETATTR | FUSE_PREFILTER: { + /* real and partial use backing file */ + int backing = 0; + + switch (fa->nodeid) { + case 1: + case 5: + case 6: + /* TODO See above */ + case 100: + backing = FUSE_BPF_BACKING; + break; + } + + bpf_printk("setattr %d %d", fa->nodeid, backing); + return backing; + } + + case FUSE_OPENDIR | FUSE_PREFILTER: { + int backing = 0; + + switch (fa->nodeid) { + case 1: + backing = FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER; + break; + } + + bpf_printk("opendir %d %d", fa->nodeid, backing); + return backing; + } + + case FUSE_OPENDIR | FUSE_POSTFILTER: { + struct fuse_open_out *foo = fa->out_args[0].value; + + foo->fh = 2; + bpf_printk("opendir postfilter"); + return 0; + } + + case FUSE_READDIR | FUSE_PREFILTER: { + const struct fuse_read_in *fri = fa->in_args[0].value; + int backing = 0; + + if (fri->fh == 2) + backing = FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER; + + bpf_printk("readdir %d %d", fri->fh, backing); + return backing; + } + + case FUSE_READDIR | FUSE_POSTFILTER: { + const struct fuse_read_in *fri = fa->in_args[0].value; + int backing = 0; + + if (fri->fh == 2) + backing = FUSE_BPF_USER_FILTER | FUSE_BPF_BACKING | + FUSE_BPF_POST_FILTER; + + bpf_printk("readdir postfilter %d %d", fri->fh, backing); + return backing; + } + + case FUSE_FLUSH | FUSE_PREFILTER: { + const struct fuse_flush_in *ffi = fa->in_args[0].value; + + bpf_printk("Flush %d", ffi->fh); + return FUSE_BPF_BACKING; + } + + case FUSE_GETXATTR | FUSE_PREFILTER: { + const struct fuse_flush_in *ffi = fa->in_args[0].value; + const char *name = fa->in_args[1].value; + + bpf_printk("getxattr %d %s", ffi->fh, name); + return FUSE_BPF_BACKING; + } + + case FUSE_LISTXATTR | FUSE_PREFILTER: { + const struct fuse_flush_in *ffi = fa->in_args[0].value; + const char *name = fa->in_args[1].value; + + bpf_printk("listxattr %d %s", ffi->fh, name); + return FUSE_BPF_BACKING; + } + + case FUSE_SETXATTR | FUSE_PREFILTER: { + const struct fuse_flush_in *ffi = fa->in_args[0].value; + const char *name = fa->in_args[1].value; + unsigned int size = fa->in_args[2].size; + + bpf_printk("setxattr %d %s %u", ffi->fh, name, size); + return FUSE_BPF_BACKING; + } + + case FUSE_REMOVEXATTR | FUSE_PREFILTER: { + const char *name = fa->in_args[0].value; + + bpf_printk("removexattr %s", name); + return FUSE_BPF_BACKING; + } + + case FUSE_CANONICAL_PATH | FUSE_PREFILTER: { + bpf_printk("canonical_path"); + return FUSE_BPF_BACKING; + } + + case FUSE_STATFS | FUSE_PREFILTER: { + bpf_printk("statfs"); + return FUSE_BPF_BACKING; + } + + case FUSE_LSEEK | FUSE_PREFILTER: { + const struct fuse_lseek_in *fli = fa->in_args[0].value; + + bpf_printk("lseek type:%d, offset:%lld", fli->whence, fli->offset); + return FUSE_BPF_BACKING; + } + + default: + bpf_printk("Unknown opcode %d", fa->opcode); + return 0; + } +} + +SEC("test_hidden") +int trace_hidden(struct fuse_bpf_args *fa) +{ + switch (fa->opcode) { + case FUSE_LOOKUP | FUSE_PREFILTER: { + const char *name = fa->in_args[0].value; + + bpf_printk("Lookup: %s", name); + if (!strcmp(name, "show")) + return FUSE_BPF_BACKING; + if (!strcmp(name, "hide")) + return -ENOENT; + + return FUSE_BPF_BACKING; + } + + case FUSE_ACCESS | FUSE_PREFILTER: { + bpf_printk("Access: %d", fa->nodeid); + return FUSE_BPF_BACKING; + } + + case FUSE_CREATE | FUSE_PREFILTER: + bpf_printk("Create: %d", fa->nodeid); + return FUSE_BPF_BACKING; + + case FUSE_WRITE | FUSE_PREFILTER: + // TODO: Clang combines similar printk calls, causing BPF to complain + // bpf_printk("Write: %d", fa->nodeid); + return FUSE_BPF_BACKING; + + case FUSE_FLUSH | FUSE_PREFILTER: { + // const struct fuse_flush_in *ffi = fa->in_args[0].value; + + // bpf_printk("Flush %d", ffi->fh); + return FUSE_BPF_BACKING; + } + + case FUSE_RELEASE | FUSE_PREFILTER: { + // const struct fuse_release_in *fri = fa->in_args[0].value; + + // bpf_printk("Release %d", fri->fh); + return FUSE_BPF_BACKING; + } + + case FUSE_FALLOCATE | FUSE_PREFILTER: + // bpf_printk("fallocate %d", fa->nodeid); + return FUSE_BPF_BACKING; + + case FUSE_CANONICAL_PATH | FUSE_PREFILTER: { + return FUSE_BPF_BACKING; + } + default: + bpf_printk("Unknown opcode: %d", fa->opcode); + return 0; + } +} + +SEC("test_simple") +int trace_simple(struct fuse_bpf_args *fa) +{ + if (fa->opcode & FUSE_PREFILTER) + bpf_printk("prefilter opcode: %d", + fa->opcode & FUSE_OPCODE_FILTER); + else if (fa->opcode & FUSE_POSTFILTER) + bpf_printk("postfilter opcode: %d", + fa->opcode & FUSE_OPCODE_FILTER); + else + bpf_printk("*** UNKNOWN *** opcode: %d", fa->opcode); + return FUSE_BPF_BACKING; +} + +SEC("test_passthrough") +int trace_daemon(struct fuse_bpf_args *fa) +{ + switch (fa->opcode) { + case FUSE_LOOKUP | FUSE_PREFILTER: { + const char *name = fa->in_args[0].value; + + bpf_printk("Lookup prefilter: %lx %s", fa->nodeid, name); + return FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER; + } + + case FUSE_LOOKUP | FUSE_POSTFILTER: { + const char *name = fa->in_args[0].value; + struct fuse_entry_bpf_out *febo = fa->out_args[1].value; + + bpf_printk("Lookup postfilter: %lx %s %lu", fa->nodeid, name); + febo->bpf_action = FUSE_ACTION_REMOVE; + + return FUSE_BPF_USER_FILTER; + } + + default: + if (fa->opcode & FUSE_PREFILTER) + bpf_printk("prefilter opcode: %d", + fa->opcode & FUSE_OPCODE_FILTER); + else if (fa->opcode & FUSE_POSTFILTER) + bpf_printk("postfilter opcode: %d", + fa->opcode & FUSE_OPCODE_FILTER); + else + bpf_printk("*** UNKNOWN *** opcode: %d", fa->opcode); + return FUSE_BPF_BACKING; + } +} + +SEC("test_error") +/* return FUSE_BPF_BACKING to use backing fs, 0 to pass to usermode */ +int error_test(struct fuse_bpf_args *fa) +{ + switch (fa->opcode) { + case FUSE_MKDIR | FUSE_PREFILTER: { + bpf_printk("mkdir"); + return FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER; + } + case FUSE_MKDIR | FUSE_POSTFILTER: { + bpf_printk("mkdir postfilter"); + if (fa->error_in == -EEXIST) + return -EPERM; + + return 0; + } + + case FUSE_LOOKUP | FUSE_PREFILTER: { + const char *name = fa->in_args[0].value; + + bpf_printk("lookup prefilter %s", name); + return FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER; + } + case FUSE_LOOKUP | FUSE_POSTFILTER: { + const char *name = fa->in_args[0].value; + + bpf_printk("lookup postfilter %s %d", name, fa->error_in); + if (strcmp(name, "doesnotexist") == 0/* && fa->error_in == -EEXIST*/) { + bpf_printk("lookup postfilter doesnotexist"); + return FUSE_BPF_USER_FILTER; + } + bpf_printk("meh"); + return 0; + } + + default: + if (fa->opcode & FUSE_PREFILTER) + bpf_printk("prefilter opcode: %d", + fa->opcode & FUSE_OPCODE_FILTER); + else if (fa->opcode & FUSE_POSTFILTER) + bpf_printk("postfilter opcode: %d", + fa->opcode & FUSE_OPCODE_FILTER); + else + bpf_printk("*** UNKNOWN *** opcode: %d", fa->opcode); + return FUSE_BPF_BACKING; + } +} + +SEC("test_readdirplus") +int readdirplus_test(struct fuse_bpf_args *fa) +{ + switch (fa->opcode) { + case FUSE_READDIR | FUSE_PREFILTER: { + return 0; + } + } + return FUSE_BPF_BACKING; +} + +SEC("test_lookup_postfilter") +int lookuppostfilter_test(struct fuse_bpf_args *fa) +{ + switch (fa->opcode) { + case FUSE_LOOKUP | FUSE_PREFILTER: + return FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER; + case FUSE_LOOKUP | FUSE_POSTFILTER: + return FUSE_BPF_USER_FILTER; + default: + return FUSE_BPF_BACKING; + } +} diff --git a/tools/testing/selftests/filesystems/fuse/test_framework.h b/tools/testing/selftests/filesystems/fuse/test_framework.h new file mode 100644 index 000000000000..47047c226559 --- /dev/null +++ b/tools/testing/selftests/filesystems/fuse/test_framework.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2021 Google LLC + */ + +#ifndef _TEST_FRAMEWORK_H +#define _TEST_FRAMEWORK_H + +#include +#include +#include + +#ifdef __ANDROID__ +static int test_case_pass; +static int test_case_fail; +#define ksft_print_msg printf +#define ksft_test_result_pass(...) ({test_case_pass++; printf(__VA_ARGS__); }) +#define ksft_test_result_fail(...) ({test_case_fail++; printf(__VA_ARGS__); }) +#define ksft_exit_fail_msg(...) printf(__VA_ARGS__) +#define ksft_print_header() +#define ksft_set_plan(cnt) +#define ksft_get_fail_cnt() test_case_fail +#define ksft_exit_pass() 0 +#define ksft_exit_fail() 1 +#else +#include +#endif + +#define TEST_FAILURE 1 +#define TEST_SUCCESS 0 + +#define ptr_to_u64(p) ((__u64)p) + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define le16_to_cpu(x) (x) +#define le32_to_cpu(x) (x) +#define le64_to_cpu(x) (x) +#else +#error Big endian not supported! +#endif + +struct _test_options { + int file; + bool verbose; +}; + +extern struct _test_options test_options; + +#define TESTCOND(condition) \ + do { \ + if (!(condition)) { \ + ksft_print_msg("%s failed %d\n", \ + __func__, __LINE__); \ + goto out; \ + } else if (test_options.verbose) \ + ksft_print_msg("%s succeeded %d\n", \ + __func__, __LINE__); \ + } while (false) + +#define TESTCONDERR(condition) \ + do { \ + if (!(condition)) { \ + ksft_print_msg("%s failed %d\n", \ + __func__, __LINE__); \ + ksft_print_msg("Error %d (\"%s\")\n", \ + errno, strerror(errno)); \ + goto out; \ + } else if (test_options.verbose) \ + ksft_print_msg("%s succeeded %d\n", \ + __func__, __LINE__); \ + } while (false) + +#define TEST(statement, condition) \ + do { \ + statement; \ + TESTCOND(condition); \ + } while (false) + +#define TESTERR(statement, condition) \ + do { \ + statement; \ + TESTCONDERR(condition); \ + } while (false) + +enum _operator { + _eq, + _ne, + _ge, +}; + +static const char * const _operator_name[] = { + "==", + "!=", + ">=", +}; + +#define _TEST_OPERATOR(name, _type, format_specifier) \ +static inline int _test_operator_##name(const char *func, int line, \ + _type a, _type b, enum _operator o) \ +{ \ + bool pass; \ + switch (o) { \ + case _eq: \ + pass = a == b; \ + break; \ + case _ne: \ + pass = a != b; \ + break; \ + case _ge: \ + pass = a >= b; \ + break; \ + } \ + \ + if (!pass) \ + ksft_print_msg("Failed: %s at line %d, " \ + format_specifier " %s " \ + format_specifier "\n", \ + func, line, a, _operator_name[o], b); \ + else if (test_options.verbose) \ + ksft_print_msg("Passed: %s at line %d, " \ + format_specifier " %s " \ + format_specifier "\n", \ + func, line, a, _operator_name[o], b); \ + \ + return pass ? TEST_SUCCESS : TEST_FAILURE; \ +} + +_TEST_OPERATOR(i, int, "%d") +_TEST_OPERATOR(ui, unsigned int, "%u") +_TEST_OPERATOR(lui, unsigned long, "%lu") +_TEST_OPERATOR(ss, ssize_t, "%zd") +_TEST_OPERATOR(vp, void *, "%px") +_TEST_OPERATOR(cp, char *, "%px") + +#define _CALL_TO(_type, name, a, b, o) \ + _test_operator_##name(__func__, __LINE__, \ + (_type) (long long) (a), \ + (_type) (long long) (b), o) + +#define TESTOPERATOR(a, b, o) \ + do { \ + if (_Generic((a), \ + int : _CALL_TO(int, i, a, b, o), \ + unsigned int : _CALL_TO(unsigned int, ui, a, b, o), \ + unsigned long : _CALL_TO(unsigned long, lui, a, b, o), \ + ssize_t : _CALL_TO(ssize_t, ss, a, b, o), \ + void * : _CALL_TO(void *, vp, a, b, o), \ + char * : _CALL_TO(char *, cp, a, b, o) \ + )) \ + goto out; \ + } while (false) + +#define TESTEQUAL(a, b) TESTOPERATOR(a, b, _eq) +#define TESTNE(a, b) TESTOPERATOR(a, b, _ne) +#define TESTGE(a, b) TESTOPERATOR(a, b, _ge) + +/* For testing a syscall that returns 0 on success and sets errno otherwise */ +#define TESTSYSCALL(statement) TESTCONDERR((statement) == 0) + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) + +static inline void print_bytes(const void *data, size_t size) +{ + const char *bytes = data; + int i; + + for (i = 0; i < size; ++i) { + if (i % 0x10 == 0) + printf("%08x:", i); + printf("%02x ", (unsigned int) (unsigned char) bytes[i]); + if (i % 0x10 == 0x0f) + printf("\n"); + } + + if (i % 0x10 != 0) + printf("\n"); +} + + + +#endif diff --git a/tools/testing/selftests/filesystems/fuse/test_fuse.h b/tools/testing/selftests/filesystems/fuse/test_fuse.h new file mode 100644 index 000000000000..69dadc9c7e45 --- /dev/null +++ b/tools/testing/selftests/filesystems/fuse/test_fuse.h @@ -0,0 +1,337 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2021 Google LLC + */ + +#ifndef TEST_FUSE__H +#define TEST_FUSE__H + +#define _GNU_SOURCE + +#include "test_framework.h" + +#include +#include +#include +#include + +#include +#include + +#define PAGE_SIZE 4096 +#define FUSE_POSTFILTER 0x20000 + +extern struct _test_options test_options; + +/* Slow but semantically easy string functions */ + +/* + * struct s just wraps a char pointer + * It is a pointer to a malloc'd string, or null + * All consumers handle null input correctly + * All consumers free the string + */ +struct s { + char *s; +}; + +struct s s(const char *s1); +struct s sn(const char *s1, const char *s2); +int s_cmp(struct s s1, struct s s2); +struct s s_cat(struct s s1, struct s s2); +struct s s_splitleft(struct s s1, char c); +struct s s_splitright(struct s s1, char c); +struct s s_word(struct s s1, char c, size_t n); +struct s s_path(struct s s1, struct s s2); +struct s s_pathn(size_t n, struct s s1, ...); +int s_link(struct s src_pathname, struct s dst_pathname); +int s_symlink(struct s src_pathname, struct s dst_pathname); +int s_mkdir(struct s pathname, mode_t mode); +int s_rmdir(struct s pathname); +int s_unlink(struct s pathname); +int s_open(struct s pathname, int flags, ...); +int s_openat(int dirfd, struct s pathname, int flags, ...); +int s_creat(struct s pathname, mode_t mode); +int s_mkfifo(struct s pathname, mode_t mode); +int s_stat(struct s pathname, struct stat *st); +int s_statfs(struct s pathname, struct statfs *st); +int s_fuse_attr(struct s pathname, struct fuse_attr *fuse_attr_out); +DIR *s_opendir(struct s pathname); +int s_getxattr(struct s pathname, const char name[], void *value, size_t size, + ssize_t *ret_size); +int s_listxattr(struct s pathname, void *list, size_t size, ssize_t *ret_size); +int s_setxattr(struct s pathname, const char name[], const void *value, + size_t size, int flags); +int s_removexattr(struct s pathname, const char name[]); +int s_rename(struct s oldpathname, struct s newpathname); + +struct s tracing_folder(void); +int tracing_on(void); + +char *concat_file_name(const char *dir, const char *file); +char *setup_mount_dir(const char *name); +int delete_dir_tree(const char *dir_path, bool remove_root); + +#define TESTFUSEINNULL(_opcode) \ + do { \ + struct fuse_in_header *in_header = \ + (struct fuse_in_header *)bytes_in; \ + ssize_t res = read(fuse_dev, &bytes_in, \ + sizeof(bytes_in)); \ + \ + TESTEQUAL(in_header->opcode, _opcode); \ + TESTEQUAL(res, sizeof(*in_header)); \ + } while (false) + +#define TESTFUSEIN(_opcode, in_struct) \ + do { \ + struct fuse_in_header *in_header = \ + (struct fuse_in_header *)bytes_in; \ + ssize_t res = read(fuse_dev, &bytes_in, \ + sizeof(bytes_in)); \ + \ + TESTEQUAL(in_header->opcode, _opcode); \ + TESTEQUAL(res, sizeof(*in_header) + sizeof(*in_struct));\ + } while (false) + +#define TESTFUSEIN2(_opcode, in_struct1, in_struct2) \ + do { \ + struct fuse_in_header *in_header = \ + (struct fuse_in_header *)bytes_in; \ + ssize_t res = read(fuse_dev, &bytes_in, \ + sizeof(bytes_in)); \ + \ + TESTEQUAL(in_header->opcode, _opcode); \ + TESTEQUAL(res, sizeof(*in_header) + sizeof(*in_struct1) \ + + sizeof(*in_struct2)); \ + in_struct1 = (void *)(bytes_in + sizeof(*in_header)); \ + in_struct2 = (void *)(bytes_in + sizeof(*in_header) \ + + sizeof(*in_struct1)); \ + } while (false) + +#define TESTFUSEINEXT(_opcode, in_struct, extra) \ + do { \ + struct fuse_in_header *in_header = \ + (struct fuse_in_header *)bytes_in; \ + ssize_t res = read(fuse_dev, &bytes_in, \ + sizeof(bytes_in)); \ + \ + TESTEQUAL(in_header->opcode, _opcode); \ + TESTEQUAL(res, \ + sizeof(*in_header) + sizeof(*in_struct) + extra);\ + } while (false) + +#define TESTFUSEINUNKNOWN() \ + do { \ + struct fuse_in_header *in_header = \ + (struct fuse_in_header *)bytes_in; \ + ssize_t res = read(fuse_dev, &bytes_in, \ + sizeof(bytes_in)); \ + \ + TESTGE(res, sizeof(*in_header)); \ + TESTEQUAL(in_header->opcode, -1); \ + } while (false) + +/* Special case lookup since it is asymmetric */ +#define TESTFUSELOOKUP(expected, filter) \ + do { \ + struct fuse_in_header *in_header = \ + (struct fuse_in_header *)bytes_in; \ + char *name = (char *) (bytes_in + sizeof(*in_header)); \ + ssize_t res; \ + \ + TEST(res = read(fuse_dev, &bytes_in, sizeof(bytes_in)), \ + res != -1); \ + /* TODO once we handle forgets properly, remove */ \ + if (in_header->opcode == FUSE_FORGET) \ + continue; \ + if (in_header->opcode == FUSE_BATCH_FORGET) \ + continue; \ + TESTGE(res, sizeof(*in_header)); \ + TESTEQUAL(in_header->opcode, \ + FUSE_LOOKUP | filter); \ + TESTEQUAL(res, \ + sizeof(*in_header) + strlen(expected) + 1 + \ + (filter == FUSE_POSTFILTER ? \ + sizeof(struct fuse_entry_out) + \ + sizeof(struct fuse_entry_bpf_out) : 0));\ + TESTCOND(!strcmp(name, expected)); \ + break; \ + } while (true) + +#define TESTFUSEOUTEMPTY() \ + do { \ + struct fuse_in_header *in_header = \ + (struct fuse_in_header *)bytes_in; \ + struct fuse_out_header *out_header = \ + (struct fuse_out_header *)bytes_out; \ + \ + *out_header = (struct fuse_out_header) { \ + .len = sizeof(*out_header), \ + .unique = in_header->unique, \ + }; \ + TESTEQUAL(write(fuse_dev, bytes_out, out_header->len), \ + out_header->len); \ + } while (false) + +#define TESTFUSEOUTERROR(errno) \ + do { \ + struct fuse_in_header *in_header = \ + (struct fuse_in_header *)bytes_in; \ + struct fuse_out_header *out_header = \ + (struct fuse_out_header *)bytes_out; \ + \ + *out_header = (struct fuse_out_header) { \ + .len = sizeof(*out_header), \ + .error = errno, \ + .unique = in_header->unique, \ + }; \ + TESTEQUAL(write(fuse_dev, bytes_out, out_header->len), \ + out_header->len); \ + } while (false) + +#define TESTFUSEOUTREAD(data, length) \ + do { \ + struct fuse_in_header *in_header = \ + (struct fuse_in_header *)bytes_in; \ + struct fuse_out_header *out_header = \ + (struct fuse_out_header *)bytes_out; \ + \ + *out_header = (struct fuse_out_header) { \ + .len = sizeof(*out_header) + length, \ + .unique = in_header->unique, \ + }; \ + memcpy(bytes_out + sizeof(*out_header), data, length); \ + TESTEQUAL(write(fuse_dev, bytes_out, out_header->len), \ + out_header->len); \ + } while (false) + +#define TESTFUSEDIROUTREAD(read_out, data, length) \ + do { \ + struct fuse_in_header *in_header = \ + (struct fuse_in_header *)bytes_in; \ + struct fuse_out_header *out_header = \ + (struct fuse_out_header *)bytes_out; \ + \ + *out_header = (struct fuse_out_header) { \ + .len = sizeof(*out_header) + \ + sizeof(*read_out) + length, \ + .unique = in_header->unique, \ + }; \ + memcpy(bytes_out + sizeof(*out_header) + \ + sizeof(*read_out), data, length); \ + memcpy(bytes_out + sizeof(*out_header), \ + read_out, sizeof(*read_out)); \ + TESTEQUAL(write(fuse_dev, bytes_out, out_header->len), \ + out_header->len); \ + } while (false) + +#define TESTFUSEOUT1(type1, obj1) \ + do { \ + *(struct fuse_out_header *) bytes_out \ + = (struct fuse_out_header) { \ + .len = sizeof(struct fuse_out_header) \ + + sizeof(struct type1), \ + .unique = ((struct fuse_in_header *) \ + bytes_in)->unique, \ + }; \ + *(struct type1 *) (bytes_out \ + + sizeof(struct fuse_out_header)) \ + = obj1; \ + TESTEQUAL(write(fuse_dev, bytes_out, \ + ((struct fuse_out_header *)bytes_out)->len), \ + ((struct fuse_out_header *)bytes_out)->len); \ + } while (false) + +#define TESTFUSEOUT2(type1, obj1, type2, obj2) \ + do { \ + *(struct fuse_out_header *) bytes_out \ + = (struct fuse_out_header) { \ + .len = sizeof(struct fuse_out_header) \ + + sizeof(struct type1) \ + + sizeof(struct type2), \ + .unique = ((struct fuse_in_header *) \ + bytes_in)->unique, \ + }; \ + *(struct type1 *) (bytes_out \ + + sizeof(struct fuse_out_header)) \ + = obj1; \ + *(struct type2 *) (bytes_out \ + + sizeof(struct fuse_out_header) \ + + sizeof(struct type1)) \ + = obj2; \ + TESTEQUAL(write(fuse_dev, bytes_out, \ + ((struct fuse_out_header *)bytes_out)->len), \ + ((struct fuse_out_header *)bytes_out)->len); \ + } while (false) + +#define TESTFUSEINITFLAGS(fuse_connection_flags) \ + do { \ + DECL_FUSE_IN(init); \ + \ + TESTFUSEIN(FUSE_INIT, init_in); \ + TESTEQUAL(init_in->major, FUSE_KERNEL_VERSION); \ + TESTEQUAL(init_in->minor, FUSE_KERNEL_MINOR_VERSION); \ + TESTFUSEOUT1(fuse_init_out, ((struct fuse_init_out) { \ + .major = FUSE_KERNEL_VERSION, \ + .minor = FUSE_KERNEL_MINOR_VERSION, \ + .max_readahead = 4096, \ + .flags = fuse_connection_flags, \ + .max_background = 0, \ + .congestion_threshold = 0, \ + .max_write = 4096, \ + .time_gran = 1000, \ + .max_pages = 12, \ + .map_alignment = 4096, \ + })); \ + } while (false) + +#define TESTFUSEINIT() \ + TESTFUSEINITFLAGS(0) + +#define DECL_FUSE_IN(name) \ + struct fuse_##name##_in *name##_in = \ + (struct fuse_##name##_in *) \ + (bytes_in + sizeof(struct fuse_in_header)) + +#define DECL_FUSE(name) \ + struct fuse_##name##_in *name##_in __maybe_unused; \ + struct fuse_##name##_out *name##_out __maybe_unused + +#define FUSE_DECLARE_DAEMON \ + int daemon = -1; \ + int status; \ + bool action; \ + uint8_t bytes_in[FUSE_MIN_READ_BUFFER] __maybe_unused; \ + uint8_t bytes_out[FUSE_MIN_READ_BUFFER] __maybe_unused + +#define FUSE_START_DAEMON() \ + do { \ + TEST(daemon = fork(), daemon != -1); \ + action = daemon != 0; \ + } while (false) + +#define FUSE_END_DAEMON() \ + do { \ + TESTEQUAL(waitpid(daemon, &status, 0), daemon); \ + TESTEQUAL(status, TEST_SUCCESS); \ + result = TEST_SUCCESS; \ +out: \ + if (!daemon) \ + exit(TEST_FAILURE); \ + } while (false) + + +struct map_relocation { + char *name; + int fd; + int value; +}; + +int mount_fuse(const char *mount_dir, int bpf_fd, int dir_fd, + int *fuse_dev_ptr); +int mount_fuse_no_init(const char *mount_dir, int bpf_fd, int dir_fd, + int *fuse_dev_ptr); +int install_elf_bpf(const char *file, const char *section, int *fd, + struct map_relocation **map_relocations, size_t *map_count); +#endif diff --git a/tools/testing/selftests/filesystems/fuse/test_fuse_bpf.h b/tools/testing/selftests/filesystems/fuse/test_fuse_bpf.h new file mode 100644 index 000000000000..9097626b7c4d --- /dev/null +++ b/tools/testing/selftests/filesystems/fuse/test_fuse_bpf.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Google LLC + */ + +#ifndef TEST_FUSE__BPF__H +#define TEST_FUSE__BPF__H + +#define __EXPORTED_HEADERS__ +#define __KERNEL__ + +#ifdef __ANDROID__ +#include +#endif + +#include +#include +#include +#include +#include + +#define SEC(NAME) __section(NAME) + +struct fuse_bpf_map { + int map_type; + size_t key_size; + size_t value_size; + int max_entries; +}; + +static void *(*bpf_map_lookup_elem)(struct fuse_bpf_map *map, void *key) + = (void *) 1; + +static void *(*bpf_map_update_elem)(struct fuse_bpf_map *map, void *key, + void *value, int flags) + = (void *) 2; + +static long (*bpf_trace_printk)(const char *fmt, __u32 fmt_size, ...) + = (void *) 6; + +static long (*bpf_get_current_pid_tgid)() + = (void *) 14; + +static long (*bpf_get_current_uid_gid)() + = (void *) 15; + +#define bpf_printk(fmt, ...) \ + ({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ + }) + +SEC("dummy") inline int strcmp(const char *a, const char *b) +{ + int i; + + for (i = 0; i < __builtin_strlen(b) + 1; ++i) + if (a[i] != b[i]) + return -1; + + return 0; +} + +#endif diff --git a/tools/testing/selftests/filesystems/incfs/.gitignore b/tools/testing/selftests/filesystems/incfs/.gitignore new file mode 100644 index 000000000000..f0e3cd99d4ac --- /dev/null +++ b/tools/testing/selftests/filesystems/incfs/.gitignore @@ -0,0 +1,3 @@ +incfs_test +incfs_stress +incfs_perf diff --git a/tools/testing/selftests/filesystems/incfs/Makefile b/tools/testing/selftests/filesystems/incfs/Makefile new file mode 100644 index 000000000000..7a44efad0b37 --- /dev/null +++ b/tools/testing/selftests/filesystems/incfs/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 +CFLAGS += -D_FILE_OFFSET_BITS=64 -Wall -Werror -Wno-deprecated-declarations -I../.. -I../../../../.. -fno-omit-frame-pointer -fsanitize=address -g +LDLIBS := -llz4 -lzstd -lcrypto -lpthread -fsanitize=address +TEST_GEN_PROGS := incfs_test incfs_stress incfs_perf + +include ../../lib.mk + +# Put after include ../../lib.mk since that changes $(TEST_GEN_PROGS) +# Otherwise you get multiple targets, this becomes the default, and it's a mess +EXTRA_SOURCES := utils.c +$(TEST_GEN_PROGS) : $(EXTRA_SOURCES) diff --git a/tools/testing/selftests/filesystems/incfs/OWNERS b/tools/testing/selftests/filesystems/incfs/OWNERS new file mode 100644 index 000000000000..f26e11cd5740 --- /dev/null +++ b/tools/testing/selftests/filesystems/incfs/OWNERS @@ -0,0 +1 @@ +file:/fs/incfs/OWNERS diff --git a/tools/testing/selftests/filesystems/incfs/incfs_perf.c b/tools/testing/selftests/filesystems/incfs/incfs_perf.c new file mode 100644 index 000000000000..ed36bbd774e3 --- /dev/null +++ b/tools/testing/selftests/filesystems/incfs/incfs_perf.c @@ -0,0 +1,717 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2020 Google LLC + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" + +#define err_msg(...) \ + do { \ + fprintf(stderr, "%s: (%d) ", TAG, __LINE__); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, " (%s)\n", strerror(errno)); \ + } while (false) + +#define TAG "incfs_perf" + +struct options { + int blocks; /* -b number of diff block sizes */ + bool no_cleanup; /* -c don't clean up after */ + const char *test_dir; /* -d working directory */ + const char *file_types; /* -f sScCvV */ + bool no_native; /* -n don't test native files */ + bool no_random; /* -r don't do random reads*/ + bool no_linear; /* -R random reads only */ + size_t size; /* -s file size as power of 2 */ + int tries; /* -t times to run test*/ +}; + +enum flags { + SHUFFLE = 1, + COMPRESS = 2, + VERIFY = 4, + LAST_FLAG = 8, +}; + +void print_help(void) +{ + puts( + "incfs_perf. Performance test tool for incfs\n" + "\tTests read performance of incfs by creating files of various types\n" + "\tflushing caches and then reading them back.\n" + "\tEach file is read with different block sizes and average\n" + "\tthroughput in megabytes/second and memory usage are reported for\n" + "\teach block size\n" + "\tNative files are tested for comparison\n" + "\tNative files are created in native folder, incfs files are created\n" + "\tin src folder which is mounted on dst folder\n" + "\n" + "\t-bn (default 8) number of different block sizes, starting at 4096\n" + "\t and doubling\n" + "\t-c don't Clean up - leave files and mount point\n" + "\t-d dir create directories in dir\n" + "\t-fs|Sc|Cv|V restrict which files are created.\n" + "\t s blocks not shuffled, S blocks shuffled\n" + "\t c blocks not compress, C blocks compressed\n" + "\t v files not verified, V files verified\n" + "\t If a letter is omitted, both options are tested\n" + "\t If no letter are given, incfs is not tested\n" + "\t-n Don't test native files\n" + "\t-r No random reads (sequential only)\n" + "\t-R Random reads only (no sequential)\n" + "\t-sn (default 30)File size as power of 2\n" + "\t-tn (default 5) Number of tries per file. Results are averaged\n" + ); +} + +int parse_options(int argc, char *const *argv, struct options *options) +{ + signed char c; + + /* Set defaults here */ + *options = (struct options){ + .blocks = 8, + .test_dir = ".", + .tries = 5, + .size = 30, + }; + + /* Load options from command line here */ + while ((c = getopt(argc, argv, "b:cd:f::hnrRs:t:")) != -1) { + switch (c) { + case 'b': + options->blocks = strtol(optarg, NULL, 10); + break; + + case 'c': + options->no_cleanup = true; + break; + + case 'd': + options->test_dir = optarg; + break; + + case 'f': + if (optarg) + options->file_types = optarg; + else + options->file_types = "sS"; + break; + + case 'h': + print_help(); + exit(0); + + case 'n': + options->no_native = true; + break; + + case 'r': + options->no_random = true; + break; + + case 'R': + options->no_linear = true; + break; + + case 's': + options->size = strtol(optarg, NULL, 10); + break; + + case 't': + options->tries = strtol(optarg, NULL, 10); + break; + + default: + print_help(); + return -EINVAL; + } + } + + options->size = 1L << options->size; + + return 0; +} + +void shuffle(size_t *buffer, size_t size) +{ + size_t i; + + for (i = 0; i < size; ++i) { + size_t j = random() * (size - i - 1) / RAND_MAX; + size_t temp = buffer[i]; + + buffer[i] = buffer[j]; + buffer[j] = temp; + } +} + +int get_free_memory(void) +{ + FILE *meminfo = fopen("/proc/meminfo", "re"); + char field[256]; + char value[256] = {}; + + if (!meminfo) + return -ENOENT; + + while (fscanf(meminfo, "%[^:]: %s kB\n", field, value) == 2) { + if (!strcmp(field, "MemFree")) + break; + *value = 0; + } + + fclose(meminfo); + + if (!*value) + return -ENOENT; + + return strtol(value, NULL, 10); +} + +int write_data(int cmd_fd, int dir_fd, const char *name, size_t size, int flags) +{ + int fd = openat(dir_fd, name, O_RDWR | O_CLOEXEC); + struct incfs_permit_fill permit_fill = { + .file_descriptor = fd, + }; + int block_count = 1 + (size - 1) / INCFS_DATA_FILE_BLOCK_SIZE; + size_t *blocks = malloc(sizeof(size_t) * block_count); + int error = 0; + size_t i; + uint8_t data[INCFS_DATA_FILE_BLOCK_SIZE] = {}; + uint8_t compressed_data[INCFS_DATA_FILE_BLOCK_SIZE] = {}; + struct incfs_fill_block fill_block = { + .compression = COMPRESSION_NONE, + .data_len = sizeof(data), + .data = ptr_to_u64(data), + }; + + if (!blocks) { + err_msg("Out of memory"); + error = -errno; + goto out; + } + + if (fd == -1) { + err_msg("Could not open file for writing %s", name); + error = -errno; + goto out; + } + + if (ioctl(cmd_fd, INCFS_IOC_PERMIT_FILL, &permit_fill)) { + err_msg("Failed to call PERMIT_FILL"); + error = -errno; + goto out; + } + + for (i = 0; i < block_count; ++i) + blocks[i] = i; + + if (flags & SHUFFLE) + shuffle(blocks, block_count); + + if (flags & COMPRESS) { + size_t comp_size = LZ4_compress_default( + (char *)data, (char *)compressed_data, sizeof(data), + ARRAY_SIZE(compressed_data)); + + if (comp_size <= 0) { + error = -EBADMSG; + goto out; + } + fill_block.compression = COMPRESSION_LZ4; + fill_block.data = ptr_to_u64(compressed_data); + fill_block.data_len = comp_size; + } + + for (i = 0; i < block_count; ++i) { + struct incfs_fill_blocks fill_blocks = { + .count = 1, + .fill_blocks = ptr_to_u64(&fill_block), + }; + + fill_block.block_index = blocks[i]; + int written = ioctl(fd, INCFS_IOC_FILL_BLOCKS, &fill_blocks); + + if (written != 1) { + error = -errno; + err_msg("Failed to write block %lu in file %s", i, + name); + break; + } + } + +out: + free(blocks); + close(fd); + sync(); + return error; +} + +int measure_read_throughput_internal(const char *tag, int dir, const char *name, + const struct options *options, bool random) +{ + int block; + + if (random) + printf("%32s(random)", tag); + else + printf("%40s", tag); + + for (block = 0; block < options->blocks; ++block) { + size_t buffer_size; + char *buffer; + int try; + double time = 0; + double throughput; + int memory = 0; + + buffer_size = 1 << (block + 12); + buffer = malloc(buffer_size); + + for (try = 0; try < options->tries; ++try) { + int err; + struct timespec start_time, end_time; + off_t i; + int fd; + size_t offsets_size = options->size / buffer_size; + size_t *offsets = + malloc(offsets_size * sizeof(*offsets)); + int start_memory, end_memory; + + if (!offsets) { + err_msg("Not enough memory"); + return -ENOMEM; + } + + for (i = 0; i < offsets_size; ++i) + offsets[i] = i * buffer_size; + + if (random) + shuffle(offsets, offsets_size); + + err = drop_caches(); + if (err) { + err_msg("Failed to drop caches"); + return err; + } + + start_memory = get_free_memory(); + if (start_memory < 0) { + err_msg("Failed to get start memory"); + return start_memory; + } + + fd = openat(dir, name, O_RDONLY | O_CLOEXEC); + if (fd == -1) { + err_msg("Failed to open file"); + return err; + } + + err = clock_gettime(CLOCK_MONOTONIC, &start_time); + if (err) { + err_msg("Failed to get start time"); + return err; + } + + for (i = 0; i < offsets_size; ++i) + if (pread(fd, buffer, buffer_size, + offsets[i]) != buffer_size) { + err_msg("Failed to read file"); + err = -errno; + goto fail; + } + + err = clock_gettime(CLOCK_MONOTONIC, &end_time); + if (err) { + err_msg("Failed to get start time"); + goto fail; + } + + end_memory = get_free_memory(); + if (end_memory < 0) { + err_msg("Failed to get end memory"); + return end_memory; + } + + time += end_time.tv_sec - start_time.tv_sec; + time += (end_time.tv_nsec - start_time.tv_nsec) / 1e9; + + close(fd); + fd = -1; + memory += start_memory - end_memory; + +fail: + free(offsets); + close(fd); + if (err) + return err; + } + + throughput = options->size * options->tries / time; + printf("%10.3e %10d", throughput, memory / options->tries); + free(buffer); + } + + printf("\n"); + return 0; +} + +int measure_read_throughput(const char *tag, int dir, const char *name, + const struct options *options) +{ + int err = 0; + + if (!options->no_linear) + err = measure_read_throughput_internal(tag, dir, name, options, + false); + + if (!err && !options->no_random) + err = measure_read_throughput_internal(tag, dir, name, options, + true); + return err; +} + +int test_native_file(int dir, const struct options *options) +{ + const char *name = "file"; + int fd; + char buffer[4096] = {}; + off_t i; + int err; + + fd = openat(dir, name, O_CREAT | O_WRONLY | O_CLOEXEC, 0600); + if (fd == -1) { + err_msg("Could not open native file"); + return -errno; + } + + for (i = 0; i < options->size; i += sizeof(buffer)) + if (pwrite(fd, buffer, sizeof(buffer), i) != sizeof(buffer)) { + err_msg("Failed to write file"); + err = -errno; + goto fail; + } + + close(fd); + sync(); + fd = -1; + + err = measure_read_throughput("native", dir, name, options); + +fail: + close(fd); + return err; +} + +struct hash_block { + char data[INCFS_DATA_FILE_BLOCK_SIZE]; +}; + +static struct hash_block *build_mtree(size_t size, char *root_hash, + int *mtree_block_count) +{ + char data[INCFS_DATA_FILE_BLOCK_SIZE] = {}; + const int digest_size = SHA256_DIGEST_SIZE; + const int hash_per_block = INCFS_DATA_FILE_BLOCK_SIZE / digest_size; + int block_count = 0; + int hash_block_count = 0; + int total_tree_block_count = 0; + int tree_lvl_index[INCFS_MAX_MTREE_LEVELS] = {}; + int tree_lvl_count[INCFS_MAX_MTREE_LEVELS] = {}; + int levels_count = 0; + int i, level; + struct hash_block *mtree; + + if (size == 0) + return 0; + + block_count = 1 + (size - 1) / INCFS_DATA_FILE_BLOCK_SIZE; + hash_block_count = block_count; + for (i = 0; hash_block_count > 1; i++) { + hash_block_count = (hash_block_count + hash_per_block - 1) / + hash_per_block; + tree_lvl_count[i] = hash_block_count; + total_tree_block_count += hash_block_count; + } + levels_count = i; + + for (i = 0; i < levels_count; i++) { + int prev_lvl_base = (i == 0) ? total_tree_block_count : + tree_lvl_index[i - 1]; + + tree_lvl_index[i] = prev_lvl_base - tree_lvl_count[i]; + } + + *mtree_block_count = total_tree_block_count; + mtree = calloc(total_tree_block_count, sizeof(*mtree)); + /* Build level 0 hashes. */ + for (i = 0; i < block_count; i++) { + int block_index = tree_lvl_index[0] + i / hash_per_block; + int block_off = (i % hash_per_block) * digest_size; + char *hash_ptr = mtree[block_index].data + block_off; + + sha256(data, INCFS_DATA_FILE_BLOCK_SIZE, hash_ptr); + } + + /* Build higher levels of hash tree. */ + for (level = 1; level < levels_count; level++) { + int prev_lvl_base = tree_lvl_index[level - 1]; + int prev_lvl_count = tree_lvl_count[level - 1]; + + for (i = 0; i < prev_lvl_count; i++) { + int block_index = + i / hash_per_block + tree_lvl_index[level]; + int block_off = (i % hash_per_block) * digest_size; + char *hash_ptr = mtree[block_index].data + block_off; + + sha256(mtree[i + prev_lvl_base].data, + INCFS_DATA_FILE_BLOCK_SIZE, hash_ptr); + } + } + + /* Calculate root hash from the top block */ + sha256(mtree[0].data, INCFS_DATA_FILE_BLOCK_SIZE, root_hash); + + return mtree; +} + +static int load_hash_tree(int cmd_fd, int dir, const char *name, + struct hash_block *mtree, int mtree_block_count) +{ + int err; + int i; + int fd; + struct incfs_fill_block *fill_block_array = + calloc(mtree_block_count, sizeof(struct incfs_fill_block)); + struct incfs_fill_blocks fill_blocks = { + .count = mtree_block_count, + .fill_blocks = ptr_to_u64(fill_block_array), + }; + struct incfs_permit_fill permit_fill; + + if (!fill_block_array) + return -ENOMEM; + + for (i = 0; i < fill_blocks.count; i++) { + fill_block_array[i] = (struct incfs_fill_block){ + .block_index = i, + .data_len = INCFS_DATA_FILE_BLOCK_SIZE, + .data = ptr_to_u64(mtree[i].data), + .flags = INCFS_BLOCK_FLAGS_HASH + }; + } + + fd = openat(dir, name, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + err = errno; + goto failure; + } + + permit_fill.file_descriptor = fd; + if (ioctl(cmd_fd, INCFS_IOC_PERMIT_FILL, &permit_fill)) { + err_msg("Failed to call PERMIT_FILL"); + err = -errno; + goto failure; + } + + err = ioctl(fd, INCFS_IOC_FILL_BLOCKS, &fill_blocks); + close(fd); + if (err < fill_blocks.count) + err = errno; + else + err = 0; + +failure: + free(fill_block_array); + return err; +} + +int test_incfs_file(int dst_dir, const struct options *options, int flags) +{ + int cmd_file = openat(dst_dir, INCFS_PENDING_READS_FILENAME, + O_RDONLY | O_CLOEXEC); + int err; + char name[4]; + incfs_uuid_t id; + char tag[256]; + + snprintf(name, sizeof(name), "%c%c%c", + flags & SHUFFLE ? 'S' : 's', + flags & COMPRESS ? 'C' : 'c', + flags & VERIFY ? 'V' : 'v'); + + if (cmd_file == -1) { + err_msg("Could not open command file"); + return -errno; + } + + if (flags & VERIFY) { + char root_hash[INCFS_MAX_HASH_SIZE]; + int mtree_block_count; + struct hash_block *mtree = build_mtree(options->size, root_hash, + &mtree_block_count); + + if (!mtree) { + err_msg("Failed to build hash tree"); + err = -ENOMEM; + goto fail; + } + + err = crypto_emit_file(cmd_file, NULL, name, &id, options->size, + root_hash, "add_data"); + + if (!err) + err = load_hash_tree(cmd_file, dst_dir, name, mtree, + mtree_block_count); + + free(mtree); + } else + err = emit_file(cmd_file, NULL, name, &id, options->size, NULL); + + if (err) { + err_msg("Failed to create file %s", name); + goto fail; + } + + if (write_data(cmd_file, dst_dir, name, options->size, flags)) + goto fail; + + snprintf(tag, sizeof(tag), "incfs%s%s%s", + flags & SHUFFLE ? "(shuffle)" : "", + flags & COMPRESS ? "(compress)" : "", + flags & VERIFY ? "(verify)" : ""); + + err = measure_read_throughput(tag, dst_dir, name, options); + +fail: + close(cmd_file); + return err; +} + +bool skip(struct options const *options, int flag, char c) +{ + if (!options->file_types) + return false; + + if (flag && strchr(options->file_types, tolower(c))) + return true; + + if (!flag && strchr(options->file_types, toupper(c))) + return true; + + return false; +} + +int main(int argc, char *const *argv) +{ + struct options options; + int err; + const char *native_dir = "native"; + const char *src_dir = "src"; + const char *dst_dir = "dst"; + int native_dir_fd = -1; + int src_dir_fd = -1; + int dst_dir_fd = -1; + int block; + int flags; + + err = parse_options(argc, argv, &options); + if (err) + return err; + + err = chdir(options.test_dir); + if (err) { + err_msg("Failed to change to %s", options.test_dir); + return -errno; + } + + /* Clean up any interrupted previous runs */ + while (!umount(dst_dir)) + ; + + err = remove_dir(native_dir) || remove_dir(src_dir) || + remove_dir(dst_dir); + if (err) + return err; + + err = mkdir(native_dir, 0700); + if (err) { + err_msg("Failed to make directory %s", src_dir); + err = -errno; + goto cleanup; + } + + err = mkdir(src_dir, 0700); + if (err) { + err_msg("Failed to make directory %s", src_dir); + err = -errno; + goto cleanup; + } + + err = mkdir(dst_dir, 0700); + if (err) { + err_msg("Failed to make directory %s", src_dir); + err = -errno; + goto cleanup; + } + + err = mount_fs_opt(dst_dir, src_dir, "readahead=0,rlog_pages=0", 0); + if (err) { + err_msg("Failed to mount incfs"); + goto cleanup; + } + + native_dir_fd = open(native_dir, O_RDONLY | O_CLOEXEC); + src_dir_fd = open(src_dir, O_RDONLY | O_CLOEXEC); + dst_dir_fd = open(dst_dir, O_RDONLY | O_CLOEXEC); + if (native_dir_fd == -1 || src_dir_fd == -1 || dst_dir_fd == -1) { + err_msg("Failed to open native, src or dst dir"); + err = -errno; + goto cleanup; + } + + printf("%40s", ""); + for (block = 0; block < options.blocks; ++block) + printf("%21d", 1 << (block + 12)); + printf("\n"); + + if (!err && !options.no_native) + err = test_native_file(native_dir_fd, &options); + + for (flags = 0; flags < LAST_FLAG && !err; ++flags) { + if (skip(&options, flags & SHUFFLE, 's') || + skip(&options, flags & COMPRESS, 'c') || + skip(&options, flags & VERIFY, 'v')) + continue; + err = test_incfs_file(dst_dir_fd, &options, flags); + } + +cleanup: + close(native_dir_fd); + close(src_dir_fd); + close(dst_dir_fd); + if (!options.no_cleanup) { + umount(dst_dir); + remove_dir(native_dir); + remove_dir(dst_dir); + remove_dir(src_dir); + } + + return err; +} diff --git a/tools/testing/selftests/filesystems/incfs/incfs_stress.c b/tools/testing/selftests/filesystems/incfs/incfs_stress.c new file mode 100644 index 000000000000..a1d491755832 --- /dev/null +++ b/tools/testing/selftests/filesystems/incfs/incfs_stress.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2020 Google LLC + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" + +#define err_msg(...) \ + do { \ + fprintf(stderr, "%s: (%d) ", TAG, __LINE__); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, " (%s)\n", strerror(errno)); \ + } while (false) + +#define TAG "incfs_stress" + +struct options { + bool no_cleanup; /* -c */ + const char *test_dir; /* -d */ + unsigned int rng_seed; /* -g */ + int num_reads; /* -n */ + int readers; /* -r */ + int size; /* -s */ + int timeout; /* -t */ +}; + +struct read_data { + const char *filename; + int dir_fd; + size_t filesize; + int num_reads; + unsigned int rng_seed; +}; + +int cancel_threads; + +int parse_options(int argc, char *const *argv, struct options *options) +{ + signed char c; + + /* Set defaults here */ + *options = (struct options){ + .test_dir = ".", + .num_reads = 1000, + .readers = 10, + .size = 10, + }; + + /* Load options from command line here */ + while ((c = getopt(argc, argv, "cd:g:n:r:s:t:")) != -1) { + switch (c) { + case 'c': + options->no_cleanup = true; + break; + + case 'd': + options->test_dir = optarg; + break; + + case 'g': + options->rng_seed = strtol(optarg, NULL, 10); + break; + + case 'n': + options->num_reads = strtol(optarg, NULL, 10); + break; + + case 'r': + options->readers = strtol(optarg, NULL, 10); + break; + + case 's': + options->size = strtol(optarg, NULL, 10); + break; + + case 't': + options->timeout = strtol(optarg, NULL, 10); + break; + } + } + + return 0; +} + +void *reader(void *data) +{ + struct read_data *read_data = (struct read_data *)data; + int i; + int fd = -1; + void *buffer = malloc(read_data->filesize); + + if (!buffer) { + err_msg("Failed to alloc read buffer"); + goto out; + } + + fd = openat(read_data->dir_fd, read_data->filename, + O_RDONLY | O_CLOEXEC); + if (fd == -1) { + err_msg("Failed to open file"); + goto out; + } + + for (i = 0; i < read_data->num_reads && !cancel_threads; ++i) { + off_t offset = rnd(read_data->filesize, &read_data->rng_seed); + size_t count = + rnd(read_data->filesize - offset, &read_data->rng_seed); + ssize_t err = pread(fd, buffer, count, offset); + + if (err != count) + err_msg("failed to read with value %lu", err); + } + +out: + close(fd); + free(read_data); + free(buffer); + return NULL; +} + +int write_data(int cmd_fd, int dir_fd, const char *name, size_t size) +{ + int fd = openat(dir_fd, name, O_RDWR | O_CLOEXEC); + struct incfs_permit_fill permit_fill = { + .file_descriptor = fd, + }; + int error = 0; + int i; + int block_count = 1 + (size - 1) / INCFS_DATA_FILE_BLOCK_SIZE; + + if (fd == -1) { + err_msg("Could not open file for writing %s", name); + return -errno; + } + + if (ioctl(cmd_fd, INCFS_IOC_PERMIT_FILL, &permit_fill)) { + err_msg("Failed to call PERMIT_FILL"); + error = -errno; + goto out; + } + + for (i = 0; i < block_count; ++i) { + uint8_t data[INCFS_DATA_FILE_BLOCK_SIZE] = {}; + size_t block_size = + size > i * INCFS_DATA_FILE_BLOCK_SIZE ? + INCFS_DATA_FILE_BLOCK_SIZE : + size - (i * INCFS_DATA_FILE_BLOCK_SIZE); + struct incfs_fill_block fill_block = { + .compression = COMPRESSION_NONE, + .block_index = i, + .data_len = block_size, + .data = ptr_to_u64(data), + }; + struct incfs_fill_blocks fill_blocks = { + .count = 1, + .fill_blocks = ptr_to_u64(&fill_block), + }; + int written = ioctl(fd, INCFS_IOC_FILL_BLOCKS, &fill_blocks); + + if (written != 1) { + error = -errno; + err_msg("Failed to write block %d in file %s", i, name); + break; + } + } +out: + close(fd); + return error; +} + +int test_files(int src_dir, int dst_dir, struct options const *options) +{ + unsigned int seed = options->rng_seed; + int cmd_file = openat(dst_dir, INCFS_PENDING_READS_FILENAME, + O_RDONLY | O_CLOEXEC); + int err; + const char *name = "001"; + incfs_uuid_t id; + size_t size; + int i; + pthread_t *threads = NULL; + + size = 1 << (rnd(options->size, &seed) + 12); + size += rnd(size, &seed); + + if (cmd_file == -1) { + err_msg("Could not open command file"); + return -errno; + } + + err = emit_file(cmd_file, NULL, name, &id, size, NULL); + if (err) { + err_msg("Failed to create file %s", name); + return err; + } + + threads = malloc(sizeof(pthread_t) * options->readers); + if (!threads) { + err_msg("Could not allocate memory for threads"); + return -ENOMEM; + } + + for (i = 0; i < options->readers; ++i) { + struct read_data *read_data = malloc(sizeof(*read_data)); + + if (!read_data) { + err_msg("Failed to allocate read_data"); + err = -ENOMEM; + break; + } + + *read_data = (struct read_data){ + .filename = name, + .dir_fd = dst_dir, + .filesize = size, + .num_reads = options->num_reads, + .rng_seed = seed, + }; + + rnd(0, &seed); + + err = pthread_create(threads + i, 0, reader, read_data); + if (err) { + err_msg("Failed to create thread"); + free(read_data); + break; + } + } + + if (err) + cancel_threads = 1; + else + err = write_data(cmd_file, dst_dir, name, size); + + for (; i > 0; --i) { + if (pthread_join(threads[i - 1], NULL)) { + err_msg("FATAL: failed to join thread"); + exit(-errno); + } + } + + free(threads); + close(cmd_file); + return err; +} + +int main(int argc, char *const *argv) +{ + struct options options; + int err; + const char *src_dir = "src"; + const char *dst_dir = "dst"; + int src_dir_fd = -1; + int dst_dir_fd = -1; + + err = parse_options(argc, argv, &options); + if (err) + return err; + + err = chdir(options.test_dir); + if (err) { + err_msg("Failed to change to %s", options.test_dir); + return -errno; + } + + err = remove_dir(src_dir) || remove_dir(dst_dir); + if (err) + return err; + + err = mkdir(src_dir, 0700); + if (err) { + err_msg("Failed to make directory %s", src_dir); + err = -errno; + goto cleanup; + } + + err = mkdir(dst_dir, 0700); + if (err) { + err_msg("Failed to make directory %s", src_dir); + err = -errno; + goto cleanup; + } + + err = mount_fs(dst_dir, src_dir, options.timeout); + if (err) { + err_msg("Failed to mount incfs"); + goto cleanup; + } + + src_dir_fd = open(src_dir, O_RDONLY | O_CLOEXEC); + dst_dir_fd = open(dst_dir, O_RDONLY | O_CLOEXEC); + if (src_dir_fd == -1 || dst_dir_fd == -1) { + err_msg("Failed to open src or dst dir"); + err = -errno; + goto cleanup; + } + + err = test_files(src_dir_fd, dst_dir_fd, &options); + +cleanup: + close(src_dir_fd); + close(dst_dir_fd); + if (!options.no_cleanup) { + umount(dst_dir); + remove_dir(dst_dir); + remove_dir(src_dir); + } + + return err; +} diff --git a/tools/testing/selftests/filesystems/incfs/incfs_test.c b/tools/testing/selftests/filesystems/incfs/incfs_test.c new file mode 100644 index 000000000000..fc2f55884972 --- /dev/null +++ b/tools/testing/selftests/filesystems/incfs/incfs_test.c @@ -0,0 +1,5008 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2018 Google LLC + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include + +#include "utils.h" + +/* Can't include uapi/linux/fs.h because it clashes with mount.h */ +#define FS_IOC_GETFLAGS _IOR('f', 1, long) +#define FS_VERITY_FL 0x00100000 /* Verity protected inode */ + +#define TEST_SKIP 2 +#define TEST_FAILURE 1 +#define TEST_SUCCESS 0 + +#define INCFS_ROOT_INODE 0 + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define le16_to_cpu(x) (x) +#define le32_to_cpu(x) (x) +#define le64_to_cpu(x) (x) +#else +#error Big endian not supported! +#endif + +struct { + int file; + bool verbose; +} options; + +#define TESTCOND(condition) \ + do { \ + if (!(condition)) { \ + ksft_print_msg("%s failed %d\n", \ + __func__, __LINE__); \ + goto out; \ + } else if (options.verbose) \ + ksft_print_msg("%s succeeded %d\n", \ + __func__, __LINE__); \ + } while (false) + +#define TEST(statement, condition) \ + do { \ + statement; \ + TESTCOND(condition); \ + } while (false) + +#define TESTEQUAL(statement, res) \ + TESTCOND((statement) == (res)) + +#define TESTNE(statement, res) \ + TESTCOND((statement) != (res)) + +#define TESTSYSCALL(statement) \ + do { \ + int res = statement; \ + \ + if (res) \ + ksft_print_msg("Failed: %s (%d)\n", \ + strerror(errno), errno); \ + TESTEQUAL(res, 0); \ + } while (false) + +void print_bytes(const void *data, size_t size) +{ + const uint8_t *bytes = data; + int i; + + for (i = 0; i < size; ++i) { + if (i % 0x10 == 0) + printf("%08x:", i); + printf("%02x ", (unsigned int) bytes[i]); + if (i % 0x10 == 0x0f) + printf("\n"); + } + + if (i % 0x10 != 0) + printf("\n"); +} + +struct hash_block { + char data[INCFS_DATA_FILE_BLOCK_SIZE]; +}; + +struct test_signature { + void *data; + size_t size; + + char add_data[100]; + size_t add_data_size; +}; + +struct test_file { + int index; + incfs_uuid_t id; + char *name; + off_t size; + char root_hash[INCFS_MAX_HASH_SIZE]; + struct hash_block *mtree; + int mtree_block_count; + struct test_signature sig; + unsigned char *verity_sig; + size_t verity_sig_size; +}; + +struct test_files_set { + struct test_file *files; + int files_count; +}; + +struct linux_dirent64 { + uint64_t d_ino; + int64_t d_off; + unsigned short d_reclen; + unsigned char d_type; + char d_name[0]; +} __packed; + +struct test_files_set get_test_files_set(void) +{ + static struct test_file files[] = { + { .index = 0, .name = "file_one_byte", .size = 1 }, + { .index = 1, + .name = "file_one_block", + .size = INCFS_DATA_FILE_BLOCK_SIZE }, + { .index = 2, + .name = "file_one_and_a_half_blocks", + .size = INCFS_DATA_FILE_BLOCK_SIZE + + INCFS_DATA_FILE_BLOCK_SIZE / 2 }, + { .index = 3, + .name = "file_three", + .size = 300 * INCFS_DATA_FILE_BLOCK_SIZE + 3 }, + { .index = 4, + .name = "file_four", + .size = 400 * INCFS_DATA_FILE_BLOCK_SIZE + 7 }, + { .index = 5, + .name = "file_five", + .size = 500 * INCFS_DATA_FILE_BLOCK_SIZE + 7 }, + { .index = 6, + .name = "file_six", + .size = 600 * INCFS_DATA_FILE_BLOCK_SIZE + 7 }, + { .index = 7, + .name = "file_seven", + .size = 700 * INCFS_DATA_FILE_BLOCK_SIZE + 7 }, + { .index = 8, + .name = "file_eight", + .size = 800 * INCFS_DATA_FILE_BLOCK_SIZE + 7 }, + { .index = 9, + .name = "file_nine", + .size = 900 * INCFS_DATA_FILE_BLOCK_SIZE + 7 }, + { .index = 10, .name = "file_big", .size = 500 * 1024 * 1024 } + }; + + if (options.file) + return (struct test_files_set) { + .files = files + options.file - 1, + .files_count = 1, + }; + + return (struct test_files_set){ .files = files, + .files_count = ARRAY_SIZE(files) }; +} + +struct test_files_set get_small_test_files_set(void) +{ + static struct test_file files[] = { + { .index = 0, .name = "file_one_byte", .size = 1 }, + { .index = 1, + .name = "file_one_block", + .size = INCFS_DATA_FILE_BLOCK_SIZE }, + { .index = 2, + .name = "file_one_and_a_half_blocks", + .size = INCFS_DATA_FILE_BLOCK_SIZE + + INCFS_DATA_FILE_BLOCK_SIZE / 2 }, + { .index = 3, + .name = "file_three", + .size = 300 * INCFS_DATA_FILE_BLOCK_SIZE + 3 }, + { .index = 4, + .name = "file_four", + .size = 400 * INCFS_DATA_FILE_BLOCK_SIZE + 7 } + }; + return (struct test_files_set){ .files = files, + .files_count = ARRAY_SIZE(files) }; +} + +static int get_file_block_seed(int file, int block) +{ + return 7919 * file + block; +} + +static loff_t min(loff_t a, loff_t b) +{ + return a < b ? a : b; +} + +static int ilog2(size_t n) +{ + int l = 0; + + while (n > 1) { + ++l; + n >>= 1; + } + return l; +} + +static pid_t flush_and_fork(void) +{ + fflush(stdout); + return fork(); +} + +static void print_error(char *msg) +{ + ksft_print_msg("%s: %s\n", msg, strerror(errno)); +} + +static int wait_for_process(pid_t pid) +{ + int status; + int wait_res; + + wait_res = waitpid(pid, &status, 0); + if (wait_res <= 0) { + print_error("Can't wait for the child"); + return -EINVAL; + } + if (!WIFEXITED(status)) { + ksft_print_msg("Unexpected child status pid=%d\n", pid); + return -EINVAL; + } + status = WEXITSTATUS(status); + if (status != 0) + return status; + return 0; +} + +static void rnd_buf(uint8_t *data, size_t len, unsigned int seed) +{ + int i; + + for (i = 0; i < len; i++) { + seed = 1103515245 * seed + 12345; + data[i] = (uint8_t)(seed >> (i % 13)); + } +} + +char *bin2hex(char *dst, const void *src, size_t count) +{ + const unsigned char *_src = src; + static const char hex_asc[] = "0123456789abcdef"; + + while (count--) { + unsigned char x = *_src++; + + *dst++ = hex_asc[(x & 0xf0) >> 4]; + *dst++ = hex_asc[(x & 0x0f)]; + } + *dst = 0; + return dst; +} + +static char *get_index_filename(const char *mnt_dir, incfs_uuid_t id) +{ + char path[FILENAME_MAX]; + char str_id[1 + 2 * sizeof(id)]; + + bin2hex(str_id, id.bytes, sizeof(id.bytes)); + snprintf(path, ARRAY_SIZE(path), "%s/.index/%s", mnt_dir, str_id); + + return strdup(path); +} + +static char *get_incomplete_filename(const char *mnt_dir, incfs_uuid_t id) +{ + char path[FILENAME_MAX]; + char str_id[1 + 2 * sizeof(id)]; + + bin2hex(str_id, id.bytes, sizeof(id.bytes)); + snprintf(path, ARRAY_SIZE(path), "%s/.incomplete/%s", mnt_dir, str_id); + + return strdup(path); +} + +int open_file_by_id(const char *mnt_dir, incfs_uuid_t id, bool use_ioctl) +{ + char *path = get_index_filename(mnt_dir, id); + int cmd_fd = open_commands_file(mnt_dir); + int fd = open(path, O_RDWR | O_CLOEXEC); + struct incfs_permit_fill permit_fill = { + .file_descriptor = fd, + }; + int error = 0; + + if (fd < 0) { + print_error("Can't open file by id."); + error = -errno; + goto out; + } + + if (use_ioctl && ioctl(cmd_fd, INCFS_IOC_PERMIT_FILL, &permit_fill)) { + print_error("Failed to call PERMIT_FILL"); + error = -errno; + goto out; + } + + if (ioctl(fd, INCFS_IOC_PERMIT_FILL, &permit_fill) != -1) { + print_error( + "Successfully called PERMIT_FILL on non pending_read file"); + return -errno; + goto out; + } + +out: + free(path); + close(cmd_fd); + + if (error) { + close(fd); + return error; + } + + return fd; +} + +int get_file_attr(const char *mnt_dir, incfs_uuid_t id, char *value, int size) +{ + char *path = get_index_filename(mnt_dir, id); + int res; + + res = getxattr(path, INCFS_XATTR_METADATA_NAME, value, size); + if (res < 0) + res = -errno; + + free(path); + return res; +} + +static bool same_id(incfs_uuid_t *id1, incfs_uuid_t *id2) +{ + return !memcmp(id1->bytes, id2->bytes, sizeof(id1->bytes)); +} + +ssize_t ZSTD_compress_default(char *data, char *comp_data, size_t data_size, + size_t comp_size) +{ + return ZSTD_compress(comp_data, comp_size, data, data_size, 1); +} + +static int emit_test_blocks(const char *mnt_dir, struct test_file *file, + int blocks[], int count) +{ + uint8_t data[INCFS_DATA_FILE_BLOCK_SIZE]; + uint8_t comp_data[2 * INCFS_DATA_FILE_BLOCK_SIZE]; + int block_count = (count > 32) ? 32 : count; + int data_buf_size = 2 * INCFS_DATA_FILE_BLOCK_SIZE * block_count; + uint8_t *data_buf = malloc(data_buf_size); + uint8_t *current_data = data_buf; + uint8_t *data_end = data_buf + data_buf_size; + struct incfs_fill_block *block_buf = + calloc(block_count, sizeof(struct incfs_fill_block)); + struct incfs_fill_blocks fill_blocks = { + .count = block_count, + .fill_blocks = ptr_to_u64(block_buf), + }; + ssize_t write_res = 0; + int fd = -1; + int error = 0; + int i = 0; + int blocks_written = 0; + + for (i = 0; i < block_count; i++) { + int block_index = blocks[i]; + bool compress_zstd = (file->index + block_index) % 4 == 2; + bool compress_lz4 = (file->index + block_index) % 4 == 0; + int seed = get_file_block_seed(file->index, block_index); + off_t block_offset = + ((off_t)block_index) * INCFS_DATA_FILE_BLOCK_SIZE; + size_t block_size = 0; + + if (block_offset > file->size) { + error = -EINVAL; + break; + } + if (file->size - block_offset > + INCFS_DATA_FILE_BLOCK_SIZE) + block_size = INCFS_DATA_FILE_BLOCK_SIZE; + else + block_size = file->size - block_offset; + + rnd_buf(data, block_size, seed); + if (compress_lz4) { + size_t comp_size = LZ4_compress_default((char *)data, + (char *)comp_data, block_size, + ARRAY_SIZE(comp_data)); + + if (comp_size <= 0) { + error = -EBADMSG; + break; + } + if (current_data + comp_size > data_end) { + error = -ENOMEM; + break; + } + memcpy(current_data, comp_data, comp_size); + block_size = comp_size; + block_buf[i].compression = COMPRESSION_LZ4; + } else if (compress_zstd) { + size_t comp_size = ZSTD_compress(comp_data, + ARRAY_SIZE(comp_data), data, block_size, + 1); + + if (comp_size <= 0) { + error = -EBADMSG; + break; + } + if (current_data + comp_size > data_end) { + error = -ENOMEM; + break; + } + memcpy(current_data, comp_data, comp_size); + block_size = comp_size; + block_buf[i].compression = COMPRESSION_ZSTD; + } else { + if (current_data + block_size > data_end) { + error = -ENOMEM; + break; + } + memcpy(current_data, data, block_size); + block_buf[i].compression = COMPRESSION_NONE; + } + + block_buf[i].block_index = block_index; + block_buf[i].data_len = block_size; + block_buf[i].data = ptr_to_u64(current_data); + current_data += block_size; + } + + if (!error) { + fd = open_file_by_id(mnt_dir, file->id, false); + if (fd < 0) { + error = -errno; + goto out; + } + write_res = ioctl(fd, INCFS_IOC_FILL_BLOCKS, &fill_blocks); + if (write_res >= 0) { + ksft_print_msg("Wrote to file via normal fd error\n"); + error = -EPERM; + goto out; + } + + close(fd); + fd = open_file_by_id(mnt_dir, file->id, true); + if (fd < 0) { + error = -errno; + goto out; + } + write_res = ioctl(fd, INCFS_IOC_FILL_BLOCKS, &fill_blocks); + if (write_res < 0) + error = -errno; + else + blocks_written = write_res; + } + if (error) { + ksft_print_msg( + "Writing data block error. Write returned: %d. Error:%s\n", + write_res, strerror(-error)); + } + +out: + free(block_buf); + free(data_buf); + close(fd); + return (error < 0) ? error : blocks_written; +} + +static int emit_test_block(const char *mnt_dir, struct test_file *file, + int block_index) +{ + int res = emit_test_blocks(mnt_dir, file, &block_index, 1); + + if (res == 0) + return -EINVAL; + if (res == 1) + return 0; + return res; +} + +static void shuffle(int array[], int count, unsigned int seed) +{ + int i; + + for (i = 0; i < count - 1; i++) { + int items_left = count - i; + int shuffle_index; + int v; + + seed = 1103515245 * seed + 12345; + shuffle_index = i + seed % items_left; + + v = array[shuffle_index]; + array[shuffle_index] = array[i]; + array[i] = v; + } +} + +static int emit_test_file_data(const char *mount_dir, struct test_file *file) +{ + int i; + int block_cnt = 1 + (file->size - 1) / INCFS_DATA_FILE_BLOCK_SIZE; + int *block_indexes = NULL; + int result = 0; + int blocks_written = 0; + + if (file->size == 0) + return 0; + + block_indexes = calloc(block_cnt, sizeof(*block_indexes)); + for (i = 0; i < block_cnt; i++) + block_indexes[i] = i; + shuffle(block_indexes, block_cnt, file->index); + + for (i = 0; i < block_cnt; i += blocks_written) { + blocks_written = emit_test_blocks(mount_dir, file, + block_indexes + i, block_cnt - i); + if (blocks_written < 0) { + result = blocks_written; + goto out; + } + if (blocks_written == 0) { + result = -EIO; + goto out; + } + } +out: + free(block_indexes); + return result; +} + +static loff_t read_whole_file(const char *filename) +{ + int fd = -1; + loff_t result; + loff_t bytes_read = 0; + uint8_t buff[16 * 1024]; + + fd = open(filename, O_RDONLY | O_CLOEXEC); + if (fd <= 0) + return fd; + + while (1) { + int read_result = read(fd, buff, ARRAY_SIZE(buff)); + + if (read_result < 0) { + print_error("Error during reading from a file."); + result = -errno; + goto cleanup; + } else if (read_result == 0) + break; + + bytes_read += read_result; + } + result = bytes_read; + +cleanup: + close(fd); + return result; +} + +static int read_test_file(uint8_t *buf, size_t len, char *filename, + int block_idx) +{ + int fd = -1; + int result; + int bytes_read = 0; + size_t bytes_to_read = len; + off_t offset = ((off_t)block_idx) * INCFS_DATA_FILE_BLOCK_SIZE; + + fd = open(filename, O_RDONLY | O_CLOEXEC); + if (fd <= 0) + return fd; + + if (lseek(fd, offset, SEEK_SET) != offset) { + print_error("Seek error"); + return -errno; + } + + while (bytes_read < bytes_to_read) { + int read_result = + read(fd, buf + bytes_read, bytes_to_read - bytes_read); + if (read_result < 0) { + result = -errno; + goto cleanup; + } else if (read_result == 0) + break; + + bytes_read += read_result; + } + result = bytes_read; + +cleanup: + close(fd); + return result; +} + +static char *create_backing_dir(const char *mount_dir) +{ + struct stat st; + char backing_dir_name[255]; + + snprintf(backing_dir_name, ARRAY_SIZE(backing_dir_name), "%s-src", + mount_dir); + + if (stat(backing_dir_name, &st) == 0) { + if (S_ISDIR(st.st_mode)) { + int error = delete_dir_tree(backing_dir_name); + + if (error) { + ksft_print_msg( + "Can't delete existing backing dir. %d\n", + error); + return NULL; + } + } else { + if (unlink(backing_dir_name)) { + print_error("Can't clear backing dir"); + return NULL; + } + } + } + + if (mkdir(backing_dir_name, 0777)) { + if (errno != EEXIST) { + print_error("Can't open/create backing dir"); + return NULL; + } + } + + return strdup(backing_dir_name); +} + +static int validate_test_file_content_with_seed(const char *mount_dir, + struct test_file *file, + unsigned int shuffle_seed) +{ + int error = -1; + char *filename = concat_file_name(mount_dir, file->name); + off_t size = file->size; + loff_t actual_size = get_file_size(filename); + int block_cnt = 1 + (size - 1) / INCFS_DATA_FILE_BLOCK_SIZE; + int *block_indexes = NULL; + int i; + + block_indexes = alloca(sizeof(int) * block_cnt); + for (i = 0; i < block_cnt; i++) + block_indexes[i] = i; + + if (shuffle_seed != 0) + shuffle(block_indexes, block_cnt, shuffle_seed); + + if (actual_size != size) { + ksft_print_msg( + "File size doesn't match. name: %s expected size:%ld actual size:%ld\n", + filename, size, actual_size); + error = -1; + goto failure; + } + + for (i = 0; i < block_cnt; i++) { + int block_idx = block_indexes[i]; + uint8_t expected_block[INCFS_DATA_FILE_BLOCK_SIZE]; + uint8_t actual_block[INCFS_DATA_FILE_BLOCK_SIZE]; + int seed = get_file_block_seed(file->index, block_idx); + size_t bytes_to_compare = min( + (off_t)INCFS_DATA_FILE_BLOCK_SIZE, + size - ((off_t)block_idx) * INCFS_DATA_FILE_BLOCK_SIZE); + int read_result = + read_test_file(actual_block, INCFS_DATA_FILE_BLOCK_SIZE, + filename, block_idx); + if (read_result < 0) { + ksft_print_msg( + "Error reading block %d from file %s. Error: %s\n", + block_idx, filename, strerror(-read_result)); + error = read_result; + goto failure; + } + rnd_buf(expected_block, INCFS_DATA_FILE_BLOCK_SIZE, seed); + if (memcmp(expected_block, actual_block, bytes_to_compare)) { + ksft_print_msg( + "File contents don't match. name: %s block:%d\n", + file->name, block_idx); + error = -2; + goto failure; + } + } + free(filename); + return 0; + +failure: + free(filename); + return error; +} + +static int validate_test_file_content(const char *mount_dir, + struct test_file *file) +{ + return validate_test_file_content_with_seed(mount_dir, file, 0); +} + +static int data_producer(const char *mount_dir, struct test_files_set *test_set) +{ + int ret = 0; + int timeout_ms = 1000; + struct incfs_pending_read_info prs[100] = {}; + int prs_size = ARRAY_SIZE(prs); + int fd = open_commands_file(mount_dir); + + if (fd < 0) + return -errno; + + while ((ret = wait_for_pending_reads(fd, timeout_ms, prs, prs_size)) > + 0) { + int read_count = ret; + int i; + + for (i = 0; i < read_count; i++) { + int j = 0; + struct test_file *file = NULL; + + for (j = 0; j < test_set->files_count; j++) { + bool same = same_id(&(test_set->files[j].id), + &(prs[i].file_id)); + + if (same) { + file = &test_set->files[j]; + break; + } + } + if (!file) { + ksft_print_msg( + "Unknown file in pending reads.\n"); + break; + } + + ret = emit_test_block(mount_dir, file, + prs[i].block_index); + if (ret < 0) { + ksft_print_msg("Emitting test data error: %s\n", + strerror(-ret)); + break; + } + } + } + close(fd); + return ret; +} + +static int data_producer2(const char *mount_dir, + struct test_files_set *test_set) +{ + int ret = 0; + int timeout_ms = 1000; + struct incfs_pending_read_info2 prs[100] = {}; + int prs_size = ARRAY_SIZE(prs); + int fd = open_commands_file(mount_dir); + + if (fd < 0) + return -errno; + + while ((ret = wait_for_pending_reads2(fd, timeout_ms, prs, prs_size)) > + 0) { + int read_count = ret; + int i; + + for (i = 0; i < read_count; i++) { + int j = 0; + struct test_file *file = NULL; + + for (j = 0; j < test_set->files_count; j++) { + bool same = same_id(&(test_set->files[j].id), + &(prs[i].file_id)); + + if (same) { + file = &test_set->files[j]; + break; + } + } + if (!file) { + ksft_print_msg( + "Unknown file in pending reads.\n"); + break; + } + + ret = emit_test_block(mount_dir, file, + prs[i].block_index); + if (ret < 0) { + ksft_print_msg("Emitting test data error: %s\n", + strerror(-ret)); + break; + } + } + } + close(fd); + return ret; +} + +static int build_mtree(struct test_file *file) +{ + char data[INCFS_DATA_FILE_BLOCK_SIZE] = {}; + const int digest_size = SHA256_DIGEST_SIZE; + const int hash_per_block = INCFS_DATA_FILE_BLOCK_SIZE / digest_size; + int block_count = 0; + int hash_block_count = 0; + int total_tree_block_count = 0; + int tree_lvl_index[INCFS_MAX_MTREE_LEVELS] = {}; + int tree_lvl_count[INCFS_MAX_MTREE_LEVELS] = {}; + int levels_count = 0; + int i, level; + + if (file->size == 0) + return 0; + + block_count = 1 + (file->size - 1) / INCFS_DATA_FILE_BLOCK_SIZE; + hash_block_count = block_count; + for (i = 0; hash_block_count > 1; i++) { + hash_block_count = (hash_block_count + hash_per_block - 1) + / hash_per_block; + tree_lvl_count[i] = hash_block_count; + total_tree_block_count += hash_block_count; + } + levels_count = i; + + for (i = 0; i < levels_count; i++) { + int prev_lvl_base = (i == 0) ? total_tree_block_count : + tree_lvl_index[i - 1]; + + tree_lvl_index[i] = prev_lvl_base - tree_lvl_count[i]; + } + + file->mtree_block_count = total_tree_block_count; + if (block_count == 1) { + int seed = get_file_block_seed(file->index, 0); + + memset(data, 0, INCFS_DATA_FILE_BLOCK_SIZE); + rnd_buf((uint8_t *)data, file->size, seed); + sha256(data, INCFS_DATA_FILE_BLOCK_SIZE, file->root_hash); + return 0; + } + + file->mtree = calloc(total_tree_block_count, sizeof(*file->mtree)); + /* Build level 0 hashes. */ + for (i = 0; i < block_count; i++) { + off_t offset = i * INCFS_DATA_FILE_BLOCK_SIZE; + size_t block_size = INCFS_DATA_FILE_BLOCK_SIZE; + int block_index = tree_lvl_index[0] + + i / hash_per_block; + int block_off = (i % hash_per_block) * digest_size; + int seed = get_file_block_seed(file->index, i); + char *hash_ptr = file->mtree[block_index].data + block_off; + + if (file->size - offset < block_size) { + block_size = file->size - offset; + memset(data, 0, INCFS_DATA_FILE_BLOCK_SIZE); + } + + rnd_buf((uint8_t *)data, block_size, seed); + sha256(data, INCFS_DATA_FILE_BLOCK_SIZE, hash_ptr); + } + + /* Build higher levels of hash tree. */ + for (level = 1; level < levels_count; level++) { + int prev_lvl_base = tree_lvl_index[level - 1]; + int prev_lvl_count = tree_lvl_count[level - 1]; + + for (i = 0; i < prev_lvl_count; i++) { + int block_index = + i / hash_per_block + tree_lvl_index[level]; + int block_off = (i % hash_per_block) * digest_size; + char *hash_ptr = + file->mtree[block_index].data + block_off; + + sha256(file->mtree[i + prev_lvl_base].data, + INCFS_DATA_FILE_BLOCK_SIZE, hash_ptr); + } + } + + /* Calculate root hash from the top block */ + sha256(file->mtree[0].data, + INCFS_DATA_FILE_BLOCK_SIZE, file->root_hash); + + return 0; +} + +static int load_hash_tree(const char *mount_dir, struct test_file *file) +{ + int err; + int i; + int fd; + struct incfs_fill_blocks fill_blocks = { + .count = file->mtree_block_count, + }; + struct incfs_fill_block *fill_block_array; + + if (fill_blocks.count == 0) + return 0; + + fill_block_array = + calloc(fill_blocks.count, sizeof(struct incfs_fill_block)); + if (!fill_block_array) + return -ENOMEM; + fill_blocks.fill_blocks = ptr_to_u64(fill_block_array); + + for (i = 0; i < fill_blocks.count; i++) { + fill_block_array[i] = (struct incfs_fill_block){ + .block_index = i, + .data_len = INCFS_DATA_FILE_BLOCK_SIZE, + .data = ptr_to_u64(file->mtree[i].data), + .flags = INCFS_BLOCK_FLAGS_HASH + }; + } + + fd = open_file_by_id(mount_dir, file->id, false); + if (fd < 0) { + err = errno; + goto failure; + } + + err = ioctl(fd, INCFS_IOC_FILL_BLOCKS, &fill_blocks); + close(fd); + if (err >= 0) { + err = -EPERM; + goto failure; + } + + fd = open_file_by_id(mount_dir, file->id, true); + if (fd < 0) { + err = errno; + goto failure; + } + + err = ioctl(fd, INCFS_IOC_FILL_BLOCKS, &fill_blocks); + close(fd); + if (err < fill_blocks.count) + err = errno; + else + err = 0; + +failure: + free(fill_block_array); + return err; +} + +static int cant_touch_index_test(const char *mount_dir) +{ + char *file_name = "test_file"; + int file_size = 123; + incfs_uuid_t file_id; + char *index_path = concat_file_name(mount_dir, ".index"); + char *subdir = concat_file_name(index_path, "subdir"); + char *dst_name = concat_file_name(mount_dir, "something"); + char *filename_in_index = NULL; + char *file_path = concat_file_name(mount_dir, file_name); + char *backing_dir; + int cmd_fd = -1; + int err; + + backing_dir = create_backing_dir(mount_dir); + if (!backing_dir) + goto failure; + + /* Mount FS and release the backing file. */ + if (mount_fs(mount_dir, backing_dir, 50) != 0) + goto failure; + free(backing_dir); + + cmd_fd = open_commands_file(mount_dir); + if (cmd_fd < 0) + goto failure; + + + err = mkdir(subdir, 0777); + if (err == 0 || errno != EBUSY) { + print_error("Shouldn't be able to crate subdir in index\n"); + goto failure; + } + + err = rmdir(index_path); + if (err == 0 || errno != EBUSY) { + print_error(".index directory should not be removed\n"); + goto failure; + } + + err = emit_file(cmd_fd, ".index", file_name, &file_id, + file_size, NULL); + if (err != -EBUSY) { + print_error("Shouldn't be able to crate a file in index\n"); + goto failure; + } + + err = emit_file(cmd_fd, NULL, file_name, &file_id, + file_size, NULL); + if (err < 0) + goto failure; + filename_in_index = get_index_filename(mount_dir, file_id); + + err = unlink(filename_in_index); + if (err == 0 || errno != EBUSY) { + print_error("Shouldn't be delete from index\n"); + goto failure; + } + + + err = rename(filename_in_index, dst_name); + if (err == 0 || errno != EBUSY) { + print_error("Shouldn't be able to move from index\n"); + goto failure; + } + + free(filename_in_index); + filename_in_index = concat_file_name(index_path, "abc"); + err = link(file_path, filename_in_index); + if (err == 0 || errno != EBUSY) { + print_error("Shouldn't be able to link inside index\n"); + goto failure; + } + + err = rename(index_path, dst_name); + if (err == 0 || errno != EBUSY) { + print_error("Shouldn't rename .index directory\n"); + goto failure; + } + + close(cmd_fd); + free(subdir); + free(index_path); + free(dst_name); + free(filename_in_index); + free(file_path); + if (umount(mount_dir) != 0) { + print_error("Can't unmout FS"); + goto failure; + } + + return TEST_SUCCESS; + +failure: + free(subdir); + free(dst_name); + free(index_path); + free(filename_in_index); + free(file_path); + close(cmd_fd); + umount(mount_dir); + return TEST_FAILURE; +} + +static bool iterate_directory(const char *dir_to_iterate, bool root, + int file_count) +{ + struct expected_name { + const char *name; + bool root_only; + bool found; + } names[] = { + {INCFS_LOG_FILENAME, true, false}, + {INCFS_PENDING_READS_FILENAME, true, false}, + {INCFS_BLOCKS_WRITTEN_FILENAME, true, false}, + {".index", true, false}, + {".incomplete", true, false}, + {"..", false, false}, + {".", false, false}, + }; + + bool pass = true, found; + int i; + + /* Test directory iteration */ + int fd = open(dir_to_iterate, O_RDONLY | O_DIRECTORY | O_CLOEXEC); + + if (fd < 0) { + print_error("Can't open directory\n"); + return false; + } + + for (;;) { + /* Enough space for one dirent - no name over 30 */ + char buf[sizeof(struct linux_dirent64) + NAME_MAX]; + struct linux_dirent64 *dirent = (struct linux_dirent64 *) buf; + int nread; + int i; + + for (i = 0; i < NAME_MAX; ++i) { + nread = syscall(__NR_getdents64, fd, buf, + sizeof(struct linux_dirent64) + i); + + if (nread >= 0) + break; + if (errno != EINVAL) + break; + } + + if (nread == 0) + break; + if (nread < 0) { + print_error("Error iterating directory\n"); + pass = false; + goto failure; + } + + /* Expected size is rounded up to 8 byte boundary. Not sure if + * this is universal truth or just happenstance, but useful test + * for the moment + */ + if (nread != (((sizeof(struct linux_dirent64) + + strlen(dirent->d_name) + 1) + 7) & ~7)) { + print_error("Wrong dirent size"); + pass = false; + goto failure; + } + + found = false; + for (i = 0; i < sizeof(names) / sizeof(*names); ++i) + if (!strcmp(dirent->d_name, names[i].name)) { + if (names[i].root_only && !root) { + print_error("Root file error"); + pass = false; + goto failure; + } + + if (names[i].found) { + print_error("File appears twice"); + pass = false; + goto failure; + } + + names[i].found = true; + found = true; + break; + } + + if (!found) + --file_count; + } + + for (i = 0; i < sizeof(names) / sizeof(*names); ++i) { + if (!names[i].found) + if (root || !names[i].root_only) { + print_error("Expected file not present"); + pass = false; + goto failure; + } + } + + if (file_count) { + print_error("Wrong number of files\n"); + pass = false; + goto failure; + } + +failure: + close(fd); + return pass; +} + +static int basic_file_ops_test(const char *mount_dir) +{ + struct test_files_set test = get_test_files_set(); + const int file_num = test.files_count; + char *subdir1 = concat_file_name(mount_dir, "subdir1"); + char *subdir2 = concat_file_name(mount_dir, "subdir2"); + char *backing_dir; + int cmd_fd = -1; + int i, err; + + backing_dir = create_backing_dir(mount_dir); + if (!backing_dir) + goto failure; + + /* Mount FS and release the backing file. */ + if (mount_fs(mount_dir, backing_dir, 50) != 0) + goto failure; + free(backing_dir); + + cmd_fd = open_commands_file(mount_dir); + if (cmd_fd < 0) + goto failure; + + err = mkdir(subdir1, 0777); + if (err < 0 && errno != EEXIST) { + print_error("Can't create subdir1\n"); + goto failure; + } + + err = mkdir(subdir2, 0777); + if (err < 0 && errno != EEXIST) { + print_error("Can't create subdir2\n"); + goto failure; + } + + /* Create all test files in subdir1 directory */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + loff_t size; + char *file_path = concat_file_name(subdir1, file->name); + + err = emit_file(cmd_fd, "subdir1", file->name, &file->id, + file->size, NULL); + if (err < 0) + goto failure; + + size = get_file_size(file_path); + free(file_path); + if (size != file->size) { + ksft_print_msg("Wrong size %lld of %s.\n", + size, file->name); + goto failure; + } + } + + if (!iterate_directory(subdir1, false, file_num)) + goto failure; + + /* Link the files to subdir2 */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + char *src_name = concat_file_name(subdir1, file->name); + char *dst_name = concat_file_name(subdir2, file->name); + loff_t size; + + err = link(src_name, dst_name); + if (err < 0) { + print_error("Can't move file\n"); + goto failure; + } + + size = get_file_size(dst_name); + if (size != file->size) { + ksft_print_msg("Wrong size %lld of %s.\n", + size, file->name); + goto failure; + } + free(src_name); + free(dst_name); + } + + /* Move the files from subdir2 to the mount dir */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + char *src_name = concat_file_name(subdir2, file->name); + char *dst_name = concat_file_name(mount_dir, file->name); + loff_t size; + + err = rename(src_name, dst_name); + if (err < 0) { + print_error("Can't move file\n"); + goto failure; + } + + size = get_file_size(dst_name); + if (size != file->size) { + ksft_print_msg("Wrong size %lld of %s.\n", + size, file->name); + goto failure; + } + free(src_name); + free(dst_name); + } + + /* +2 because there are 2 subdirs */ + if (!iterate_directory(mount_dir, true, file_num + 2)) + goto failure; + + /* Open and close all files from the mount dir */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + char *path = concat_file_name(mount_dir, file->name); + int fd; + + fd = open(path, O_RDWR | O_CLOEXEC); + free(path); + if (fd <= 0) { + print_error("Can't open file"); + goto failure; + } + if (close(fd)) { + print_error("Can't close file"); + goto failure; + } + } + + /* Delete all files from the mount dir */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + char *path = concat_file_name(mount_dir, file->name); + + err = unlink(path); + free(path); + if (err < 0) { + print_error("Can't unlink file"); + goto failure; + } + } + + err = delete_dir_tree(subdir1); + if (err) { + ksft_print_msg("Error deleting subdir1 %d", err); + goto failure; + } + + err = rmdir(subdir2); + if (err) { + print_error("Error deleting subdir2"); + goto failure; + } + + close(cmd_fd); + cmd_fd = -1; + if (umount(mount_dir) != 0) { + print_error("Can't unmout FS"); + goto failure; + } + + free(subdir1); + free(subdir2); + return TEST_SUCCESS; + +failure: + free(subdir1); + free(subdir2); + close(cmd_fd); + umount(mount_dir); + return TEST_FAILURE; +} + +static int dynamic_files_and_data_test(const char *mount_dir) +{ + struct test_files_set test = get_test_files_set(); + const int file_num = test.files_count; + const int missing_file_idx = 5; + int cmd_fd = -1; + char *backing_dir; + int i; + + backing_dir = create_backing_dir(mount_dir); + if (!backing_dir) + goto failure; + + /* Mount FS and release the backing file. */ + if (mount_fs(mount_dir, backing_dir, 50) != 0) + goto failure; + free(backing_dir); + + cmd_fd = open_commands_file(mount_dir); + if (cmd_fd < 0) + goto failure; + + /* Check that test files don't exist in the filesystem. */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + char *filename = concat_file_name(mount_dir, file->name); + + if (access(filename, F_OK) != -1) { + ksft_print_msg( + "File %s somehow already exists in a clean FS.\n", + filename); + goto failure; + } + free(filename); + } + + /* Write test data into the command file. */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + int res; + + res = emit_file(cmd_fd, NULL, file->name, &file->id, + file->size, NULL); + if (res < 0) { + ksft_print_msg("Error %s emiting file %s.\n", + strerror(-res), file->name); + goto failure; + } + + /* Skip writing data to one file so we can check */ + /* that it's missing later. */ + if (i == missing_file_idx) + continue; + + res = emit_test_file_data(mount_dir, file); + if (res) { + ksft_print_msg("Error %s emiting data for %s.\n", + strerror(-res), file->name); + goto failure; + } + } + + /* Validate contents of the FS */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + + if (i == missing_file_idx) { + /* No data has been written to this file. */ + /* Check for read error; */ + uint8_t buf; + char *filename = + concat_file_name(mount_dir, file->name); + int res = read_test_file(&buf, 1, filename, 0); + + free(filename); + if (res > 0) { + ksft_print_msg( + "Data present, even though never writtern.\n"); + goto failure; + } + if (res != -ETIME) { + ksft_print_msg("Wrong error code: %d.\n", res); + goto failure; + } + } else { + if (validate_test_file_content(mount_dir, file) < 0) + goto failure; + } + } + + close(cmd_fd); + cmd_fd = -1; + if (umount(mount_dir) != 0) { + print_error("Can't unmout FS"); + goto failure; + } + + return TEST_SUCCESS; + +failure: + close(cmd_fd); + umount(mount_dir); + return TEST_FAILURE; +} + +static int concurrent_reads_and_writes_test(const char *mount_dir) +{ + struct test_files_set test = get_test_files_set(); + const int file_num = test.files_count; + /* Validate each file from that many child processes. */ + const int child_multiplier = 3; + int cmd_fd = -1; + char *backing_dir; + int status; + int i; + pid_t producer_pid; + pid_t *child_pids = alloca(child_multiplier * file_num * sizeof(pid_t)); + + backing_dir = create_backing_dir(mount_dir); + if (!backing_dir) + goto failure; + + /* Mount FS and release the backing file. */ + if (mount_fs(mount_dir, backing_dir, 500) != 0) + goto failure; + free(backing_dir); + + cmd_fd = open_commands_file(mount_dir); + if (cmd_fd < 0) + goto failure; + + /* Tell FS about the files, without actually providing the data. */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + int res; + + res = emit_file(cmd_fd, NULL, file->name, &file->id, + file->size, NULL); + if (res) + goto failure; + } + + /* Start child processes acessing data in the files */ + for (i = 0; i < file_num * child_multiplier; i++) { + struct test_file *file = &test.files[i / child_multiplier]; + pid_t child_pid = flush_and_fork(); + + if (child_pid == 0) { + /* This is a child process, do the data validation. */ + int ret = validate_test_file_content_with_seed( + mount_dir, file, i); + if (ret >= 0) { + /* Zero exit status if data is valid. */ + exit(0); + } + + /* Positive status if validation error found. */ + exit(-ret); + } else if (child_pid > 0) { + child_pids[i] = child_pid; + } else { + print_error("Fork error"); + goto failure; + } + } + + producer_pid = flush_and_fork(); + if (producer_pid == 0) { + int ret; + /* + * This is a child that should provide data to + * pending reads. + */ + + ret = data_producer(mount_dir, &test); + exit(-ret); + } else { + status = wait_for_process(producer_pid); + if (status != 0) { + ksft_print_msg("Data produces failed. %d(%s) ", status, + strerror(status)); + goto failure; + } + } + + /* Check that all children has finished with 0 exit status */ + for (i = 0; i < file_num * child_multiplier; i++) { + struct test_file *file = &test.files[i / child_multiplier]; + + status = wait_for_process(child_pids[i]); + if (status != 0) { + ksft_print_msg( + "Validation for the file %s failed with code %d (%s)\n", + file->name, status, strerror(status)); + goto failure; + } + } + + /* Check that there are no pending reads left */ + { + struct incfs_pending_read_info prs[1] = {}; + int timeout = 0; + int read_count = wait_for_pending_reads(cmd_fd, timeout, prs, + ARRAY_SIZE(prs)); + + if (read_count) { + ksft_print_msg( + "Pending reads pending when all data written\n"); + goto failure; + } + } + + close(cmd_fd); + cmd_fd = -1; + if (umount(mount_dir) != 0) { + print_error("Can't unmout FS"); + goto failure; + } + + return TEST_SUCCESS; + +failure: + close(cmd_fd); + umount(mount_dir); + return TEST_FAILURE; +} + +static int work_after_remount_test(const char *mount_dir) +{ + struct test_files_set test = get_test_files_set(); + const int file_num = test.files_count; + const int file_num_stage1 = file_num / 2; + const int file_num_stage2 = file_num; + char *backing_dir = NULL; + int i = 0; + int cmd_fd = -1; + + backing_dir = create_backing_dir(mount_dir); + if (!backing_dir) + goto failure; + + /* Mount FS and release the backing file. */ + if (mount_fs(mount_dir, backing_dir, 50) != 0) + goto failure; + + cmd_fd = open_commands_file(mount_dir); + if (cmd_fd < 0) + goto failure; + + /* Write first half of the data into the command file. (stage 1) */ + for (i = 0; i < file_num_stage1; i++) { + struct test_file *file = &test.files[i]; + + if (emit_file(cmd_fd, NULL, file->name, &file->id, + file->size, NULL)) + goto failure; + + if (emit_test_file_data(mount_dir, file)) + goto failure; + } + + /* Unmount and mount again, to see that data is persistent. */ + close(cmd_fd); + cmd_fd = -1; + if (umount(mount_dir) != 0) { + print_error("Can't unmout FS"); + goto failure; + } + + if (mount_fs(mount_dir, backing_dir, 50) != 0) + goto failure; + + cmd_fd = open_commands_file(mount_dir); + if (cmd_fd < 0) + goto failure; + + /* Write the second half of the data into the command file. (stage 2) */ + for (; i < file_num_stage2; i++) { + struct test_file *file = &test.files[i]; + int res = emit_file(cmd_fd, NULL, file->name, &file->id, + file->size, NULL); + + if (res) + goto failure; + + if (emit_test_file_data(mount_dir, file)) + goto failure; + } + + /* Validate contents of the FS */ + for (i = 0; i < file_num_stage2; i++) { + struct test_file *file = &test.files[i]; + + if (validate_test_file_content(mount_dir, file) < 0) + goto failure; + } + + /* Delete all files */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + char *filename = concat_file_name(mount_dir, file->name); + char *filename_in_index = get_index_filename(mount_dir, + file->id); + + if (access(filename, F_OK) != 0) { + ksft_print_msg("File %s is not visible.\n", filename); + goto failure; + } + + if (access(filename_in_index, F_OK) != 0) { + ksft_print_msg("File %s is not visible.\n", + filename_in_index); + goto failure; + } + + unlink(filename); + + if (access(filename, F_OK) != -1) { + ksft_print_msg("File %s is still present.\n", filename); + goto failure; + } + + if (access(filename_in_index, F_OK) != -1) { + ksft_print_msg("File %s is still present.\n", + filename_in_index); + goto failure; + } + free(filename); + free(filename_in_index); + } + + /* Unmount and mount again, to see that deleted files stay deleted. */ + close(cmd_fd); + cmd_fd = -1; + if (umount(mount_dir) != 0) { + print_error("Can't unmout FS"); + goto failure; + } + + if (mount_fs(mount_dir, backing_dir, 50) != 0) + goto failure; + + cmd_fd = open_commands_file(mount_dir); + if (cmd_fd < 0) + goto failure; + + /* Validate all deleted files are still deleted. */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + char *filename = concat_file_name(mount_dir, file->name); + + if (access(filename, F_OK) != -1) { + ksft_print_msg("File %s is still visible.\n", filename); + goto failure; + } + free(filename); + } + + /* Final unmount */ + close(cmd_fd); + free(backing_dir); + cmd_fd = -1; + if (umount(mount_dir) != 0) { + print_error("Can't unmout FS"); + goto failure; + } + + return TEST_SUCCESS; + +failure: + close(cmd_fd); + free(backing_dir); + umount(mount_dir); + return TEST_FAILURE; +} + +static int attribute_test(const char *mount_dir) +{ + char file_attr[] = "metadata123123"; + char attr_buf[INCFS_MAX_FILE_ATTR_SIZE] = {}; + int cmd_fd = -1; + incfs_uuid_t file_id; + int attr_res = 0; + char *backing_dir; + + + backing_dir = create_backing_dir(mount_dir); + if (!backing_dir) + goto failure; + + /* Mount FS and release the backing file. */ + if (mount_fs(mount_dir, backing_dir, 50) != 0) + goto failure; + + + cmd_fd = open_commands_file(mount_dir); + if (cmd_fd < 0) + goto failure; + + if (emit_file(cmd_fd, NULL, "file", &file_id, 12, file_attr)) + goto failure; + + /* Test attribute values */ + attr_res = get_file_attr(mount_dir, file_id, attr_buf, + ARRAY_SIZE(attr_buf)); + if (attr_res != strlen(file_attr)) { + ksft_print_msg("Get file attr error: %d\n", attr_res); + goto failure; + } + if (strcmp(attr_buf, file_attr) != 0) { + ksft_print_msg("Incorrect file attr value: '%s'", attr_buf); + goto failure; + } + + /* Unmount and mount again, to see that attributes are persistent. */ + close(cmd_fd); + cmd_fd = -1; + if (umount(mount_dir) != 0) { + print_error("Can't unmout FS"); + goto failure; + } + + if (mount_fs(mount_dir, backing_dir, 50) != 0) + goto failure; + + cmd_fd = open_commands_file(mount_dir); + if (cmd_fd < 0) + goto failure; + + /* Test attribute values again after remount*/ + attr_res = get_file_attr(mount_dir, file_id, attr_buf, + ARRAY_SIZE(attr_buf)); + if (attr_res != strlen(file_attr)) { + ksft_print_msg("Get dir attr error: %d\n", attr_res); + goto failure; + } + if (strcmp(attr_buf, file_attr) != 0) { + ksft_print_msg("Incorrect file attr value: '%s'", attr_buf); + goto failure; + } + + /* Final unmount */ + close(cmd_fd); + free(backing_dir); + cmd_fd = -1; + if (umount(mount_dir) != 0) { + print_error("Can't unmout FS"); + goto failure; + } + + return TEST_SUCCESS; + +failure: + close(cmd_fd); + free(backing_dir); + umount(mount_dir); + return TEST_FAILURE; +} + +static int child_procs_waiting_for_data_test(const char *mount_dir) +{ + struct test_files_set test = get_test_files_set(); + const int file_num = test.files_count; + int cmd_fd = -1; + int i; + pid_t *child_pids = alloca(file_num * sizeof(pid_t)); + char *backing_dir; + + backing_dir = create_backing_dir(mount_dir); + if (!backing_dir) + goto failure; + + /* Mount FS and release the backing file. (10s wait time) */ + if (mount_fs(mount_dir, backing_dir, 10000) != 0) + goto failure; + + + cmd_fd = open_commands_file(mount_dir); + if (cmd_fd < 0) + goto failure; + + /* Tell FS about the files, without actually providing the data. */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + + emit_file(cmd_fd, NULL, file->name, &file->id, + file->size, NULL); + } + + /* Start child processes acessing data in the files */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + pid_t child_pid = flush_and_fork(); + + if (child_pid == 0) { + /* This is a child process, do the data validation. */ + int ret = validate_test_file_content(mount_dir, file); + + if (ret >= 0) { + /* Zero exit status if data is valid. */ + exit(0); + } + + /* Positive status if validation error found. */ + exit(-ret); + } else if (child_pid > 0) { + child_pids[i] = child_pid; + } else { + print_error("Fork error"); + goto failure; + } + } + + /* Write test data into the command file. */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + + if (emit_test_file_data(mount_dir, file)) + goto failure; + } + + /* Check that all children has finished with 0 exit status */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + int status = wait_for_process(child_pids[i]); + + if (status != 0) { + ksft_print_msg( + "Validation for the file %s failed with code %d (%s)\n", + file->name, status, strerror(status)); + goto failure; + } + } + + close(cmd_fd); + free(backing_dir); + cmd_fd = -1; + if (umount(mount_dir) != 0) { + print_error("Can't unmout FS"); + goto failure; + } + + return TEST_SUCCESS; + +failure: + close(cmd_fd); + free(backing_dir); + umount(mount_dir); + return TEST_FAILURE; +} + +static int multiple_providers_test(const char *mount_dir) +{ + struct test_files_set test = get_test_files_set(); + const int file_num = test.files_count; + const int producer_count = 5; + int cmd_fd = -1; + int status; + int i; + pid_t *producer_pids = alloca(producer_count * sizeof(pid_t)); + char *backing_dir; + + backing_dir = create_backing_dir(mount_dir); + if (!backing_dir) + goto failure; + + /* Mount FS and release the backing file. (10s wait time) */ + if (mount_fs_opt(mount_dir, backing_dir, + "read_timeout_ms=10000,report_uid", false) != 0) + goto failure; + + cmd_fd = open_commands_file(mount_dir); + if (cmd_fd < 0) + goto failure; + + /* Tell FS about the files, without actually providing the data. */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + + if (emit_file(cmd_fd, NULL, file->name, &file->id, + file->size, NULL) < 0) + goto failure; + } + + /* Start producer processes */ + for (i = 0; i < producer_count; i++) { + pid_t producer_pid = flush_and_fork(); + + if (producer_pid == 0) { + int ret; + /* + * This is a child that should provide data to + * pending reads. + */ + + ret = data_producer2(mount_dir, &test); + exit(-ret); + } else if (producer_pid > 0) { + producer_pids[i] = producer_pid; + } else { + print_error("Fork error"); + goto failure; + } + } + + /* Validate FS content */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + char *filename = concat_file_name(mount_dir, file->name); + loff_t read_result = read_whole_file(filename); + + free(filename); + if (read_result != file->size) { + ksft_print_msg( + "Error validating file %s. Result: %ld\n", + file->name, read_result); + goto failure; + } + } + + /* Check that all producers has finished with 0 exit status */ + for (i = 0; i < producer_count; i++) { + status = wait_for_process(producer_pids[i]); + if (status != 0) { + ksft_print_msg("Producer %d failed with code (%s)\n", i, + strerror(status)); + goto failure; + } + } + + close(cmd_fd); + free(backing_dir); + cmd_fd = -1; + if (umount(mount_dir) != 0) { + print_error("Can't unmout FS"); + goto failure; + } + + return TEST_SUCCESS; + +failure: + close(cmd_fd); + free(backing_dir); + umount(mount_dir); + return TEST_FAILURE; +} + +static int validate_hash_tree(const char *mount_dir, struct test_file *file) +{ + int result = TEST_FAILURE; + char *filename = NULL; + int fd = -1; + unsigned char *buf; + int i, err; + + TEST(filename = concat_file_name(mount_dir, file->name), filename); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + TEST(buf = malloc(INCFS_DATA_FILE_BLOCK_SIZE * 8), buf); + + for (i = 0; i < file->mtree_block_count; ) { + int blocks_to_read = i % 7 + 1; + struct fsverity_read_metadata_arg args = { + .metadata_type = FS_VERITY_METADATA_TYPE_MERKLE_TREE, + .offset = i * INCFS_DATA_FILE_BLOCK_SIZE, + .length = blocks_to_read * INCFS_DATA_FILE_BLOCK_SIZE, + .buf_ptr = ptr_to_u64(buf), + }; + + TEST(err = ioctl(fd, FS_IOC_READ_VERITY_METADATA, &args), + err == min(args.length, (file->mtree_block_count - i) * + INCFS_DATA_FILE_BLOCK_SIZE)); + TESTEQUAL(memcmp(buf, file->mtree[i].data, err), 0); + + i += blocks_to_read; + } + + result = TEST_SUCCESS; + +out: + free(buf); + close(fd); + free(filename); + return result; +} + +static int hash_tree_test(const char *mount_dir) +{ + int result = TEST_FAILURE; + char *backing_dir; + struct test_files_set test = get_test_files_set(); + const int file_num = test.files_count; + const int corrupted_file_idx = 5; + int i = 0; + int cmd_fd = -1; + + backing_dir = create_backing_dir(mount_dir); + if (!backing_dir) + goto failure; + + /* Mount FS and release the backing file. */ + if (mount_fs(mount_dir, backing_dir, 50) != 0) + goto failure; + + cmd_fd = open_commands_file(mount_dir); + if (cmd_fd < 0) + goto failure; + + /* Write hashes and data. */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + int res; + + build_mtree(file); + res = crypto_emit_file(cmd_fd, NULL, file->name, &file->id, + file->size, file->root_hash, + file->sig.add_data); + + if (i == corrupted_file_idx) { + /* Corrupt third blocks hash */ + file->mtree[0].data[2 * SHA256_DIGEST_SIZE] ^= 0xff; + } + if (emit_test_file_data(mount_dir, file)) + goto failure; + + res = load_hash_tree(mount_dir, file); + if (res) { + ksft_print_msg("Can't load hashes for %s. error: %s\n", + file->name, strerror(-res)); + goto failure; + } + } + + /* Validate data */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + + if (i == corrupted_file_idx) { + uint8_t data[INCFS_DATA_FILE_BLOCK_SIZE]; + char *filename = + concat_file_name(mount_dir, file->name); + int res; + + res = read_test_file(data, INCFS_DATA_FILE_BLOCK_SIZE, + filename, 2); + free(filename); + if (res != -EBADMSG) { + ksft_print_msg("Hash violation missed1. %d\n", + res); + goto failure; + } + } else if (validate_test_file_content(mount_dir, file) < 0) + goto failure; + else if (validate_hash_tree(mount_dir, file) < 0) + goto failure; + } + + /* Unmount and mount again, to that hashes are persistent. */ + close(cmd_fd); + cmd_fd = -1; + if (umount(mount_dir) != 0) { + print_error("Can't unmout FS"); + goto failure; + } + if (mount_fs(mount_dir, backing_dir, 50) != 0) + goto failure; + + cmd_fd = open_commands_file(mount_dir); + if (cmd_fd < 0) + goto failure; + + /* Validate data again */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + + if (i == corrupted_file_idx) { + uint8_t data[INCFS_DATA_FILE_BLOCK_SIZE]; + char *filename = + concat_file_name(mount_dir, file->name); + int res; + + res = read_test_file(data, INCFS_DATA_FILE_BLOCK_SIZE, + filename, 2); + free(filename); + if (res != -EBADMSG) { + ksft_print_msg("Hash violation missed2. %d\n", + res); + goto failure; + } + } else if (validate_test_file_content(mount_dir, file) < 0) + goto failure; + } + result = TEST_SUCCESS; + +failure: + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + + free(file->mtree); + } + + close(cmd_fd); + free(backing_dir); + umount(mount_dir); + return result; +} + +enum expected_log { FULL_LOG, NO_LOG, PARTIAL_LOG }; + +static int validate_logs(const char *mount_dir, int log_fd, + struct test_file *file, + enum expected_log expected_log, + bool report_uid, bool expect_data) +{ + int result = TEST_FAILURE; + uint8_t data[INCFS_DATA_FILE_BLOCK_SIZE]; + struct incfs_pending_read_info prs[2048] = {}; + struct incfs_pending_read_info2 prs2[2048] = {}; + struct incfs_pending_read_info *previous_record = NULL; + int prs_size = ARRAY_SIZE(prs); + int block_count = 1 + (file->size - 1) / INCFS_DATA_FILE_BLOCK_SIZE; + int expected_read_count, read_count, block_index, read_index; + char *filename = NULL; + int fd = -1; + + TEST(filename = concat_file_name(mount_dir, file->name), filename); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + + if (block_count > prs_size) + block_count = prs_size; + expected_read_count = block_count; + + for (block_index = 0; block_index < block_count; block_index++) { + int result = pread(fd, data, sizeof(data), + INCFS_DATA_FILE_BLOCK_SIZE * block_index); + + /* Make some read logs of type SAME_FILE_NEXT_BLOCK */ + if (block_index % 100 == 10) + usleep(20000); + + /* Skip some blocks to make logs of type SAME_FILE */ + if (block_index % 10 == 5) { + ++block_index; + --expected_read_count; + } + + if (expect_data) + TESTCOND(result > 0); + + if (!expect_data) + TESTEQUAL(result, -1); + } + + if (report_uid) + read_count = wait_for_pending_reads2(log_fd, + expected_log == NO_LOG ? 10 : 0, + prs2, prs_size); + else + read_count = wait_for_pending_reads(log_fd, + expected_log == NO_LOG ? 10 : 0, + prs, prs_size); + + if (expected_log == NO_LOG) + TESTEQUAL(read_count, 0); + + if (expected_log == PARTIAL_LOG) + TESTCOND(read_count > 0 && + read_count <= expected_read_count); + + if (expected_log == FULL_LOG) + TESTEQUAL(read_count, expected_read_count); + + /* If read less than expected, advance block_index appropriately */ + for (block_index = 0, read_index = 0; + read_index < expected_read_count - read_count; + block_index++, read_index++) + if (block_index % 10 == 5) + ++block_index; + + for (read_index = 0; read_index < read_count; + block_index++, read_index++) { + struct incfs_pending_read_info *record = report_uid ? + (struct incfs_pending_read_info *) &prs2[read_index] : + &prs[read_index]; + + TESTCOND(same_id(&record->file_id, &file->id)); + TESTEQUAL(record->block_index, block_index); + TESTNE(record->timestamp_us, 0); + if (previous_record) + TESTEQUAL(record->serial_number, + previous_record->serial_number + 1); + + previous_record = record; + if (block_index % 10 == 5) + ++block_index; + } + + result = TEST_SUCCESS; +out: + close(fd); + free(filename); + return result; +} + +static int read_log_test(const char *mount_dir) +{ + int result = TEST_FAILURE; + struct test_files_set test = get_test_files_set(); + const int file_num = test.files_count; + int i = 0; + int cmd_fd = -1, log_fd = -1; + char *backing_dir = NULL; + + /* Create files */ + TEST(backing_dir = create_backing_dir(mount_dir), backing_dir); + TESTEQUAL(mount_fs_opt(mount_dir, backing_dir, + "readahead=0,report_uid,read_timeout_ms=0", + false), 0); + TEST(cmd_fd = open_commands_file(mount_dir), cmd_fd != -1); + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + + TESTEQUAL(emit_file(cmd_fd, NULL, file->name, &file->id, + file->size, NULL), 0); + } + close(cmd_fd); + cmd_fd = -1; + + /* Validate logs */ + TEST(log_fd = open_log_file(mount_dir), log_fd != -1); + for (i = 0; i < file_num; i++) + TESTEQUAL(validate_logs(mount_dir, log_fd, &test.files[i], + FULL_LOG, true, false), 0); + + /* Unmount and mount again without report_uid */ + close(log_fd); + log_fd = -1; + TESTEQUAL(umount(mount_dir), 0); + TESTEQUAL(mount_fs_opt(mount_dir, backing_dir, + "readahead=0,read_timeout_ms=0", false), 0); + + TEST(log_fd = open_log_file(mount_dir), log_fd != -1); + for (i = 0; i < file_num; i++) + TESTEQUAL(validate_logs(mount_dir, log_fd, &test.files[i], + FULL_LOG, false, false), 0); + + /* No read log to make sure poll doesn't crash */ + close(log_fd); + log_fd = -1; + TESTEQUAL(umount(mount_dir), 0); + TESTEQUAL(mount_fs_opt(mount_dir, backing_dir, + "readahead=0,rlog_pages=0,read_timeout_ms=0", + false), 0); + + TEST(log_fd = open_log_file(mount_dir), log_fd != -1); + for (i = 0; i < file_num; i++) + TESTEQUAL(validate_logs(mount_dir, log_fd, &test.files[i], + NO_LOG, false, false), 0); + + /* Remount and check that logs start working again */ + TESTEQUAL(mount_fs_opt(mount_dir, backing_dir, + "readahead=0,rlog_pages=1,read_timeout_ms=0", + true), 0); + for (i = 0; i < file_num; i++) + TESTEQUAL(validate_logs(mount_dir, log_fd, &test.files[i], + PARTIAL_LOG, false, false), 0); + + /* Remount and check that logs continue working */ + TESTEQUAL(mount_fs_opt(mount_dir, backing_dir, + "readahead=0,rlog_pages=4,read_timeout_ms=0", + true), 0); + for (i = 0; i < file_num; i++) + TESTEQUAL(validate_logs(mount_dir, log_fd, &test.files[i], + FULL_LOG, false, false), 0); + + /* Check logs work with data */ + for (i = 0; i < file_num; i++) { + TESTEQUAL(emit_test_file_data(mount_dir, &test.files[i]), 0); + TESTEQUAL(validate_logs(mount_dir, log_fd, &test.files[i], + FULL_LOG, false, true), 0); + } + + /* Final unmount */ + close(log_fd); + log_fd = -1; + TESTEQUAL(umount(mount_dir), 0); + + result = TEST_SUCCESS; +out: + close(cmd_fd); + close(log_fd); + free(backing_dir); + umount(mount_dir); + return result; +} + +static int emit_partial_test_file_data(const char *mount_dir, + struct test_file *file) +{ + int i, j; + int block_cnt = 1 + (file->size - 1) / INCFS_DATA_FILE_BLOCK_SIZE; + int *block_indexes = NULL; + int result = 0; + int blocks_written = 0; + int bw_fd = -1; + char buffer[20]; + struct pollfd pollfd; + long blocks_written_total, blocks_written_new_total; + + if (file->size == 0) + return 0; + + bw_fd = open_blocks_written_file(mount_dir); + if (bw_fd == -1) + return -errno; + + result = read(bw_fd, buffer, sizeof(buffer)); + if (result <= 0) { + result = -EIO; + goto out; + } + + buffer[result] = 0; + blocks_written_total = strtol(buffer, NULL, 10); + result = 0; + + pollfd = (struct pollfd) { + .fd = bw_fd, + .events = POLLIN, + }; + + result = poll(&pollfd, 1, 0); + if (result) { + result = -EIO; + goto out; + } + + /* Emit 2 blocks, skip 2 blocks etc*/ + block_indexes = calloc(block_cnt, sizeof(*block_indexes)); + for (i = 0, j = 0; i < block_cnt; ++i) + if ((i & 2) == 0) { + block_indexes[j] = i; + ++j; + } + + for (i = 0; i < j; i += blocks_written) { + blocks_written = emit_test_blocks(mount_dir, file, + block_indexes + i, j - i); + if (blocks_written < 0) { + result = blocks_written; + goto out; + } + if (blocks_written == 0) { + result = -EIO; + goto out; + } + + result = poll(&pollfd, 1, 0); + if (result != 1 || pollfd.revents != POLLIN) { + result = -EIO; + goto out; + } + + result = read(bw_fd, buffer, sizeof(buffer)); + buffer[result] = 0; + blocks_written_new_total = strtol(buffer, NULL, 10); + + if (blocks_written_new_total - blocks_written_total + != blocks_written) { + result = -EIO; + goto out; + } + + blocks_written_total = blocks_written_new_total; + result = 0; + } +out: + free(block_indexes); + close(bw_fd); + return result; +} + +static int validate_ranges(const char *mount_dir, struct test_file *file) +{ + int block_cnt = 1 + (file->size - 1) / INCFS_DATA_FILE_BLOCK_SIZE; + char *filename = concat_file_name(mount_dir, file->name); + int fd; + struct incfs_filled_range ranges[128]; + struct incfs_get_filled_blocks_args fba = { + .range_buffer = ptr_to_u64(ranges), + .range_buffer_size = sizeof(ranges), + }; + int error = TEST_SUCCESS; + int i; + int range_cnt; + int cmd_fd = -1; + struct incfs_permit_fill permit_fill; + + fd = open(filename, O_RDONLY | O_CLOEXEC); + free(filename); + if (fd <= 0) + return TEST_FAILURE; + + error = ioctl(fd, INCFS_IOC_GET_FILLED_BLOCKS, &fba); + if (error != -1 || errno != EPERM) { + ksft_print_msg("INCFS_IOC_GET_FILLED_BLOCKS not blocked\n"); + error = -EPERM; + goto out; + } + + cmd_fd = open_commands_file(mount_dir); + permit_fill.file_descriptor = fd; + if (ioctl(cmd_fd, INCFS_IOC_PERMIT_FILL, &permit_fill)) { + print_error("INCFS_IOC_PERMIT_FILL failed"); + return -EPERM; + goto out; + } + + error = ioctl(fd, INCFS_IOC_GET_FILLED_BLOCKS, &fba); + if (error && errno != ERANGE) + goto out; + + if (error && errno == ERANGE && block_cnt < 509) + goto out; + + if (!error && block_cnt >= 509) { + error = -ERANGE; + goto out; + } + + if (fba.total_blocks_out != block_cnt) { + error = -EINVAL; + goto out; + } + + if (fba.data_blocks_out != block_cnt) { + error = -EINVAL; + goto out; + } + + range_cnt = (block_cnt + 3) / 4; + if (range_cnt > 128) + range_cnt = 128; + if (range_cnt != fba.range_buffer_size_out / sizeof(*ranges)) { + error = -ERANGE; + goto out; + } + + error = TEST_SUCCESS; + for (i = 0; i < fba.range_buffer_size_out / sizeof(*ranges) - 1; ++i) + if (ranges[i].begin != i * 4 || ranges[i].end != i * 4 + 2) { + error = -EINVAL; + goto out; + } + + if (ranges[i].begin != i * 4 || + (ranges[i].end != i * 4 + 1 && ranges[i].end != i * 4 + 2)) { + error = -EINVAL; + goto out; + } + + for (i = 0; i < 64; ++i) { + fba.start_index = i * 2; + fba.end_index = i * 2 + 2; + error = ioctl(fd, INCFS_IOC_GET_FILLED_BLOCKS, &fba); + if (error) + goto out; + + if (fba.total_blocks_out != block_cnt) { + error = -EINVAL; + goto out; + } + + if (fba.start_index >= block_cnt) { + if (fba.index_out != fba.start_index) { + error = -EINVAL; + goto out; + } + + break; + } + + if (i % 2) { + if (fba.range_buffer_size_out != 0) { + error = -EINVAL; + goto out; + } + } else { + if (fba.range_buffer_size_out != sizeof(*ranges)) { + error = -EINVAL; + goto out; + } + + if (ranges[0].begin != i * 2) { + error = -EINVAL; + goto out; + } + + if (ranges[0].end != i * 2 + 1 && + ranges[0].end != i * 2 + 2) { + error = -EINVAL; + goto out; + } + } + } + +out: + close(fd); + close(cmd_fd); + return error; +} + +static int get_blocks_test(const char *mount_dir) +{ + char *backing_dir; + int cmd_fd = -1; + int i; + struct test_files_set test = get_test_files_set(); + const int file_num = test.files_count; + + backing_dir = create_backing_dir(mount_dir); + if (!backing_dir) + goto failure; + + if (mount_fs_opt(mount_dir, backing_dir, "readahead=0", false) != 0) + goto failure; + + cmd_fd = open_commands_file(mount_dir); + if (cmd_fd < 0) + goto failure; + + /* Write data. */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + + if (emit_file(cmd_fd, NULL, file->name, &file->id, file->size, + NULL)) + goto failure; + + if (emit_partial_test_file_data(mount_dir, file)) + goto failure; + } + + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + + if (validate_ranges(mount_dir, file)) + goto failure; + + /* + * The smallest files are filled completely, so this checks that + * the fast get_filled_blocks path is not causing issues + */ + if (validate_ranges(mount_dir, file)) + goto failure; + } + + close(cmd_fd); + umount(mount_dir); + free(backing_dir); + return TEST_SUCCESS; + +failure: + close(cmd_fd); + umount(mount_dir); + free(backing_dir); + return TEST_FAILURE; +} + +static int emit_partial_test_file_hash(const char *mount_dir, + struct test_file *file) +{ + int err; + int fd; + struct incfs_fill_blocks fill_blocks = { + .count = 1, + }; + struct incfs_fill_block *fill_block_array; + uint8_t data[INCFS_DATA_FILE_BLOCK_SIZE]; + + if (file->size <= 4096 / 32 * 4096) + return 0; + + fill_block_array = + calloc(fill_blocks.count, sizeof(struct incfs_fill_block)); + if (!fill_block_array) + return -ENOMEM; + fill_blocks.fill_blocks = ptr_to_u64(fill_block_array); + + rnd_buf(data, sizeof(data), 0); + + fill_block_array[0] = + (struct incfs_fill_block){ .block_index = 1, + .data_len = + INCFS_DATA_FILE_BLOCK_SIZE, + .data = ptr_to_u64(data), + .flags = INCFS_BLOCK_FLAGS_HASH }; + + fd = open_file_by_id(mount_dir, file->id, true); + if (fd < 0) { + err = errno; + goto failure; + } + + err = ioctl(fd, INCFS_IOC_FILL_BLOCKS, &fill_blocks); + close(fd); + if (err < fill_blocks.count) + err = errno; + else + err = 0; + +failure: + free(fill_block_array); + return err; +} + +static int validate_hash_ranges(const char *mount_dir, struct test_file *file) +{ + int block_cnt = 1 + (file->size - 1) / INCFS_DATA_FILE_BLOCK_SIZE; + char *filename = NULL; + int fd; + struct incfs_filled_range ranges[128]; + struct incfs_get_filled_blocks_args fba = { + .range_buffer = ptr_to_u64(ranges), + .range_buffer_size = sizeof(ranges), + }; + int error = TEST_SUCCESS; + int file_blocks = (file->size + INCFS_DATA_FILE_BLOCK_SIZE - 1) / + INCFS_DATA_FILE_BLOCK_SIZE; + int cmd_fd = -1; + struct incfs_permit_fill permit_fill; + + if (file->size <= 4096 / 32 * 4096) + return 0; + + filename = concat_file_name(mount_dir, file->name); + fd = open(filename, O_RDONLY | O_CLOEXEC); + free(filename); + if (fd <= 0) + return TEST_FAILURE; + + error = ioctl(fd, INCFS_IOC_GET_FILLED_BLOCKS, &fba); + if (error != -1 || errno != EPERM) { + ksft_print_msg("INCFS_IOC_GET_FILLED_BLOCKS not blocked\n"); + error = -EPERM; + goto out; + } + + cmd_fd = open_commands_file(mount_dir); + permit_fill.file_descriptor = fd; + if (ioctl(cmd_fd, INCFS_IOC_PERMIT_FILL, &permit_fill)) { + print_error("INCFS_IOC_PERMIT_FILL failed"); + return -EPERM; + goto out; + } + + error = ioctl(fd, INCFS_IOC_GET_FILLED_BLOCKS, &fba); + if (error) + goto out; + + if (fba.total_blocks_out <= block_cnt) { + error = -EINVAL; + goto out; + } + + if (fba.data_blocks_out != block_cnt) { + error = -EINVAL; + goto out; + } + + if (fba.range_buffer_size_out != sizeof(struct incfs_filled_range)) { + error = -EINVAL; + goto out; + } + + if (ranges[0].begin != file_blocks + 1 || + ranges[0].end != file_blocks + 2) { + error = -EINVAL; + goto out; + } + +out: + close(cmd_fd); + close(fd); + return error; +} + +static int get_hash_blocks_test(const char *mount_dir) +{ + char *backing_dir; + int cmd_fd = -1; + int i; + struct test_files_set test = get_test_files_set(); + const int file_num = test.files_count; + + backing_dir = create_backing_dir(mount_dir); + if (!backing_dir) + goto failure; + + if (mount_fs_opt(mount_dir, backing_dir, "readahead=0", false) != 0) + goto failure; + + cmd_fd = open_commands_file(mount_dir); + if (cmd_fd < 0) + goto failure; + + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + + if (crypto_emit_file(cmd_fd, NULL, file->name, &file->id, + file->size, file->root_hash, + file->sig.add_data)) + goto failure; + + if (emit_partial_test_file_hash(mount_dir, file)) + goto failure; + } + + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + + if (validate_hash_ranges(mount_dir, file)) + goto failure; + } + + close(cmd_fd); + umount(mount_dir); + free(backing_dir); + return TEST_SUCCESS; + +failure: + close(cmd_fd); + umount(mount_dir); + free(backing_dir); + return TEST_FAILURE; +} + +#define THREE_GB (3LL * 1024 * 1024 * 1024) +#define FOUR_GB (4LL * 1024 * 1024 * 1024) /* Have 1GB of margin */ +static int large_file_test(const char *mount_dir) +{ + char *backing_dir; + int cmd_fd = -1; + int i; + int result = TEST_FAILURE, ret; + uint8_t data[INCFS_DATA_FILE_BLOCK_SIZE] = {}; + int block_count = THREE_GB / INCFS_DATA_FILE_BLOCK_SIZE; + struct incfs_fill_block *block_buf = NULL; + struct incfs_fill_blocks fill_blocks; + incfs_uuid_t id; + int fd = -1; + struct statvfs svfs; + unsigned long long free_disksz; + + ret = statvfs(mount_dir, &svfs); + if (ret) { + ksft_print_msg("Can't get disk size. Skipping %s...\n", __func__); + return TEST_SKIP; + } + + free_disksz = (unsigned long long)svfs.f_bavail * svfs.f_bsize; + + if (FOUR_GB > free_disksz) { + ksft_print_msg("Not enough free disk space (%lldMB). Skipping %s...\n", + free_disksz >> 20, __func__); + return TEST_SKIP; + } + + backing_dir = create_backing_dir(mount_dir); + if (!backing_dir) + goto failure; + + if (mount_fs_opt(mount_dir, backing_dir, "readahead=0", false) != 0) + goto failure; + + cmd_fd = open_commands_file(mount_dir); + if (cmd_fd < 0) + goto failure; + + if (emit_file(cmd_fd, NULL, "very_large_file", &id, + (uint64_t)block_count * INCFS_DATA_FILE_BLOCK_SIZE, + NULL) < 0) + goto failure; + + block_buf = calloc(block_count, sizeof(struct incfs_fill_block)); + if (!block_buf) + goto failure; + fill_blocks = (struct incfs_fill_blocks) { + .count = block_count, + .fill_blocks = ptr_to_u64(block_buf), + }; + + for (i = 0; i < block_count; i++) { + block_buf[i].compression = COMPRESSION_NONE; + block_buf[i].block_index = i; + block_buf[i].data_len = INCFS_DATA_FILE_BLOCK_SIZE; + block_buf[i].data = ptr_to_u64(data); + } + + fd = open_file_by_id(mount_dir, id, true); + if (fd < 0) + goto failure; + + if (ioctl(fd, INCFS_IOC_FILL_BLOCKS, &fill_blocks) != block_count) + goto failure; + + if (emit_file(cmd_fd, NULL, "very_very_large_file", &id, 1LL << 40, + NULL) < 0) + goto failure; + + result = TEST_SUCCESS; + +failure: + close(fd); + close(cmd_fd); + unlink("very_large_file"); + umount(mount_dir); + free(backing_dir); + free(block_buf); + return result; +} + +static int validate_mapped_file(const char *orig_name, const char *name, + size_t size, size_t offset) +{ + struct stat st; + int orig_fd = -1, fd = -1; + size_t block; + int result = TEST_FAILURE; + + if (stat(name, &st)) { + ksft_print_msg("Failed to stat %s with error %s\n", + name, strerror(errno)); + goto failure; + } + + if (size != st.st_size) { + ksft_print_msg("Mismatched file sizes for file %s - expected %llu, got %llu\n", + name, size, st.st_size); + goto failure; + } + + fd = open(name, O_RDONLY | O_CLOEXEC); + if (fd == -1) { + ksft_print_msg("Failed to open %s with error %s\n", name, + strerror(errno)); + goto failure; + } + + orig_fd = open(orig_name, O_RDONLY | O_CLOEXEC); + if (orig_fd == -1) { + ksft_print_msg("Failed to open %s with error %s\n", orig_name, + strerror(errno)); + goto failure; + } + + for (block = 0; block < size; block += INCFS_DATA_FILE_BLOCK_SIZE) { + uint8_t orig_data[INCFS_DATA_FILE_BLOCK_SIZE]; + uint8_t data[INCFS_DATA_FILE_BLOCK_SIZE]; + ssize_t orig_read, mapped_read; + + orig_read = pread(orig_fd, orig_data, + INCFS_DATA_FILE_BLOCK_SIZE, block + offset); + mapped_read = pread(fd, data, INCFS_DATA_FILE_BLOCK_SIZE, + block); + + if (orig_read < mapped_read || + mapped_read != min(size - block, + INCFS_DATA_FILE_BLOCK_SIZE)) { + ksft_print_msg("Failed to read enough data: %llu %llu %llu %lld %lld\n", + block, size, offset, orig_read, + mapped_read); + goto failure; + } + + if (memcmp(orig_data, data, mapped_read)) { + ksft_print_msg("Data doesn't match: %llu %llu %llu %lld %lld\n", + block, size, offset, orig_read, + mapped_read); + goto failure; + } + } + + result = TEST_SUCCESS; + +failure: + close(orig_fd); + close(fd); + return result; +} + +static int mapped_file_test(const char *mount_dir) +{ + char *backing_dir; + int result = TEST_FAILURE; + int cmd_fd = -1; + int i; + struct test_files_set test = get_test_files_set(); + const int file_num = test.files_count; + + backing_dir = create_backing_dir(mount_dir); + if (!backing_dir) + goto failure; + + if (mount_fs_opt(mount_dir, backing_dir, "readahead=0", false) != 0) + goto failure; + + cmd_fd = open_commands_file(mount_dir); + if (cmd_fd < 0) + goto failure; + + for (i = 0; i < file_num; ++i) { + struct test_file *file = &test.files[i]; + size_t blocks = file->size / INCFS_DATA_FILE_BLOCK_SIZE; + size_t mapped_offset = blocks / 4 * + INCFS_DATA_FILE_BLOCK_SIZE; + size_t mapped_size = file->size / 4 * 3 - mapped_offset; + struct incfs_create_mapped_file_args mfa; + char mapped_file_name[FILENAME_MAX]; + char orig_file_path[PATH_MAX]; + char mapped_file_path[PATH_MAX]; + + if (emit_file(cmd_fd, NULL, file->name, &file->id, file->size, + NULL) < 0) + goto failure; + + if (emit_test_file_data(mount_dir, file)) + goto failure; + + if (snprintf(mapped_file_name, ARRAY_SIZE(mapped_file_name), + "%s.mapped", file->name) < 0) + goto failure; + + mfa = (struct incfs_create_mapped_file_args) { + .size = mapped_size, + .mode = 0664, + .file_name = ptr_to_u64(mapped_file_name), + .source_file_id = file->id, + .source_offset = mapped_offset, + }; + + result = ioctl(cmd_fd, INCFS_IOC_CREATE_MAPPED_FILE, &mfa); + if (result) { + ksft_print_msg( + "Failed to create mapped file with error %d\n", + result); + goto failure; + } + + result = snprintf(orig_file_path, + ARRAY_SIZE(orig_file_path), "%s/%s", + mount_dir, file->name); + + if (result < 0 || result >= ARRAY_SIZE(mapped_file_path)) { + result = TEST_FAILURE; + goto failure; + } + + result = snprintf(mapped_file_path, + ARRAY_SIZE(mapped_file_path), "%s/%s", + mount_dir, mapped_file_name); + + if (result < 0 || result >= ARRAY_SIZE(mapped_file_path)) { + result = TEST_FAILURE; + goto failure; + } + + result = validate_mapped_file(orig_file_path, mapped_file_path, + mapped_size, mapped_offset); + if (result) + goto failure; + } + +failure: + close(cmd_fd); + umount(mount_dir); + free(backing_dir); + return result; +} + +static const char v1_file[] = { + /* Header */ + /* 0x00: Magic number */ + 0x49, 0x4e, 0x43, 0x46, 0x53, 0x00, 0x00, 0x00, + /* 0x08: Version */ + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* 0x10: Header size */ + 0x38, 0x00, + /* 0x12: Block size */ + 0x00, 0x10, + /* 0x14: Flags */ + 0x00, 0x00, 0x00, 0x00, + /* 0x18: First md offset */ + 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* 0x20: File size */ + 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* 0x28: UUID */ + 0x8c, 0x7d, 0xd9, 0x22, 0xad, 0x47, 0x49, 0x4f, + 0xc0, 0x2c, 0x38, 0x8e, 0x12, 0xc0, 0x0e, 0xac, + + /* 0x38: Attribute */ + 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, + 0x31, 0x32, 0x33, 0x31, 0x32, 0x33, + + /* Attribute md record */ + /* 0x46: Type */ + 0x02, + /* 0x47: Size */ + 0x25, 0x00, + /* 0x49: CRC */ + 0x9a, 0xef, 0xef, 0x72, + /* 0x4d: Next md offset */ + 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* 0x55: Prev md offset */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* 0x5d: fa_offset */ + 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* 0x65: fa_size */ + 0x0e, 0x00, + /* 0x67: fa_crc */ + 0xfb, 0x5e, 0x72, 0x89, + + /* Blockmap table */ + /* 0x6b: First 10-byte entry */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + + /* Blockmap md record */ + /* 0x75: Type */ + 0x01, + /* 0x76: Size */ + 0x23, 0x00, + /* 0x78: CRC */ + 0x74, 0x45, 0xd3, 0xb9, + /* 0x7c: Next md offset */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* 0x84: Prev md offset */ + 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* 0x8c: blockmap offset */ + 0x6b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* 0x94: blockmap count */ + 0x01, 0x00, 0x00, 0x00, +}; + +static int compatibility_test(const char *mount_dir) +{ + static const char *name = "file"; + int result = TEST_FAILURE; + char *backing_dir = NULL; + char *filename = NULL; + int fd = -1; + uint64_t size = 0x0c; + + TEST(backing_dir = create_backing_dir(mount_dir), backing_dir); + TEST(filename = concat_file_name(backing_dir, name), filename); + TEST(fd = open(filename, O_CREAT | O_WRONLY | O_CLOEXEC, 0777), + fd != -1); + TESTEQUAL(write(fd, v1_file, sizeof(v1_file)), sizeof(v1_file)); + TESTEQUAL(fsetxattr(fd, INCFS_XATTR_SIZE_NAME, &size, sizeof(size), 0), + 0); + TESTEQUAL(mount_fs(mount_dir, backing_dir, 50), 0); + free(filename); + TEST(filename = concat_file_name(mount_dir, name), filename); + close(fd); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + + result = TEST_SUCCESS; +out: + close(fd); + umount(mount_dir); + free(backing_dir); + free(filename); + return result; +} + +static int zero_blocks_written_count(int fd, uint32_t data_blocks_written, + uint32_t hash_blocks_written) +{ + int test_result = TEST_FAILURE; + uint64_t offset; + uint8_t type; + uint32_t bw; + + /* Get first md record */ + TESTEQUAL(pread(fd, &offset, sizeof(offset), 24), sizeof(offset)); + + /* Find status md record */ + for (;;) { + TESTNE(offset, 0); + TESTEQUAL(pread(fd, &type, sizeof(type), le64_to_cpu(offset)), + sizeof(type)); + if (type == 4) + break; + TESTEQUAL(pread(fd, &offset, sizeof(offset), + le64_to_cpu(offset) + 7), + sizeof(offset)); + } + + /* Read blocks_written */ + offset = le64_to_cpu(offset); + TESTEQUAL(pread(fd, &bw, sizeof(bw), offset + 23), sizeof(bw)); + TESTEQUAL(le32_to_cpu(bw), data_blocks_written); + TESTEQUAL(pread(fd, &bw, sizeof(bw), offset + 27), sizeof(bw)); + TESTEQUAL(le32_to_cpu(bw), hash_blocks_written); + + /* Write out zero */ + bw = 0; + TESTEQUAL(pwrite(fd, &bw, sizeof(bw), offset + 23), sizeof(bw)); + TESTEQUAL(pwrite(fd, &bw, sizeof(bw), offset + 27), sizeof(bw)); + + test_result = TEST_SUCCESS; +out: + return test_result; +} + +static int validate_block_count(const char *mount_dir, const char *backing_dir, + struct test_file *file, + int total_data_blocks, int filled_data_blocks, + int total_hash_blocks, int filled_hash_blocks) +{ + char *filename = NULL; + char *backing_filename = NULL; + int fd = -1; + struct incfs_get_block_count_args bca = {}; + int test_result = TEST_FAILURE; + struct incfs_filled_range ranges[128]; + struct incfs_get_filled_blocks_args fba = { + .range_buffer = ptr_to_u64(ranges), + .range_buffer_size = sizeof(ranges), + }; + int cmd_fd = -1; + struct incfs_permit_fill permit_fill; + + TEST(filename = concat_file_name(mount_dir, file->name), filename); + TEST(backing_filename = concat_file_name(backing_dir, file->name), + backing_filename); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + + TESTEQUAL(ioctl(fd, INCFS_IOC_GET_BLOCK_COUNT, &bca), 0); + TESTEQUAL(bca.total_data_blocks_out, total_data_blocks); + TESTEQUAL(bca.filled_data_blocks_out, filled_data_blocks); + TESTEQUAL(bca.total_hash_blocks_out, total_hash_blocks); + TESTEQUAL(bca.filled_hash_blocks_out, filled_hash_blocks); + + close(fd); + TESTEQUAL(umount(mount_dir), 0); + TEST(fd = open(backing_filename, O_RDWR | O_CLOEXEC), fd != -1); + TESTEQUAL(zero_blocks_written_count(fd, filled_data_blocks, + filled_hash_blocks), + TEST_SUCCESS); + close(fd); + fd = -1; + TESTEQUAL(mount_fs_opt(mount_dir, backing_dir, "readahead=0", false), + 0); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + + TESTEQUAL(ioctl(fd, INCFS_IOC_GET_BLOCK_COUNT, &bca), 0); + TESTEQUAL(bca.total_data_blocks_out, total_data_blocks); + TESTEQUAL(bca.filled_data_blocks_out, 0); + TESTEQUAL(bca.total_hash_blocks_out, total_hash_blocks); + TESTEQUAL(bca.filled_hash_blocks_out, 0); + + TEST(cmd_fd = open_commands_file(mount_dir), cmd_fd != -1); + permit_fill.file_descriptor = fd; + TESTEQUAL(ioctl(cmd_fd, INCFS_IOC_PERMIT_FILL, &permit_fill), 0); + do { + ioctl(fd, INCFS_IOC_GET_FILLED_BLOCKS, &fba); + fba.start_index = fba.index_out + 1; + } while (fba.index_out < fba.total_blocks_out); + + TESTEQUAL(ioctl(fd, INCFS_IOC_GET_BLOCK_COUNT, &bca), 0); + TESTEQUAL(bca.total_data_blocks_out, total_data_blocks); + TESTEQUAL(bca.filled_data_blocks_out, filled_data_blocks); + TESTEQUAL(bca.total_hash_blocks_out, total_hash_blocks); + TESTEQUAL(bca.filled_hash_blocks_out, filled_hash_blocks); + + test_result = TEST_SUCCESS; +out: + close(cmd_fd); + close(fd); + free(filename); + free(backing_filename); + return test_result; +} + + + +static int validate_data_block_count(const char *mount_dir, + const char *backing_dir, + struct test_file *file) +{ + const int total_data_blocks = 1 + (file->size - 1) / + INCFS_DATA_FILE_BLOCK_SIZE; + const int filled_data_blocks = (total_data_blocks + 1) / 2; + + int test_result = TEST_FAILURE; + char *filename = NULL; + char *incomplete_filename = NULL; + struct stat stat_buf_incomplete, stat_buf_file; + int fd = -1; + struct incfs_get_block_count_args bca = {}; + int i; + + TEST(filename = concat_file_name(mount_dir, file->name), filename); + TEST(incomplete_filename = get_incomplete_filename(mount_dir, file->id), + incomplete_filename); + + TESTEQUAL(stat(filename, &stat_buf_file), 0); + TESTEQUAL(stat(incomplete_filename, &stat_buf_incomplete), 0); + TESTEQUAL(stat_buf_file.st_ino, stat_buf_incomplete.st_ino); + TESTEQUAL(stat_buf_file.st_nlink, 3); + + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + TESTEQUAL(ioctl(fd, INCFS_IOC_GET_BLOCK_COUNT, &bca), 0); + TESTEQUAL(bca.total_data_blocks_out, total_data_blocks); + TESTEQUAL(bca.filled_data_blocks_out, 0); + TESTEQUAL(bca.total_hash_blocks_out, 0); + TESTEQUAL(bca.filled_hash_blocks_out, 0); + + for (i = 0; i < total_data_blocks; i += 2) + TESTEQUAL(emit_test_block(mount_dir, file, i), 0); + + TESTEQUAL(ioctl(fd, INCFS_IOC_GET_BLOCK_COUNT, &bca), 0); + TESTEQUAL(bca.total_data_blocks_out, total_data_blocks); + TESTEQUAL(bca.filled_data_blocks_out, filled_data_blocks); + TESTEQUAL(bca.total_hash_blocks_out, 0); + TESTEQUAL(bca.filled_hash_blocks_out, 0); + close(fd); + fd = -1; + + TESTEQUAL(validate_block_count(mount_dir, backing_dir, file, + total_data_blocks, filled_data_blocks, + 0, 0), + 0); + + for (i = 1; i < total_data_blocks; i += 2) + TESTEQUAL(emit_test_block(mount_dir, file, i), 0); + + TESTEQUAL(stat(incomplete_filename, &stat_buf_incomplete), -1); + TESTEQUAL(errno, ENOENT); + + test_result = TEST_SUCCESS; +out: + close(fd); + free(incomplete_filename); + free(filename); + return test_result; +} + +static int data_block_count_test(const char *mount_dir) +{ + int result = TEST_FAILURE; + char *backing_dir; + int cmd_fd = -1; + int i; + struct test_files_set test = get_test_files_set(); + + TEST(backing_dir = create_backing_dir(mount_dir), backing_dir); + TESTEQUAL(mount_fs_opt(mount_dir, backing_dir, "readahead=0", false), + 0); + + for (i = 0; i < test.files_count; ++i) { + struct test_file *file = &test.files[i]; + + TEST(cmd_fd = open_commands_file(mount_dir), cmd_fd != -1); + TESTEQUAL(emit_file(cmd_fd, NULL, file->name, &file->id, + file->size, NULL), + 0); + close(cmd_fd); + cmd_fd = -1; + + TESTEQUAL(validate_data_block_count(mount_dir, backing_dir, + file), + 0); + } + + result = TEST_SUCCESS; +out: + close(cmd_fd); + umount(mount_dir); + free(backing_dir); + return result; +} + +static int validate_hash_block_count(const char *mount_dir, + const char *backing_dir, + struct test_file *file) +{ + const int digest_size = SHA256_DIGEST_SIZE; + const int hash_per_block = INCFS_DATA_FILE_BLOCK_SIZE / digest_size; + const int total_data_blocks = 1 + (file->size - 1) / + INCFS_DATA_FILE_BLOCK_SIZE; + + int result = TEST_FAILURE; + int hash_layer = total_data_blocks; + int total_hash_blocks = 0; + int filled_hash_blocks; + char *filename = NULL; + int fd = -1; + struct incfs_get_block_count_args bca = {}; + + while (hash_layer > 1) { + hash_layer = (hash_layer + hash_per_block - 1) / hash_per_block; + total_hash_blocks += hash_layer; + } + filled_hash_blocks = total_hash_blocks > 1 ? 1 : 0; + + TEST(filename = concat_file_name(mount_dir, file->name), filename); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + + TESTEQUAL(ioctl(fd, INCFS_IOC_GET_BLOCK_COUNT, &bca), 0); + TESTEQUAL(bca.total_data_blocks_out, total_data_blocks); + TESTEQUAL(bca.filled_data_blocks_out, 0); + TESTEQUAL(bca.total_hash_blocks_out, total_hash_blocks); + TESTEQUAL(bca.filled_hash_blocks_out, 0); + + TESTEQUAL(emit_partial_test_file_hash(mount_dir, file), 0); + + TESTEQUAL(ioctl(fd, INCFS_IOC_GET_BLOCK_COUNT, &bca), 0); + TESTEQUAL(bca.total_data_blocks_out, total_data_blocks); + TESTEQUAL(bca.filled_data_blocks_out, 0); + TESTEQUAL(bca.total_hash_blocks_out, total_hash_blocks); + TESTEQUAL(bca.filled_hash_blocks_out, filled_hash_blocks); + close(fd); + fd = -1; + + if (filled_hash_blocks) + TESTEQUAL(validate_block_count(mount_dir, backing_dir, file, + total_data_blocks, 0, + total_hash_blocks, filled_hash_blocks), + 0); + + result = TEST_SUCCESS; +out: + close(fd); + free(filename); + return result; +} + +static int hash_block_count_test(const char *mount_dir) +{ + int result = TEST_FAILURE; + char *backing_dir; + int cmd_fd = -1; + int i; + struct test_files_set test = get_test_files_set(); + + TEST(backing_dir = create_backing_dir(mount_dir), backing_dir); + TESTEQUAL(mount_fs_opt(mount_dir, backing_dir, "readahead=0", false), + 0); + + for (i = 0; i < test.files_count; i++) { + struct test_file *file = &test.files[i]; + + TEST(cmd_fd = open_commands_file(mount_dir), cmd_fd != -1); + TESTEQUAL(crypto_emit_file(cmd_fd, NULL, file->name, &file->id, + file->size, file->root_hash, + file->sig.add_data), + 0); + close(cmd_fd); + cmd_fd = -1; + + TESTEQUAL(validate_hash_block_count(mount_dir, backing_dir, + &test.files[i]), + 0); + } + + result = TEST_SUCCESS; +out: + close(cmd_fd); + umount(mount_dir); + free(backing_dir); + return result; +} + +static int is_close(struct timespec *start, int expected_ms) +{ + const int allowed_variance = 500; + int result = TEST_FAILURE; + struct timespec finish; + int diff; + + TESTEQUAL(clock_gettime(CLOCK_MONOTONIC, &finish), 0); + diff = (finish.tv_sec - start->tv_sec) * 1000 + + (finish.tv_nsec - start->tv_nsec) / 1000000; + + if(diff < expected_ms - allowed_variance || diff > expected_ms + allowed_variance) { + printf("%d %d %d\n", diff, expected_ms, allowed_variance); + } + + TESTCOND(diff >= expected_ms); + TESTCOND(diff <= expected_ms + allowed_variance); + result = TEST_SUCCESS; +out: + return result; +} + +static int per_uid_read_timeouts_test(const char *mount_dir) +{ + struct test_file file = { + .name = "file", + .size = 16 * INCFS_DATA_FILE_BLOCK_SIZE + }; + + int result = TEST_FAILURE; + char *backing_dir = NULL; + int pid = -1; + int cmd_fd = -1; + char *filename = NULL; + int fd = -1; + struct timespec start; + char buffer[4096]; + struct incfs_per_uid_read_timeouts purt_get[1]; + struct incfs_get_read_timeouts_args grt = { + ptr_to_u64(purt_get), + sizeof(purt_get) + }; + struct incfs_per_uid_read_timeouts purt_set[] = { + { + .uid = 0, + .min_time_us = 1000000, + .min_pending_time_us = 2000000, + .max_pending_time_us = 3000000, + }, + }; + struct incfs_set_read_timeouts_args srt = { + ptr_to_u64(purt_set), + sizeof(purt_set) + }; + int status; + + TEST(backing_dir = create_backing_dir(mount_dir), backing_dir); + TESTEQUAL(mount_fs_opt(mount_dir, backing_dir, + "read_timeout_ms=1000,readahead=0", false), 0); + + TEST(cmd_fd = open_commands_file(mount_dir), cmd_fd != -1); + TESTEQUAL(emit_file(cmd_fd, NULL, file.name, &file.id, file.size, + NULL), 0); + + TEST(filename = concat_file_name(mount_dir, file.name), filename); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + TESTEQUAL(fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC), 0); + + /* Default mount options read failure is 1000 */ + TESTEQUAL(clock_gettime(CLOCK_MONOTONIC, &start), 0); + TESTEQUAL(pread(fd, buffer, sizeof(buffer), 0), -1); + TESTEQUAL(is_close(&start, 1000), 0); + + grt.timeouts_array_size = 0; + TESTEQUAL(ioctl(cmd_fd, INCFS_IOC_GET_READ_TIMEOUTS, &grt), 0); + TESTEQUAL(grt.timeouts_array_size_out, 0); + + /* Set it to 3000 */ + TESTEQUAL(ioctl(cmd_fd, INCFS_IOC_SET_READ_TIMEOUTS, &srt), 0); + TESTEQUAL(clock_gettime(CLOCK_MONOTONIC, &start), 0); + TESTEQUAL(pread(fd, buffer, sizeof(buffer), 0), -1); + TESTEQUAL(is_close(&start, 3000), 0); + TESTEQUAL(ioctl(cmd_fd, INCFS_IOC_GET_READ_TIMEOUTS, &grt), -1); + TESTEQUAL(errno, E2BIG); + TESTEQUAL(grt.timeouts_array_size_out, sizeof(purt_get)); + grt.timeouts_array_size = sizeof(purt_get); + TESTEQUAL(ioctl(cmd_fd, INCFS_IOC_GET_READ_TIMEOUTS, &grt), 0); + TESTEQUAL(grt.timeouts_array_size_out, sizeof(purt_get)); + TESTEQUAL(purt_get[0].uid, purt_set[0].uid); + TESTEQUAL(purt_get[0].min_time_us, purt_set[0].min_time_us); + TESTEQUAL(purt_get[0].min_pending_time_us, + purt_set[0].min_pending_time_us); + TESTEQUAL(purt_get[0].max_pending_time_us, + purt_set[0].max_pending_time_us); + + /* Still 1000 in UID 2 */ + TEST(pid = fork(), pid != -1); + if (pid == 0) { + TESTEQUAL(setuid(2), 0); + TESTEQUAL(pread(fd, buffer, sizeof(buffer), 0), -1); + exit(0); + } + TESTEQUAL(clock_gettime(CLOCK_MONOTONIC, &start), 0); + TESTNE(wait(&status), -1); + TESTEQUAL(WEXITSTATUS(status), 0); + TESTEQUAL(is_close(&start, 1000), 0); + + /* Set it to default */ + purt_set[0].max_pending_time_us = UINT32_MAX; + TESTEQUAL(ioctl(cmd_fd, INCFS_IOC_SET_READ_TIMEOUTS, &srt), 0); + TESTEQUAL(clock_gettime(CLOCK_MONOTONIC, &start), 0); + TESTEQUAL(pread(fd, buffer, sizeof(buffer), 0), -1); + TESTEQUAL(is_close(&start, 1000), 0); + + /* Test min read time */ + TESTEQUAL(emit_test_block(mount_dir, &file, 0), 0); + TESTEQUAL(clock_gettime(CLOCK_MONOTONIC, &start), 0); + TESTEQUAL(pread(fd, buffer, sizeof(buffer), 0), sizeof(buffer)); + TESTEQUAL(is_close(&start, 1000), 0); + + /* Test min pending time */ + purt_set[0].uid = 2; + TESTEQUAL(ioctl(cmd_fd, INCFS_IOC_SET_READ_TIMEOUTS, &srt), 0); + TESTEQUAL(clock_gettime(CLOCK_MONOTONIC, &start), 0); + TEST(pid = fork(), pid != -1); + if (pid == 0) { + TESTEQUAL(setuid(2), 0); + TESTEQUAL(pread(fd, buffer, sizeof(buffer), sizeof(buffer)), + sizeof(buffer)); + exit(0); + } + sleep(1); + TESTEQUAL(emit_test_block(mount_dir, &file, 1), 0); + TESTNE(wait(&status), -1); + TESTEQUAL(WEXITSTATUS(status), 0); + TESTEQUAL(is_close(&start, 2000), 0); + + /* Clear timeouts */ + srt.timeouts_array_size = 0; + TESTEQUAL(ioctl(cmd_fd, INCFS_IOC_SET_READ_TIMEOUTS, &srt), 0); + grt.timeouts_array_size = 0; + TESTEQUAL(ioctl(cmd_fd, INCFS_IOC_GET_READ_TIMEOUTS, &grt), 0); + TESTEQUAL(grt.timeouts_array_size_out, 0); + + result = TEST_SUCCESS; +out: + close(fd); + + if (pid == 0) + exit(result); + + free(filename); + close(cmd_fd); + umount(mount_dir); + free(backing_dir); + return result; +} + +#define DIRS 3 +static int inotify_test(const char *mount_dir) +{ + const char *mapped_file_name = "mapped_name"; + struct test_file file = { + .name = "file", + .size = 16 * INCFS_DATA_FILE_BLOCK_SIZE + }; + + int result = TEST_FAILURE; + char *backing_dir = NULL, *index_dir = NULL, *incomplete_dir = NULL; + char *file_name = NULL; + int cmd_fd = -1; + int notify_fd = -1; + int wds[DIRS]; + char buffer[DIRS * (sizeof(struct inotify_event) + NAME_MAX + 1)]; + char *ptr = buffer; + struct inotify_event *event; + struct inotify_event *events[DIRS] = {}; + const char *names[DIRS] = {}; + int res; + char id[sizeof(incfs_uuid_t) * 2 + 1]; + struct incfs_create_mapped_file_args mfa; + + /* File creation triggers inotify events in .index and .incomplete? */ + TEST(backing_dir = create_backing_dir(mount_dir), backing_dir); + TEST(index_dir = concat_file_name(mount_dir, ".index"), index_dir); + TEST(incomplete_dir = concat_file_name(mount_dir, ".incomplete"), + incomplete_dir); + TESTEQUAL(mount_fs(mount_dir, backing_dir, 50), 0); + TEST(cmd_fd = open_commands_file(mount_dir), cmd_fd != -1); + TEST(notify_fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC), + notify_fd != -1); + TEST(wds[0] = inotify_add_watch(notify_fd, mount_dir, + IN_CREATE | IN_DELETE), + wds[0] != -1); + TEST(wds[1] = inotify_add_watch(notify_fd, index_dir, + IN_CREATE | IN_DELETE), + wds[1] != -1); + TEST(wds[2] = inotify_add_watch(notify_fd, incomplete_dir, + IN_CREATE | IN_DELETE), + wds[2] != -1); + TESTEQUAL(emit_file(cmd_fd, NULL, file.name, &file.id, file.size, + NULL), 0); + TEST(res = read(notify_fd, buffer, sizeof(buffer)), res != -1); + + while (ptr < buffer + res) { + int i; + + event = (struct inotify_event *) ptr; + TESTCOND(ptr + sizeof(*event) <= buffer + res); + for (i = 0; i < DIRS; ++i) + if (event->wd == wds[i]) { + TESTEQUAL(events[i], NULL); + events[i] = event; + ptr += sizeof(*event); + names[i] = ptr; + ptr += events[i]->len; + TESTCOND(ptr <= buffer + res); + break; + } + TESTCOND(i < DIRS); + } + + TESTNE(events[0], NULL); + TESTNE(events[1], NULL); + TESTNE(events[2], NULL); + + bin2hex(id, file.id.bytes, sizeof(incfs_uuid_t)); + + TESTEQUAL(events[0]->mask, IN_CREATE); + TESTEQUAL(events[1]->mask, IN_CREATE); + TESTEQUAL(events[2]->mask, IN_CREATE); + TESTEQUAL(strcmp(names[0], file.name), 0); + TESTEQUAL(strcmp(names[1], id), 0); + TESTEQUAL(strcmp(names[2], id), 0); + + /* Creating a mapped file triggers inotify event */ + mfa = (struct incfs_create_mapped_file_args) { + .size = INCFS_DATA_FILE_BLOCK_SIZE, + .mode = 0664, + .file_name = ptr_to_u64(mapped_file_name), + .source_file_id = file.id, + .source_offset = INCFS_DATA_FILE_BLOCK_SIZE, + }; + + TEST(res = ioctl(cmd_fd, INCFS_IOC_CREATE_MAPPED_FILE, &mfa), + res != -1); + TEST(res = read(notify_fd, buffer, sizeof(buffer)), res != -1); + event = (struct inotify_event *) buffer; + TESTEQUAL(event->wd, wds[0]); + TESTEQUAL(event->mask, IN_CREATE); + TESTEQUAL(strcmp(event->name, mapped_file_name), 0); + + /* File completion triggers inotify event in .incomplete? */ + TESTEQUAL(emit_test_file_data(mount_dir, &file), 0); + TEST(res = read(notify_fd, buffer, sizeof(buffer)), res != -1); + event = (struct inotify_event *) buffer; + TESTEQUAL(event->wd, wds[2]); + TESTEQUAL(event->mask, IN_DELETE); + TESTEQUAL(strcmp(event->name, id), 0); + + /* File unlinking triggers inotify event in .index? */ + TEST(file_name = concat_file_name(mount_dir, file.name), file_name); + TESTEQUAL(unlink(file_name), 0); + TEST(res = read(notify_fd, buffer, sizeof(buffer)), res != -1); + memset(events, 0, sizeof(events)); + memset(names, 0, sizeof(names)); + for (ptr = buffer; ptr < buffer + res;) { + event = (struct inotify_event *) ptr; + int i; + + TESTCOND(ptr + sizeof(*event) <= buffer + res); + for (i = 0; i < DIRS; ++i) + if (event->wd == wds[i]) { + TESTEQUAL(events[i], NULL); + events[i] = event; + ptr += sizeof(*event); + names[i] = ptr; + ptr += events[i]->len; + TESTCOND(ptr <= buffer + res); + break; + } + TESTCOND(i < DIRS); + } + + TESTNE(events[0], NULL); + TESTNE(events[1], NULL); + TESTEQUAL(events[2], NULL); + + TESTEQUAL(events[0]->mask, IN_DELETE); + TESTEQUAL(events[1]->mask, IN_DELETE); + TESTEQUAL(strcmp(names[0], file.name), 0); + TESTEQUAL(strcmp(names[1], id), 0); + + result = TEST_SUCCESS; +out: + free(file_name); + close(notify_fd); + close(cmd_fd); + umount(mount_dir); + free(backing_dir); + free(index_dir); + free(incomplete_dir); + return result; +} + +static EVP_PKEY *create_key(void) +{ + EVP_PKEY *pkey = NULL; + RSA *rsa = NULL; + BIGNUM *bn = NULL; + + pkey = EVP_PKEY_new(); + if (!pkey) + goto fail; + + bn = BN_new(); + BN_set_word(bn, RSA_F4); + + rsa = RSA_new(); + if (!rsa) + goto fail; + + RSA_generate_key_ex(rsa, 4096, bn, NULL); + EVP_PKEY_assign_RSA(pkey, rsa); + + BN_free(bn); + return pkey; + +fail: + BN_free(bn); + EVP_PKEY_free(pkey); + return NULL; +} + +static X509 *get_cert(EVP_PKEY *key) +{ + X509 *x509 = NULL; + X509_NAME *name = NULL; + + x509 = X509_new(); + if (!x509) + return NULL; + + ASN1_INTEGER_set(X509_get_serialNumber(x509), 1); + X509_gmtime_adj(X509_get_notBefore(x509), 0); + X509_gmtime_adj(X509_get_notAfter(x509), 31536000L); + X509_set_pubkey(x509, key); + + name = X509_get_subject_name(x509); + X509_NAME_add_entry_by_txt(name, "C", MBSTRING_ASC, + (const unsigned char *)"US", -1, -1, 0); + X509_NAME_add_entry_by_txt(name, "ST", MBSTRING_ASC, + (const unsigned char *)"CA", -1, -1, 0); + X509_NAME_add_entry_by_txt(name, "L", MBSTRING_ASC, + (const unsigned char *)"San Jose", -1, -1, 0); + X509_NAME_add_entry_by_txt(name, "O", MBSTRING_ASC, + (const unsigned char *)"Example", -1, -1, 0); + X509_NAME_add_entry_by_txt(name, "OU", MBSTRING_ASC, + (const unsigned char *)"Org", -1, -1, 0); + X509_NAME_add_entry_by_txt(name, "CN", MBSTRING_ASC, + (const unsigned char *)"www.example.com", -1, -1, 0); + X509_set_issuer_name(x509, name); + + if (!X509_sign(x509, key, EVP_sha256())) + return NULL; + + return x509; +} + +static int sign(EVP_PKEY *key, X509 *cert, const char *data, size_t len, + unsigned char **sig, size_t *sig_len) +{ + const int pkcs7_flags = PKCS7_BINARY | PKCS7_NOATTR | PKCS7_PARTIAL | + PKCS7_DETACHED; + const EVP_MD *md = EVP_sha256(); + + int result = TEST_FAILURE; + + BIO *bio = NULL; + PKCS7 *p7 = NULL; + unsigned char *bio_buffer; + + TEST(bio = BIO_new_mem_buf(data, len), bio); + TEST(p7 = PKCS7_sign(NULL, NULL, NULL, bio, pkcs7_flags), p7); + TESTNE(PKCS7_sign_add_signer(p7, cert, key, md, pkcs7_flags), 0); + TESTEQUAL(PKCS7_final(p7, bio, pkcs7_flags), 1); + TEST(*sig_len = i2d_PKCS7(p7, NULL), *sig_len); + TEST(bio_buffer = malloc(*sig_len), bio_buffer); + *sig = bio_buffer; + TEST(*sig_len = i2d_PKCS7(p7, &bio_buffer), *sig_len); + TESTEQUAL(PKCS7_verify(p7, NULL, NULL, bio, NULL, + pkcs7_flags | PKCS7_NOVERIFY | PKCS7_NOSIGS), 1); + + result = TEST_SUCCESS; +out: + PKCS7_free(p7); + BIO_free(bio); + return result; +} + +static int verity_installed(const char *mount_dir, int cmd_fd, bool *installed) +{ + int result = TEST_FAILURE; + char *filename = NULL; + int fd = -1; + struct test_file *file = &get_test_files_set().files[0]; + + TESTEQUAL(emit_file(cmd_fd, NULL, file->name, &file->id, file->size, + NULL), 0); + TEST(filename = concat_file_name(mount_dir, file->name), filename); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + TESTEQUAL(ioctl(fd, FS_IOC_ENABLE_VERITY, NULL), -1); + *installed = errno != EOPNOTSUPP; + + result = TEST_SUCCESS; +out: + close(fd); + if (filename) + remove(filename); + free(filename); + return result; +} + +static int enable_verity(const char *mount_dir, struct test_file *file, + EVP_PKEY *key, X509 *cert, bool use_signatures) +{ + int result = TEST_FAILURE; + char *filename = NULL; + int fd = -1; + struct fsverity_enable_arg fear = { + .version = 1, + .hash_algorithm = FS_VERITY_HASH_ALG_SHA256, + .block_size = INCFS_DATA_FILE_BLOCK_SIZE, + .sig_size = 0, + .sig_ptr = 0, + }; + struct { + __u8 version; /* must be 1 */ + __u8 hash_algorithm; /* Merkle tree hash algorithm */ + __u8 log_blocksize; /* log2 of size of data and tree blocks */ + __u8 salt_size; /* size of salt in bytes; 0 if none */ + __le32 sig_size; /* must be 0 */ + __le64 data_size; /* size of file the Merkle tree is built over */ + __u8 root_hash[64]; /* Merkle tree root hash */ + __u8 salt[32]; /* salt prepended to each hashed block */ + __u8 __reserved[144]; /* must be 0's */ + } __packed fsverity_descriptor = { + .version = 1, + .hash_algorithm = 1, + .log_blocksize = 12, + .data_size = file->size, + }; + struct { + char magic[8]; /* must be "FSVerity" */ + __le16 digest_algorithm; + __le16 digest_size; + __u8 digest[32]; + } __packed fsverity_signed_digest = { + .digest_algorithm = 1, + .digest_size = 32 + }; + unsigned char *sig = NULL; + size_t sig_size = 0; + uint64_t flags; + struct statx statxbuf = {}; + + memcpy(fsverity_signed_digest.magic, "FSVerity", 8); + + TEST(filename = concat_file_name(mount_dir, file->name), filename); + TESTEQUAL(syscall(__NR_statx, AT_FDCWD, filename, 0, STATX_ALL, + &statxbuf), 0); + TESTEQUAL(statxbuf.stx_attributes_mask & STATX_ATTR_VERITY, + STATX_ATTR_VERITY); + TESTEQUAL(statxbuf.stx_attributes & STATX_ATTR_VERITY, 0); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + TESTEQUAL(ioctl(fd, FS_IOC_GETFLAGS, &flags), 0); + TESTEQUAL(flags & FS_VERITY_FL, 0); + + /* First try to enable verity with random digest */ + if (key) { + TESTEQUAL(sign(key, cert, (void *)&fsverity_signed_digest, + sizeof(fsverity_signed_digest), &sig, &sig_size), + 0); + + fear.sig_size = sig_size; + fear.sig_ptr = ptr_to_u64(sig); + TESTEQUAL(ioctl(fd, FS_IOC_ENABLE_VERITY, &fear), -1); + } + + /* Now try with correct digest */ + memcpy(fsverity_descriptor.root_hash, file->root_hash, 32); + sha256((char *)&fsverity_descriptor, sizeof(fsverity_descriptor), + (char *)fsverity_signed_digest.digest); + + if (ioctl(fd, FS_IOC_ENABLE_VERITY, NULL) == -1 && + errno == EOPNOTSUPP) { + result = TEST_SUCCESS; + goto out; + } + + free(sig); + sig = NULL; + + if (key) + TESTEQUAL(sign(key, cert, (void *)&fsverity_signed_digest, + sizeof(fsverity_signed_digest), + &sig, &sig_size), + 0); + + if (use_signatures) { + fear.sig_size = sig_size; + file->verity_sig_size = sig_size; + fear.sig_ptr = ptr_to_u64(sig); + file->verity_sig = sig; + sig = NULL; + } else { + fear.sig_size = 0; + fear.sig_ptr = 0; + } + TESTEQUAL(ioctl(fd, FS_IOC_ENABLE_VERITY, &fear), 0); + + result = TEST_SUCCESS; +out: + free(sig); + close(fd); + free(filename); + return result; +} + +static int memzero(const unsigned char *buf, size_t size) +{ + size_t i; + + for (i = 0; i < size; ++i) + if (buf[i]) + return -1; + return 0; +} + +static int validate_verity(const char *mount_dir, struct test_file *file) +{ + int result = TEST_FAILURE; + char *filename = 0; + int fd = -1; + uint64_t flags; + struct fsverity_digest *digest; + struct statx statxbuf = {}; + struct fsverity_read_metadata_arg frma = {}; + uint8_t *buf = NULL; + struct fsverity_descriptor desc; + + TEST(digest = malloc(sizeof(struct fsverity_digest) + + INCFS_MAX_HASH_SIZE), digest != NULL); + TEST(filename = concat_file_name(mount_dir, file->name), filename); + TESTEQUAL(syscall(__NR_statx, AT_FDCWD, filename, 0, STATX_ALL, + &statxbuf), 0); + TESTEQUAL(statxbuf.stx_attributes & STATX_ATTR_VERITY, + STATX_ATTR_VERITY); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + TESTEQUAL(ioctl(fd, FS_IOC_GETFLAGS, &flags), 0); + TESTEQUAL(flags & FS_VERITY_FL, FS_VERITY_FL); + digest->digest_size = INCFS_MAX_HASH_SIZE; + TESTEQUAL(ioctl(fd, FS_IOC_MEASURE_VERITY, digest), 0); + TESTEQUAL(digest->digest_algorithm, FS_VERITY_HASH_ALG_SHA256); + TESTEQUAL(digest->digest_size, 32); + + if (file->verity_sig) { + TEST(buf = malloc(file->verity_sig_size), buf); + frma = (struct fsverity_read_metadata_arg) { + .metadata_type = FS_VERITY_METADATA_TYPE_SIGNATURE, + .length = file->verity_sig_size, + .buf_ptr = ptr_to_u64(buf), + }; + TESTEQUAL(ioctl(fd, FS_IOC_READ_VERITY_METADATA, &frma), + file->verity_sig_size); + TESTEQUAL(memcmp(buf, file->verity_sig, file->verity_sig_size), + 0); + } else { + frma = (struct fsverity_read_metadata_arg) { + .metadata_type = FS_VERITY_METADATA_TYPE_SIGNATURE, + }; + TESTEQUAL(ioctl(fd, FS_IOC_READ_VERITY_METADATA, &frma), -1); + TESTEQUAL(errno, ENODATA); + } + + frma = (struct fsverity_read_metadata_arg) { + .metadata_type = FS_VERITY_METADATA_TYPE_DESCRIPTOR, + .length = sizeof(desc), + .buf_ptr = ptr_to_u64(&desc), + }; + TESTEQUAL(ioctl(fd, FS_IOC_READ_VERITY_METADATA, &frma), + sizeof(desc)); + TESTEQUAL(desc.version, 1); + TESTEQUAL(desc.hash_algorithm, FS_VERITY_HASH_ALG_SHA256); + TESTEQUAL(desc.log_blocksize, ilog2(INCFS_DATA_FILE_BLOCK_SIZE)); + TESTEQUAL(desc.salt_size, 0); + TESTEQUAL(desc.__reserved_0x04, 0); + TESTEQUAL(desc.data_size, file->size); + TESTEQUAL(memcmp(desc.root_hash, file->root_hash, SHA256_DIGEST_SIZE), + 0); + TESTEQUAL(memzero(desc.root_hash + SHA256_DIGEST_SIZE, + sizeof(desc.root_hash) - SHA256_DIGEST_SIZE), 0); + TESTEQUAL(memzero(desc.salt, sizeof(desc.salt)), 0); + TESTEQUAL(memzero(desc.__reserved, sizeof(desc.__reserved)), 0); + + result = TEST_SUCCESS; +out: + free(buf); + close(fd); + free(filename); + free(digest); + return result; +} + +static int verity_test_optional_sigs(const char *mount_dir, bool use_signatures) +{ + int result = TEST_FAILURE; + char *backing_dir = NULL; + bool installed; + int cmd_fd = -1; + int i; + struct test_files_set test = get_test_files_set(); + const int file_num = test.files_count; + EVP_PKEY *key = NULL; + X509 *cert = NULL; + BIO *mem = NULL; + long len; + void *ptr; + FILE *proc_key_fd = NULL; + char *line = NULL; + size_t read = 0; + int key_id = -1; + + TEST(backing_dir = create_backing_dir(mount_dir), backing_dir); + TESTEQUAL(mount_fs_opt(mount_dir, backing_dir, "readahead=0", false), + 0); + TEST(cmd_fd = open_commands_file(mount_dir), cmd_fd != -1); + TESTEQUAL(verity_installed(mount_dir, cmd_fd, &installed), 0); + if (!installed) { + result = TEST_SUCCESS; + goto out; + } + TEST(key = create_key(), key); + TEST(cert = get_cert(key), cert); + + TEST(proc_key_fd = fopen("/proc/keys", "r"), proc_key_fd != NULL); + while (getline(&line, &read, proc_key_fd) != -1) + if (strstr(line, ".fs-verity")) + key_id = strtol(line, NULL, 16); + + TEST(mem = BIO_new(BIO_s_mem()), mem != NULL); + TESTEQUAL(i2d_X509_bio(mem, cert), 1); + TEST(len = BIO_get_mem_data(mem, &ptr), len != 0); + TESTCOND(key_id == -1 + || syscall(__NR_add_key, "asymmetric", "test:key", ptr, len, + key_id) != -1); + + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + + build_mtree(file); + TESTEQUAL(crypto_emit_file(cmd_fd, NULL, file->name, &file->id, + file->size, file->root_hash, + file->sig.add_data), 0); + + TESTEQUAL(load_hash_tree(mount_dir, file), 0); + TESTEQUAL(enable_verity(mount_dir, file, key, cert, + use_signatures), + 0); + } + + for (i = 0; i < file_num; i++) + TESTEQUAL(validate_verity(mount_dir, &test.files[i]), 0); + + close(cmd_fd); + cmd_fd = -1; + TESTEQUAL(umount(mount_dir), 0); + TESTEQUAL(mount_fs_opt(mount_dir, backing_dir, "readahead=0", false), + 0); + + for (i = 0; i < file_num; i++) + TESTEQUAL(validate_verity(mount_dir, &test.files[i]), 0); + + result = TEST_SUCCESS; +out: + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + + free(file->mtree); + free(file->verity_sig); + + file->mtree = NULL; + file->verity_sig = NULL; + } + + free(line); + BIO_free(mem); + X509_free(cert); + EVP_PKEY_free(key); + fclose(proc_key_fd); + close(cmd_fd); + umount(mount_dir); + free(backing_dir); + return result; +} + +static int verity_test(const char *mount_dir) +{ + int result = TEST_FAILURE; + + TESTEQUAL(verity_test_optional_sigs(mount_dir, true), TEST_SUCCESS); + TESTEQUAL(verity_test_optional_sigs(mount_dir, false), TEST_SUCCESS); + result = TEST_SUCCESS; +out: + return result; +} + +static int verity_file_valid(const char *mount_dir, struct test_file *file) +{ + int result = TEST_FAILURE; + char *filename = NULL; + int fd = -1; + uint8_t buffer[INCFS_DATA_FILE_BLOCK_SIZE]; + struct incfs_get_file_sig_args gfsa = { + .file_signature = ptr_to_u64(buffer), + .file_signature_buf_size = sizeof(buffer), + }; + int i; + + TEST(filename = concat_file_name(mount_dir, file->name), filename); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + TESTEQUAL(ioctl(fd, INCFS_IOC_READ_FILE_SIGNATURE, &gfsa), 0); + for (i = 0; i < file->size; i += sizeof(buffer)) + TESTEQUAL(pread(fd, buffer, sizeof(buffer), i), + file->size - i > sizeof(buffer) ? + sizeof(buffer) : file->size - i); + + result = TEST_SUCCESS; +out: + close(fd); + free(filename); + return result; +} + +static int enable_verity_test(const char *mount_dir) +{ + int result = TEST_FAILURE; + char *backing_dir = NULL; + bool installed; + int cmd_fd = -1; + struct test_files_set test = get_test_files_set(); + int i; + + TEST(backing_dir = create_backing_dir(mount_dir), backing_dir); + TESTEQUAL(mount_fs(mount_dir, backing_dir, 0), 0); + TEST(cmd_fd = open_commands_file(mount_dir), cmd_fd != -1); + TESTEQUAL(verity_installed(mount_dir, cmd_fd, &installed), 0); + if (!installed) { + result = TEST_SUCCESS; + goto out; + } + for (i = 0; i < test.files_count; ++i) { + struct test_file *file = &test.files[i]; + + TESTEQUAL(emit_file(cmd_fd, NULL, file->name, &file->id, + file->size, NULL), 0); + TESTEQUAL(emit_test_file_data(mount_dir, file), 0); + TESTEQUAL(enable_verity(mount_dir, file, NULL, NULL, false), 0); + } + + /* Check files are valid on disk */ + close(cmd_fd); + cmd_fd = -1; + TESTEQUAL(umount(mount_dir), 0); + TESTEQUAL(mount_fs(mount_dir, backing_dir, 0), 0); + for (i = 0; i < test.files_count; ++i) + TESTEQUAL(verity_file_valid(mount_dir, &test.files[i]), 0); + + result = TEST_SUCCESS; +out: + close(cmd_fd); + umount(mount_dir); + free(backing_dir); + return result; +} + +static int mmap_test(const char *mount_dir) +{ + int result = TEST_FAILURE; + char *backing_dir = NULL; + int cmd_fd = -1; + /* + * File is big enough to have a two layer tree with two hashes in the + * higher level, so we can corrupt the second one + */ + int shas_per_block = INCFS_DATA_FILE_BLOCK_SIZE / SHA256_DIGEST_SIZE; + struct test_file file = { + .name = "file", + .size = INCFS_DATA_FILE_BLOCK_SIZE * shas_per_block * 2, + }; + char *filename = NULL; + int fd = -1; + char *addr = (void *)-1; + int pid = -1; + int status; + + TEST(backing_dir = create_backing_dir(mount_dir), backing_dir); + TESTEQUAL(mount_fs(mount_dir, backing_dir, 0), 0); + TEST(cmd_fd = open_commands_file(mount_dir), cmd_fd != -1); + + TESTEQUAL(build_mtree(&file), 0); + file.mtree[1].data[INCFS_DATA_FILE_BLOCK_SIZE] ^= 0xff; + TESTEQUAL(crypto_emit_file(cmd_fd, NULL, file.name, &file.id, + file.size, file.root_hash, + file.sig.add_data), 0); + TESTEQUAL(emit_test_file_data(mount_dir, &file), 0); + TESTEQUAL(load_hash_tree(mount_dir, &file), 0); + TEST(filename = concat_file_name(mount_dir, file.name), filename); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + TEST(addr = mmap(NULL, file.size, PROT_READ, MAP_PRIVATE, fd, 0), + addr != (void *) -1); + TESTEQUAL(addr[0], 57); + + TEST(pid = fork(), pid != -1); + if (pid == 0) { + fclose(stderr); + TESTEQUAL(addr[shas_per_block * INCFS_DATA_FILE_BLOCK_SIZE], + -71); + exit(0); + } + TESTNE(wait(&status), -1); + TESTEQUAL(status, 256); + + TESTEQUAL(addr[(shas_per_block - 1) * INCFS_DATA_FILE_BLOCK_SIZE], 76); + TESTEQUAL(munmap(addr, file.size), 0); + addr = (void *) -1; + + result = TEST_SUCCESS; +out: + if (addr != (void *) -1) + munmap(addr, file.size); + free(file.mtree); + close(fd); + free(filename); + close(cmd_fd); + umount(mount_dir); + free(backing_dir); + return result; +} + +static int truncate_test(const char *mount_dir) +{ + int result = TEST_FAILURE; + char *backing_dir = NULL; + int cmd_fd = -1; + struct test_file file = { + .name = "file", + .size = INCFS_DATA_FILE_BLOCK_SIZE, + }; + char *backing_file = NULL; + int fd = -1; + struct stat st; + + TEST(backing_dir = create_backing_dir(mount_dir), backing_dir); + TESTEQUAL(mount_fs(mount_dir, backing_dir, 0), 0); + TEST(cmd_fd = open_commands_file(mount_dir), cmd_fd != -1); + TESTEQUAL(emit_file(cmd_fd, NULL, file.name, &file.id, file.size, NULL), + 0); + TEST(backing_file = concat_file_name(backing_dir, file.name), + backing_file); + TEST(fd = open(backing_file, O_RDWR | O_CLOEXEC), fd != -1); + TESTEQUAL(stat(backing_file, &st), 0); + TESTCOND(st.st_blocks < 128); + TESTEQUAL(fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 1 << 24), 0); + TESTEQUAL(stat(backing_file, &st), 0); + TESTCOND(st.st_blocks > 32768); + TESTEQUAL(emit_test_file_data(mount_dir, &file), 0); + TESTEQUAL(stat(backing_file, &st), 0); + TESTCOND(st.st_blocks < 128); + + result = TEST_SUCCESS; +out: + close(fd); + free(backing_file); + close(cmd_fd); + umount(mount_dir); + free(backing_dir); + return result; +} + +static int stat_file_test(const char *mount_dir, int cmd_fd, + struct test_file *file) +{ + int result = TEST_FAILURE; + struct stat st; + char *filename = NULL; + + TESTEQUAL(emit_file(cmd_fd, NULL, file->name, &file->id, + file->size, NULL), 0); + TEST(filename = concat_file_name(mount_dir, file->name), filename); + TESTEQUAL(stat(filename, &st), 0); + TESTCOND(st.st_blocks < 32); + TESTEQUAL(emit_test_file_data(mount_dir, file), 0); + TESTEQUAL(stat(filename, &st), 0); + TESTCOND(st.st_blocks > file->size / 512); + + result = TEST_SUCCESS; +out: + free(filename); + return result; +} + +static int stat_test(const char *mount_dir) +{ + int result = TEST_FAILURE; + char *backing_dir = NULL; + int cmd_fd = -1; + int i; + struct test_files_set test = get_test_files_set(); + const int file_num = test.files_count; + + TEST(backing_dir = create_backing_dir(mount_dir), backing_dir); + TESTEQUAL(mount_fs(mount_dir, backing_dir, 0), 0); + TEST(cmd_fd = open_commands_file(mount_dir), cmd_fd != -1); + + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + + TESTEQUAL(stat_file_test(mount_dir, cmd_fd, file), 0); + } + + result = TEST_SUCCESS; +out: + close(cmd_fd); + umount(mount_dir); + free(backing_dir); + return result; +} + +#define SYSFS_DIR "/sys/fs/incremental-fs/instances/test_node/" + +static int sysfs_test_value(const char *name, uint64_t value) +{ + int result = TEST_FAILURE; + char *filename = NULL; + FILE *file = NULL; + uint64_t res; + + TEST(filename = concat_file_name(SYSFS_DIR, name), filename); + TEST(file = fopen(filename, "re"), file); + TESTEQUAL(fscanf(file, "%lu", &res), 1); + TESTEQUAL(res, value); + + result = TEST_SUCCESS; +out: + if (file) + fclose(file); + free(filename); + return result; +} + +static int sysfs_test_value_range(const char *name, uint64_t low, uint64_t high) +{ + int result = TEST_FAILURE; + char *filename = NULL; + FILE *file = NULL; + uint64_t res; + + TEST(filename = concat_file_name(SYSFS_DIR, name), filename); + TEST(file = fopen(filename, "re"), file); + TESTEQUAL(fscanf(file, "%lu", &res), 1); + TESTCOND(res >= low && res <= high); + + result = TEST_SUCCESS; +out: + if (file) + fclose(file); + free(filename); + return result; +} + +static int ioctl_test_last_error(int cmd_fd, const incfs_uuid_t *file_id, + int page, int error) +{ + int result = TEST_FAILURE; + struct incfs_get_last_read_error_args glre; + + TESTEQUAL(ioctl(cmd_fd, INCFS_IOC_GET_LAST_READ_ERROR, &glre), 0); + if (file_id) + TESTEQUAL(memcmp(&glre.file_id_out, file_id, sizeof(*file_id)), + 0); + + TESTEQUAL(glre.page_out, page); + TESTEQUAL(glre.errno_out, error); + result = TEST_SUCCESS; +out: + return result; +} + +static int sysfs_test(const char *mount_dir) +{ + int result = TEST_FAILURE; + char *backing_dir = NULL; + int cmd_fd = -1; + struct test_file file = { + .name = "file", + .size = INCFS_DATA_FILE_BLOCK_SIZE, + }; + char *filename = NULL; + int fd = -1; + int pid = -1; + char buffer[32]; + char *null_buf = NULL; + int status; + struct incfs_per_uid_read_timeouts purt_set[] = { + { + .uid = 0, + .min_time_us = 1000000, + .min_pending_time_us = 1000000, + .max_pending_time_us = 2000000, + }, + }; + struct incfs_set_read_timeouts_args srt = { + ptr_to_u64(purt_set), + sizeof(purt_set) + }; + + TEST(backing_dir = create_backing_dir(mount_dir), backing_dir); + TESTEQUAL(mount_fs_opt(mount_dir, backing_dir, "sysfs_name=test_node", + false), + 0); + TEST(cmd_fd = open_commands_file(mount_dir), cmd_fd != -1); + TESTEQUAL(build_mtree(&file), 0); + file.root_hash[0] ^= 0xff; + TESTEQUAL(crypto_emit_file(cmd_fd, NULL, file.name, &file.id, file.size, + file.root_hash, file.sig.add_data), + 0); + TEST(filename = concat_file_name(mount_dir, file.name), filename); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + TESTEQUAL(ioctl_test_last_error(cmd_fd, NULL, 0, 0), 0); + TESTEQUAL(sysfs_test_value("reads_failed_timed_out", 0), 0); + TESTEQUAL(read(fd, null_buf, 1), -1); + TESTEQUAL(ioctl_test_last_error(cmd_fd, &file.id, 0, -ETIME), 0); + TESTEQUAL(sysfs_test_value("reads_failed_timed_out", 2), 0); + + TESTEQUAL(emit_test_file_data(mount_dir, &file), 0); + TESTEQUAL(sysfs_test_value("reads_failed_hash_verification", 0), 0); + TESTEQUAL(read(fd, null_buf, 1), -1); + TESTEQUAL(sysfs_test_value("reads_failed_hash_verification", 1), 0); + TESTSYSCALL(close(fd)); + fd = -1; + + TESTSYSCALL(unlink(filename)); + TESTEQUAL(mount_fs_opt(mount_dir, backing_dir, + "read_timeout_ms=10000,sysfs_name=test_node", + true), + 0); + TESTEQUAL(emit_file(cmd_fd, NULL, file.name, &file.id, file.size, NULL), + 0); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + TESTSYSCALL(fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC)); + TEST(pid = fork(), pid != -1); + if (pid == 0) { + TESTEQUAL(read(fd, buffer, sizeof(buffer)), sizeof(buffer)); + exit(0); + } + sleep(1); + TESTEQUAL(sysfs_test_value("reads_delayed_pending", 0), 0); + TESTEQUAL(emit_test_file_data(mount_dir, &file), 0); + TESTNE(wait(&status), -1); + TESTEQUAL(status, 0); + TESTEQUAL(sysfs_test_value("reads_delayed_pending", 1), 0); + /* Allow +/- 10% */ + TESTEQUAL(sysfs_test_value_range("reads_delayed_pending_us", 900000, 1100000), + 0); + + TESTSYSCALL(close(fd)); + fd = -1; + + TESTSYSCALL(unlink(filename)); + TESTEQUAL(ioctl(cmd_fd, INCFS_IOC_SET_READ_TIMEOUTS, &srt), 0); + TESTEQUAL(emit_file(cmd_fd, NULL, file.name, &file.id, file.size, NULL), + 0); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + TESTEQUAL(sysfs_test_value("reads_delayed_min", 0), 0); + TESTEQUAL(emit_test_file_data(mount_dir, &file), 0); + TESTEQUAL(read(fd, buffer, sizeof(buffer)), sizeof(buffer)); + TESTEQUAL(sysfs_test_value("reads_delayed_min", 1), 0); + /* This should be exact */ + TESTEQUAL(sysfs_test_value("reads_delayed_min_us", 1000000), 0); + + TESTSYSCALL(close(fd)); + fd = -1; + + TESTSYSCALL(unlink(filename)); + TESTEQUAL(emit_file(cmd_fd, NULL, file.name, &file.id, file.size, NULL), + 0); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + TESTSYSCALL(fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC)); + TEST(pid = fork(), pid != -1); + if (pid == 0) { + TESTEQUAL(read(fd, buffer, sizeof(buffer)), sizeof(buffer)); + exit(0); + } + usleep(500000); + TESTEQUAL(sysfs_test_value("reads_delayed_pending", 1), 0); + TESTEQUAL(sysfs_test_value("reads_delayed_min", 1), 0); + TESTEQUAL(emit_test_file_data(mount_dir, &file), 0); + TESTNE(wait(&status), -1); + TESTEQUAL(status, 0); + TESTEQUAL(sysfs_test_value("reads_delayed_pending", 2), 0); + TESTEQUAL(sysfs_test_value("reads_delayed_min", 2), 0); + /* Exact 1000000 plus 500000 +/- 10% */ + TESTEQUAL(sysfs_test_value_range("reads_delayed_min_us", 1450000, 1550000), 0); + /* Allow +/- 10% */ + TESTEQUAL(sysfs_test_value_range("reads_delayed_pending_us", 1350000, 1650000), + 0); + + result = TEST_SUCCESS; +out: + if (pid == 0) + exit(result); + free(file.mtree); + free(filename); + close(fd); + close(cmd_fd); + umount(mount_dir); + free(backing_dir); + return result; +} + +static int sysfs_test_directories(bool one_present, bool two_present) +{ + int result = TEST_FAILURE; + struct stat st; + + TESTEQUAL(stat("/sys/fs/incremental-fs/instances/1", &st), + one_present ? 0 : -1); + if (one_present) + TESTCOND(S_ISDIR(st.st_mode)); + else + TESTEQUAL(errno, ENOENT); + TESTEQUAL(stat("/sys/fs/incremental-fs/instances/2", &st), + two_present ? 0 : -1); + if (two_present) + TESTCOND(S_ISDIR(st.st_mode)); + else + TESTEQUAL(errno, ENOENT); + + result = TEST_SUCCESS; +out: + return result; +} + +static int sysfs_rename_test(const char *mount_dir) +{ + int result = TEST_FAILURE; + char *backing_dir = NULL; + char *mount_dir2 = NULL; + int fd = -1; + char c; + + /* Mount with no node */ + TEST(backing_dir = create_backing_dir(mount_dir), backing_dir); + TESTEQUAL(mount_fs(mount_dir, backing_dir, 0), 0); + TESTEQUAL(sysfs_test_directories(false, false), 0); + + /* Remount with node */ + TESTEQUAL(mount_fs_opt(mount_dir, backing_dir, "sysfs_name=1", true), + 0); + TESTEQUAL(sysfs_test_directories(true, false), 0); + TEST(fd = open("/sys/fs/incremental-fs/instances/1/reads_delayed_min", + O_RDONLY | O_CLOEXEC), fd != -1); + TESTEQUAL(pread(fd, &c, 1, 0), 1); + TESTEQUAL(c, '0'); + TESTEQUAL(pread(fd, &c, 1, 0), 1); + TESTEQUAL(c, '0'); + + /* Rename node */ + TESTEQUAL(mount_fs_opt(mount_dir, backing_dir, "sysfs_name=2", true), + 0); + TESTEQUAL(sysfs_test_directories(false, true), 0); + TESTEQUAL(pread(fd, &c, 1, 0), -1); + + /* Try mounting another instance with same node name */ + TEST(mount_dir2 = concat_file_name(backing_dir, "incfs-mount-dir2"), + mount_dir2); + rmdir(mount_dir2); /* In case we crashed before */ + TESTSYSCALL(mkdir(mount_dir2, 0777)); + TEST(mount_fs_opt(mount_dir2, backing_dir, "sysfs_name=2", false), + -1); + + /* Try mounting another instance then remounting with existing name */ + TESTEQUAL(mount_fs(mount_dir2, backing_dir, 0), 0); + TESTEQUAL(mount_fs_opt(mount_dir2, backing_dir, "sysfs_name=2", true), + -1); + + /* Remount with no node */ + TESTEQUAL(mount_fs_opt(mount_dir, backing_dir, "", true), + 0); + TESTEQUAL(sysfs_test_directories(false, false), 0); + + result = TEST_SUCCESS; +out: + umount(mount_dir2); + rmdir(mount_dir2); + free(mount_dir2); + close(fd); + umount(mount_dir); + free(backing_dir); + return result; +} + +static int stacked_mount_test(const char *mount_dir) +{ + int result = TEST_FAILURE; + char *backing_dir = NULL; + + /* Mount with no node */ + TEST(backing_dir = create_backing_dir(mount_dir), backing_dir); + TESTEQUAL(mount_fs(mount_dir, backing_dir, 0), 0); + /* Try mounting another instance with same name */ + TESTEQUAL(mount_fs(mount_dir, backing_dir, 0), 0); + /* Try unmounting the first instance */ + TESTEQUAL(umount_fs(mount_dir), 0); + /* Try unmounting the second instance */ + TESTEQUAL(umount_fs(mount_dir), 0); + result = TEST_SUCCESS; +out: + /* Cleanup */ + rmdir(backing_dir); + free(backing_dir); + return result; +} + +static int throttled_read_doesnt_block_read_test(const char *mount_dir) +{ + struct test_file file = { + .name = "file", + .size = INCFS_DATA_FILE_BLOCK_SIZE, + }; + int result = TEST_FAILURE; + char *backing_dir = NULL; + int cmd_fd = -1; + char *filename = NULL; + int fd = -1; + int pid = -1; + int status; + struct timespec start; + struct incfs_per_uid_read_timeouts purt_set[] = { + { + .uid = 2, + .min_time_us = 0, + .min_pending_time_us = 2000000, + .max_pending_time_us = 2000000, + }, + }; + struct incfs_set_read_timeouts_args srt = { + ptr_to_u64(purt_set), + sizeof(purt_set) + }; + char buffer[4096]; + + TEST(backing_dir = create_backing_dir(mount_dir), backing_dir); + TESTSYSCALL(mount_fs_opt(mount_dir, backing_dir, + "read_timeout_ms=10000,report_uid", false)); + TEST(cmd_fd = open_commands_file(mount_dir), cmd_fd != -1); + TESTEQUAL(emit_file(cmd_fd, NULL, file.name, &file.id, + file.size, NULL), 0); + + TESTEQUAL(ioctl(cmd_fd, INCFS_IOC_SET_READ_TIMEOUTS, &srt), 0); + + TEST(filename = concat_file_name(mount_dir, file.name), filename); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + + TEST(pid = fork(), pid != -1); + if (pid == 0) { + TESTEQUAL(setuid(2), 0); + TESTEQUAL(pread(fd, buffer, sizeof(buffer), 0), sizeof(buffer)); + exit(0); + } + + sleep(1); + TESTEQUAL(emit_test_file_data(mount_dir, &file), 0); + TESTEQUAL(clock_gettime(CLOCK_MONOTONIC, &start), 0); + TESTEQUAL(pread(fd, buffer, sizeof(buffer), 0), sizeof(buffer)); + TESTEQUAL(is_close(&start, 0), 0); + + TESTNE(wait(&status), -1); + TESTEQUAL(status, 0); + TESTEQUAL(is_close(&start, 1000), 0); + result = TEST_SUCCESS; +out: + close(fd); + free(filename); + close(cmd_fd); + umount(mount_dir); + rmdir(backing_dir); + free(backing_dir); + return result; +} + +static int throttled_mmap_doesnt_block_mmap_test(const char *mount_dir) +{ + struct test_file file = { + .name = "file", + .size = INCFS_DATA_FILE_BLOCK_SIZE, + }; + int result = TEST_FAILURE; + char *backing_dir = NULL; + int cmd_fd = -1; + char *filename = NULL; + int fd = -1; + char *addr = (void *)-1; + int pid = -1; + int status; + struct timespec start; + struct incfs_per_uid_read_timeouts purt_set[] = { + { + .uid = 2, + .min_time_us = 0, + .min_pending_time_us = 2000000, + .max_pending_time_us = 2000000, + }, + }; + struct incfs_set_read_timeouts_args srt = { + ptr_to_u64(purt_set), + sizeof(purt_set) + }; + + TEST(backing_dir = create_backing_dir(mount_dir), backing_dir); + TESTSYSCALL(mount_fs_opt(mount_dir, backing_dir, + "read_timeout_ms=10000,report_uid", false)); + TEST(cmd_fd = open_commands_file(mount_dir), cmd_fd != -1); + TESTEQUAL(emit_file(cmd_fd, NULL, file.name, &file.id, + file.size, NULL), 0); + + TESTEQUAL(ioctl(cmd_fd, INCFS_IOC_SET_READ_TIMEOUTS, &srt), 0); + + TEST(filename = concat_file_name(mount_dir, file.name), filename); + TEST(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1); + TEST(addr = mmap(NULL, file.size, PROT_READ, MAP_PRIVATE, fd, 0), + addr != (void *) -1); + + TEST(pid = fork(), pid != -1); + if (pid == 0) { + TESTEQUAL(setuid(2), 0); + TESTEQUAL(addr[0], 57); + exit(0); + } + + sleep(1); + TESTEQUAL(emit_test_file_data(mount_dir, &file), 0); + TESTEQUAL(clock_gettime(CLOCK_MONOTONIC, &start), 0); + TESTEQUAL(addr[0], 57); + TESTEQUAL(is_close(&start, 0), 0); + + TESTNE(wait(&status), -1); + TESTEQUAL(status, 0); + TESTEQUAL(is_close(&start, 1000), 0); + + result = TEST_SUCCESS; +out: + if (addr != (void *) -1) + munmap(addr, file.size); + close(fd); + free(filename); + close(cmd_fd); + umount(mount_dir); + rmdir(backing_dir); + free(backing_dir); + return result; +} + + +static char *setup_mount_dir() +{ + struct stat st; + char *current_dir = getcwd(NULL, 0); + char *mount_dir = concat_file_name(current_dir, "incfs-mount-dir"); + + free(current_dir); + if (stat(mount_dir, &st) == 0) { + if (S_ISDIR(st.st_mode)) + return mount_dir; + + ksft_print_msg("%s is a file, not a dir.\n", mount_dir); + return NULL; + } + + if (mkdir(mount_dir, 0777)) { + print_error("Can't create mount dir."); + return NULL; + } + + return mount_dir; +} + +static void parse_range(const char *ranges, bool *run_test, size_t tests) +{ + size_t i; + char *range; + + for (i = 0; i < tests; ++i) + run_test[i] = false; + + range = strtok(optarg, ","); + while (range) { + char *dash = strchr(range, '-'); + + if (dash) { + size_t start = 1, end = tests; + char *end_ptr; + + if (dash > range) { + start = strtol(range, &end_ptr, 10); + if (*end_ptr != '-' || start <= 0 || start > tests) + ksft_exit_fail_msg("Bad range\n"); + } + + if (dash[1]) { + end = strtol(dash + 1, &end_ptr, 10); + if (*end_ptr || end <= start || end > tests) + ksft_exit_fail_msg("Bad range\n"); + } + + for (i = start; i <= end; ++i) + run_test[i - 1] = true; + } else { + char *end; + long value = strtol(range, &end, 10); + + if (*end || value <= 0 || value > tests) + ksft_exit_fail_msg("Bad range\n"); + run_test[value - 1] = true; + } + range = strtok(NULL, ","); + } +} + +int parse_options(int argc, char *const *argv, bool *run_test, + size_t tests) +{ + signed char c; + + while ((c = getopt(argc, argv, "f:t:v")) != -1) + switch (c) { + case 'f': + options.file = strtol(optarg, NULL, 10); + break; + + case 't': + parse_range(optarg, run_test, tests); + break; + + case 'v': + options.verbose = true; + break; + + default: + return -EINVAL; + } + + return 0; +} + +struct test_case { + int (*pfunc)(const char *dir); + const char *name; +}; + +void run_one_test(const char *mount_dir, struct test_case *test_case) +{ + int ret; + + ksft_print_msg("Running %s\n", test_case->name); + ret = test_case->pfunc(mount_dir); + + if (ret == TEST_SUCCESS) + ksft_test_result_pass("%s\n", test_case->name); + else if (ret == TEST_SKIP) + ksft_test_result_skip("%s\n", test_case->name); + else + ksft_test_result_fail("%s\n", test_case->name); +} + +int main(int argc, char *argv[]) +{ + char *mount_dir = NULL; + int i; + int fd, count; + + // Seed randomness pool for testing on QEMU + // NOTE - this abuses the concept of randomness - do *not* ever do this + // on a machine for production use - the device will think it has good + // randomness when it does not. + fd = open("/dev/urandom", O_WRONLY | O_CLOEXEC); + count = 4096; + for (int i = 0; i < 128; ++i) + ioctl(fd, RNDADDTOENTCNT, &count); + close(fd); + + ksft_print_header(); + + if (geteuid() != 0) + ksft_print_msg("Not a root, might fail to mount.\n"); + + mount_dir = setup_mount_dir(); + if (mount_dir == NULL) + ksft_exit_fail_msg("Can't create a mount dir\n"); + +#define MAKE_TEST(test) \ + { \ + test, #test \ + } + struct test_case cases[] = { + MAKE_TEST(basic_file_ops_test), + MAKE_TEST(cant_touch_index_test), + MAKE_TEST(dynamic_files_and_data_test), + MAKE_TEST(concurrent_reads_and_writes_test), + MAKE_TEST(attribute_test), + MAKE_TEST(work_after_remount_test), + MAKE_TEST(child_procs_waiting_for_data_test), + MAKE_TEST(multiple_providers_test), + MAKE_TEST(hash_tree_test), + MAKE_TEST(read_log_test), + MAKE_TEST(get_blocks_test), + MAKE_TEST(get_hash_blocks_test), + MAKE_TEST(large_file_test), + MAKE_TEST(mapped_file_test), + MAKE_TEST(compatibility_test), + MAKE_TEST(data_block_count_test), + MAKE_TEST(hash_block_count_test), + MAKE_TEST(per_uid_read_timeouts_test), + MAKE_TEST(inotify_test), + MAKE_TEST(verity_test), + MAKE_TEST(enable_verity_test), + MAKE_TEST(mmap_test), + MAKE_TEST(truncate_test), + MAKE_TEST(stat_test), + MAKE_TEST(sysfs_test), + MAKE_TEST(sysfs_rename_test), + MAKE_TEST(stacked_mount_test), + MAKE_TEST(throttled_read_doesnt_block_read_test), + MAKE_TEST(throttled_mmap_doesnt_block_mmap_test), + }; +#undef MAKE_TEST + bool run_test[ARRAY_SIZE(cases)]; + + for (int i = 0; i < ARRAY_SIZE(cases); ++i) + run_test[i] = true; + + if (parse_options(argc, argv, run_test, ARRAY_SIZE(cases))) + ksft_exit_fail_msg("Bad options\n"); + + ksft_set_plan(ARRAY_SIZE(cases)); + for (i = 0; i < ARRAY_SIZE(run_test); ++i) + if (run_test[i]) + run_one_test(mount_dir, &cases[i]); + else + ksft_cnt.ksft_xskip++; + + umount2(mount_dir, MNT_FORCE); + rmdir(mount_dir); + return !ksft_get_fail_cnt() ? ksft_exit_pass() : ksft_exit_fail(); +} diff --git a/tools/testing/selftests/filesystems/incfs/utils.c b/tools/testing/selftests/filesystems/incfs/utils.c new file mode 100644 index 000000000000..d7deb5321c3e --- /dev/null +++ b/tools/testing/selftests/filesystems/incfs/utils.c @@ -0,0 +1,391 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2018 Google LLC + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include "utils.h" + +#ifndef __S_IFREG +#define __S_IFREG S_IFREG +#endif + +unsigned int rnd(unsigned int max, unsigned int *seed) +{ + return rand_r(seed) * ((uint64_t)max + 1) / RAND_MAX; +} + +int remove_dir(const char *dir) +{ + int err = rmdir(dir); + + if (err && errno == ENOTEMPTY) { + err = delete_dir_tree(dir); + if (err) + return err; + return 0; + } + + if (err && errno != ENOENT) + return -errno; + + return 0; +} + +int drop_caches(void) +{ + int drop_caches = + open("/proc/sys/vm/drop_caches", O_WRONLY | O_CLOEXEC); + int i; + + if (drop_caches == -1) + return -errno; + i = write(drop_caches, "3", 1); + close(drop_caches); + + if (i != 1) + return -errno; + + return 0; +} + +int mount_fs(const char *mount_dir, const char *backing_dir, + int read_timeout_ms) +{ + static const char fs_name[] = INCFS_NAME; + char mount_options[512]; + int result; + + snprintf(mount_options, ARRAY_SIZE(mount_options), + "read_timeout_ms=%u", + read_timeout_ms); + + result = mount(backing_dir, mount_dir, fs_name, 0, mount_options); + if (result != 0) + perror("Error mounting fs."); + return result; +} + +int umount_fs(const char *mount_dir) +{ + int result; + + result = umount(mount_dir); + if (result != 0) + perror("Error unmounting fs."); + return result; +} + +int mount_fs_opt(const char *mount_dir, const char *backing_dir, + const char *opt, bool remount) +{ + static const char fs_name[] = INCFS_NAME; + int result; + + result = mount(backing_dir, mount_dir, fs_name, + remount ? MS_REMOUNT : 0, opt); + if (result != 0) + perror("Error mounting fs."); + return result; +} + +struct hash_section { + uint32_t algorithm; + uint8_t log2_blocksize; + uint32_t salt_size; + /* no salt */ + uint32_t hash_size; + uint8_t hash[SHA256_DIGEST_SIZE]; +} __packed; + +struct signature_blob { + uint32_t version; + uint32_t hash_section_size; + struct hash_section hash_section; + uint32_t signing_section_size; + uint8_t signing_section[]; +} __packed; + +size_t format_signature(void **buf, const char *root_hash, const char *add_data) +{ + size_t size = sizeof(struct signature_blob) + strlen(add_data) + 1; + struct signature_blob *sb = malloc(size); + + if (!sb) + return 0; + + *sb = (struct signature_blob){ + .version = INCFS_SIGNATURE_VERSION, + .hash_section_size = sizeof(struct hash_section), + .hash_section = + (struct hash_section){ + .algorithm = INCFS_HASH_TREE_SHA256, + .log2_blocksize = 12, + .salt_size = 0, + .hash_size = SHA256_DIGEST_SIZE, + }, + .signing_section_size = strlen(add_data) + 1, + }; + + memcpy(sb->hash_section.hash, root_hash, SHA256_DIGEST_SIZE); + memcpy((char *)sb->signing_section, add_data, strlen(add_data) + 1); + *buf = sb; + return size; +} + +int crypto_emit_file(int fd, const char *dir, const char *filename, + incfs_uuid_t *id_out, size_t size, const char *root_hash, + const char *add_data) +{ + int mode = __S_IFREG | 0555; + void *signature; + int error = 0; + + struct incfs_new_file_args args = { + .size = size, + .mode = mode, + .file_name = ptr_to_u64(filename), + .directory_path = ptr_to_u64(dir), + .file_attr = 0, + .file_attr_len = 0 + }; + + args.signature_size = format_signature(&signature, root_hash, add_data); + args.signature_info = ptr_to_u64(signature); + + md5(filename, strlen(filename), (char *)args.file_id.bytes); + + if (ioctl(fd, INCFS_IOC_CREATE_FILE, &args) != 0) { + error = -errno; + goto out; + } + + *id_out = args.file_id; + +out: + free(signature); + return error; +} + +int emit_file(int fd, const char *dir, const char *filename, + incfs_uuid_t *id_out, size_t size, const char *attr) +{ + int mode = __S_IFREG | 0555; + struct incfs_new_file_args args = { .size = size, + .mode = mode, + .file_name = ptr_to_u64(filename), + .directory_path = ptr_to_u64(dir), + .signature_info = ptr_to_u64(NULL), + .signature_size = 0, + .file_attr = ptr_to_u64(attr), + .file_attr_len = + attr ? strlen(attr) : 0 }; + + md5(filename, strlen(filename), (char *)args.file_id.bytes); + + if (ioctl(fd, INCFS_IOC_CREATE_FILE, &args) != 0) + return -errno; + + *id_out = args.file_id; + return 0; +} + +int get_file_bmap(int cmd_fd, int ino, unsigned char *buf, int buf_size) +{ + return 0; +} + +int get_file_signature(int fd, unsigned char *buf, int buf_size) +{ + struct incfs_get_file_sig_args args = { + .file_signature = ptr_to_u64(buf), + .file_signature_buf_size = buf_size + }; + + if (ioctl(fd, INCFS_IOC_READ_FILE_SIGNATURE, &args) == 0) + return args.file_signature_len_out; + return -errno; +} + +loff_t get_file_size(const char *name) +{ + struct stat st; + + if (stat(name, &st) == 0) + return st.st_size; + return -ENOENT; +} + +int open_commands_file(const char *mount_dir) +{ + char cmd_file[255]; + int cmd_fd; + + snprintf(cmd_file, ARRAY_SIZE(cmd_file), + "%s/%s", mount_dir, INCFS_PENDING_READS_FILENAME); + cmd_fd = open(cmd_file, O_RDONLY | O_CLOEXEC); + + if (cmd_fd < 0) + perror("Can't open commands file"); + return cmd_fd; +} + +int open_log_file(const char *mount_dir) +{ + char file[255]; + int fd; + + snprintf(file, ARRAY_SIZE(file), "%s/.log", mount_dir); + fd = open(file, O_RDWR | O_CLOEXEC); + if (fd < 0) + perror("Can't open log file"); + return fd; +} + +int open_blocks_written_file(const char *mount_dir) +{ + char file[255]; + int fd; + + snprintf(file, ARRAY_SIZE(file), + "%s/%s", mount_dir, INCFS_BLOCKS_WRITTEN_FILENAME); + fd = open(file, O_RDONLY | O_CLOEXEC); + + if (fd < 0) + perror("Can't open blocks_written file"); + return fd; +} + +int wait_for_pending_reads(int fd, int timeout_ms, + struct incfs_pending_read_info *prs, int prs_count) +{ + ssize_t read_res = 0; + + if (timeout_ms > 0) { + int poll_res = 0; + struct pollfd pollfd = { + .fd = fd, + .events = POLLIN + }; + + poll_res = poll(&pollfd, 1, timeout_ms); + if (poll_res < 0) + return -errno; + if (poll_res == 0) + return 0; + if (!(pollfd.revents | POLLIN)) + return 0; + } + + read_res = read(fd, prs, prs_count * sizeof(*prs)); + if (read_res < 0) + return -errno; + + return read_res / sizeof(*prs); +} + +int wait_for_pending_reads2(int fd, int timeout_ms, + struct incfs_pending_read_info2 *prs, int prs_count) +{ + ssize_t read_res = 0; + + if (timeout_ms > 0) { + int poll_res = 0; + struct pollfd pollfd = { + .fd = fd, + .events = POLLIN + }; + + poll_res = poll(&pollfd, 1, timeout_ms); + if (poll_res < 0) + return -errno; + if (poll_res == 0) + return 0; + if (!(pollfd.revents | POLLIN)) + return 0; + } + + read_res = read(fd, prs, prs_count * sizeof(*prs)); + if (read_res < 0) + return -errno; + + return read_res / sizeof(*prs); +} + +char *concat_file_name(const char *dir, const char *file) +{ + char full_name[FILENAME_MAX] = ""; + + if (snprintf(full_name, ARRAY_SIZE(full_name), "%s/%s", dir, file) < 0) + return NULL; + return strdup(full_name); +} + +int delete_dir_tree(const char *dir_path) +{ + DIR *dir = NULL; + struct dirent *dp; + int result = 0; + + dir = opendir(dir_path); + if (!dir) { + result = -errno; + goto out; + } + + while ((dp = readdir(dir))) { + char *full_path; + + if (!strcmp(dp->d_name, ".") || !strcmp(dp->d_name, "..")) + continue; + + full_path = concat_file_name(dir_path, dp->d_name); + if (dp->d_type == DT_DIR) + result = delete_dir_tree(full_path); + else + result = unlink(full_path); + free(full_path); + if (result) + goto out; + } + +out: + if (dir) + closedir(dir); + if (!result) + rmdir(dir_path); + return result; +} + +void sha256(const char *data, size_t dsize, char *hash) +{ + SHA256_CTX ctx; + + SHA256_Init(&ctx); + SHA256_Update(&ctx, data, dsize); + SHA256_Final((unsigned char *)hash, &ctx); +} + +void md5(const char *data, size_t dsize, char *hash) +{ + MD5_CTX ctx; + + MD5_Init(&ctx); + MD5_Update(&ctx, data, dsize); + MD5_Final((unsigned char *)hash, &ctx); +} diff --git a/tools/testing/selftests/filesystems/incfs/utils.h b/tools/testing/selftests/filesystems/incfs/utils.h new file mode 100644 index 000000000000..17a1ac5eef7d --- /dev/null +++ b/tools/testing/selftests/filesystems/incfs/utils.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2019 Google LLC + */ +#include +#include + +#include + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) + +#define __packed __attribute__((__packed__)) + +#ifdef __LP64__ +#define ptr_to_u64(p) ((__u64)p) +#else +#define ptr_to_u64(p) ((__u64)(__u32)p) +#endif + +#define SHA256_DIGEST_SIZE 32 +#define INCFS_MAX_MTREE_LEVELS 8 + +unsigned int rnd(unsigned int max, unsigned int *seed); + +int remove_dir(const char *dir); + +int drop_caches(void); + +int mount_fs(const char *mount_dir, const char *backing_dir, + int read_timeout_ms); + +int umount_fs(const char *mount_dir); + +int mount_fs_opt(const char *mount_dir, const char *backing_dir, + const char *opt, bool remount); + +int get_file_bmap(int cmd_fd, int ino, unsigned char *buf, int buf_size); + +int get_file_signature(int fd, unsigned char *buf, int buf_size); + +int emit_node(int fd, char *filename, int *ino_out, int parent_ino, + size_t size, mode_t mode, char *attr); + +int emit_file(int fd, const char *dir, const char *filename, + incfs_uuid_t *id_out, size_t size, const char *attr); + +int crypto_emit_file(int fd, const char *dir, const char *filename, + incfs_uuid_t *id_out, size_t size, const char *root_hash, + const char *add_data); + +loff_t get_file_size(const char *name); + +int open_commands_file(const char *mount_dir); + +int open_log_file(const char *mount_dir); + +int open_blocks_written_file(const char *mount_dir); + +int wait_for_pending_reads(int fd, int timeout_ms, + struct incfs_pending_read_info *prs, int prs_count); + +int wait_for_pending_reads2(int fd, int timeout_ms, + struct incfs_pending_read_info2 *prs, int prs_count); + +char *concat_file_name(const char *dir, const char *file); + +void sha256(const char *data, size_t dsize, char *hash); + +void md5(const char *data, size_t dsize, char *hash); + +int delete_dir_tree(const char *path); diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-expressions.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-expressions.tc new file mode 100644 index 000000000000..05ffba299dbf --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-expressions.tc @@ -0,0 +1,63 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: event trigger - test histogram expression parsing +# requires: set_event events/sched/sched_process_fork/trigger events/sched/sched_process_fork/hist error_log "=":README + + +fail() { #msg + echo $1 + exit_fail +} + +test_hist_expr() { # test_name expression expected_val + trigger="events/sched/sched_process_fork/trigger" + + reset_trigger_file $trigger + + echo "Test hist trigger expressions - $1" + + echo "hist:keys=common_pid:x=$2" > $trigger + + for i in `seq 1 10` ; do ( echo "forked" > /dev/null); done + + actual=`grep -o 'x=[[:digit:]]*' $trigger | awk -F= '{ print $2 }'` + + if [ $actual != $3 ]; then + fail "Failed hist trigger expression evaluation: Expression: $2 Expected: $3, Actual: $actual" + fi + + reset_trigger_file $trigger +} + +check_error() { # test_name command-with-error-pos-by-^ + trigger="events/sched/sched_process_fork/trigger" + + echo "Test hist trigger expressions - $1" + ftrace_errlog_check 'hist:sched:sched_process_fork' "$2" $trigger +} + +test_hist_expr "Variable assignment" "123" "123" + +test_hist_expr "Subtraction not associative" "16-8-4-2" "2" + +test_hist_expr "Division not associative" "64/8/4/2" "1" + +test_hist_expr "Same precedence operators (+,-) evaluated left to right" "16-8+4+2" "14" + +test_hist_expr "Same precedence operators (*,/) evaluated left to right" "4*3/2*2" "12" + +test_hist_expr "Multiplication evaluated before addition/subtraction" "4+3*2-2" "8" + +test_hist_expr "Division evaluated before addition/subtraction" "4+6/2-2" "5" + +# err pos for "too many subexpressions" is dependent on where +# the last subexpression was detected. This can vary depending +# on how the expression tree was generated. +check_error "Too many subexpressions" 'hist:keys=common_pid:x=32+^10*3/20-4' +check_error "Too many subexpressions" 'hist:keys=common_pid:x=^1+2+3+4+5' + +check_error "Unary minus not supported in subexpression" 'hist:keys=common_pid:x=-(^1)+2' + +check_error "Division by zero" 'hist:keys=common_pid:x=3/^0' + +exit 0 diff --git a/usr/Makefile b/usr/Makefile index b1a81a40eab1..7b89c0175a3a 100644 --- a/usr/Makefile +++ b/usr/Makefile @@ -3,11 +3,6 @@ # kbuild file for usr/ - including initramfs image # -# cmd_bzip2, cmd_lzma, cmd_lzo, cmd_lz4 from scripts/Makefile.lib appends the -# size at the end of the compressed file, which unfortunately does not work -# with unpack_to_rootfs(). Make size_append no-op. -override size_append := : - compress-y := shipped compress-$(CONFIG_INITRAMFS_COMPRESSION_GZIP) := gzip compress-$(CONFIG_INITRAMFS_COMPRESSION_BZIP2) := bzip2 diff --git a/usr/include/Makefile b/usr/include/Makefile index adc6cb258736..6111974688a8 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -10,7 +10,10 @@ UAPI_CFLAGS := -std=c90 -Wall -Werror=implicit-function-declaration # In theory, we do not care -m32 or -m64 for header compile tests. # It is here just because CONFIG_CC_CAN_LINK is tested with -m32 or -m64. -UAPI_CFLAGS += $(filter -m32 -m64, $(KBUILD_CFLAGS)) +UAPI_CFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CFLAGS)) + +# USERCFLAGS might contain sysroot location for CC. +UAPI_CFLAGS += $(USERCFLAGS) override c_flags = $(UAPI_CFLAGS) -Wp,-MMD,$(depfile) -I$(objtree)/usr/include diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3ffed093d3ea..771fc02753ec 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -261,8 +261,9 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, struct kvm_vcpu *except, unsigned long *vcpu_bitmap, cpumask_var_t tmp) { - int i, cpu, me; + int cpu, me; struct kvm_vcpu *vcpu; + unsigned long i; bool called; me = get_cpu(); @@ -439,7 +440,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->last_used_slot = 0; } -void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) +static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) { kvm_arch_vcpu_destroy(vcpu); kvm_dirty_ring_free(&vcpu->dirty_ring); @@ -454,7 +455,20 @@ void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->run); kmem_cache_free(kvm_vcpu_cache, vcpu); } -EXPORT_SYMBOL_GPL(kvm_vcpu_destroy); + +void kvm_destroy_vcpus(struct kvm *kvm) +{ + unsigned long i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_vcpu_destroy(vcpu); + xa_erase(&kvm->vcpu_array, i); + } + + atomic_set(&kvm->online_vcpus, 0); +} +EXPORT_SYMBOL_GPL(kvm_destroy_vcpus); #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) @@ -954,6 +968,12 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + kvm_vcpu_stats_header.num_desc; + /* + * Force subsequent debugfs file creations to fail if the VM directory + * is not created. + */ + kvm->debugfs_dentry = ERR_PTR(-ENOENT); + if (!debugfs_initialized()) return 0; @@ -1067,6 +1087,7 @@ static struct kvm *kvm_create_vm(unsigned long type) mutex_init(&kvm->slots_arch_lock); spin_lock_init(&kvm->mn_invalidate_lock); rcuwait_init(&kvm->mn_memslots_update_rcuwait); + xa_init(&kvm->vcpu_array); INIT_LIST_HEAD(&kvm->devices); @@ -3491,10 +3512,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) struct kvm *kvm = me->kvm; struct kvm_vcpu *vcpu; int last_boosted_vcpu = me->kvm->last_boosted_vcpu; + unsigned long i; int yielded = 0; int try = 3; int pass; - int i; kvm_vcpu_set_in_spin_loop(me, true); /* @@ -3701,7 +3722,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) } vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); - BUG_ON(kvm->vcpus[vcpu->vcpu_idx]); + r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT); + BUG_ON(r == -EBUSY); + if (r) + goto unlock_vcpu_destroy; /* Fill the stats id string for the vcpu */ snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d", @@ -3711,15 +3735,14 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) kvm_get_kvm(kvm); r = create_vcpu_fd(vcpu); if (r < 0) { + xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx); kvm_put_kvm_no_destroy(kvm); goto unlock_vcpu_destroy; } - kvm->vcpus[vcpu->vcpu_idx] = vcpu; - /* - * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus - * before kvm->online_vcpu's incremented value. + * Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu + * pointer before kvm->online_vcpu's incremented value. */ smp_wmb(); atomic_inc(&kvm->online_vcpus); @@ -4304,7 +4327,7 @@ static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size) static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; int cleared = 0; @@ -5234,7 +5257,7 @@ static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset) static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; *val = 0; @@ -5247,7 +5270,7 @@ static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val) static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) @@ -5547,6 +5570,50 @@ struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) return &kvm_running_vcpu; } +#ifdef CONFIG_GUEST_PERF_EVENTS +static unsigned int kvm_guest_state(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + unsigned int state; + + if (!kvm_arch_pmi_in_guest(vcpu)) + return 0; + + state = PERF_GUEST_ACTIVE; + if (!kvm_arch_vcpu_in_kernel(vcpu)) + state |= PERF_GUEST_USER; + + return state; +} + +static unsigned long kvm_guest_get_ip(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */ + if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu))) + return 0; + + return kvm_arch_vcpu_get_ip(vcpu); +} + +static struct perf_guest_info_callbacks kvm_guest_cbs = { + .state = kvm_guest_state, + .get_ip = kvm_guest_get_ip, + .handle_intel_pt_intr = NULL, +}; + +void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)) +{ + kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler; + perf_register_guest_info_callbacks(&kvm_guest_cbs); +} +void kvm_unregister_perf_callbacks(void) +{ + perf_unregister_guest_info_callbacks(&kvm_guest_cbs); +} +#endif + struct kvm_cpu_compat_check { void *opaque; int *ret;